1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that X86 uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "X86ISelLowering.h" 16 #include "Utils/X86ShuffleDecode.h" 17 #include "X86CallingConv.h" 18 #include "X86FrameLowering.h" 19 #include "X86InstrBuilder.h" 20 #include "X86MachineFunctionInfo.h" 21 #include "X86TargetMachine.h" 22 #include "X86TargetObjectFile.h" 23 #include "llvm/ADT/SmallBitVector.h" 24 #include "llvm/ADT/SmallSet.h" 25 #include "llvm/ADT/Statistic.h" 26 #include "llvm/ADT/StringExtras.h" 27 #include "llvm/ADT/StringSwitch.h" 28 #include "llvm/CodeGen/IntrinsicLowering.h" 29 #include "llvm/CodeGen/MachineFrameInfo.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineJumpTableInfo.h" 33 #include "llvm/CodeGen/MachineModuleInfo.h" 34 #include "llvm/CodeGen/MachineRegisterInfo.h" 35 #include "llvm/CodeGen/WinEHFuncInfo.h" 36 #include "llvm/IR/CallSite.h" 37 #include "llvm/IR/CallingConv.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/DerivedTypes.h" 40 #include "llvm/IR/Function.h" 41 #include "llvm/IR/GlobalAlias.h" 42 #include "llvm/IR/GlobalVariable.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/Intrinsics.h" 45 #include "llvm/MC/MCAsmInfo.h" 46 #include "llvm/MC/MCContext.h" 47 #include "llvm/MC/MCExpr.h" 48 #include "llvm/MC/MCSymbol.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/Debug.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Support/MathExtras.h" 53 #include "llvm/Target/TargetOptions.h" 54 #include "X86IntrinsicsInfo.h" 55 #include <bitset> 56 #include <numeric> 57 #include <cctype> 58 using namespace llvm; 59 60 #define DEBUG_TYPE "x86-isel" 61 62 STATISTIC(NumTailCalls, "Number of tail calls"); 63 64 static cl::opt<bool> ExperimentalVectorWideningLegalization( 65 "x86-experimental-vector-widening-legalization", cl::init(false), 66 cl::desc("Enable an experimental vector type legalization through widening " 67 "rather than promotion."), 68 cl::Hidden); 69 70 static cl::opt<int> ReciprocalEstimateRefinementSteps( 71 "x86-recip-refinement-steps", cl::init(1), 72 cl::desc("Specify the number of Newton-Raphson iterations applied to the " 73 "result of the hardware reciprocal estimate instruction."), 74 cl::NotHidden); 75 76 // Forward declarations. 77 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 78 SDValue V2); 79 80 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, 81 const X86Subtarget &STI) 82 : TargetLowering(TM), Subtarget(&STI) { 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 TD = getDataLayout(); 86 87 // Set up the TargetLowering object. 88 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 89 90 // X86 is weird. It always uses i8 for shift amounts and setcc results. 91 setBooleanContents(ZeroOrOneBooleanContent); 92 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 93 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 94 95 // For 64-bit, since we have so many registers, use the ILP scheduler. 96 // For 32-bit, use the register pressure specific scheduling. 97 // For Atom, always use ILP scheduling. 98 if (Subtarget->isAtom()) 99 setSchedulingPreference(Sched::ILP); 100 else if (Subtarget->is64Bit()) 101 setSchedulingPreference(Sched::ILP); 102 else 103 setSchedulingPreference(Sched::RegPressure); 104 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 105 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 106 107 // Bypass expensive divides on Atom when compiling with O2. 108 if (TM.getOptLevel() >= CodeGenOpt::Default) { 109 if (Subtarget->hasSlowDivide32()) 110 addBypassSlowDiv(32, 8); 111 if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) 112 addBypassSlowDiv(64, 16); 113 } 114 115 if (Subtarget->isTargetKnownWindowsMSVC()) { 116 // Setup Windows compiler runtime calls. 117 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 118 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 119 setLibcallName(RTLIB::SREM_I64, "_allrem"); 120 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 121 setLibcallName(RTLIB::MUL_I64, "_allmul"); 122 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 123 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 124 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 125 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 126 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 127 128 // The _ftol2 runtime function has an unusual calling conv, which 129 // is modeled by a special pseudo-instruction. 130 setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); 131 setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); 132 setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); 133 setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); 134 } 135 136 if (Subtarget->isTargetDarwin()) { 137 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 138 setUseUnderscoreSetJmp(false); 139 setUseUnderscoreLongJmp(false); 140 } else if (Subtarget->isTargetWindowsGNU()) { 141 // MS runtime is weird: it exports _setjmp, but longjmp! 142 setUseUnderscoreSetJmp(true); 143 setUseUnderscoreLongJmp(false); 144 } else { 145 setUseUnderscoreSetJmp(true); 146 setUseUnderscoreLongJmp(true); 147 } 148 149 // Set up the register classes. 150 addRegisterClass(MVT::i8, &X86::GR8RegClass); 151 addRegisterClass(MVT::i16, &X86::GR16RegClass); 152 addRegisterClass(MVT::i32, &X86::GR32RegClass); 153 if (Subtarget->is64Bit()) 154 addRegisterClass(MVT::i64, &X86::GR64RegClass); 155 156 for (MVT VT : MVT::integer_valuetypes()) 157 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 158 159 // We don't accept any truncstore of integer registers. 160 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 161 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 162 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 163 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 164 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 165 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 166 167 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 168 169 // SETOEQ and SETUNE require checking two conditions. 170 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 171 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 172 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 173 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 174 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 175 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 176 177 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 178 // operation. 179 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 180 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 181 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 182 183 if (Subtarget->is64Bit()) { 184 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 185 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 186 } else if (!TM.Options.UseSoftFloat) { 187 // We have an algorithm for SSE2->double, and we turn this into a 188 // 64-bit FILD followed by conditional FADD for other targets. 189 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 190 // We have an algorithm for SSE2, and we turn this into a 64-bit 191 // FILD for other targets. 192 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 193 } 194 195 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 196 // this operation. 197 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 198 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 199 200 if (!TM.Options.UseSoftFloat) { 201 // SSE has no i16 to fp conversion, only i32 202 if (X86ScalarSSEf32) { 203 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 204 // f32 and f64 cases are Legal, f80 case is not 205 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 206 } else { 207 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 208 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 209 } 210 } else { 211 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 212 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 213 } 214 215 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 216 // are Legal, f80 is custom lowered. 217 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 218 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 219 220 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 221 // this operation. 222 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 223 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 224 225 if (X86ScalarSSEf32) { 226 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 227 // f32 and f64 cases are Legal, f80 case is not 228 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 229 } else { 230 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 231 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 232 } 233 234 // Handle FP_TO_UINT by promoting the destination to a larger signed 235 // conversion. 236 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 237 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 238 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 239 240 if (Subtarget->is64Bit()) { 241 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 242 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 243 } else if (!TM.Options.UseSoftFloat) { 244 // Since AVX is a superset of SSE3, only check for SSE here. 245 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 246 // Expand FP_TO_UINT into a select. 247 // FIXME: We would like to use a Custom expander here eventually to do 248 // the optimal thing for SSE vs. the default expansion in the legalizer. 249 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 250 else 251 // With SSE3 we can use fisttpll to convert to a signed i64; without 252 // SSE, we're stuck with a fistpll. 253 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 254 } 255 256 if (isTargetFTOL()) { 257 // Use the _ftol2 runtime function, which has a pseudo-instruction 258 // to handle its weird calling convention. 259 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 260 } 261 262 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 263 if (!X86ScalarSSEf64) { 264 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 265 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 266 if (Subtarget->is64Bit()) { 267 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 268 // Without SSE, i64->f64 goes through memory. 269 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 270 } 271 } 272 273 // Scalar integer divide and remainder are lowered to use operations that 274 // produce two results, to match the available instructions. This exposes 275 // the two-result form to trivial CSE, which is able to combine x/y and x%y 276 // into a single instruction. 277 // 278 // Scalar integer multiply-high is also lowered to use two-result 279 // operations, to match the available instructions. However, plain multiply 280 // (low) operations are left as Legal, as there are single-result 281 // instructions for this in x86. Using the two-result multiply instructions 282 // when both high and low results are needed must be arranged by dagcombine. 283 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 284 MVT VT = IntVTs[i]; 285 setOperationAction(ISD::MULHS, VT, Expand); 286 setOperationAction(ISD::MULHU, VT, Expand); 287 setOperationAction(ISD::SDIV, VT, Expand); 288 setOperationAction(ISD::UDIV, VT, Expand); 289 setOperationAction(ISD::SREM, VT, Expand); 290 setOperationAction(ISD::UREM, VT, Expand); 291 292 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 293 setOperationAction(ISD::ADDC, VT, Custom); 294 setOperationAction(ISD::ADDE, VT, Custom); 295 setOperationAction(ISD::SUBC, VT, Custom); 296 setOperationAction(ISD::SUBE, VT, Custom); 297 } 298 299 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 300 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 301 setOperationAction(ISD::BR_CC , MVT::f32, Expand); 302 setOperationAction(ISD::BR_CC , MVT::f64, Expand); 303 setOperationAction(ISD::BR_CC , MVT::f80, Expand); 304 setOperationAction(ISD::BR_CC , MVT::i8, Expand); 305 setOperationAction(ISD::BR_CC , MVT::i16, Expand); 306 setOperationAction(ISD::BR_CC , MVT::i32, Expand); 307 setOperationAction(ISD::BR_CC , MVT::i64, Expand); 308 setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); 309 setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); 310 setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); 311 setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); 312 setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); 313 setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); 314 setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); 315 if (Subtarget->is64Bit()) 316 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 317 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 318 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 319 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 320 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 321 setOperationAction(ISD::FREM , MVT::f32 , Expand); 322 setOperationAction(ISD::FREM , MVT::f64 , Expand); 323 setOperationAction(ISD::FREM , MVT::f80 , Expand); 324 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 325 326 // Promote the i8 variants and force them on up to i32 which has a shorter 327 // encoding. 328 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 329 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 330 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 331 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 332 if (Subtarget->hasBMI()) { 333 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 334 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 335 if (Subtarget->is64Bit()) 336 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 337 } else { 338 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 339 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 340 if (Subtarget->is64Bit()) 341 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 342 } 343 344 if (Subtarget->hasLZCNT()) { 345 // When promoting the i8 variants, force them to i32 for a shorter 346 // encoding. 347 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 348 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 349 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 350 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 351 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 352 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 353 if (Subtarget->is64Bit()) 354 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 355 } else { 356 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 357 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 358 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 359 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 360 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 361 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 362 if (Subtarget->is64Bit()) { 363 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 364 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 365 } 366 } 367 368 // Special handling for half-precision floating point conversions. 369 // If we don't have F16C support, then lower half float conversions 370 // into library calls. 371 if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { 372 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 373 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 374 } 375 376 // There's never any support for operations beyond MVT::f32. 377 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 378 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); 379 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 380 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); 381 382 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 383 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 384 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); 385 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 386 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 387 setTruncStoreAction(MVT::f80, MVT::f16, Expand); 388 389 if (Subtarget->hasPOPCNT()) { 390 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 391 } else { 392 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 393 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 394 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 395 if (Subtarget->is64Bit()) 396 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 397 } 398 399 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 400 401 if (!Subtarget->hasMOVBE()) 402 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 403 404 // These should be promoted to a larger select which is supported. 405 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 406 // X86 wants to expand cmov itself. 407 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 408 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 409 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 410 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 411 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 412 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 413 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 414 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 415 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 416 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 417 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 418 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 419 if (Subtarget->is64Bit()) { 420 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 421 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 422 } 423 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 424 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 425 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 426 // support continuation, user-level threading, and etc.. As a result, no 427 // other SjLj exception interfaces are implemented and please don't build 428 // your own exception handling based on them. 429 // LLVM/Clang supports zero-cost DWARF exception handling. 430 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 431 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 432 433 // Darwin ABI issue. 434 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 435 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 436 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 437 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 438 if (Subtarget->is64Bit()) 439 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 440 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 441 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 442 if (Subtarget->is64Bit()) { 443 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 444 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 445 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 446 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 447 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 448 } 449 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 450 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 451 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 452 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 453 if (Subtarget->is64Bit()) { 454 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 455 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 456 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 457 } 458 459 if (Subtarget->hasSSE1()) 460 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 461 462 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 463 464 // Expand certain atomics 465 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 466 MVT VT = IntVTs[i]; 467 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); 468 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 469 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 470 } 471 472 if (Subtarget->hasCmpxchg16b()) { 473 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); 474 } 475 476 // FIXME - use subtarget debug flags 477 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && 478 !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { 479 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 480 } 481 482 if (Subtarget->is64Bit()) { 483 setExceptionPointerRegister(X86::RAX); 484 setExceptionSelectorRegister(X86::RDX); 485 } else { 486 setExceptionPointerRegister(X86::EAX); 487 setExceptionSelectorRegister(X86::EDX); 488 } 489 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 490 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 491 492 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 493 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 494 495 setOperationAction(ISD::TRAP, MVT::Other, Legal); 496 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 497 498 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 499 setOperationAction(ISD::VASTART , MVT::Other, Custom); 500 setOperationAction(ISD::VAEND , MVT::Other, Expand); 501 if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { 502 // TargetInfo::X86_64ABIBuiltinVaList 503 setOperationAction(ISD::VAARG , MVT::Other, Custom); 504 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 505 } else { 506 // TargetInfo::CharPtrBuiltinVaList 507 setOperationAction(ISD::VAARG , MVT::Other, Expand); 508 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 509 } 510 511 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 512 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 513 514 setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); 515 516 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 517 // f32 and f64 use SSE. 518 // Set up the FP register classes. 519 addRegisterClass(MVT::f32, &X86::FR32RegClass); 520 addRegisterClass(MVT::f64, &X86::FR64RegClass); 521 522 // Use ANDPD to simulate FABS. 523 setOperationAction(ISD::FABS , MVT::f64, Custom); 524 setOperationAction(ISD::FABS , MVT::f32, Custom); 525 526 // Use XORP to simulate FNEG. 527 setOperationAction(ISD::FNEG , MVT::f64, Custom); 528 setOperationAction(ISD::FNEG , MVT::f32, Custom); 529 530 // Use ANDPD and ORPD to simulate FCOPYSIGN. 531 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 532 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 533 534 // Lower this to FGETSIGNx86 plus an AND. 535 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 536 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 537 538 // We don't support sin/cos/fmod 539 setOperationAction(ISD::FSIN , MVT::f64, Expand); 540 setOperationAction(ISD::FCOS , MVT::f64, Expand); 541 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 542 setOperationAction(ISD::FSIN , MVT::f32, Expand); 543 setOperationAction(ISD::FCOS , MVT::f32, Expand); 544 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 545 546 // Expand FP immediates into loads from the stack, except for the special 547 // cases we handle. 548 addLegalFPImmediate(APFloat(+0.0)); // xorpd 549 addLegalFPImmediate(APFloat(+0.0f)); // xorps 550 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 551 // Use SSE for f32, x87 for f64. 552 // Set up the FP register classes. 553 addRegisterClass(MVT::f32, &X86::FR32RegClass); 554 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 555 556 // Use ANDPS to simulate FABS. 557 setOperationAction(ISD::FABS , MVT::f32, Custom); 558 559 // Use XORP to simulate FNEG. 560 setOperationAction(ISD::FNEG , MVT::f32, Custom); 561 562 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 563 564 // Use ANDPS and ORPS to simulate FCOPYSIGN. 565 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 566 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 567 568 // We don't support sin/cos/fmod 569 setOperationAction(ISD::FSIN , MVT::f32, Expand); 570 setOperationAction(ISD::FCOS , MVT::f32, Expand); 571 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 572 573 // Special cases we handle for FP constants. 574 addLegalFPImmediate(APFloat(+0.0f)); // xorps 575 addLegalFPImmediate(APFloat(+0.0)); // FLD0 576 addLegalFPImmediate(APFloat(+1.0)); // FLD1 577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 579 580 if (!TM.Options.UnsafeFPMath) { 581 setOperationAction(ISD::FSIN , MVT::f64, Expand); 582 setOperationAction(ISD::FCOS , MVT::f64, Expand); 583 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 584 } 585 } else if (!TM.Options.UseSoftFloat) { 586 // f32 and f64 in x87. 587 // Set up the FP register classes. 588 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 589 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 590 591 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 592 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 593 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 594 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 595 596 if (!TM.Options.UnsafeFPMath) { 597 setOperationAction(ISD::FSIN , MVT::f64, Expand); 598 setOperationAction(ISD::FSIN , MVT::f32, Expand); 599 setOperationAction(ISD::FCOS , MVT::f64, Expand); 600 setOperationAction(ISD::FCOS , MVT::f32, Expand); 601 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 602 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 603 } 604 addLegalFPImmediate(APFloat(+0.0)); // FLD0 605 addLegalFPImmediate(APFloat(+1.0)); // FLD1 606 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 607 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 608 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 609 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 610 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 611 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 612 } 613 614 // We don't support FMA. 615 setOperationAction(ISD::FMA, MVT::f64, Expand); 616 setOperationAction(ISD::FMA, MVT::f32, Expand); 617 618 // Long double always uses X87. 619 if (!TM.Options.UseSoftFloat) { 620 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 621 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 622 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 623 { 624 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 625 addLegalFPImmediate(TmpFlt); // FLD0 626 TmpFlt.changeSign(); 627 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 628 629 bool ignored; 630 APFloat TmpFlt2(+1.0); 631 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 632 &ignored); 633 addLegalFPImmediate(TmpFlt2); // FLD1 634 TmpFlt2.changeSign(); 635 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 636 } 637 638 if (!TM.Options.UnsafeFPMath) { 639 setOperationAction(ISD::FSIN , MVT::f80, Expand); 640 setOperationAction(ISD::FCOS , MVT::f80, Expand); 641 setOperationAction(ISD::FSINCOS, MVT::f80, Expand); 642 } 643 644 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 645 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 646 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 647 setOperationAction(ISD::FRINT, MVT::f80, Expand); 648 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 649 setOperationAction(ISD::FMA, MVT::f80, Expand); 650 } 651 652 // Always use a library call for pow. 653 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 654 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 655 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 656 657 setOperationAction(ISD::FLOG, MVT::f80, Expand); 658 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 659 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 660 setOperationAction(ISD::FEXP, MVT::f80, Expand); 661 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 662 setOperationAction(ISD::FMINNUM, MVT::f80, Expand); 663 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); 664 665 // First set operation action for all vector types to either promote 666 // (for widening) or expand (for scalarization). Then we will selectively 667 // turn on ones that can be effectively codegen'd. 668 for (MVT VT : MVT::vector_valuetypes()) { 669 setOperationAction(ISD::ADD , VT, Expand); 670 setOperationAction(ISD::SUB , VT, Expand); 671 setOperationAction(ISD::FADD, VT, Expand); 672 setOperationAction(ISD::FNEG, VT, Expand); 673 setOperationAction(ISD::FSUB, VT, Expand); 674 setOperationAction(ISD::MUL , VT, Expand); 675 setOperationAction(ISD::FMUL, VT, Expand); 676 setOperationAction(ISD::SDIV, VT, Expand); 677 setOperationAction(ISD::UDIV, VT, Expand); 678 setOperationAction(ISD::FDIV, VT, Expand); 679 setOperationAction(ISD::SREM, VT, Expand); 680 setOperationAction(ISD::UREM, VT, Expand); 681 setOperationAction(ISD::LOAD, VT, Expand); 682 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 683 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 684 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 685 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 686 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 687 setOperationAction(ISD::FABS, VT, Expand); 688 setOperationAction(ISD::FSIN, VT, Expand); 689 setOperationAction(ISD::FSINCOS, VT, Expand); 690 setOperationAction(ISD::FCOS, VT, Expand); 691 setOperationAction(ISD::FSINCOS, VT, Expand); 692 setOperationAction(ISD::FREM, VT, Expand); 693 setOperationAction(ISD::FMA, VT, Expand); 694 setOperationAction(ISD::FPOWI, VT, Expand); 695 setOperationAction(ISD::FSQRT, VT, Expand); 696 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 697 setOperationAction(ISD::FFLOOR, VT, Expand); 698 setOperationAction(ISD::FCEIL, VT, Expand); 699 setOperationAction(ISD::FTRUNC, VT, Expand); 700 setOperationAction(ISD::FRINT, VT, Expand); 701 setOperationAction(ISD::FNEARBYINT, VT, Expand); 702 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 703 setOperationAction(ISD::MULHS, VT, Expand); 704 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 705 setOperationAction(ISD::MULHU, VT, Expand); 706 setOperationAction(ISD::SDIVREM, VT, Expand); 707 setOperationAction(ISD::UDIVREM, VT, Expand); 708 setOperationAction(ISD::FPOW, VT, Expand); 709 setOperationAction(ISD::CTPOP, VT, Expand); 710 setOperationAction(ISD::CTTZ, VT, Expand); 711 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 712 setOperationAction(ISD::CTLZ, VT, Expand); 713 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 714 setOperationAction(ISD::SHL, VT, Expand); 715 setOperationAction(ISD::SRA, VT, Expand); 716 setOperationAction(ISD::SRL, VT, Expand); 717 setOperationAction(ISD::ROTL, VT, Expand); 718 setOperationAction(ISD::ROTR, VT, Expand); 719 setOperationAction(ISD::BSWAP, VT, Expand); 720 setOperationAction(ISD::SETCC, VT, Expand); 721 setOperationAction(ISD::FLOG, VT, Expand); 722 setOperationAction(ISD::FLOG2, VT, Expand); 723 setOperationAction(ISD::FLOG10, VT, Expand); 724 setOperationAction(ISD::FEXP, VT, Expand); 725 setOperationAction(ISD::FEXP2, VT, Expand); 726 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 727 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 728 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 729 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 730 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 731 setOperationAction(ISD::TRUNCATE, VT, Expand); 732 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 733 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 734 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 735 setOperationAction(ISD::VSELECT, VT, Expand); 736 setOperationAction(ISD::SELECT_CC, VT, Expand); 737 for (MVT InnerVT : MVT::vector_valuetypes()) { 738 setTruncStoreAction(InnerVT, VT, Expand); 739 740 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); 741 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); 742 743 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like 744 // types, we have to deal with them whether we ask for Expansion or not. 745 // Setting Expand causes its own optimisation problems though, so leave 746 // them legal. 747 if (VT.getVectorElementType() == MVT::i1) 748 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 749 750 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are 751 // split/scalarized right now. 752 if (VT.getVectorElementType() == MVT::f16) 753 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 754 } 755 } 756 757 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 758 // with -msoft-float, disable use of MMX as well. 759 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 760 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 761 // No operations on x86mmx supported, everything uses intrinsics. 762 } 763 764 // MMX-sized vectors (other than x86mmx) are expected to be expanded 765 // into smaller operations. 766 for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { 767 setOperationAction(ISD::MULHS, MMXTy, Expand); 768 setOperationAction(ISD::AND, MMXTy, Expand); 769 setOperationAction(ISD::OR, MMXTy, Expand); 770 setOperationAction(ISD::XOR, MMXTy, Expand); 771 setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); 772 setOperationAction(ISD::SELECT, MMXTy, Expand); 773 setOperationAction(ISD::BITCAST, MMXTy, Expand); 774 } 775 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 776 777 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 778 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 779 780 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 781 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 782 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 783 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 784 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 785 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 786 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 787 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 788 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 789 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 790 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); 791 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 792 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 793 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 794 } 795 796 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 797 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 798 799 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM 800 // registers cannot be used even for integer operations. 801 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 802 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 803 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 804 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 805 806 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 807 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 808 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 809 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 810 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 811 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 812 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); 813 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); 814 setOperationAction(ISD::MULHU, MVT::v8i16, Legal); 815 setOperationAction(ISD::MULHS, MVT::v8i16, Legal); 816 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 817 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 818 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 819 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 820 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 821 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 822 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 823 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 824 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 825 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 826 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 827 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 828 829 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 830 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 831 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 832 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 833 834 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 835 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 836 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 837 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 838 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 839 840 // Only provide customized ctpop vector bit twiddling for vector types we 841 // know to perform better than using the popcnt instructions on each vector 842 // element. If popcnt isn't supported, always provide the custom version. 843 if (!Subtarget->hasPOPCNT()) { 844 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 845 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 846 } 847 848 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 849 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 850 MVT VT = (MVT::SimpleValueType)i; 851 // Do not attempt to custom lower non-power-of-2 vectors 852 if (!isPowerOf2_32(VT.getVectorNumElements())) 853 continue; 854 // Do not attempt to custom lower non-128-bit vectors 855 if (!VT.is128BitVector()) 856 continue; 857 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 858 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 859 setOperationAction(ISD::VSELECT, VT, Custom); 860 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 861 } 862 863 // We support custom legalizing of sext and anyext loads for specific 864 // memory vector types which we can load as a scalar (or sequence of 865 // scalars) and extend in-register to a legal 128-bit vector type. For sext 866 // loads these must work with a single scalar load. 867 for (MVT VT : MVT::integer_vector_valuetypes()) { 868 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); 869 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); 870 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); 871 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); 872 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); 873 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); 874 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); 875 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); 876 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); 877 } 878 879 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 880 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 881 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 882 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 883 setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); 884 setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); 885 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 886 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 887 888 if (Subtarget->is64Bit()) { 889 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 890 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 891 } 892 893 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 894 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 895 MVT VT = (MVT::SimpleValueType)i; 896 897 // Do not attempt to promote non-128-bit vectors 898 if (!VT.is128BitVector()) 899 continue; 900 901 setOperationAction(ISD::AND, VT, Promote); 902 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 903 setOperationAction(ISD::OR, VT, Promote); 904 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 905 setOperationAction(ISD::XOR, VT, Promote); 906 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 907 setOperationAction(ISD::LOAD, VT, Promote); 908 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 909 setOperationAction(ISD::SELECT, VT, Promote); 910 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 911 } 912 913 // Custom lower v2i64 and v2f64 selects. 914 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 915 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 916 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 917 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 918 919 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 920 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 921 922 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 923 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 924 // As there is no 64-bit GPR available, we need build a special custom 925 // sequence to convert from v2i32 to v2f32. 926 if (!Subtarget->is64Bit()) 927 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 928 929 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 930 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 931 932 for (MVT VT : MVT::fp_vector_valuetypes()) 933 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); 934 935 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); 936 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); 937 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); 938 } 939 940 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { 941 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { 942 setOperationAction(ISD::FFLOOR, RoundedTy, Legal); 943 setOperationAction(ISD::FCEIL, RoundedTy, Legal); 944 setOperationAction(ISD::FTRUNC, RoundedTy, Legal); 945 setOperationAction(ISD::FRINT, RoundedTy, Legal); 946 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); 947 } 948 949 // FIXME: Do we need to handle scalar-to-vector here? 950 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 951 952 // We directly match byte blends in the backend as they match the VSELECT 953 // condition form. 954 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 955 956 // SSE41 brings specific instructions for doing vector sign extend even in 957 // cases where we don't have SRA. 958 for (MVT VT : MVT::integer_vector_valuetypes()) { 959 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); 960 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); 961 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); 962 } 963 964 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X 965 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); 966 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); 967 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); 968 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); 969 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); 970 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); 971 972 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); 973 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); 974 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); 975 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); 976 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); 977 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); 978 979 // i8 and i16 vectors are custom because the source register and source 980 // source memory operand types are not the same width. f32 vectors are 981 // custom since the immediate controlling the insert encodes additional 982 // information. 983 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 984 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 985 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 986 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 987 988 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 989 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 990 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 991 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 992 993 // FIXME: these should be Legal, but that's only for the case where 994 // the index is constant. For now custom expand to deal with that. 995 if (Subtarget->is64Bit()) { 996 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 997 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 998 } 999 } 1000 1001 if (Subtarget->hasSSE2()) { 1002 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 1003 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 1004 1005 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 1006 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 1007 1008 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 1009 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 1010 1011 // In the customized shift lowering, the legal cases in AVX2 will be 1012 // recognized. 1013 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1014 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1015 1016 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1017 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1018 1019 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1020 } 1021 1022 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { 1023 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1024 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1025 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1026 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1027 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1028 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1029 1030 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1031 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1032 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1033 1034 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1035 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1036 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1037 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1038 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1039 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 1040 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); 1041 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); 1042 setOperationAction(ISD::FRINT, MVT::v8f32, Legal); 1043 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); 1044 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1045 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 1046 1047 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1048 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1049 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1050 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1051 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1052 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1053 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 1054 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 1055 setOperationAction(ISD::FRINT, MVT::v4f64, Legal); 1056 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); 1057 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1058 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 1059 1060 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted 1061 // even though v8i16 is a legal type. 1062 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); 1063 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); 1064 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1065 1066 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 1067 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1068 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1069 1070 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1071 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1072 1073 for (MVT VT : MVT::fp_vector_valuetypes()) 1074 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); 1075 1076 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1077 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1078 1079 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1080 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1081 1082 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1083 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1084 1085 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1086 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1087 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1088 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1089 1090 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1091 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1092 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1093 1094 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); 1095 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 1096 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1097 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); 1098 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1099 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 1100 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); 1101 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); 1102 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); 1103 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1104 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 1105 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 1106 1107 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { 1108 setOperationAction(ISD::FMA, MVT::v8f32, Legal); 1109 setOperationAction(ISD::FMA, MVT::v4f64, Legal); 1110 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 1111 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 1112 setOperationAction(ISD::FMA, MVT::f32, Legal); 1113 setOperationAction(ISD::FMA, MVT::f64, Legal); 1114 } 1115 1116 if (Subtarget->hasInt256()) { 1117 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1118 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1119 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1120 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1121 1122 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1123 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1124 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1125 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1126 1127 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1128 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1129 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1130 // Don't lower v32i8 because there is no 128-bit byte mul 1131 1132 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); 1133 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); 1134 setOperationAction(ISD::MULHU, MVT::v16i16, Legal); 1135 setOperationAction(ISD::MULHS, MVT::v16i16, Legal); 1136 1137 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting 1138 // when we have a 256bit-wide blend with immediate. 1139 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); 1140 1141 // Only provide customized ctpop vector bit twiddling for vector types we 1142 // know to perform better than using the popcnt instructions on each 1143 // vector element. If popcnt isn't supported, always provide the custom 1144 // version. 1145 if (!Subtarget->hasPOPCNT()) 1146 setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); 1147 1148 // Custom CTPOP always performs better on natively supported v8i32 1149 setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); 1150 1151 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X 1152 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); 1153 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); 1154 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); 1155 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); 1156 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); 1157 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); 1158 1159 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); 1160 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); 1161 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); 1162 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); 1163 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); 1164 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); 1165 } else { 1166 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1167 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1168 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1169 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1170 1171 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1172 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1173 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1174 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1175 1176 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1177 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1178 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1179 // Don't lower v32i8 because there is no 128-bit byte mul 1180 } 1181 1182 // In the customized shift lowering, the legal cases in AVX2 will be 1183 // recognized. 1184 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1185 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1186 1187 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1188 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1189 1190 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1191 1192 // Custom lower several nodes for 256-bit types. 1193 for (MVT VT : MVT::vector_valuetypes()) { 1194 if (VT.getScalarSizeInBits() >= 32) { 1195 setOperationAction(ISD::MLOAD, VT, Legal); 1196 setOperationAction(ISD::MSTORE, VT, Legal); 1197 } 1198 // Extract subvector is special because the value type 1199 // (result) is 128-bit but the source is 256-bit wide. 1200 if (VT.is128BitVector()) { 1201 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1202 } 1203 // Do not attempt to custom lower other non-256-bit vectors 1204 if (!VT.is256BitVector()) 1205 continue; 1206 1207 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1208 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1209 setOperationAction(ISD::VSELECT, VT, Custom); 1210 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1211 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1212 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1213 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1214 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1215 } 1216 1217 if (Subtarget->hasInt256()) 1218 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1219 1220 1221 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1222 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 1223 MVT VT = (MVT::SimpleValueType)i; 1224 1225 // Do not attempt to promote non-256-bit vectors 1226 if (!VT.is256BitVector()) 1227 continue; 1228 1229 setOperationAction(ISD::AND, VT, Promote); 1230 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1231 setOperationAction(ISD::OR, VT, Promote); 1232 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1233 setOperationAction(ISD::XOR, VT, Promote); 1234 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1235 setOperationAction(ISD::LOAD, VT, Promote); 1236 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1237 setOperationAction(ISD::SELECT, VT, Promote); 1238 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1239 } 1240 } 1241 1242 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { 1243 addRegisterClass(MVT::v16i32, &X86::VR512RegClass); 1244 addRegisterClass(MVT::v16f32, &X86::VR512RegClass); 1245 addRegisterClass(MVT::v8i64, &X86::VR512RegClass); 1246 addRegisterClass(MVT::v8f64, &X86::VR512RegClass); 1247 1248 addRegisterClass(MVT::i1, &X86::VK1RegClass); 1249 addRegisterClass(MVT::v8i1, &X86::VK8RegClass); 1250 addRegisterClass(MVT::v16i1, &X86::VK16RegClass); 1251 1252 for (MVT VT : MVT::fp_vector_valuetypes()) 1253 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); 1254 1255 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 1256 setOperationAction(ISD::SETCC, MVT::i1, Custom); 1257 setOperationAction(ISD::XOR, MVT::i1, Legal); 1258 setOperationAction(ISD::OR, MVT::i1, Legal); 1259 setOperationAction(ISD::AND, MVT::i1, Legal); 1260 setOperationAction(ISD::LOAD, MVT::v16f32, Legal); 1261 setOperationAction(ISD::LOAD, MVT::v8f64, Legal); 1262 setOperationAction(ISD::LOAD, MVT::v8i64, Legal); 1263 setOperationAction(ISD::LOAD, MVT::v16i32, Legal); 1264 setOperationAction(ISD::LOAD, MVT::v16i1, Legal); 1265 1266 setOperationAction(ISD::FADD, MVT::v16f32, Legal); 1267 setOperationAction(ISD::FSUB, MVT::v16f32, Legal); 1268 setOperationAction(ISD::FMUL, MVT::v16f32, Legal); 1269 setOperationAction(ISD::FDIV, MVT::v16f32, Legal); 1270 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); 1271 setOperationAction(ISD::FNEG, MVT::v16f32, Custom); 1272 1273 setOperationAction(ISD::FADD, MVT::v8f64, Legal); 1274 setOperationAction(ISD::FSUB, MVT::v8f64, Legal); 1275 setOperationAction(ISD::FMUL, MVT::v8f64, Legal); 1276 setOperationAction(ISD::FDIV, MVT::v8f64, Legal); 1277 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); 1278 setOperationAction(ISD::FNEG, MVT::v8f64, Custom); 1279 setOperationAction(ISD::FMA, MVT::v8f64, Legal); 1280 setOperationAction(ISD::FMA, MVT::v16f32, Legal); 1281 1282 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); 1283 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); 1284 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); 1285 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); 1286 if (Subtarget->is64Bit()) { 1287 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); 1288 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); 1289 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); 1290 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); 1291 } 1292 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); 1293 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); 1294 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 1295 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 1296 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); 1297 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); 1298 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); 1299 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); 1300 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); 1301 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); 1302 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 1303 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 1304 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); 1305 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); 1306 1307 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); 1308 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1309 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 1310 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); 1311 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); 1312 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); 1313 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 1314 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); 1315 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 1316 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); 1317 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); 1318 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); 1319 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1320 1321 setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); 1322 setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); 1323 setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); 1324 setOperationAction(ISD::FCEIL, MVT::v8f64, Legal); 1325 setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal); 1326 setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal); 1327 setOperationAction(ISD::FRINT, MVT::v16f32, Legal); 1328 setOperationAction(ISD::FRINT, MVT::v8f64, Legal); 1329 setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal); 1330 setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal); 1331 1332 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); 1333 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); 1334 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); 1335 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); 1336 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); 1337 1338 setOperationAction(ISD::SETCC, MVT::v16i1, Custom); 1339 setOperationAction(ISD::SETCC, MVT::v8i1, Custom); 1340 1341 setOperationAction(ISD::MUL, MVT::v8i64, Custom); 1342 1343 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); 1344 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); 1345 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); 1346 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); 1347 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); 1348 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); 1349 setOperationAction(ISD::SELECT, MVT::v8f64, Custom); 1350 setOperationAction(ISD::SELECT, MVT::v8i64, Custom); 1351 setOperationAction(ISD::SELECT, MVT::v16f32, Custom); 1352 1353 setOperationAction(ISD::ADD, MVT::v8i64, Legal); 1354 setOperationAction(ISD::ADD, MVT::v16i32, Legal); 1355 1356 setOperationAction(ISD::SUB, MVT::v8i64, Legal); 1357 setOperationAction(ISD::SUB, MVT::v16i32, Legal); 1358 1359 setOperationAction(ISD::MUL, MVT::v16i32, Legal); 1360 1361 setOperationAction(ISD::SRL, MVT::v8i64, Custom); 1362 setOperationAction(ISD::SRL, MVT::v16i32, Custom); 1363 1364 setOperationAction(ISD::SHL, MVT::v8i64, Custom); 1365 setOperationAction(ISD::SHL, MVT::v16i32, Custom); 1366 1367 setOperationAction(ISD::SRA, MVT::v8i64, Custom); 1368 setOperationAction(ISD::SRA, MVT::v16i32, Custom); 1369 1370 setOperationAction(ISD::AND, MVT::v8i64, Legal); 1371 setOperationAction(ISD::OR, MVT::v8i64, Legal); 1372 setOperationAction(ISD::XOR, MVT::v8i64, Legal); 1373 setOperationAction(ISD::AND, MVT::v16i32, Legal); 1374 setOperationAction(ISD::OR, MVT::v16i32, Legal); 1375 setOperationAction(ISD::XOR, MVT::v16i32, Legal); 1376 1377 if (Subtarget->hasCDI()) { 1378 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); 1379 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); 1380 } 1381 1382 // Custom lower several nodes. 1383 for (MVT VT : MVT::vector_valuetypes()) { 1384 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 1385 // Extract subvector is special because the value type 1386 // (result) is 256/128-bit but the source is 512-bit wide. 1387 if (VT.is128BitVector() || VT.is256BitVector()) { 1388 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1389 } 1390 if (VT.getVectorElementType() == MVT::i1) 1391 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 1392 1393 // Do not attempt to custom lower other non-512-bit vectors 1394 if (!VT.is512BitVector()) 1395 continue; 1396 1397 if ( EltSize >= 32) { 1398 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1399 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1400 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1401 setOperationAction(ISD::VSELECT, VT, Legal); 1402 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1403 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1404 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1405 setOperationAction(ISD::MLOAD, VT, Legal); 1406 setOperationAction(ISD::MSTORE, VT, Legal); 1407 } 1408 } 1409 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { 1410 MVT VT = (MVT::SimpleValueType)i; 1411 1412 // Do not attempt to promote non-512-bit vectors. 1413 if (!VT.is512BitVector()) 1414 continue; 1415 1416 setOperationAction(ISD::SELECT, VT, Promote); 1417 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); 1418 } 1419 }// has AVX-512 1420 1421 if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { 1422 addRegisterClass(MVT::v32i16, &X86::VR512RegClass); 1423 addRegisterClass(MVT::v64i8, &X86::VR512RegClass); 1424 1425 addRegisterClass(MVT::v32i1, &X86::VK32RegClass); 1426 addRegisterClass(MVT::v64i1, &X86::VK64RegClass); 1427 1428 setOperationAction(ISD::LOAD, MVT::v32i16, Legal); 1429 setOperationAction(ISD::LOAD, MVT::v64i8, Legal); 1430 setOperationAction(ISD::SETCC, MVT::v32i1, Custom); 1431 setOperationAction(ISD::SETCC, MVT::v64i1, Custom); 1432 setOperationAction(ISD::ADD, MVT::v32i16, Legal); 1433 setOperationAction(ISD::ADD, MVT::v64i8, Legal); 1434 setOperationAction(ISD::SUB, MVT::v32i16, Legal); 1435 setOperationAction(ISD::SUB, MVT::v64i8, Legal); 1436 setOperationAction(ISD::MUL, MVT::v32i16, Legal); 1437 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); 1438 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); 1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); 1440 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); 1441 1442 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { 1443 const MVT VT = (MVT::SimpleValueType)i; 1444 1445 const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 1446 1447 // Do not attempt to promote non-512-bit vectors. 1448 if (!VT.is512BitVector()) 1449 continue; 1450 1451 if (EltSize < 32) { 1452 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1453 setOperationAction(ISD::VSELECT, VT, Legal); 1454 } 1455 } 1456 } 1457 1458 if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) { 1459 addRegisterClass(MVT::v4i1, &X86::VK4RegClass); 1460 addRegisterClass(MVT::v2i1, &X86::VK2RegClass); 1461 1462 setOperationAction(ISD::SETCC, MVT::v4i1, Custom); 1463 setOperationAction(ISD::SETCC, MVT::v2i1, Custom); 1464 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); 1465 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); 1466 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); 1467 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); 1468 1469 setOperationAction(ISD::AND, MVT::v8i32, Legal); 1470 setOperationAction(ISD::OR, MVT::v8i32, Legal); 1471 setOperationAction(ISD::XOR, MVT::v8i32, Legal); 1472 setOperationAction(ISD::AND, MVT::v4i32, Legal); 1473 setOperationAction(ISD::OR, MVT::v4i32, Legal); 1474 setOperationAction(ISD::XOR, MVT::v4i32, Legal); 1475 } 1476 1477 // We want to custom lower some of our intrinsics. 1478 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1479 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1480 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1481 if (!Subtarget->is64Bit()) 1482 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); 1483 1484 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1485 // handle type legalization for these operations here. 1486 // 1487 // FIXME: We really should do custom legalization for addition and 1488 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1489 // than generic legalization for 64-bit multiplication-with-overflow, though. 1490 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1491 // Add/Sub/Mul with overflow operations are custom lowered. 1492 MVT VT = IntVTs[i]; 1493 setOperationAction(ISD::SADDO, VT, Custom); 1494 setOperationAction(ISD::UADDO, VT, Custom); 1495 setOperationAction(ISD::SSUBO, VT, Custom); 1496 setOperationAction(ISD::USUBO, VT, Custom); 1497 setOperationAction(ISD::SMULO, VT, Custom); 1498 setOperationAction(ISD::UMULO, VT, Custom); 1499 } 1500 1501 1502 if (!Subtarget->is64Bit()) { 1503 // These libcalls are not available in 32-bit. 1504 setLibcallName(RTLIB::SHL_I128, nullptr); 1505 setLibcallName(RTLIB::SRL_I128, nullptr); 1506 setLibcallName(RTLIB::SRA_I128, nullptr); 1507 } 1508 1509 // Combine sin / cos into one node or libcall if possible. 1510 if (Subtarget->hasSinCos()) { 1511 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 1512 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 1513 if (Subtarget->isTargetDarwin()) { 1514 // For MacOSX, we don't want the normal expansion of a libcall to sincos. 1515 // We want to issue a libcall to __sincos_stret to avoid memory traffic. 1516 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1517 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1518 } 1519 } 1520 1521 if (Subtarget->isTargetWin64()) { 1522 setOperationAction(ISD::SDIV, MVT::i128, Custom); 1523 setOperationAction(ISD::UDIV, MVT::i128, Custom); 1524 setOperationAction(ISD::SREM, MVT::i128, Custom); 1525 setOperationAction(ISD::UREM, MVT::i128, Custom); 1526 setOperationAction(ISD::SDIVREM, MVT::i128, Custom); 1527 setOperationAction(ISD::UDIVREM, MVT::i128, Custom); 1528 } 1529 1530 // We have target-specific dag combine patterns for the following nodes: 1531 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1532 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1533 setTargetDAGCombine(ISD::BITCAST); 1534 setTargetDAGCombine(ISD::VSELECT); 1535 setTargetDAGCombine(ISD::SELECT); 1536 setTargetDAGCombine(ISD::SHL); 1537 setTargetDAGCombine(ISD::SRA); 1538 setTargetDAGCombine(ISD::SRL); 1539 setTargetDAGCombine(ISD::OR); 1540 setTargetDAGCombine(ISD::AND); 1541 setTargetDAGCombine(ISD::ADD); 1542 setTargetDAGCombine(ISD::FADD); 1543 setTargetDAGCombine(ISD::FSUB); 1544 setTargetDAGCombine(ISD::FMA); 1545 setTargetDAGCombine(ISD::SUB); 1546 setTargetDAGCombine(ISD::LOAD); 1547 setTargetDAGCombine(ISD::MLOAD); 1548 setTargetDAGCombine(ISD::STORE); 1549 setTargetDAGCombine(ISD::MSTORE); 1550 setTargetDAGCombine(ISD::ZERO_EXTEND); 1551 setTargetDAGCombine(ISD::ANY_EXTEND); 1552 setTargetDAGCombine(ISD::SIGN_EXTEND); 1553 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 1554 setTargetDAGCombine(ISD::TRUNCATE); 1555 setTargetDAGCombine(ISD::SINT_TO_FP); 1556 setTargetDAGCombine(ISD::SETCC); 1557 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 1558 setTargetDAGCombine(ISD::BUILD_VECTOR); 1559 setTargetDAGCombine(ISD::MUL); 1560 setTargetDAGCombine(ISD::XOR); 1561 1562 computeRegisterProperties(Subtarget->getRegisterInfo()); 1563 1564 // On Darwin, -Os means optimize for size without hurting performance, 1565 // do not reduce the limit. 1566 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1567 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1568 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1569 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1570 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1571 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1572 setPrefLoopAlignment(4); // 2^4 bytes. 1573 1574 // Predictable cmov don't hurt on atom because it's in-order. 1575 PredictableSelectIsExpensive = !Subtarget->isAtom(); 1576 EnableExtLdPromotion = true; 1577 setPrefFunctionAlignment(4); // 2^4 bytes. 1578 1579 verifyIntrinsicTables(); 1580 } 1581 1582 // This has so far only been implemented for 64-bit MachO. 1583 bool X86TargetLowering::useLoadStackGuardNode() const { 1584 return Subtarget->isTargetMachO() && Subtarget->is64Bit(); 1585 } 1586 1587 TargetLoweringBase::LegalizeTypeAction 1588 X86TargetLowering::getPreferredVectorAction(EVT VT) const { 1589 if (ExperimentalVectorWideningLegalization && 1590 VT.getVectorNumElements() != 1 && 1591 VT.getVectorElementType().getSimpleVT() != MVT::i1) 1592 return TypeWidenVector; 1593 1594 return TargetLoweringBase::getPreferredVectorAction(VT); 1595 } 1596 1597 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1598 if (!VT.isVector()) 1599 return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; 1600 1601 const unsigned NumElts = VT.getVectorNumElements(); 1602 const EVT EltVT = VT.getVectorElementType(); 1603 if (VT.is512BitVector()) { 1604 if (Subtarget->hasAVX512()) 1605 if (EltVT == MVT::i32 || EltVT == MVT::i64 || 1606 EltVT == MVT::f32 || EltVT == MVT::f64) 1607 switch(NumElts) { 1608 case 8: return MVT::v8i1; 1609 case 16: return MVT::v16i1; 1610 } 1611 if (Subtarget->hasBWI()) 1612 if (EltVT == MVT::i8 || EltVT == MVT::i16) 1613 switch(NumElts) { 1614 case 32: return MVT::v32i1; 1615 case 64: return MVT::v64i1; 1616 } 1617 } 1618 1619 if (VT.is256BitVector() || VT.is128BitVector()) { 1620 if (Subtarget->hasVLX()) 1621 if (EltVT == MVT::i32 || EltVT == MVT::i64 || 1622 EltVT == MVT::f32 || EltVT == MVT::f64) 1623 switch(NumElts) { 1624 case 2: return MVT::v2i1; 1625 case 4: return MVT::v4i1; 1626 case 8: return MVT::v8i1; 1627 } 1628 if (Subtarget->hasBWI() && Subtarget->hasVLX()) 1629 if (EltVT == MVT::i8 || EltVT == MVT::i16) 1630 switch(NumElts) { 1631 case 8: return MVT::v8i1; 1632 case 16: return MVT::v16i1; 1633 case 32: return MVT::v32i1; 1634 } 1635 } 1636 1637 return VT.changeVectorElementTypeToInteger(); 1638 } 1639 1640 /// Helper for getByValTypeAlignment to determine 1641 /// the desired ByVal argument alignment. 1642 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1643 if (MaxAlign == 16) 1644 return; 1645 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1646 if (VTy->getBitWidth() == 128) 1647 MaxAlign = 16; 1648 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1649 unsigned EltAlign = 0; 1650 getMaxByValAlign(ATy->getElementType(), EltAlign); 1651 if (EltAlign > MaxAlign) 1652 MaxAlign = EltAlign; 1653 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1654 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1655 unsigned EltAlign = 0; 1656 getMaxByValAlign(STy->getElementType(i), EltAlign); 1657 if (EltAlign > MaxAlign) 1658 MaxAlign = EltAlign; 1659 if (MaxAlign == 16) 1660 break; 1661 } 1662 } 1663 } 1664 1665 /// Return the desired alignment for ByVal aggregate 1666 /// function arguments in the caller parameter area. For X86, aggregates 1667 /// that contain SSE vectors are placed at 16-byte boundaries while the rest 1668 /// are at 4-byte boundaries. 1669 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1670 if (Subtarget->is64Bit()) { 1671 // Max of 8 and alignment of type. 1672 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1673 if (TyAlign > 8) 1674 return TyAlign; 1675 return 8; 1676 } 1677 1678 unsigned Align = 4; 1679 if (Subtarget->hasSSE1()) 1680 getMaxByValAlign(Ty, Align); 1681 return Align; 1682 } 1683 1684 /// Returns the target specific optimal type for load 1685 /// and store operations as a result of memset, memcpy, and memmove 1686 /// lowering. If DstAlign is zero that means it's safe to destination 1687 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1688 /// means there isn't a need to check it against alignment requirement, 1689 /// probably because the source does not need to be loaded. If 'IsMemset' is 1690 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 1691 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 1692 /// source is constant so it does not need to be loaded. 1693 /// It returns EVT::Other if the type should be determined using generic 1694 /// target-independent logic. 1695 EVT 1696 X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1697 unsigned DstAlign, unsigned SrcAlign, 1698 bool IsMemset, bool ZeroMemset, 1699 bool MemcpyStrSrc, 1700 MachineFunction &MF) const { 1701 const Function *F = MF.getFunction(); 1702 if ((!IsMemset || ZeroMemset) && 1703 !F->hasFnAttribute(Attribute::NoImplicitFloat)) { 1704 if (Size >= 16 && 1705 (Subtarget->isUnalignedMemAccessFast() || 1706 ((DstAlign == 0 || DstAlign >= 16) && 1707 (SrcAlign == 0 || SrcAlign >= 16)))) { 1708 if (Size >= 32) { 1709 if (Subtarget->hasInt256()) 1710 return MVT::v8i32; 1711 if (Subtarget->hasFp256()) 1712 return MVT::v8f32; 1713 } 1714 if (Subtarget->hasSSE2()) 1715 return MVT::v4i32; 1716 if (Subtarget->hasSSE1()) 1717 return MVT::v4f32; 1718 } else if (!MemcpyStrSrc && Size >= 8 && 1719 !Subtarget->is64Bit() && 1720 Subtarget->hasSSE2()) { 1721 // Do not use f64 to lower memcpy if source is string constant. It's 1722 // better to use i32 to avoid the loads. 1723 return MVT::f64; 1724 } 1725 } 1726 if (Subtarget->is64Bit() && Size >= 8) 1727 return MVT::i64; 1728 return MVT::i32; 1729 } 1730 1731 bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 1732 if (VT == MVT::f32) 1733 return X86ScalarSSEf32; 1734 else if (VT == MVT::f64) 1735 return X86ScalarSSEf64; 1736 return true; 1737 } 1738 1739 bool 1740 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 1741 unsigned, 1742 unsigned, 1743 bool *Fast) const { 1744 if (Fast) 1745 *Fast = Subtarget->isUnalignedMemAccessFast(); 1746 return true; 1747 } 1748 1749 /// Return the entry encoding for a jump table in the 1750 /// current function. The returned value is a member of the 1751 /// MachineJumpTableInfo::JTEntryKind enum. 1752 unsigned X86TargetLowering::getJumpTableEncoding() const { 1753 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1754 // symbol. 1755 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1756 Subtarget->isPICStyleGOT()) 1757 return MachineJumpTableInfo::EK_Custom32; 1758 1759 // Otherwise, use the normal jump table encoding heuristics. 1760 return TargetLowering::getJumpTableEncoding(); 1761 } 1762 1763 const MCExpr * 1764 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1765 const MachineBasicBlock *MBB, 1766 unsigned uid,MCContext &Ctx) const{ 1767 assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && 1768 Subtarget->isPICStyleGOT()); 1769 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1770 // entries. 1771 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1772 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1773 } 1774 1775 /// Returns relocation base for the given PIC jumptable. 1776 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1777 SelectionDAG &DAG) const { 1778 if (!Subtarget->is64Bit()) 1779 // This doesn't have SDLoc associated with it, but is not really the 1780 // same as a Register. 1781 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); 1782 return Table; 1783 } 1784 1785 /// This returns the relocation base for the given PIC jumptable, 1786 /// the same as getPICJumpTableRelocBase, but as an MCExpr. 1787 const MCExpr *X86TargetLowering:: 1788 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1789 MCContext &Ctx) const { 1790 // X86-64 uses RIP relative addressing based on the jump table label. 1791 if (Subtarget->isPICStyleRIPRel()) 1792 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1793 1794 // Otherwise, the reference is relative to the PIC base. 1795 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1796 } 1797 1798 std::pair<const TargetRegisterClass *, uint8_t> 1799 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1800 MVT VT) const { 1801 const TargetRegisterClass *RRC = nullptr; 1802 uint8_t Cost = 1; 1803 switch (VT.SimpleTy) { 1804 default: 1805 return TargetLowering::findRepresentativeClass(TRI, VT); 1806 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1807 RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; 1808 break; 1809 case MVT::x86mmx: 1810 RRC = &X86::VR64RegClass; 1811 break; 1812 case MVT::f32: case MVT::f64: 1813 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1814 case MVT::v4f32: case MVT::v2f64: 1815 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1816 case MVT::v4f64: 1817 RRC = &X86::VR128RegClass; 1818 break; 1819 } 1820 return std::make_pair(RRC, Cost); 1821 } 1822 1823 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1824 unsigned &Offset) const { 1825 if (!Subtarget->isTargetLinux()) 1826 return false; 1827 1828 if (Subtarget->is64Bit()) { 1829 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1830 Offset = 0x28; 1831 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1832 AddressSpace = 256; 1833 else 1834 AddressSpace = 257; 1835 } else { 1836 // %gs:0x14 on i386 1837 Offset = 0x14; 1838 AddressSpace = 256; 1839 } 1840 return true; 1841 } 1842 1843 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 1844 unsigned DestAS) const { 1845 assert(SrcAS != DestAS && "Expected different address spaces!"); 1846 1847 return SrcAS < 256 && DestAS < 256; 1848 } 1849 1850 //===----------------------------------------------------------------------===// 1851 // Return Value Calling Convention Implementation 1852 //===----------------------------------------------------------------------===// 1853 1854 #include "X86GenCallingConv.inc" 1855 1856 bool 1857 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1858 MachineFunction &MF, bool isVarArg, 1859 const SmallVectorImpl<ISD::OutputArg> &Outs, 1860 LLVMContext &Context) const { 1861 SmallVector<CCValAssign, 16> RVLocs; 1862 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 1863 return CCInfo.CheckReturn(Outs, RetCC_X86); 1864 } 1865 1866 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 1867 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; 1868 return ScratchRegs; 1869 } 1870 1871 SDValue 1872 X86TargetLowering::LowerReturn(SDValue Chain, 1873 CallingConv::ID CallConv, bool isVarArg, 1874 const SmallVectorImpl<ISD::OutputArg> &Outs, 1875 const SmallVectorImpl<SDValue> &OutVals, 1876 SDLoc dl, SelectionDAG &DAG) const { 1877 MachineFunction &MF = DAG.getMachineFunction(); 1878 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1879 1880 SmallVector<CCValAssign, 16> RVLocs; 1881 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 1882 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1883 1884 SDValue Flag; 1885 SmallVector<SDValue, 6> RetOps; 1886 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1887 // Operand #1 = Bytes To Pop 1888 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1889 MVT::i16)); 1890 1891 // Copy the result values into the output registers. 1892 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1893 CCValAssign &VA = RVLocs[i]; 1894 assert(VA.isRegLoc() && "Can only return in registers!"); 1895 SDValue ValToCopy = OutVals[i]; 1896 EVT ValVT = ValToCopy.getValueType(); 1897 1898 // Promote values to the appropriate types. 1899 if (VA.getLocInfo() == CCValAssign::SExt) 1900 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 1901 else if (VA.getLocInfo() == CCValAssign::ZExt) 1902 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 1903 else if (VA.getLocInfo() == CCValAssign::AExt) 1904 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 1905 else if (VA.getLocInfo() == CCValAssign::BCvt) 1906 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 1907 1908 assert(VA.getLocInfo() != CCValAssign::FPExt && 1909 "Unexpected FP-extend for return value."); 1910 1911 // If this is x86-64, and we disabled SSE, we can't return FP values, 1912 // or SSE or MMX vectors. 1913 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1914 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1915 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1916 report_fatal_error("SSE register return with SSE disabled"); 1917 } 1918 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1919 // llvm-gcc has never done it right and no one has noticed, so this 1920 // should be OK for now. 1921 if (ValVT == MVT::f64 && 1922 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1923 report_fatal_error("SSE2 register return with SSE2 disabled"); 1924 1925 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1926 // the RET instruction and handled by the FP Stackifier. 1927 if (VA.getLocReg() == X86::FP0 || 1928 VA.getLocReg() == X86::FP1) { 1929 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1930 // change the value to the FP stack register class. 1931 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1932 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1933 RetOps.push_back(ValToCopy); 1934 // Don't emit a copytoreg. 1935 continue; 1936 } 1937 1938 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1939 // which is returned in RAX / RDX. 1940 if (Subtarget->is64Bit()) { 1941 if (ValVT == MVT::x86mmx) { 1942 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1943 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1944 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1945 ValToCopy); 1946 // If we don't have SSE2 available, convert to v4f32 so the generated 1947 // register is legal. 1948 if (!Subtarget->hasSSE2()) 1949 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1950 } 1951 } 1952 } 1953 1954 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1955 Flag = Chain.getValue(1); 1956 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1957 } 1958 1959 // The x86-64 ABIs require that for returning structs by value we copy 1960 // the sret argument into %rax/%eax (depending on ABI) for the return. 1961 // Win32 requires us to put the sret argument to %eax as well. 1962 // We saved the argument into a virtual register in the entry block, 1963 // so now we copy the value out and into %rax/%eax. 1964 // 1965 // Checking Function.hasStructRetAttr() here is insufficient because the IR 1966 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is 1967 // false, then an sret argument may be implicitly inserted in the SelDAG. In 1968 // either case FuncInfo->setSRetReturnReg() will have been called. 1969 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { 1970 assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) && 1971 "No need for an sret register"); 1972 SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy()); 1973 1974 unsigned RetValReg 1975 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? 1976 X86::RAX : X86::EAX; 1977 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); 1978 Flag = Chain.getValue(1); 1979 1980 // RAX/EAX now acts like a return value. 1981 RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); 1982 } 1983 1984 RetOps[0] = Chain; // Update chain. 1985 1986 // Add the flag if we have it. 1987 if (Flag.getNode()) 1988 RetOps.push_back(Flag); 1989 1990 return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); 1991 } 1992 1993 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 1994 if (N->getNumValues() != 1) 1995 return false; 1996 if (!N->hasNUsesOfValue(1, 0)) 1997 return false; 1998 1999 SDValue TCChain = Chain; 2000 SDNode *Copy = *N->use_begin(); 2001 if (Copy->getOpcode() == ISD::CopyToReg) { 2002 // If the copy has a glue operand, we conservatively assume it isn't safe to 2003 // perform a tail call. 2004 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2005 return false; 2006 TCChain = Copy->getOperand(0); 2007 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 2008 return false; 2009 2010 bool HasRet = false; 2011 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2012 UI != UE; ++UI) { 2013 if (UI->getOpcode() != X86ISD::RET_FLAG) 2014 return false; 2015 // If we are returning more than one value, we can definitely 2016 // not make a tail call see PR19530 2017 if (UI->getNumOperands() > 4) 2018 return false; 2019 if (UI->getNumOperands() == 4 && 2020 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) 2021 return false; 2022 HasRet = true; 2023 } 2024 2025 if (!HasRet) 2026 return false; 2027 2028 Chain = TCChain; 2029 return true; 2030 } 2031 2032 EVT 2033 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 2034 ISD::NodeType ExtendKind) const { 2035 MVT ReturnMVT; 2036 // TODO: Is this also valid on 32-bit? 2037 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 2038 ReturnMVT = MVT::i8; 2039 else 2040 ReturnMVT = MVT::i32; 2041 2042 EVT MinVT = getRegisterType(Context, ReturnMVT); 2043 return VT.bitsLT(MinVT) ? MinVT : VT; 2044 } 2045 2046 /// Lower the result values of a call into the 2047 /// appropriate copies out of appropriate physical registers. 2048 /// 2049 SDValue 2050 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 2051 CallingConv::ID CallConv, bool isVarArg, 2052 const SmallVectorImpl<ISD::InputArg> &Ins, 2053 SDLoc dl, SelectionDAG &DAG, 2054 SmallVectorImpl<SDValue> &InVals) const { 2055 2056 // Assign locations to each value returned by this call. 2057 SmallVector<CCValAssign, 16> RVLocs; 2058 bool Is64Bit = Subtarget->is64Bit(); 2059 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2060 *DAG.getContext()); 2061 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2062 2063 // Copy all of the result registers out of their specified physreg. 2064 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2065 CCValAssign &VA = RVLocs[i]; 2066 EVT CopyVT = VA.getValVT(); 2067 2068 // If this is x86-64, and we disabled SSE, we can't return FP values 2069 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 2070 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 2071 report_fatal_error("SSE register return with SSE disabled"); 2072 } 2073 2074 // If we prefer to use the value in xmm registers, copy it out as f80 and 2075 // use a truncate to move it from fp stack reg to xmm reg. 2076 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && 2077 isScalarFPTypeInSSEReg(VA.getValVT())) 2078 CopyVT = MVT::f80; 2079 2080 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 2081 CopyVT, InFlag).getValue(1); 2082 SDValue Val = Chain.getValue(0); 2083 2084 if (CopyVT != VA.getValVT()) 2085 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 2086 // This truncation won't change the value. 2087 DAG.getIntPtrConstant(1)); 2088 2089 InFlag = Chain.getValue(2); 2090 InVals.push_back(Val); 2091 } 2092 2093 return Chain; 2094 } 2095 2096 //===----------------------------------------------------------------------===// 2097 // C & StdCall & Fast Calling Convention implementation 2098 //===----------------------------------------------------------------------===// 2099 // StdCall calling convention seems to be standard for many Windows' API 2100 // routines and around. It differs from C calling convention just a little: 2101 // callee should clean up the stack, not caller. Symbols should be also 2102 // decorated in some fancy way :) It doesn't support any vector arguments. 2103 // For info on fast calling convention see Fast Calling Convention (tail call) 2104 // implementation LowerX86_32FastCCCallTo. 2105 2106 /// CallIsStructReturn - Determines whether a call uses struct return 2107 /// semantics. 2108 enum StructReturnType { 2109 NotStructReturn, 2110 RegStructReturn, 2111 StackStructReturn 2112 }; 2113 static StructReturnType 2114 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 2115 if (Outs.empty()) 2116 return NotStructReturn; 2117 2118 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 2119 if (!Flags.isSRet()) 2120 return NotStructReturn; 2121 if (Flags.isInReg()) 2122 return RegStructReturn; 2123 return StackStructReturn; 2124 } 2125 2126 /// Determines whether a function uses struct return semantics. 2127 static StructReturnType 2128 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 2129 if (Ins.empty()) 2130 return NotStructReturn; 2131 2132 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 2133 if (!Flags.isSRet()) 2134 return NotStructReturn; 2135 if (Flags.isInReg()) 2136 return RegStructReturn; 2137 return StackStructReturn; 2138 } 2139 2140 /// Make a copy of an aggregate at address specified by "Src" to address 2141 /// "Dst" with size and alignment information specified by the specific 2142 /// parameter attribute. The copy will be passed as a byval function parameter. 2143 static SDValue 2144 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 2145 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 2146 SDLoc dl) { 2147 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 2148 2149 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 2150 /*isVolatile*/false, /*AlwaysInline=*/true, 2151 /*isTailCall*/false, 2152 MachinePointerInfo(), MachinePointerInfo()); 2153 } 2154 2155 /// Return true if the calling convention is one that 2156 /// supports tail call optimization. 2157 static bool IsTailCallConvention(CallingConv::ID CC) { 2158 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 2159 CC == CallingConv::HiPE); 2160 } 2161 2162 /// \brief Return true if the calling convention is a C calling convention. 2163 static bool IsCCallConvention(CallingConv::ID CC) { 2164 return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || 2165 CC == CallingConv::X86_64_SysV); 2166 } 2167 2168 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2169 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 2170 return false; 2171 2172 CallSite CS(CI); 2173 CallingConv::ID CalleeCC = CS.getCallingConv(); 2174 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 2175 return false; 2176 2177 return true; 2178 } 2179 2180 /// Return true if the function is being made into 2181 /// a tailcall target by changing its ABI. 2182 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 2183 bool GuaranteedTailCallOpt) { 2184 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 2185 } 2186 2187 SDValue 2188 X86TargetLowering::LowerMemArgument(SDValue Chain, 2189 CallingConv::ID CallConv, 2190 const SmallVectorImpl<ISD::InputArg> &Ins, 2191 SDLoc dl, SelectionDAG &DAG, 2192 const CCValAssign &VA, 2193 MachineFrameInfo *MFI, 2194 unsigned i) const { 2195 // Create the nodes corresponding to a load from this parameter slot. 2196 ISD::ArgFlagsTy Flags = Ins[i].Flags; 2197 bool AlwaysUseMutable = FuncIsMadeTailCallSafe( 2198 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); 2199 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 2200 EVT ValVT; 2201 2202 // If value is passed by pointer we have address passed instead of the value 2203 // itself. 2204 if (VA.getLocInfo() == CCValAssign::Indirect) 2205 ValVT = VA.getLocVT(); 2206 else 2207 ValVT = VA.getValVT(); 2208 2209 // FIXME: For now, all byval parameter objects are marked mutable. This can be 2210 // changed with more analysis. 2211 // In case of tail call optimization mark all arguments mutable. Since they 2212 // could be overwritten by lowering of arguments in case of a tail call. 2213 if (Flags.isByVal()) { 2214 unsigned Bytes = Flags.getByValSize(); 2215 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 2216 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 2217 return DAG.getFrameIndex(FI, getPointerTy()); 2218 } else { 2219 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 2220 VA.getLocMemOffset(), isImmutable); 2221 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2222 return DAG.getLoad(ValVT, dl, Chain, FIN, 2223 MachinePointerInfo::getFixedStack(FI), 2224 false, false, false, 0); 2225 } 2226 } 2227 2228 // FIXME: Get this from tablegen. 2229 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, 2230 const X86Subtarget *Subtarget) { 2231 assert(Subtarget->is64Bit()); 2232 2233 if (Subtarget->isCallingConvWin64(CallConv)) { 2234 static const MCPhysReg GPR64ArgRegsWin64[] = { 2235 X86::RCX, X86::RDX, X86::R8, X86::R9 2236 }; 2237 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); 2238 } 2239 2240 static const MCPhysReg GPR64ArgRegs64Bit[] = { 2241 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 2242 }; 2243 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); 2244 } 2245 2246 // FIXME: Get this from tablegen. 2247 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, 2248 CallingConv::ID CallConv, 2249 const X86Subtarget *Subtarget) { 2250 assert(Subtarget->is64Bit()); 2251 if (Subtarget->isCallingConvWin64(CallConv)) { 2252 // The XMM registers which might contain var arg parameters are shadowed 2253 // in their paired GPR. So we only need to save the GPR to their home 2254 // slots. 2255 // TODO: __vectorcall will change this. 2256 return None; 2257 } 2258 2259 const Function *Fn = MF.getFunction(); 2260 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); 2261 assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && 2262 "SSE register cannot be used when SSE is disabled!"); 2263 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 2264 !Subtarget->hasSSE1()) 2265 // Kernel mode asks for SSE to be disabled, so there are no XMM argument 2266 // registers. 2267 return None; 2268 2269 static const MCPhysReg XMMArgRegs64Bit[] = { 2270 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2271 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2272 }; 2273 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); 2274 } 2275 2276 SDValue 2277 X86TargetLowering::LowerFormalArguments(SDValue Chain, 2278 CallingConv::ID CallConv, 2279 bool isVarArg, 2280 const SmallVectorImpl<ISD::InputArg> &Ins, 2281 SDLoc dl, 2282 SelectionDAG &DAG, 2283 SmallVectorImpl<SDValue> &InVals) 2284 const { 2285 MachineFunction &MF = DAG.getMachineFunction(); 2286 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2287 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 2288 2289 const Function* Fn = MF.getFunction(); 2290 if (Fn->hasExternalLinkage() && 2291 Subtarget->isTargetCygMing() && 2292 Fn->getName() == "main") 2293 FuncInfo->setForceFramePointer(true); 2294 2295 MachineFrameInfo *MFI = MF.getFrameInfo(); 2296 bool Is64Bit = Subtarget->is64Bit(); 2297 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2298 2299 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2300 "Var args not supported with calling convention fastcc, ghc or hipe"); 2301 2302 // Assign locations to all of the incoming arguments. 2303 SmallVector<CCValAssign, 16> ArgLocs; 2304 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 2305 2306 // Allocate shadow area for Win64 2307 if (IsWin64) 2308 CCInfo.AllocateStack(32, 8); 2309 2310 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 2311 2312 unsigned LastVal = ~0U; 2313 SDValue ArgValue; 2314 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2315 CCValAssign &VA = ArgLocs[i]; 2316 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 2317 // places. 2318 assert(VA.getValNo() != LastVal && 2319 "Don't support value assigned to multiple locs yet"); 2320 (void)LastVal; 2321 LastVal = VA.getValNo(); 2322 2323 if (VA.isRegLoc()) { 2324 EVT RegVT = VA.getLocVT(); 2325 const TargetRegisterClass *RC; 2326 if (RegVT == MVT::i32) 2327 RC = &X86::GR32RegClass; 2328 else if (Is64Bit && RegVT == MVT::i64) 2329 RC = &X86::GR64RegClass; 2330 else if (RegVT == MVT::f32) 2331 RC = &X86::FR32RegClass; 2332 else if (RegVT == MVT::f64) 2333 RC = &X86::FR64RegClass; 2334 else if (RegVT.is512BitVector()) 2335 RC = &X86::VR512RegClass; 2336 else if (RegVT.is256BitVector()) 2337 RC = &X86::VR256RegClass; 2338 else if (RegVT.is128BitVector()) 2339 RC = &X86::VR128RegClass; 2340 else if (RegVT == MVT::x86mmx) 2341 RC = &X86::VR64RegClass; 2342 else if (RegVT == MVT::i1) 2343 RC = &X86::VK1RegClass; 2344 else if (RegVT == MVT::v8i1) 2345 RC = &X86::VK8RegClass; 2346 else if (RegVT == MVT::v16i1) 2347 RC = &X86::VK16RegClass; 2348 else if (RegVT == MVT::v32i1) 2349 RC = &X86::VK32RegClass; 2350 else if (RegVT == MVT::v64i1) 2351 RC = &X86::VK64RegClass; 2352 else 2353 llvm_unreachable("Unknown argument type!"); 2354 2355 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2356 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 2357 2358 // If this is an 8 or 16-bit value, it is really passed promoted to 32 2359 // bits. Insert an assert[sz]ext to capture this, then truncate to the 2360 // right size. 2361 if (VA.getLocInfo() == CCValAssign::SExt) 2362 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 2363 DAG.getValueType(VA.getValVT())); 2364 else if (VA.getLocInfo() == CCValAssign::ZExt) 2365 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 2366 DAG.getValueType(VA.getValVT())); 2367 else if (VA.getLocInfo() == CCValAssign::BCvt) 2368 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 2369 2370 if (VA.isExtInLoc()) { 2371 // Handle MMX values passed in XMM regs. 2372 if (RegVT.isVector()) 2373 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 2374 else 2375 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2376 } 2377 } else { 2378 assert(VA.isMemLoc()); 2379 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 2380 } 2381 2382 // If value is passed via pointer - do a load. 2383 if (VA.getLocInfo() == CCValAssign::Indirect) 2384 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 2385 MachinePointerInfo(), false, false, false, 0); 2386 2387 InVals.push_back(ArgValue); 2388 } 2389 2390 if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) { 2391 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2392 // The x86-64 ABIs require that for returning structs by value we copy 2393 // the sret argument into %rax/%eax (depending on ABI) for the return. 2394 // Win32 requires us to put the sret argument to %eax as well. 2395 // Save the argument into a virtual register so that we can access it 2396 // from the return points. 2397 if (Ins[i].Flags.isSRet()) { 2398 unsigned Reg = FuncInfo->getSRetReturnReg(); 2399 if (!Reg) { 2400 MVT PtrTy = getPointerTy(); 2401 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 2402 FuncInfo->setSRetReturnReg(Reg); 2403 } 2404 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); 2405 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 2406 break; 2407 } 2408 } 2409 } 2410 2411 unsigned StackSize = CCInfo.getNextStackOffset(); 2412 // Align stack specially for tail calls. 2413 if (FuncIsMadeTailCallSafe(CallConv, 2414 MF.getTarget().Options.GuaranteedTailCallOpt)) 2415 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 2416 2417 // If the function takes variable number of arguments, make a frame index for 2418 // the start of the first vararg value... for expansion of llvm.va_start. We 2419 // can skip this if there are no va_start calls. 2420 if (MFI->hasVAStart() && 2421 (Is64Bit || (CallConv != CallingConv::X86_FastCall && 2422 CallConv != CallingConv::X86_ThisCall))) { 2423 FuncInfo->setVarArgsFrameIndex( 2424 MFI->CreateFixedObject(1, StackSize, true)); 2425 } 2426 2427 MachineModuleInfo &MMI = MF.getMMI(); 2428 const Function *WinEHParent = nullptr; 2429 if (IsWin64 && MMI.hasWinEHFuncInfo(Fn)) 2430 WinEHParent = MMI.getWinEHParent(Fn); 2431 bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn; 2432 bool IsWinEHParent = WinEHParent && WinEHParent == Fn; 2433 2434 // Figure out if XMM registers are in use. 2435 assert(!(MF.getTarget().Options.UseSoftFloat && 2436 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && 2437 "SSE register cannot be used when SSE is disabled!"); 2438 2439 // 64-bit calling conventions support varargs and register parameters, so we 2440 // have to do extra work to spill them in the prologue. 2441 if (Is64Bit && isVarArg && MFI->hasVAStart()) { 2442 // Find the first unallocated argument registers. 2443 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); 2444 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); 2445 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); 2446 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); 2447 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 2448 "SSE register cannot be used when SSE is disabled!"); 2449 2450 // Gather all the live in physical registers. 2451 SmallVector<SDValue, 6> LiveGPRs; 2452 SmallVector<SDValue, 8> LiveXMMRegs; 2453 SDValue ALVal; 2454 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { 2455 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); 2456 LiveGPRs.push_back( 2457 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); 2458 } 2459 if (!ArgXMMs.empty()) { 2460 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2461 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); 2462 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { 2463 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); 2464 LiveXMMRegs.push_back( 2465 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); 2466 } 2467 } 2468 2469 if (IsWin64) { 2470 // Get to the caller-allocated home save location. Add 8 to account 2471 // for the return address. 2472 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2473 FuncInfo->setRegSaveFrameIndex( 2474 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2475 // Fixup to set vararg frame on shadow area (4 x i64). 2476 if (NumIntRegs < 4) 2477 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2478 } else { 2479 // For X86-64, if there are vararg parameters that are passed via 2480 // registers, then we must store them to their spots on the stack so 2481 // they may be loaded by deferencing the result of va_next. 2482 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2483 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); 2484 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( 2485 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); 2486 } 2487 2488 // Store the integer parameter registers. 2489 SmallVector<SDValue, 8> MemOps; 2490 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2491 getPointerTy()); 2492 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2493 for (SDValue Val : LiveGPRs) { 2494 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 2495 DAG.getIntPtrConstant(Offset)); 2496 SDValue Store = 2497 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2498 MachinePointerInfo::getFixedStack( 2499 FuncInfo->getRegSaveFrameIndex(), Offset), 2500 false, false, 0); 2501 MemOps.push_back(Store); 2502 Offset += 8; 2503 } 2504 2505 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { 2506 // Now store the XMM (fp + vector) parameter registers. 2507 SmallVector<SDValue, 12> SaveXMMOps; 2508 SaveXMMOps.push_back(Chain); 2509 SaveXMMOps.push_back(ALVal); 2510 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2511 FuncInfo->getRegSaveFrameIndex())); 2512 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2513 FuncInfo->getVarArgsFPOffset())); 2514 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), 2515 LiveXMMRegs.end()); 2516 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2517 MVT::Other, SaveXMMOps)); 2518 } 2519 2520 if (!MemOps.empty()) 2521 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 2522 } else if (IsWinEHOutlined) { 2523 // Get to the caller-allocated home save location. Add 8 to account 2524 // for the return address. 2525 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2526 FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject( 2527 /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false)); 2528 2529 MMI.getWinEHFuncInfo(Fn) 2530 .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] = 2531 FuncInfo->getRegSaveFrameIndex(); 2532 2533 // Store the second integer parameter (rdx) into rsp+16 relative to the 2534 // stack pointer at the entry of the function. 2535 SDValue RSFIN = 2536 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); 2537 unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass); 2538 SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64); 2539 Chain = DAG.getStore( 2540 Val.getValue(1), dl, Val, RSFIN, 2541 MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()), 2542 /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0); 2543 } 2544 2545 if (isVarArg && MFI->hasMustTailInVarArgFunc()) { 2546 // Find the largest legal vector type. 2547 MVT VecVT = MVT::Other; 2548 // FIXME: Only some x86_32 calling conventions support AVX512. 2549 if (Subtarget->hasAVX512() && 2550 (Is64Bit || (CallConv == CallingConv::X86_VectorCall || 2551 CallConv == CallingConv::Intel_OCL_BI))) 2552 VecVT = MVT::v16f32; 2553 else if (Subtarget->hasAVX()) 2554 VecVT = MVT::v8f32; 2555 else if (Subtarget->hasSSE2()) 2556 VecVT = MVT::v4f32; 2557 2558 // We forward some GPRs and some vector types. 2559 SmallVector<MVT, 2> RegParmTypes; 2560 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; 2561 RegParmTypes.push_back(IntVT); 2562 if (VecVT != MVT::Other) 2563 RegParmTypes.push_back(VecVT); 2564 2565 // Compute the set of forwarded registers. The rest are scratch. 2566 SmallVectorImpl<ForwardedRegister> &Forwards = 2567 FuncInfo->getForwardedMustTailRegParms(); 2568 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); 2569 2570 // Conservatively forward AL on x86_64, since it might be used for varargs. 2571 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { 2572 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2573 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); 2574 } 2575 2576 // Copy all forwards from physical to virtual registers. 2577 for (ForwardedRegister &F : Forwards) { 2578 // FIXME: Can we use a less constrained schedule? 2579 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 2580 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); 2581 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); 2582 } 2583 } 2584 2585 // Some CCs need callee pop. 2586 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2587 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2588 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2589 } else { 2590 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2591 // If this is an sret function, the return should pop the hidden pointer. 2592 if (!Is64Bit && !IsTailCallConvention(CallConv) && 2593 !Subtarget->getTargetTriple().isOSMSVCRT() && 2594 argsAreStructReturn(Ins) == StackStructReturn) 2595 FuncInfo->setBytesToPopOnReturn(4); 2596 } 2597 2598 if (!Is64Bit) { 2599 // RegSaveFrameIndex is X86-64 only. 2600 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2601 if (CallConv == CallingConv::X86_FastCall || 2602 CallConv == CallingConv::X86_ThisCall) 2603 // fastcc functions can't have varargs. 2604 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2605 } 2606 2607 FuncInfo->setArgumentStackSize(StackSize); 2608 2609 if (IsWinEHParent) { 2610 int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); 2611 SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); 2612 MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; 2613 SDValue Neg2 = DAG.getConstant(-2, MVT::i64); 2614 Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, 2615 MachinePointerInfo::getFixedStack(UnwindHelpFI), 2616 /*isVolatile=*/true, 2617 /*isNonTemporal=*/false, /*Alignment=*/0); 2618 } 2619 2620 return Chain; 2621 } 2622 2623 SDValue 2624 X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2625 SDValue StackPtr, SDValue Arg, 2626 SDLoc dl, SelectionDAG &DAG, 2627 const CCValAssign &VA, 2628 ISD::ArgFlagsTy Flags) const { 2629 unsigned LocMemOffset = VA.getLocMemOffset(); 2630 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2631 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2632 if (Flags.isByVal()) 2633 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2634 2635 return DAG.getStore(Chain, dl, Arg, PtrOff, 2636 MachinePointerInfo::getStack(LocMemOffset), 2637 false, false, 0); 2638 } 2639 2640 /// Emit a load of return address if tail call 2641 /// optimization is performed and it is required. 2642 SDValue 2643 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2644 SDValue &OutRetAddr, SDValue Chain, 2645 bool IsTailCall, bool Is64Bit, 2646 int FPDiff, SDLoc dl) const { 2647 // Adjust the Return address stack slot. 2648 EVT VT = getPointerTy(); 2649 OutRetAddr = getReturnAddressFrameIndex(DAG); 2650 2651 // Load the "old" Return address. 2652 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2653 false, false, false, 0); 2654 return SDValue(OutRetAddr.getNode(), 1); 2655 } 2656 2657 /// Emit a store of the return address if tail call 2658 /// optimization is performed and it is required (FPDiff!=0). 2659 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, 2660 SDValue Chain, SDValue RetAddrFrIdx, 2661 EVT PtrVT, unsigned SlotSize, 2662 int FPDiff, SDLoc dl) { 2663 // Store the return address to the appropriate stack slot. 2664 if (!FPDiff) return Chain; 2665 // Calculate the new stack slot for the return address. 2666 int NewReturnAddrFI = 2667 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 2668 false); 2669 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 2670 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2671 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2672 false, false, 0); 2673 return Chain; 2674 } 2675 2676 SDValue 2677 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2678 SmallVectorImpl<SDValue> &InVals) const { 2679 SelectionDAG &DAG = CLI.DAG; 2680 SDLoc &dl = CLI.DL; 2681 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2682 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2683 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2684 SDValue Chain = CLI.Chain; 2685 SDValue Callee = CLI.Callee; 2686 CallingConv::ID CallConv = CLI.CallConv; 2687 bool &isTailCall = CLI.IsTailCall; 2688 bool isVarArg = CLI.IsVarArg; 2689 2690 MachineFunction &MF = DAG.getMachineFunction(); 2691 bool Is64Bit = Subtarget->is64Bit(); 2692 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2693 StructReturnType SR = callIsStructReturn(Outs); 2694 bool IsSibcall = false; 2695 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2696 2697 if (MF.getTarget().Options.DisableTailCalls) 2698 isTailCall = false; 2699 2700 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); 2701 if (IsMustTail) { 2702 // Force this to be a tail call. The verifier rules are enough to ensure 2703 // that we can lower this successfully without moving the return address 2704 // around. 2705 isTailCall = true; 2706 } else if (isTailCall) { 2707 // Check if it's really possible to do a tail call. 2708 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2709 isVarArg, SR != NotStructReturn, 2710 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 2711 Outs, OutVals, Ins, DAG); 2712 2713 // Sibcalls are automatically detected tailcalls which do not require 2714 // ABI changes. 2715 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2716 IsSibcall = true; 2717 2718 if (isTailCall) 2719 ++NumTailCalls; 2720 } 2721 2722 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2723 "Var args not supported with calling convention fastcc, ghc or hipe"); 2724 2725 // Analyze operands of the call, assigning locations to each operand. 2726 SmallVector<CCValAssign, 16> ArgLocs; 2727 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 2728 2729 // Allocate shadow area for Win64 2730 if (IsWin64) 2731 CCInfo.AllocateStack(32, 8); 2732 2733 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2734 2735 // Get a count of how many bytes are to be pushed on the stack. 2736 unsigned NumBytes = CCInfo.getNextStackOffset(); 2737 if (IsSibcall) 2738 // This is a sibcall. The memory operands are available in caller's 2739 // own caller's stack. 2740 NumBytes = 0; 2741 else if (MF.getTarget().Options.GuaranteedTailCallOpt && 2742 IsTailCallConvention(CallConv)) 2743 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2744 2745 int FPDiff = 0; 2746 if (isTailCall && !IsSibcall && !IsMustTail) { 2747 // Lower arguments at fp - stackoffset + fpdiff. 2748 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2749 2750 FPDiff = NumBytesCallerPushed - NumBytes; 2751 2752 // Set the delta of movement of the returnaddr stackslot. 2753 // But only set if delta is greater than previous delta. 2754 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2755 X86Info->setTCReturnAddrDelta(FPDiff); 2756 } 2757 2758 unsigned NumBytesToPush = NumBytes; 2759 unsigned NumBytesToPop = NumBytes; 2760 2761 // If we have an inalloca argument, all stack space has already been allocated 2762 // for us and be right at the top of the stack. We don't support multiple 2763 // arguments passed in memory when using inalloca. 2764 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { 2765 NumBytesToPush = 0; 2766 if (!ArgLocs.back().isMemLoc()) 2767 report_fatal_error("cannot use inalloca attribute on a register " 2768 "parameter"); 2769 if (ArgLocs.back().getLocMemOffset() != 0) 2770 report_fatal_error("any parameter with the inalloca attribute must be " 2771 "the only memory argument"); 2772 } 2773 2774 if (!IsSibcall) 2775 Chain = DAG.getCALLSEQ_START( 2776 Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl); 2777 2778 SDValue RetAddrFrIdx; 2779 // Load return address for tail calls. 2780 if (isTailCall && FPDiff) 2781 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2782 Is64Bit, FPDiff, dl); 2783 2784 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2785 SmallVector<SDValue, 8> MemOpChains; 2786 SDValue StackPtr; 2787 2788 // Walk the register/memloc assignments, inserting copies/loads. In the case 2789 // of tail call optimization arguments are handle later. 2790 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 2791 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2792 // Skip inalloca arguments, they have already been written. 2793 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2794 if (Flags.isInAlloca()) 2795 continue; 2796 2797 CCValAssign &VA = ArgLocs[i]; 2798 EVT RegVT = VA.getLocVT(); 2799 SDValue Arg = OutVals[i]; 2800 bool isByVal = Flags.isByVal(); 2801 2802 // Promote the value if needed. 2803 switch (VA.getLocInfo()) { 2804 default: llvm_unreachable("Unknown loc info!"); 2805 case CCValAssign::Full: break; 2806 case CCValAssign::SExt: 2807 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2808 break; 2809 case CCValAssign::ZExt: 2810 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2811 break; 2812 case CCValAssign::AExt: 2813 if (RegVT.is128BitVector()) { 2814 // Special case: passing MMX values in XMM registers. 2815 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2816 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2817 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2818 } else 2819 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2820 break; 2821 case CCValAssign::BCvt: 2822 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2823 break; 2824 case CCValAssign::Indirect: { 2825 // Store the argument. 2826 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2827 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2828 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2829 MachinePointerInfo::getFixedStack(FI), 2830 false, false, 0); 2831 Arg = SpillSlot; 2832 break; 2833 } 2834 } 2835 2836 if (VA.isRegLoc()) { 2837 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2838 if (isVarArg && IsWin64) { 2839 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2840 // shadow reg if callee is a varargs function. 2841 unsigned ShadowReg = 0; 2842 switch (VA.getLocReg()) { 2843 case X86::XMM0: ShadowReg = X86::RCX; break; 2844 case X86::XMM1: ShadowReg = X86::RDX; break; 2845 case X86::XMM2: ShadowReg = X86::R8; break; 2846 case X86::XMM3: ShadowReg = X86::R9; break; 2847 } 2848 if (ShadowReg) 2849 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2850 } 2851 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2852 assert(VA.isMemLoc()); 2853 if (!StackPtr.getNode()) 2854 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2855 getPointerTy()); 2856 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2857 dl, DAG, VA, Flags)); 2858 } 2859 } 2860 2861 if (!MemOpChains.empty()) 2862 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2863 2864 if (Subtarget->isPICStyleGOT()) { 2865 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2866 // GOT pointer. 2867 if (!isTailCall) { 2868 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 2869 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); 2870 } else { 2871 // If we are tail calling and generating PIC/GOT style code load the 2872 // address of the callee into ECX. The value in ecx is used as target of 2873 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2874 // for tail calls on PIC/GOT architectures. Normally we would just put the 2875 // address of GOT into ebx and then call target@PLT. But for tail calls 2876 // ebx would be restored (since ebx is callee saved) before jumping to the 2877 // target@PLT. 2878 2879 // Note: The actual moving to ECX is done further down. 2880 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2881 if (G && !G->getGlobal()->hasHiddenVisibility() && 2882 !G->getGlobal()->hasProtectedVisibility()) 2883 Callee = LowerGlobalAddress(Callee, DAG); 2884 else if (isa<ExternalSymbolSDNode>(Callee)) 2885 Callee = LowerExternalSymbol(Callee, DAG); 2886 } 2887 } 2888 2889 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { 2890 // From AMD64 ABI document: 2891 // For calls that may call functions that use varargs or stdargs 2892 // (prototype-less calls or calls to functions containing ellipsis (...) in 2893 // the declaration) %al is used as hidden argument to specify the number 2894 // of SSE registers used. The contents of %al do not need to match exactly 2895 // the number of registers, but must be an ubound on the number of SSE 2896 // registers used and is in the range 0 - 8 inclusive. 2897 2898 // Count the number of XMM registers allocated. 2899 static const MCPhysReg XMMArgRegs[] = { 2900 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2901 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2902 }; 2903 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); 2904 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2905 && "SSE registers cannot be used when SSE is disabled"); 2906 2907 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 2908 DAG.getConstant(NumXMMRegs, MVT::i8))); 2909 } 2910 2911 if (isVarArg && IsMustTail) { 2912 const auto &Forwards = X86Info->getForwardedMustTailRegParms(); 2913 for (const auto &F : Forwards) { 2914 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 2915 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); 2916 } 2917 } 2918 2919 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls 2920 // don't need this because the eligibility check rejects calls that require 2921 // shuffling arguments passed in memory. 2922 if (!IsSibcall && isTailCall) { 2923 // Force all the incoming stack arguments to be loaded from the stack 2924 // before any new outgoing arguments are stored to the stack, because the 2925 // outgoing stack slots may alias the incoming argument stack slots, and 2926 // the alias isn't otherwise explicit. This is slightly more conservative 2927 // than necessary, because it means that each store effectively depends 2928 // on every argument instead of just those arguments it would clobber. 2929 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2930 2931 SmallVector<SDValue, 8> MemOpChains2; 2932 SDValue FIN; 2933 int FI = 0; 2934 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2935 CCValAssign &VA = ArgLocs[i]; 2936 if (VA.isRegLoc()) 2937 continue; 2938 assert(VA.isMemLoc()); 2939 SDValue Arg = OutVals[i]; 2940 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2941 // Skip inalloca arguments. They don't require any work. 2942 if (Flags.isInAlloca()) 2943 continue; 2944 // Create frame index. 2945 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2946 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2947 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2948 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2949 2950 if (Flags.isByVal()) { 2951 // Copy relative to framepointer. 2952 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2953 if (!StackPtr.getNode()) 2954 StackPtr = DAG.getCopyFromReg(Chain, dl, 2955 RegInfo->getStackRegister(), 2956 getPointerTy()); 2957 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2958 2959 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2960 ArgChain, 2961 Flags, DAG, dl)); 2962 } else { 2963 // Store relative to framepointer. 2964 MemOpChains2.push_back( 2965 DAG.getStore(ArgChain, dl, Arg, FIN, 2966 MachinePointerInfo::getFixedStack(FI), 2967 false, false, 0)); 2968 } 2969 } 2970 2971 if (!MemOpChains2.empty()) 2972 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 2973 2974 // Store the return address to the appropriate stack slot. 2975 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2976 getPointerTy(), RegInfo->getSlotSize(), 2977 FPDiff, dl); 2978 } 2979 2980 // Build a sequence of copy-to-reg nodes chained together with token chain 2981 // and flag operands which copy the outgoing args into registers. 2982 SDValue InFlag; 2983 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2984 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2985 RegsToPass[i].second, InFlag); 2986 InFlag = Chain.getValue(1); 2987 } 2988 2989 if (DAG.getTarget().getCodeModel() == CodeModel::Large) { 2990 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2991 // In the 64-bit large code model, we have to make all calls 2992 // through a register, since the call instruction's 32-bit 2993 // pc-relative offset may not be large enough to hold the whole 2994 // address. 2995 } else if (Callee->getOpcode() == ISD::GlobalAddress) { 2996 // If the callee is a GlobalAddress node (quite common, every direct call 2997 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2998 // it. 2999 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); 3000 3001 // We should use extra load for direct calls to dllimported functions in 3002 // non-JIT mode. 3003 const GlobalValue *GV = G->getGlobal(); 3004 if (!GV->hasDLLImportStorageClass()) { 3005 unsigned char OpFlags = 0; 3006 bool ExtraLoad = false; 3007 unsigned WrapperKind = ISD::DELETED_NODE; 3008 3009 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 3010 // external symbols most go through the PLT in PIC mode. If the symbol 3011 // has hidden or protected visibility, or if it is static or local, then 3012 // we don't need to use the PLT - we can directly call it. 3013 if (Subtarget->isTargetELF() && 3014 DAG.getTarget().getRelocationModel() == Reloc::PIC_ && 3015 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 3016 OpFlags = X86II::MO_PLT; 3017 } else if (Subtarget->isPICStyleStubAny() && 3018 (GV->isDeclaration() || GV->isWeakForLinker()) && 3019 (!Subtarget->getTargetTriple().isMacOSX() || 3020 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 3021 // PC-relative references to external symbols should go through $stub, 3022 // unless we're building with the leopard linker or later, which 3023 // automatically synthesizes these stubs. 3024 OpFlags = X86II::MO_DARWIN_STUB; 3025 } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) && 3026 cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) { 3027 // If the function is marked as non-lazy, generate an indirect call 3028 // which loads from the GOT directly. This avoids runtime overhead 3029 // at the cost of eager binding (and one extra byte of encoding). 3030 OpFlags = X86II::MO_GOTPCREL; 3031 WrapperKind = X86ISD::WrapperRIP; 3032 ExtraLoad = true; 3033 } 3034 3035 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 3036 G->getOffset(), OpFlags); 3037 3038 // Add a wrapper if needed. 3039 if (WrapperKind != ISD::DELETED_NODE) 3040 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 3041 // Add extra indirection if needed. 3042 if (ExtraLoad) 3043 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 3044 MachinePointerInfo::getGOT(), 3045 false, false, false, 0); 3046 } 3047 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 3048 unsigned char OpFlags = 0; 3049 3050 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 3051 // external symbols should go through the PLT. 3052 if (Subtarget->isTargetELF() && 3053 DAG.getTarget().getRelocationModel() == Reloc::PIC_) { 3054 OpFlags = X86II::MO_PLT; 3055 } else if (Subtarget->isPICStyleStubAny() && 3056 (!Subtarget->getTargetTriple().isMacOSX() || 3057 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 3058 // PC-relative references to external symbols should go through $stub, 3059 // unless we're building with the leopard linker or later, which 3060 // automatically synthesizes these stubs. 3061 OpFlags = X86II::MO_DARWIN_STUB; 3062 } 3063 3064 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 3065 OpFlags); 3066 } else if (Subtarget->isTarget64BitILP32() && 3067 Callee->getValueType(0) == MVT::i32) { 3068 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI 3069 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); 3070 } 3071 3072 // Returns a chain & a flag for retval copy to use. 3073 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3074 SmallVector<SDValue, 8> Ops; 3075 3076 if (!IsSibcall && isTailCall) { 3077 Chain = DAG.getCALLSEQ_END(Chain, 3078 DAG.getIntPtrConstant(NumBytesToPop, true), 3079 DAG.getIntPtrConstant(0, true), InFlag, dl); 3080 InFlag = Chain.getValue(1); 3081 } 3082 3083 Ops.push_back(Chain); 3084 Ops.push_back(Callee); 3085 3086 if (isTailCall) 3087 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 3088 3089 // Add argument registers to the end of the list so that they are known live 3090 // into the call. 3091 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 3092 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 3093 RegsToPass[i].second.getValueType())); 3094 3095 // Add a register mask operand representing the call-preserved registers. 3096 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3097 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); 3098 assert(Mask && "Missing call preserved mask for calling convention"); 3099 Ops.push_back(DAG.getRegisterMask(Mask)); 3100 3101 if (InFlag.getNode()) 3102 Ops.push_back(InFlag); 3103 3104 if (isTailCall) { 3105 // We used to do: 3106 //// If this is the first return lowered for this function, add the regs 3107 //// to the liveout set for the function. 3108 // This isn't right, although it's probably harmless on x86; liveouts 3109 // should be computed from returns not tail calls. Consider a void 3110 // function making a tail call to a function returning int. 3111 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); 3112 } 3113 3114 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); 3115 InFlag = Chain.getValue(1); 3116 3117 // Create the CALLSEQ_END node. 3118 unsigned NumBytesForCalleeToPop; 3119 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 3120 DAG.getTarget().Options.GuaranteedTailCallOpt)) 3121 NumBytesForCalleeToPop = NumBytes; // Callee pops everything 3122 else if (!Is64Bit && !IsTailCallConvention(CallConv) && 3123 !Subtarget->getTargetTriple().isOSMSVCRT() && 3124 SR == StackStructReturn) 3125 // If this is a call to a struct-return function, the callee 3126 // pops the hidden struct pointer, so we have to push it back. 3127 // This is common for Darwin/X86, Linux & Mingw32 targets. 3128 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 3129 NumBytesForCalleeToPop = 4; 3130 else 3131 NumBytesForCalleeToPop = 0; // Callee pops nothing. 3132 3133 // Returns a flag for retval copy to use. 3134 if (!IsSibcall) { 3135 Chain = DAG.getCALLSEQ_END(Chain, 3136 DAG.getIntPtrConstant(NumBytesToPop, true), 3137 DAG.getIntPtrConstant(NumBytesForCalleeToPop, 3138 true), 3139 InFlag, dl); 3140 InFlag = Chain.getValue(1); 3141 } 3142 3143 // Handle result values, copying them out of physregs into vregs that we 3144 // return. 3145 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 3146 Ins, dl, DAG, InVals); 3147 } 3148 3149 //===----------------------------------------------------------------------===// 3150 // Fast Calling Convention (tail call) implementation 3151 //===----------------------------------------------------------------------===// 3152 3153 // Like std call, callee cleans arguments, convention except that ECX is 3154 // reserved for storing the tail called function address. Only 2 registers are 3155 // free for argument passing (inreg). Tail call optimization is performed 3156 // provided: 3157 // * tailcallopt is enabled 3158 // * caller/callee are fastcc 3159 // On X86_64 architecture with GOT-style position independent code only local 3160 // (within module) calls are supported at the moment. 3161 // To keep the stack aligned according to platform abi the function 3162 // GetAlignedArgumentStackSize ensures that argument delta is always multiples 3163 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 3164 // If a tail called function callee has more arguments than the caller the 3165 // caller needs to make sure that there is room to move the RETADDR to. This is 3166 // achieved by reserving an area the size of the argument delta right after the 3167 // original RETADDR, but before the saved framepointer or the spilled registers 3168 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 3169 // stack layout: 3170 // arg1 3171 // arg2 3172 // RETADDR 3173 // [ new RETADDR 3174 // move area ] 3175 // (possible EBP) 3176 // ESI 3177 // EDI 3178 // local1 .. 3179 3180 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 3181 /// for a 16 byte align requirement. 3182 unsigned 3183 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 3184 SelectionDAG& DAG) const { 3185 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 3186 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 3187 unsigned StackAlignment = TFI.getStackAlignment(); 3188 uint64_t AlignMask = StackAlignment - 1; 3189 int64_t Offset = StackSize; 3190 unsigned SlotSize = RegInfo->getSlotSize(); 3191 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 3192 // Number smaller than 12 so just add the difference. 3193 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 3194 } else { 3195 // Mask out lower bits, add stackalignment once plus the 12 bytes. 3196 Offset = ((~AlignMask) & Offset) + StackAlignment + 3197 (StackAlignment-SlotSize); 3198 } 3199 return Offset; 3200 } 3201 3202 /// MatchingStackOffset - Return true if the given stack call argument is 3203 /// already available in the same position (relatively) of the caller's 3204 /// incoming argument stack. 3205 static 3206 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 3207 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 3208 const X86InstrInfo *TII) { 3209 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 3210 int FI = INT_MAX; 3211 if (Arg.getOpcode() == ISD::CopyFromReg) { 3212 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 3213 if (!TargetRegisterInfo::isVirtualRegister(VR)) 3214 return false; 3215 MachineInstr *Def = MRI->getVRegDef(VR); 3216 if (!Def) 3217 return false; 3218 if (!Flags.isByVal()) { 3219 if (!TII->isLoadFromStackSlot(Def, FI)) 3220 return false; 3221 } else { 3222 unsigned Opcode = Def->getOpcode(); 3223 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || 3224 Opcode == X86::LEA64_32r) && 3225 Def->getOperand(1).isFI()) { 3226 FI = Def->getOperand(1).getIndex(); 3227 Bytes = Flags.getByValSize(); 3228 } else 3229 return false; 3230 } 3231 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 3232 if (Flags.isByVal()) 3233 // ByVal argument is passed in as a pointer but it's now being 3234 // dereferenced. e.g. 3235 // define @foo(%struct.X* %A) { 3236 // tail call @bar(%struct.X* byval %A) 3237 // } 3238 return false; 3239 SDValue Ptr = Ld->getBasePtr(); 3240 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 3241 if (!FINode) 3242 return false; 3243 FI = FINode->getIndex(); 3244 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 3245 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 3246 FI = FINode->getIndex(); 3247 Bytes = Flags.getByValSize(); 3248 } else 3249 return false; 3250 3251 assert(FI != INT_MAX); 3252 if (!MFI->isFixedObjectIndex(FI)) 3253 return false; 3254 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 3255 } 3256 3257 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 3258 /// for tail call optimization. Targets which want to do tail call 3259 /// optimization should implement this function. 3260 bool 3261 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 3262 CallingConv::ID CalleeCC, 3263 bool isVarArg, 3264 bool isCalleeStructRet, 3265 bool isCallerStructRet, 3266 Type *RetTy, 3267 const SmallVectorImpl<ISD::OutputArg> &Outs, 3268 const SmallVectorImpl<SDValue> &OutVals, 3269 const SmallVectorImpl<ISD::InputArg> &Ins, 3270 SelectionDAG &DAG) const { 3271 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 3272 return false; 3273 3274 // If -tailcallopt is specified, make fastcc functions tail-callable. 3275 const MachineFunction &MF = DAG.getMachineFunction(); 3276 const Function *CallerF = MF.getFunction(); 3277 3278 // If the function return type is x86_fp80 and the callee return type is not, 3279 // then the FP_EXTEND of the call result is not a nop. It's not safe to 3280 // perform a tailcall optimization here. 3281 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 3282 return false; 3283 3284 CallingConv::ID CallerCC = CallerF->getCallingConv(); 3285 bool CCMatch = CallerCC == CalleeCC; 3286 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); 3287 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); 3288 3289 // Win64 functions have extra shadow space for argument homing. Don't do the 3290 // sibcall if the caller and callee have mismatched expectations for this 3291 // space. 3292 if (IsCalleeWin64 != IsCallerWin64) 3293 return false; 3294 3295 if (DAG.getTarget().Options.GuaranteedTailCallOpt) { 3296 if (IsTailCallConvention(CalleeCC) && CCMatch) 3297 return true; 3298 return false; 3299 } 3300 3301 // Look for obvious safe cases to perform tail call optimization that do not 3302 // require ABI changes. This is what gcc calls sibcall. 3303 3304 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 3305 // emit a special epilogue. 3306 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 3307 if (RegInfo->needsStackRealignment(MF)) 3308 return false; 3309 3310 // Also avoid sibcall optimization if either caller or callee uses struct 3311 // return semantics. 3312 if (isCalleeStructRet || isCallerStructRet) 3313 return false; 3314 3315 // An stdcall/thiscall caller is expected to clean up its arguments; the 3316 // callee isn't going to do that. 3317 // FIXME: this is more restrictive than needed. We could produce a tailcall 3318 // when the stack adjustment matches. For example, with a thiscall that takes 3319 // only one argument. 3320 if (!CCMatch && (CallerCC == CallingConv::X86_StdCall || 3321 CallerCC == CallingConv::X86_ThisCall)) 3322 return false; 3323 3324 // Do not sibcall optimize vararg calls unless all arguments are passed via 3325 // registers. 3326 if (isVarArg && !Outs.empty()) { 3327 3328 // Optimizing for varargs on Win64 is unlikely to be safe without 3329 // additional testing. 3330 if (IsCalleeWin64 || IsCallerWin64) 3331 return false; 3332 3333 SmallVector<CCValAssign, 16> ArgLocs; 3334 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 3335 *DAG.getContext()); 3336 3337 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3338 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 3339 if (!ArgLocs[i].isRegLoc()) 3340 return false; 3341 } 3342 3343 // If the call result is in ST0 / ST1, it needs to be popped off the x87 3344 // stack. Therefore, if it's not used by the call it is not safe to optimize 3345 // this into a sibcall. 3346 bool Unused = false; 3347 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3348 if (!Ins[i].Used) { 3349 Unused = true; 3350 break; 3351 } 3352 } 3353 if (Unused) { 3354 SmallVector<CCValAssign, 16> RVLocs; 3355 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, 3356 *DAG.getContext()); 3357 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 3358 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 3359 CCValAssign &VA = RVLocs[i]; 3360 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) 3361 return false; 3362 } 3363 } 3364 3365 // If the calling conventions do not match, then we'd better make sure the 3366 // results are returned in the same way as what the caller expects. 3367 if (!CCMatch) { 3368 SmallVector<CCValAssign, 16> RVLocs1; 3369 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, 3370 *DAG.getContext()); 3371 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 3372 3373 SmallVector<CCValAssign, 16> RVLocs2; 3374 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, 3375 *DAG.getContext()); 3376 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 3377 3378 if (RVLocs1.size() != RVLocs2.size()) 3379 return false; 3380 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 3381 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 3382 return false; 3383 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 3384 return false; 3385 if (RVLocs1[i].isRegLoc()) { 3386 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 3387 return false; 3388 } else { 3389 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 3390 return false; 3391 } 3392 } 3393 } 3394 3395 // If the callee takes no arguments then go on to check the results of the 3396 // call. 3397 if (!Outs.empty()) { 3398 // Check if stack adjustment is needed. For now, do not do this if any 3399 // argument is passed on the stack. 3400 SmallVector<CCValAssign, 16> ArgLocs; 3401 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 3402 *DAG.getContext()); 3403 3404 // Allocate shadow area for Win64 3405 if (IsCalleeWin64) 3406 CCInfo.AllocateStack(32, 8); 3407 3408 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3409 if (CCInfo.getNextStackOffset()) { 3410 MachineFunction &MF = DAG.getMachineFunction(); 3411 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 3412 return false; 3413 3414 // Check if the arguments are already laid out in the right way as 3415 // the caller's fixed stack objects. 3416 MachineFrameInfo *MFI = MF.getFrameInfo(); 3417 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 3418 const X86InstrInfo *TII = Subtarget->getInstrInfo(); 3419 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3420 CCValAssign &VA = ArgLocs[i]; 3421 SDValue Arg = OutVals[i]; 3422 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3423 if (VA.getLocInfo() == CCValAssign::Indirect) 3424 return false; 3425 if (!VA.isRegLoc()) { 3426 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3427 MFI, MRI, TII)) 3428 return false; 3429 } 3430 } 3431 } 3432 3433 // If the tailcall address may be in a register, then make sure it's 3434 // possible to register allocate for it. In 32-bit, the call address can 3435 // only target EAX, EDX, or ECX since the tail call must be scheduled after 3436 // callee-saved registers are restored. These happen to be the same 3437 // registers used to pass 'inreg' arguments so watch out for those. 3438 if (!Subtarget->is64Bit() && 3439 ((!isa<GlobalAddressSDNode>(Callee) && 3440 !isa<ExternalSymbolSDNode>(Callee)) || 3441 DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { 3442 unsigned NumInRegs = 0; 3443 // In PIC we need an extra register to formulate the address computation 3444 // for the callee. 3445 unsigned MaxInRegs = 3446 (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; 3447 3448 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3449 CCValAssign &VA = ArgLocs[i]; 3450 if (!VA.isRegLoc()) 3451 continue; 3452 unsigned Reg = VA.getLocReg(); 3453 switch (Reg) { 3454 default: break; 3455 case X86::EAX: case X86::EDX: case X86::ECX: 3456 if (++NumInRegs == MaxInRegs) 3457 return false; 3458 break; 3459 } 3460 } 3461 } 3462 } 3463 3464 return true; 3465 } 3466 3467 FastISel * 3468 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 3469 const TargetLibraryInfo *libInfo) const { 3470 return X86::createFastISel(funcInfo, libInfo); 3471 } 3472 3473 //===----------------------------------------------------------------------===// 3474 // Other Lowering Hooks 3475 //===----------------------------------------------------------------------===// 3476 3477 static bool MayFoldLoad(SDValue Op) { 3478 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 3479 } 3480 3481 static bool MayFoldIntoStore(SDValue Op) { 3482 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 3483 } 3484 3485 static bool isTargetShuffle(unsigned Opcode) { 3486 switch(Opcode) { 3487 default: return false; 3488 case X86ISD::BLENDI: 3489 case X86ISD::PSHUFB: 3490 case X86ISD::PSHUFD: 3491 case X86ISD::PSHUFHW: 3492 case X86ISD::PSHUFLW: 3493 case X86ISD::SHUFP: 3494 case X86ISD::PALIGNR: 3495 case X86ISD::MOVLHPS: 3496 case X86ISD::MOVLHPD: 3497 case X86ISD::MOVHLPS: 3498 case X86ISD::MOVLPS: 3499 case X86ISD::MOVLPD: 3500 case X86ISD::MOVSHDUP: 3501 case X86ISD::MOVSLDUP: 3502 case X86ISD::MOVDDUP: 3503 case X86ISD::MOVSS: 3504 case X86ISD::MOVSD: 3505 case X86ISD::UNPCKL: 3506 case X86ISD::UNPCKH: 3507 case X86ISD::VPERMILPI: 3508 case X86ISD::VPERM2X128: 3509 case X86ISD::VPERMI: 3510 return true; 3511 } 3512 } 3513 3514 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3515 SDValue V1, unsigned TargetMask, 3516 SelectionDAG &DAG) { 3517 switch(Opc) { 3518 default: llvm_unreachable("Unknown x86 shuffle node"); 3519 case X86ISD::PSHUFD: 3520 case X86ISD::PSHUFHW: 3521 case X86ISD::PSHUFLW: 3522 case X86ISD::VPERMILPI: 3523 case X86ISD::VPERMI: 3524 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 3525 } 3526 } 3527 3528 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3529 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3530 switch(Opc) { 3531 default: llvm_unreachable("Unknown x86 shuffle node"); 3532 case X86ISD::MOVLHPS: 3533 case X86ISD::MOVLHPD: 3534 case X86ISD::MOVHLPS: 3535 case X86ISD::MOVLPS: 3536 case X86ISD::MOVLPD: 3537 case X86ISD::MOVSS: 3538 case X86ISD::MOVSD: 3539 case X86ISD::UNPCKL: 3540 case X86ISD::UNPCKH: 3541 return DAG.getNode(Opc, dl, VT, V1, V2); 3542 } 3543 } 3544 3545 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3546 MachineFunction &MF = DAG.getMachineFunction(); 3547 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 3548 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3549 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3550 3551 if (ReturnAddrIndex == 0) { 3552 // Set up a frame object for the return address. 3553 unsigned SlotSize = RegInfo->getSlotSize(); 3554 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, 3555 -(int64_t)SlotSize, 3556 false); 3557 FuncInfo->setRAIndex(ReturnAddrIndex); 3558 } 3559 3560 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 3561 } 3562 3563 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3564 bool hasSymbolicDisplacement) { 3565 // Offset should fit into 32 bit immediate field. 3566 if (!isInt<32>(Offset)) 3567 return false; 3568 3569 // If we don't have a symbolic displacement - we don't have any extra 3570 // restrictions. 3571 if (!hasSymbolicDisplacement) 3572 return true; 3573 3574 // FIXME: Some tweaks might be needed for medium code model. 3575 if (M != CodeModel::Small && M != CodeModel::Kernel) 3576 return false; 3577 3578 // For small code model we assume that latest object is 16MB before end of 31 3579 // bits boundary. We may also accept pretty large negative constants knowing 3580 // that all objects are in the positive half of address space. 3581 if (M == CodeModel::Small && Offset < 16*1024*1024) 3582 return true; 3583 3584 // For kernel code model we know that all object resist in the negative half 3585 // of 32bits address space. We may not accept negative offsets, since they may 3586 // be just off and we may accept pretty large positive ones. 3587 if (M == CodeModel::Kernel && Offset >= 0) 3588 return true; 3589 3590 return false; 3591 } 3592 3593 /// isCalleePop - Determines whether the callee is required to pop its 3594 /// own arguments. Callee pop is necessary to support tail calls. 3595 bool X86::isCalleePop(CallingConv::ID CallingConv, 3596 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3597 switch (CallingConv) { 3598 default: 3599 return false; 3600 case CallingConv::X86_StdCall: 3601 case CallingConv::X86_FastCall: 3602 case CallingConv::X86_ThisCall: 3603 return !is64Bit; 3604 case CallingConv::Fast: 3605 case CallingConv::GHC: 3606 case CallingConv::HiPE: 3607 if (IsVarArg) 3608 return false; 3609 return TailCallOpt; 3610 } 3611 } 3612 3613 /// \brief Return true if the condition is an unsigned comparison operation. 3614 static bool isX86CCUnsigned(unsigned X86CC) { 3615 switch (X86CC) { 3616 default: llvm_unreachable("Invalid integer condition!"); 3617 case X86::COND_E: return true; 3618 case X86::COND_G: return false; 3619 case X86::COND_GE: return false; 3620 case X86::COND_L: return false; 3621 case X86::COND_LE: return false; 3622 case X86::COND_NE: return true; 3623 case X86::COND_B: return true; 3624 case X86::COND_A: return true; 3625 case X86::COND_BE: return true; 3626 case X86::COND_AE: return true; 3627 } 3628 llvm_unreachable("covered switch fell through?!"); 3629 } 3630 3631 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3632 /// specific condition code, returning the condition code and the LHS/RHS of the 3633 /// comparison to make. 3634 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3635 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3636 if (!isFP) { 3637 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3638 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3639 // X > -1 -> X == 0, jump !sign. 3640 RHS = DAG.getConstant(0, RHS.getValueType()); 3641 return X86::COND_NS; 3642 } 3643 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3644 // X < 0 -> X == 0, jump on sign. 3645 return X86::COND_S; 3646 } 3647 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3648 // X < 1 -> X <= 0 3649 RHS = DAG.getConstant(0, RHS.getValueType()); 3650 return X86::COND_LE; 3651 } 3652 } 3653 3654 switch (SetCCOpcode) { 3655 default: llvm_unreachable("Invalid integer condition!"); 3656 case ISD::SETEQ: return X86::COND_E; 3657 case ISD::SETGT: return X86::COND_G; 3658 case ISD::SETGE: return X86::COND_GE; 3659 case ISD::SETLT: return X86::COND_L; 3660 case ISD::SETLE: return X86::COND_LE; 3661 case ISD::SETNE: return X86::COND_NE; 3662 case ISD::SETULT: return X86::COND_B; 3663 case ISD::SETUGT: return X86::COND_A; 3664 case ISD::SETULE: return X86::COND_BE; 3665 case ISD::SETUGE: return X86::COND_AE; 3666 } 3667 } 3668 3669 // First determine if it is required or is profitable to flip the operands. 3670 3671 // If LHS is a foldable load, but RHS is not, flip the condition. 3672 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3673 !ISD::isNON_EXTLoad(RHS.getNode())) { 3674 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3675 std::swap(LHS, RHS); 3676 } 3677 3678 switch (SetCCOpcode) { 3679 default: break; 3680 case ISD::SETOLT: 3681 case ISD::SETOLE: 3682 case ISD::SETUGT: 3683 case ISD::SETUGE: 3684 std::swap(LHS, RHS); 3685 break; 3686 } 3687 3688 // On a floating point condition, the flags are set as follows: 3689 // ZF PF CF op 3690 // 0 | 0 | 0 | X > Y 3691 // 0 | 0 | 1 | X < Y 3692 // 1 | 0 | 0 | X == Y 3693 // 1 | 1 | 1 | unordered 3694 switch (SetCCOpcode) { 3695 default: llvm_unreachable("Condcode should be pre-legalized away"); 3696 case ISD::SETUEQ: 3697 case ISD::SETEQ: return X86::COND_E; 3698 case ISD::SETOLT: // flipped 3699 case ISD::SETOGT: 3700 case ISD::SETGT: return X86::COND_A; 3701 case ISD::SETOLE: // flipped 3702 case ISD::SETOGE: 3703 case ISD::SETGE: return X86::COND_AE; 3704 case ISD::SETUGT: // flipped 3705 case ISD::SETULT: 3706 case ISD::SETLT: return X86::COND_B; 3707 case ISD::SETUGE: // flipped 3708 case ISD::SETULE: 3709 case ISD::SETLE: return X86::COND_BE; 3710 case ISD::SETONE: 3711 case ISD::SETNE: return X86::COND_NE; 3712 case ISD::SETUO: return X86::COND_P; 3713 case ISD::SETO: return X86::COND_NP; 3714 case ISD::SETOEQ: 3715 case ISD::SETUNE: return X86::COND_INVALID; 3716 } 3717 } 3718 3719 /// hasFPCMov - is there a floating point cmov for the specific X86 condition 3720 /// code. Current x86 isa includes the following FP cmov instructions: 3721 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3722 static bool hasFPCMov(unsigned X86CC) { 3723 switch (X86CC) { 3724 default: 3725 return false; 3726 case X86::COND_B: 3727 case X86::COND_BE: 3728 case X86::COND_E: 3729 case X86::COND_P: 3730 case X86::COND_A: 3731 case X86::COND_AE: 3732 case X86::COND_NE: 3733 case X86::COND_NP: 3734 return true; 3735 } 3736 } 3737 3738 /// isFPImmLegal - Returns true if the target can instruction select the 3739 /// specified FP immediate natively. If false, the legalizer will 3740 /// materialize the FP immediate as a load from a constant pool. 3741 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3742 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3743 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3744 return true; 3745 } 3746 return false; 3747 } 3748 3749 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, 3750 ISD::LoadExtType ExtTy, 3751 EVT NewVT) const { 3752 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF 3753 // relocation target a movq or addq instruction: don't let the load shrink. 3754 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); 3755 if (BasePtr.getOpcode() == X86ISD::WrapperRIP) 3756 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) 3757 return GA->getTargetFlags() != X86II::MO_GOTTPOFF; 3758 return true; 3759 } 3760 3761 /// \brief Returns true if it is beneficial to convert a load of a constant 3762 /// to just the constant itself. 3763 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 3764 Type *Ty) const { 3765 assert(Ty->isIntegerTy()); 3766 3767 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 3768 if (BitSize == 0 || BitSize > 64) 3769 return false; 3770 return true; 3771 } 3772 3773 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, 3774 unsigned Index) const { 3775 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 3776 return false; 3777 3778 return (Index == 0 || Index == ResVT.getVectorNumElements()); 3779 } 3780 3781 bool X86TargetLowering::isCheapToSpeculateCttz() const { 3782 // Speculate cttz only if we can directly use TZCNT. 3783 return Subtarget->hasBMI(); 3784 } 3785 3786 bool X86TargetLowering::isCheapToSpeculateCtlz() const { 3787 // Speculate ctlz only if we can directly use LZCNT. 3788 return Subtarget->hasLZCNT(); 3789 } 3790 3791 /// isUndefOrInRange - Return true if Val is undef or if its value falls within 3792 /// the specified range (L, H]. 3793 static bool isUndefOrInRange(int Val, int Low, int Hi) { 3794 return (Val < 0) || (Val >= Low && Val < Hi); 3795 } 3796 3797 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3798 /// specified value. 3799 static bool isUndefOrEqual(int Val, int CmpVal) { 3800 return (Val < 0 || Val == CmpVal); 3801 } 3802 3803 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 3804 /// from position Pos and ending in Pos+Size, falls within the specified 3805 /// sequential range (Low, Low+Size]. or is undef. 3806 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3807 unsigned Pos, unsigned Size, int Low) { 3808 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3809 if (!isUndefOrEqual(Mask[i], Low)) 3810 return false; 3811 return true; 3812 } 3813 3814 /// isVEXTRACTIndex - Return true if the specified 3815 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3816 /// suitable for instruction that extract 128 or 256 bit vectors 3817 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { 3818 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 3819 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3820 return false; 3821 3822 // The index should be aligned on a vecWidth-bit boundary. 3823 uint64_t Index = 3824 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3825 3826 MVT VT = N->getSimpleValueType(0); 3827 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 3828 bool Result = (Index * ElSize) % vecWidth == 0; 3829 3830 return Result; 3831 } 3832 3833 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR 3834 /// operand specifies a subvector insert that is suitable for input to 3835 /// insertion of 128 or 256-bit subvectors 3836 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { 3837 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 3838 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3839 return false; 3840 // The index should be aligned on a vecWidth-bit boundary. 3841 uint64_t Index = 3842 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3843 3844 MVT VT = N->getSimpleValueType(0); 3845 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 3846 bool Result = (Index * ElSize) % vecWidth == 0; 3847 3848 return Result; 3849 } 3850 3851 bool X86::isVINSERT128Index(SDNode *N) { 3852 return isVINSERTIndex(N, 128); 3853 } 3854 3855 bool X86::isVINSERT256Index(SDNode *N) { 3856 return isVINSERTIndex(N, 256); 3857 } 3858 3859 bool X86::isVEXTRACT128Index(SDNode *N) { 3860 return isVEXTRACTIndex(N, 128); 3861 } 3862 3863 bool X86::isVEXTRACT256Index(SDNode *N) { 3864 return isVEXTRACTIndex(N, 256); 3865 } 3866 3867 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { 3868 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 3869 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3870 llvm_unreachable("Illegal extract subvector for VEXTRACT"); 3871 3872 uint64_t Index = 3873 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3874 3875 MVT VecVT = N->getOperand(0).getSimpleValueType(); 3876 MVT ElVT = VecVT.getVectorElementType(); 3877 3878 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 3879 return Index / NumElemsPerChunk; 3880 } 3881 3882 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { 3883 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 3884 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3885 llvm_unreachable("Illegal insert subvector for VINSERT"); 3886 3887 uint64_t Index = 3888 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3889 3890 MVT VecVT = N->getSimpleValueType(0); 3891 MVT ElVT = VecVT.getVectorElementType(); 3892 3893 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 3894 return Index / NumElemsPerChunk; 3895 } 3896 3897 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate 3898 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3899 /// and VINSERTI128 instructions. 3900 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { 3901 return getExtractVEXTRACTImmediate(N, 128); 3902 } 3903 3904 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate 3905 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 3906 /// and VINSERTI64x4 instructions. 3907 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { 3908 return getExtractVEXTRACTImmediate(N, 256); 3909 } 3910 3911 /// getInsertVINSERT128Immediate - Return the appropriate immediate 3912 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3913 /// and VINSERTI128 instructions. 3914 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { 3915 return getInsertVINSERTImmediate(N, 128); 3916 } 3917 3918 /// getInsertVINSERT256Immediate - Return the appropriate immediate 3919 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 3920 /// and VINSERTI64x4 instructions. 3921 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { 3922 return getInsertVINSERTImmediate(N, 256); 3923 } 3924 3925 /// isZero - Returns true if Elt is a constant integer zero 3926 static bool isZero(SDValue V) { 3927 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 3928 return C && C->isNullValue(); 3929 } 3930 3931 /// isZeroNode - Returns true if Elt is a constant zero or a floating point 3932 /// constant +0.0. 3933 bool X86::isZeroNode(SDValue Elt) { 3934 if (isZero(Elt)) 3935 return true; 3936 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) 3937 return CFP->getValueAPF().isPosZero(); 3938 return false; 3939 } 3940 3941 /// getZeroVector - Returns a vector of specified type with all zero elements. 3942 /// 3943 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 3944 SelectionDAG &DAG, SDLoc dl) { 3945 assert(VT.isVector() && "Expected a vector type"); 3946 3947 // Always build SSE zero vectors as <4 x i32> bitcasted 3948 // to their dest type. This ensures they get CSE'd. 3949 SDValue Vec; 3950 if (VT.is128BitVector()) { // SSE 3951 if (Subtarget->hasSSE2()) { // SSE2 3952 SDValue Cst = DAG.getConstant(0, MVT::i32); 3953 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3954 } else { // SSE1 3955 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); 3956 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3957 } 3958 } else if (VT.is256BitVector()) { // AVX 3959 if (Subtarget->hasInt256()) { // AVX2 3960 SDValue Cst = DAG.getConstant(0, MVT::i32); 3961 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3962 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); 3963 } else { 3964 // 256-bit logic and arithmetic instructions in AVX are all 3965 // floating-point, no support for integer ops. Emit fp zeroed vectors. 3966 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); 3967 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3968 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); 3969 } 3970 } else if (VT.is512BitVector()) { // AVX-512 3971 SDValue Cst = DAG.getConstant(0, MVT::i32); 3972 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 3973 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3974 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); 3975 } else if (VT.getScalarType() == MVT::i1) { 3976 3977 assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) 3978 && "Unexpected vector type"); 3979 assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) 3980 && "Unexpected vector type"); 3981 SDValue Cst = DAG.getConstant(0, MVT::i1); 3982 SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst); 3983 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 3984 } else 3985 llvm_unreachable("Unexpected vector type"); 3986 3987 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3988 } 3989 3990 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, 3991 SelectionDAG &DAG, SDLoc dl, 3992 unsigned vectorWidth) { 3993 assert((vectorWidth == 128 || vectorWidth == 256) && 3994 "Unsupported vector width"); 3995 EVT VT = Vec.getValueType(); 3996 EVT ElVT = VT.getVectorElementType(); 3997 unsigned Factor = VT.getSizeInBits()/vectorWidth; 3998 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 3999 VT.getVectorNumElements()/Factor); 4000 4001 // Extract from UNDEF is UNDEF. 4002 if (Vec.getOpcode() == ISD::UNDEF) 4003 return DAG.getUNDEF(ResultVT); 4004 4005 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR 4006 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); 4007 4008 // This is the index of the first element of the vectorWidth-bit chunk 4009 // we want. 4010 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) 4011 * ElemsPerChunk); 4012 4013 // If the input is a buildvector just emit a smaller one. 4014 if (Vec.getOpcode() == ISD::BUILD_VECTOR) 4015 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, 4016 makeArrayRef(Vec->op_begin() + NormalizedIdxVal, 4017 ElemsPerChunk)); 4018 4019 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 4020 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); 4021 } 4022 4023 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This 4024 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 4025 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 4026 /// instructions or a simple subregister reference. Idx is an index in the 4027 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes 4028 /// lowering EXTRACT_VECTOR_ELT operations easier. 4029 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 4030 SelectionDAG &DAG, SDLoc dl) { 4031 assert((Vec.getValueType().is256BitVector() || 4032 Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); 4033 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); 4034 } 4035 4036 /// Generate a DAG to grab 256-bits from a 512-bit vector. 4037 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, 4038 SelectionDAG &DAG, SDLoc dl) { 4039 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); 4040 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); 4041 } 4042 4043 static SDValue InsertSubVector(SDValue Result, SDValue Vec, 4044 unsigned IdxVal, SelectionDAG &DAG, 4045 SDLoc dl, unsigned vectorWidth) { 4046 assert((vectorWidth == 128 || vectorWidth == 256) && 4047 "Unsupported vector width"); 4048 // Inserting UNDEF is Result 4049 if (Vec.getOpcode() == ISD::UNDEF) 4050 return Result; 4051 EVT VT = Vec.getValueType(); 4052 EVT ElVT = VT.getVectorElementType(); 4053 EVT ResultVT = Result.getValueType(); 4054 4055 // Insert the relevant vectorWidth bits. 4056 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); 4057 4058 // This is the index of the first element of the vectorWidth-bit chunk 4059 // we want. 4060 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) 4061 * ElemsPerChunk); 4062 4063 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 4064 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); 4065 } 4066 4067 /// Generate a DAG to put 128-bits into a vector > 128 bits. This 4068 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or 4069 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a 4070 /// simple superregister reference. Idx is an index in the 128 bits 4071 /// we want. It need not be aligned to a 128-bit boundary. That makes 4072 /// lowering INSERT_VECTOR_ELT operations easier. 4073 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, 4074 SelectionDAG &DAG, SDLoc dl) { 4075 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); 4076 4077 // For insertion into the zero index (low half) of a 256-bit vector, it is 4078 // more efficient to generate a blend with immediate instead of an insert*128. 4079 // We are still creating an INSERT_SUBVECTOR below with an undef node to 4080 // extend the subvector to the size of the result vector. Make sure that 4081 // we are not recursing on that node by checking for undef here. 4082 if (IdxVal == 0 && Result.getValueType().is256BitVector() && 4083 Result.getOpcode() != ISD::UNDEF) { 4084 EVT ResultVT = Result.getValueType(); 4085 SDValue ZeroIndex = DAG.getIntPtrConstant(0); 4086 SDValue Undef = DAG.getUNDEF(ResultVT); 4087 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, 4088 Vec, ZeroIndex); 4089 4090 // The blend instruction, and therefore its mask, depend on the data type. 4091 MVT ScalarType = ResultVT.getScalarType().getSimpleVT(); 4092 if (ScalarType.isFloatingPoint()) { 4093 // Choose either vblendps (float) or vblendpd (double). 4094 unsigned ScalarSize = ScalarType.getSizeInBits(); 4095 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); 4096 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; 4097 SDValue Mask = DAG.getConstant(MaskVal, MVT::i8); 4098 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); 4099 } 4100 4101 const X86Subtarget &Subtarget = 4102 static_cast<const X86Subtarget &>(DAG.getSubtarget()); 4103 4104 // AVX2 is needed for 256-bit integer blend support. 4105 // Integers must be cast to 32-bit because there is only vpblendd; 4106 // vpblendw can't be used for this because it has a handicapped mask. 4107 4108 // If we don't have AVX2, then cast to float. Using a wrong domain blend 4109 // is still more efficient than using the wrong domain vinsertf128 that 4110 // will be created by InsertSubVector(). 4111 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; 4112 4113 SDValue Mask = DAG.getConstant(0x0f, MVT::i8); 4114 Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256); 4115 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); 4116 return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256); 4117 } 4118 4119 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); 4120 } 4121 4122 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, 4123 SelectionDAG &DAG, SDLoc dl) { 4124 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); 4125 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); 4126 } 4127 4128 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 4129 /// instructions. This is used because creating CONCAT_VECTOR nodes of 4130 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 4131 /// large BUILD_VECTORS. 4132 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 4133 unsigned NumElems, SelectionDAG &DAG, 4134 SDLoc dl) { 4135 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 4136 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 4137 } 4138 4139 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, 4140 unsigned NumElems, SelectionDAG &DAG, 4141 SDLoc dl) { 4142 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 4143 return Insert256BitVector(V, V2, NumElems/2, DAG, dl); 4144 } 4145 4146 /// getOnesVector - Returns a vector of specified type with all bits set. 4147 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4148 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4149 /// Then bitcast to their original type, ensuring they get CSE'd. 4150 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, 4151 SDLoc dl) { 4152 assert(VT.isVector() && "Expected a vector type"); 4153 4154 SDValue Cst = DAG.getConstant(~0U, MVT::i32); 4155 SDValue Vec; 4156 if (VT.is256BitVector()) { 4157 if (HasInt256) { // AVX2 4158 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4159 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); 4160 } else { // AVX 4161 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4162 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4163 } 4164 } else if (VT.is128BitVector()) { 4165 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4166 } else 4167 llvm_unreachable("Unexpected vector type"); 4168 4169 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4170 } 4171 4172 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4173 /// operation of specified width. 4174 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 4175 SDValue V2) { 4176 unsigned NumElems = VT.getVectorNumElements(); 4177 SmallVector<int, 8> Mask; 4178 Mask.push_back(NumElems); 4179 for (unsigned i = 1; i != NumElems; ++i) 4180 Mask.push_back(i); 4181 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4182 } 4183 4184 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4185 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4186 SDValue V2) { 4187 unsigned NumElems = VT.getVectorNumElements(); 4188 SmallVector<int, 8> Mask; 4189 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4190 Mask.push_back(i); 4191 Mask.push_back(i + NumElems); 4192 } 4193 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4194 } 4195 4196 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4197 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4198 SDValue V2) { 4199 unsigned NumElems = VT.getVectorNumElements(); 4200 SmallVector<int, 8> Mask; 4201 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4202 Mask.push_back(i + Half); 4203 Mask.push_back(i + NumElems + Half); 4204 } 4205 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4206 } 4207 4208 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4209 /// vector of zero or undef vector. This produces a shuffle where the low 4210 /// element of V2 is swizzled into the zero/undef vector, landing at element 4211 /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4212 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4213 bool IsZero, 4214 const X86Subtarget *Subtarget, 4215 SelectionDAG &DAG) { 4216 MVT VT = V2.getSimpleValueType(); 4217 SDValue V1 = IsZero 4218 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); 4219 unsigned NumElems = VT.getVectorNumElements(); 4220 SmallVector<int, 16> MaskVec; 4221 for (unsigned i = 0; i != NumElems; ++i) 4222 // If this is the insertion idx, put the low elt of V2 here. 4223 MaskVec.push_back(i == Idx ? NumElems : i); 4224 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); 4225 } 4226 4227 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 4228 /// target specific opcode. Returns true if the Mask could be calculated. Sets 4229 /// IsUnary to true if only uses one source. Note that this will set IsUnary for 4230 /// shuffles which use a single input multiple times, and in those cases it will 4231 /// adjust the mask to only have indices within that single input. 4232 static bool getTargetShuffleMask(SDNode *N, MVT VT, 4233 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4234 unsigned NumElems = VT.getVectorNumElements(); 4235 SDValue ImmN; 4236 4237 IsUnary = false; 4238 bool IsFakeUnary = false; 4239 switch(N->getOpcode()) { 4240 case X86ISD::BLENDI: 4241 ImmN = N->getOperand(N->getNumOperands()-1); 4242 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4243 break; 4244 case X86ISD::SHUFP: 4245 ImmN = N->getOperand(N->getNumOperands()-1); 4246 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4247 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4248 break; 4249 case X86ISD::UNPCKH: 4250 DecodeUNPCKHMask(VT, Mask); 4251 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4252 break; 4253 case X86ISD::UNPCKL: 4254 DecodeUNPCKLMask(VT, Mask); 4255 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4256 break; 4257 case X86ISD::MOVHLPS: 4258 DecodeMOVHLPSMask(NumElems, Mask); 4259 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4260 break; 4261 case X86ISD::MOVLHPS: 4262 DecodeMOVLHPSMask(NumElems, Mask); 4263 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4264 break; 4265 case X86ISD::PALIGNR: 4266 ImmN = N->getOperand(N->getNumOperands()-1); 4267 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4268 break; 4269 case X86ISD::PSHUFD: 4270 case X86ISD::VPERMILPI: 4271 ImmN = N->getOperand(N->getNumOperands()-1); 4272 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4273 IsUnary = true; 4274 break; 4275 case X86ISD::PSHUFHW: 4276 ImmN = N->getOperand(N->getNumOperands()-1); 4277 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4278 IsUnary = true; 4279 break; 4280 case X86ISD::PSHUFLW: 4281 ImmN = N->getOperand(N->getNumOperands()-1); 4282 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4283 IsUnary = true; 4284 break; 4285 case X86ISD::PSHUFB: { 4286 IsUnary = true; 4287 SDValue MaskNode = N->getOperand(1); 4288 while (MaskNode->getOpcode() == ISD::BITCAST) 4289 MaskNode = MaskNode->getOperand(0); 4290 4291 if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { 4292 // If we have a build-vector, then things are easy. 4293 EVT VT = MaskNode.getValueType(); 4294 assert(VT.isVector() && 4295 "Can't produce a non-vector with a build_vector!"); 4296 if (!VT.isInteger()) 4297 return false; 4298 4299 int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; 4300 4301 SmallVector<uint64_t, 32> RawMask; 4302 for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { 4303 SDValue Op = MaskNode->getOperand(i); 4304 if (Op->getOpcode() == ISD::UNDEF) { 4305 RawMask.push_back((uint64_t)SM_SentinelUndef); 4306 continue; 4307 } 4308 auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); 4309 if (!CN) 4310 return false; 4311 APInt MaskElement = CN->getAPIntValue(); 4312 4313 // We now have to decode the element which could be any integer size and 4314 // extract each byte of it. 4315 for (int j = 0; j < NumBytesPerElement; ++j) { 4316 // Note that this is x86 and so always little endian: the low byte is 4317 // the first byte of the mask. 4318 RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); 4319 MaskElement = MaskElement.lshr(8); 4320 } 4321 } 4322 DecodePSHUFBMask(RawMask, Mask); 4323 break; 4324 } 4325 4326 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); 4327 if (!MaskLoad) 4328 return false; 4329 4330 SDValue Ptr = MaskLoad->getBasePtr(); 4331 if (Ptr->getOpcode() == X86ISD::Wrapper) 4332 Ptr = Ptr->getOperand(0); 4333 4334 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); 4335 if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) 4336 return false; 4337 4338 if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { 4339 DecodePSHUFBMask(C, Mask); 4340 if (Mask.empty()) 4341 return false; 4342 break; 4343 } 4344 4345 return false; 4346 } 4347 case X86ISD::VPERMI: 4348 ImmN = N->getOperand(N->getNumOperands()-1); 4349 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4350 IsUnary = true; 4351 break; 4352 case X86ISD::MOVSS: 4353 case X86ISD::MOVSD: 4354 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); 4355 break; 4356 case X86ISD::VPERM2X128: 4357 ImmN = N->getOperand(N->getNumOperands()-1); 4358 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4359 if (Mask.empty()) return false; 4360 break; 4361 case X86ISD::MOVSLDUP: 4362 DecodeMOVSLDUPMask(VT, Mask); 4363 IsUnary = true; 4364 break; 4365 case X86ISD::MOVSHDUP: 4366 DecodeMOVSHDUPMask(VT, Mask); 4367 IsUnary = true; 4368 break; 4369 case X86ISD::MOVDDUP: 4370 DecodeMOVDDUPMask(VT, Mask); 4371 IsUnary = true; 4372 break; 4373 case X86ISD::MOVLHPD: 4374 case X86ISD::MOVLPD: 4375 case X86ISD::MOVLPS: 4376 // Not yet implemented 4377 return false; 4378 default: llvm_unreachable("unknown target shuffle node"); 4379 } 4380 4381 // If we have a fake unary shuffle, the shuffle mask is spread across two 4382 // inputs that are actually the same node. Re-map the mask to always point 4383 // into the first input. 4384 if (IsFakeUnary) 4385 for (int &M : Mask) 4386 if (M >= (int)Mask.size()) 4387 M -= Mask.size(); 4388 4389 return true; 4390 } 4391 4392 /// getShuffleScalarElt - Returns the scalar element that will make up the ith 4393 /// element of the result of the vector shuffle. 4394 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 4395 unsigned Depth) { 4396 if (Depth == 6) 4397 return SDValue(); // Limit search depth. 4398 4399 SDValue V = SDValue(N, 0); 4400 EVT VT = V.getValueType(); 4401 unsigned Opcode = V.getOpcode(); 4402 4403 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4404 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4405 int Elt = SV->getMaskElt(Index); 4406 4407 if (Elt < 0) 4408 return DAG.getUNDEF(VT.getVectorElementType()); 4409 4410 unsigned NumElems = VT.getVectorNumElements(); 4411 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 4412 : SV->getOperand(1); 4413 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 4414 } 4415 4416 // Recurse into target specific vector shuffles to find scalars. 4417 if (isTargetShuffle(Opcode)) { 4418 MVT ShufVT = V.getSimpleValueType(); 4419 unsigned NumElems = ShufVT.getVectorNumElements(); 4420 SmallVector<int, 16> ShuffleMask; 4421 bool IsUnary; 4422 4423 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 4424 return SDValue(); 4425 4426 int Elt = ShuffleMask[Index]; 4427 if (Elt < 0) 4428 return DAG.getUNDEF(ShufVT.getVectorElementType()); 4429 4430 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 4431 : N->getOperand(1); 4432 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 4433 Depth+1); 4434 } 4435 4436 // Actual nodes that may contain scalar elements 4437 if (Opcode == ISD::BITCAST) { 4438 V = V.getOperand(0); 4439 EVT SrcVT = V.getValueType(); 4440 unsigned NumElems = VT.getVectorNumElements(); 4441 4442 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4443 return SDValue(); 4444 } 4445 4446 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4447 return (Index == 0) ? V.getOperand(0) 4448 : DAG.getUNDEF(VT.getVectorElementType()); 4449 4450 if (V.getOpcode() == ISD::BUILD_VECTOR) 4451 return V.getOperand(Index); 4452 4453 return SDValue(); 4454 } 4455 4456 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4457 /// 4458 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4459 unsigned NumNonZero, unsigned NumZero, 4460 SelectionDAG &DAG, 4461 const X86Subtarget* Subtarget, 4462 const TargetLowering &TLI) { 4463 if (NumNonZero > 8) 4464 return SDValue(); 4465 4466 SDLoc dl(Op); 4467 SDValue V; 4468 bool First = true; 4469 4470 // SSE4.1 - use PINSRB to insert each byte directly. 4471 if (Subtarget->hasSSE41()) { 4472 for (unsigned i = 0; i < 16; ++i) { 4473 bool isNonZero = (NonZeros & (1 << i)) != 0; 4474 if (isNonZero) { 4475 if (First) { 4476 if (NumZero) 4477 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); 4478 else 4479 V = DAG.getUNDEF(MVT::v16i8); 4480 First = false; 4481 } 4482 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4483 MVT::v16i8, V, Op.getOperand(i), 4484 DAG.getIntPtrConstant(i)); 4485 } 4486 } 4487 4488 return V; 4489 } 4490 4491 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. 4492 for (unsigned i = 0; i < 16; ++i) { 4493 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4494 if (ThisIsNonZero && First) { 4495 if (NumZero) 4496 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4497 else 4498 V = DAG.getUNDEF(MVT::v8i16); 4499 First = false; 4500 } 4501 4502 if ((i & 1) != 0) { 4503 SDValue ThisElt, LastElt; 4504 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4505 if (LastIsNonZero) { 4506 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4507 MVT::i16, Op.getOperand(i-1)); 4508 } 4509 if (ThisIsNonZero) { 4510 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4511 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4512 ThisElt, DAG.getConstant(8, MVT::i8)); 4513 if (LastIsNonZero) 4514 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4515 } else 4516 ThisElt = LastElt; 4517 4518 if (ThisElt.getNode()) 4519 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4520 DAG.getIntPtrConstant(i/2)); 4521 } 4522 } 4523 4524 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4525 } 4526 4527 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4528 /// 4529 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4530 unsigned NumNonZero, unsigned NumZero, 4531 SelectionDAG &DAG, 4532 const X86Subtarget* Subtarget, 4533 const TargetLowering &TLI) { 4534 if (NumNonZero > 4) 4535 return SDValue(); 4536 4537 SDLoc dl(Op); 4538 SDValue V; 4539 bool First = true; 4540 for (unsigned i = 0; i < 8; ++i) { 4541 bool isNonZero = (NonZeros & (1 << i)) != 0; 4542 if (isNonZero) { 4543 if (First) { 4544 if (NumZero) 4545 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4546 else 4547 V = DAG.getUNDEF(MVT::v8i16); 4548 First = false; 4549 } 4550 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4551 MVT::v8i16, V, Op.getOperand(i), 4552 DAG.getIntPtrConstant(i)); 4553 } 4554 } 4555 4556 return V; 4557 } 4558 4559 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. 4560 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, 4561 const X86Subtarget *Subtarget, 4562 const TargetLowering &TLI) { 4563 // Find all zeroable elements. 4564 std::bitset<4> Zeroable; 4565 for (int i=0; i < 4; ++i) { 4566 SDValue Elt = Op->getOperand(i); 4567 Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); 4568 } 4569 assert(Zeroable.size() - Zeroable.count() > 1 && 4570 "We expect at least two non-zero elements!"); 4571 4572 // We only know how to deal with build_vector nodes where elements are either 4573 // zeroable or extract_vector_elt with constant index. 4574 SDValue FirstNonZero; 4575 unsigned FirstNonZeroIdx; 4576 for (unsigned i=0; i < 4; ++i) { 4577 if (Zeroable[i]) 4578 continue; 4579 SDValue Elt = Op->getOperand(i); 4580 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 4581 !isa<ConstantSDNode>(Elt.getOperand(1))) 4582 return SDValue(); 4583 // Make sure that this node is extracting from a 128-bit vector. 4584 MVT VT = Elt.getOperand(0).getSimpleValueType(); 4585 if (!VT.is128BitVector()) 4586 return SDValue(); 4587 if (!FirstNonZero.getNode()) { 4588 FirstNonZero = Elt; 4589 FirstNonZeroIdx = i; 4590 } 4591 } 4592 4593 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); 4594 SDValue V1 = FirstNonZero.getOperand(0); 4595 MVT VT = V1.getSimpleValueType(); 4596 4597 // See if this build_vector can be lowered as a blend with zero. 4598 SDValue Elt; 4599 unsigned EltMaskIdx, EltIdx; 4600 int Mask[4]; 4601 for (EltIdx = 0; EltIdx < 4; ++EltIdx) { 4602 if (Zeroable[EltIdx]) { 4603 // The zero vector will be on the right hand side. 4604 Mask[EltIdx] = EltIdx+4; 4605 continue; 4606 } 4607 4608 Elt = Op->getOperand(EltIdx); 4609 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. 4610 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); 4611 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) 4612 break; 4613 Mask[EltIdx] = EltIdx; 4614 } 4615 4616 if (EltIdx == 4) { 4617 // Let the shuffle legalizer deal with blend operations. 4618 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); 4619 if (V1.getSimpleValueType() != VT) 4620 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); 4621 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); 4622 } 4623 4624 // See if we can lower this build_vector to a INSERTPS. 4625 if (!Subtarget->hasSSE41()) 4626 return SDValue(); 4627 4628 SDValue V2 = Elt.getOperand(0); 4629 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) 4630 V1 = SDValue(); 4631 4632 bool CanFold = true; 4633 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { 4634 if (Zeroable[i]) 4635 continue; 4636 4637 SDValue Current = Op->getOperand(i); 4638 SDValue SrcVector = Current->getOperand(0); 4639 if (!V1.getNode()) 4640 V1 = SrcVector; 4641 CanFold = SrcVector == V1 && 4642 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; 4643 } 4644 4645 if (!CanFold) 4646 return SDValue(); 4647 4648 assert(V1.getNode() && "Expected at least two non-zero elements!"); 4649 if (V1.getSimpleValueType() != MVT::v4f32) 4650 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); 4651 if (V2.getSimpleValueType() != MVT::v4f32) 4652 V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); 4653 4654 // Ok, we can emit an INSERTPS instruction. 4655 unsigned ZMask = Zeroable.to_ulong(); 4656 4657 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; 4658 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); 4659 SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2, 4660 DAG.getIntPtrConstant(InsertPSMask)); 4661 return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result); 4662 } 4663 4664 /// Return a vector logical shift node. 4665 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4666 unsigned NumBits, SelectionDAG &DAG, 4667 const TargetLowering &TLI, SDLoc dl) { 4668 assert(VT.is128BitVector() && "Unknown type for VShift"); 4669 MVT ShVT = MVT::v2i64; 4670 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 4671 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4672 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType()); 4673 assert(NumBits % 8 == 0 && "Only support byte sized shifts"); 4674 SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy); 4675 return DAG.getNode(ISD::BITCAST, dl, VT, 4676 DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); 4677 } 4678 4679 static SDValue 4680 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { 4681 4682 // Check if the scalar load can be widened into a vector load. And if 4683 // the address is "base + cst" see if the cst can be "absorbed" into 4684 // the shuffle mask. 4685 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4686 SDValue Ptr = LD->getBasePtr(); 4687 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4688 return SDValue(); 4689 EVT PVT = LD->getValueType(0); 4690 if (PVT != MVT::i32 && PVT != MVT::f32) 4691 return SDValue(); 4692 4693 int FI = -1; 4694 int64_t Offset = 0; 4695 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4696 FI = FINode->getIndex(); 4697 Offset = 0; 4698 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4699 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4700 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4701 Offset = Ptr.getConstantOperandVal(1); 4702 Ptr = Ptr.getOperand(0); 4703 } else { 4704 return SDValue(); 4705 } 4706 4707 // FIXME: 256-bit vector instructions don't require a strict alignment, 4708 // improve this code to support it better. 4709 unsigned RequiredAlign = VT.getSizeInBits()/8; 4710 SDValue Chain = LD->getChain(); 4711 // Make sure the stack object alignment is at least 16 or 32. 4712 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4713 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4714 if (MFI->isFixedObjectIndex(FI)) { 4715 // Can't change the alignment. FIXME: It's possible to compute 4716 // the exact stack offset and reference FI + adjust offset instead. 4717 // If someone *really* cares about this. That's the way to implement it. 4718 return SDValue(); 4719 } else { 4720 MFI->setObjectAlignment(FI, RequiredAlign); 4721 } 4722 } 4723 4724 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4725 // Ptr + (Offset & ~15). 4726 if (Offset < 0) 4727 return SDValue(); 4728 if ((Offset % RequiredAlign) & 3) 4729 return SDValue(); 4730 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4731 if (StartOffset) 4732 Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), 4733 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4734 4735 int EltNo = (Offset - StartOffset) >> 2; 4736 unsigned NumElems = VT.getVectorNumElements(); 4737 4738 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4739 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4740 LD->getPointerInfo().getWithOffset(StartOffset), 4741 false, false, false, 0); 4742 4743 SmallVector<int, 8> Mask(NumElems, EltNo); 4744 4745 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 4746 } 4747 4748 return SDValue(); 4749 } 4750 4751 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the 4752 /// elements can be replaced by a single large load which has the same value as 4753 /// a build_vector or insert_subvector whose loaded operands are 'Elts'. 4754 /// 4755 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4756 /// 4757 /// FIXME: we'd also like to handle the case where the last elements are zero 4758 /// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4759 /// There's even a handy isZeroNode for that purpose. 4760 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, 4761 SDLoc &DL, SelectionDAG &DAG, 4762 bool isAfterLegalize) { 4763 unsigned NumElems = Elts.size(); 4764 4765 LoadSDNode *LDBase = nullptr; 4766 unsigned LastLoadedElt = -1U; 4767 4768 // For each element in the initializer, see if we've found a load or an undef. 4769 // If we don't find an initial load element, or later load elements are 4770 // non-consecutive, bail out. 4771 for (unsigned i = 0; i < NumElems; ++i) { 4772 SDValue Elt = Elts[i]; 4773 // Look through a bitcast. 4774 if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) 4775 Elt = Elt.getOperand(0); 4776 if (!Elt.getNode() || 4777 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4778 return SDValue(); 4779 if (!LDBase) { 4780 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4781 return SDValue(); 4782 LDBase = cast<LoadSDNode>(Elt.getNode()); 4783 LastLoadedElt = i; 4784 continue; 4785 } 4786 if (Elt.getOpcode() == ISD::UNDEF) 4787 continue; 4788 4789 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4790 EVT LdVT = Elt.getValueType(); 4791 // Each loaded element must be the correct fractional portion of the 4792 // requested vector load. 4793 if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) 4794 return SDValue(); 4795 if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) 4796 return SDValue(); 4797 LastLoadedElt = i; 4798 } 4799 4800 // If we have found an entire vector of loads and undefs, then return a large 4801 // load of the entire vector width starting at the base pointer. If we found 4802 // consecutive loads for the low half, generate a vzext_load node. 4803 if (LastLoadedElt == NumElems - 1) { 4804 assert(LDBase && "Did not find base load for merging consecutive loads"); 4805 EVT EltVT = LDBase->getValueType(0); 4806 // Ensure that the input vector size for the merged loads matches the 4807 // cumulative size of the input elements. 4808 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) 4809 return SDValue(); 4810 4811 if (isAfterLegalize && 4812 !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) 4813 return SDValue(); 4814 4815 SDValue NewLd = SDValue(); 4816 4817 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4818 LDBase->getPointerInfo(), LDBase->isVolatile(), 4819 LDBase->isNonTemporal(), LDBase->isInvariant(), 4820 LDBase->getAlignment()); 4821 4822 if (LDBase->hasAnyUseOfValue(1)) { 4823 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 4824 SDValue(LDBase, 1), 4825 SDValue(NewLd.getNode(), 1)); 4826 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 4827 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 4828 SDValue(NewLd.getNode(), 1)); 4829 } 4830 4831 return NewLd; 4832 } 4833 4834 //TODO: The code below fires only for for loading the low v2i32 / v2f32 4835 //of a v4i32 / v4f32. It's probably worth generalizing. 4836 EVT EltVT = VT.getVectorElementType(); 4837 if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && 4838 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4839 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4840 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4841 SDValue ResNode = 4842 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, 4843 LDBase->getPointerInfo(), 4844 LDBase->getAlignment(), 4845 false/*isVolatile*/, true/*ReadMem*/, 4846 false/*WriteMem*/); 4847 4848 // Make sure the newly-created LOAD is in the same position as LDBase in 4849 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 4850 // update uses of LDBase's output chain to use the TokenFactor. 4851 if (LDBase->hasAnyUseOfValue(1)) { 4852 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 4853 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 4854 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 4855 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 4856 SDValue(ResNode.getNode(), 1)); 4857 } 4858 4859 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4860 } 4861 return SDValue(); 4862 } 4863 4864 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 4865 /// to generate a splat value for the following cases: 4866 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 4867 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from 4868 /// a scalar load, or a constant. 4869 /// The VBROADCAST node is returned when a pattern is found, 4870 /// or SDValue() otherwise. 4871 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, 4872 SelectionDAG &DAG) { 4873 // VBROADCAST requires AVX. 4874 // TODO: Splats could be generated for non-AVX CPUs using SSE 4875 // instructions, but there's less potential gain for only 128-bit vectors. 4876 if (!Subtarget->hasAVX()) 4877 return SDValue(); 4878 4879 MVT VT = Op.getSimpleValueType(); 4880 SDLoc dl(Op); 4881 4882 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && 4883 "Unsupported vector type for broadcast."); 4884 4885 SDValue Ld; 4886 bool ConstSplatVal; 4887 4888 switch (Op.getOpcode()) { 4889 default: 4890 // Unknown pattern found. 4891 return SDValue(); 4892 4893 case ISD::BUILD_VECTOR: { 4894 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode()); 4895 BitVector UndefElements; 4896 SDValue Splat = BVOp->getSplatValue(&UndefElements); 4897 4898 // We need a splat of a single value to use broadcast, and it doesn't 4899 // make any sense if the value is only in one element of the vector. 4900 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) 4901 return SDValue(); 4902 4903 Ld = Splat; 4904 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 4905 Ld.getOpcode() == ISD::ConstantFP); 4906 4907 // Make sure that all of the users of a non-constant load are from the 4908 // BUILD_VECTOR node. 4909 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) 4910 return SDValue(); 4911 break; 4912 } 4913 4914 case ISD::VECTOR_SHUFFLE: { 4915 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4916 4917 // Shuffles must have a splat mask where the first element is 4918 // broadcasted. 4919 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 4920 return SDValue(); 4921 4922 SDValue Sc = Op.getOperand(0); 4923 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 4924 Sc.getOpcode() != ISD::BUILD_VECTOR) { 4925 4926 if (!Subtarget->hasInt256()) 4927 return SDValue(); 4928 4929 // Use the register form of the broadcast instruction available on AVX2. 4930 if (VT.getSizeInBits() >= 256) 4931 Sc = Extract128BitVector(Sc, 0, DAG, dl); 4932 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 4933 } 4934 4935 Ld = Sc.getOperand(0); 4936 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 4937 Ld.getOpcode() == ISD::ConstantFP); 4938 4939 // The scalar_to_vector node and the suspected 4940 // load node must have exactly one user. 4941 // Constants may have multiple users. 4942 4943 // AVX-512 has register version of the broadcast 4944 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && 4945 Ld.getValueType().getSizeInBits() >= 32; 4946 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && 4947 !hasRegVer)) 4948 return SDValue(); 4949 break; 4950 } 4951 } 4952 4953 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 4954 bool IsGE256 = (VT.getSizeInBits() >= 256); 4955 4956 // When optimizing for size, generate up to 5 extra bytes for a broadcast 4957 // instruction to save 8 or more bytes of constant pool data. 4958 // TODO: If multiple splats are generated to load the same constant, 4959 // it may be detrimental to overall size. There needs to be a way to detect 4960 // that condition to know if this is truly a size win. 4961 const Function *F = DAG.getMachineFunction().getFunction(); 4962 bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize); 4963 4964 // Handle broadcasting a single constant scalar from the constant pool 4965 // into a vector. 4966 // On Sandybridge (no AVX2), it is still better to load a constant vector 4967 // from the constant pool and not to broadcast it from a scalar. 4968 // But override that restriction when optimizing for size. 4969 // TODO: Check if splatting is recommended for other AVX-capable CPUs. 4970 if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { 4971 EVT CVT = Ld.getValueType(); 4972 assert(!CVT.isVector() && "Must not broadcast a vector type"); 4973 4974 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. 4975 // For size optimization, also splat v2f64 and v2i64, and for size opt 4976 // with AVX2, also splat i8 and i16. 4977 // With pattern matching, the VBROADCAST node may become a VMOVDDUP. 4978 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || 4979 (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { 4980 const Constant *C = nullptr; 4981 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 4982 C = CI->getConstantIntValue(); 4983 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 4984 C = CF->getConstantFPValue(); 4985 4986 assert(C && "Invalid constant type"); 4987 4988 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 4989 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 4990 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 4991 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 4992 MachinePointerInfo::getConstantPool(), 4993 false, false, false, Alignment); 4994 4995 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 4996 } 4997 } 4998 4999 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5000 5001 // Handle AVX2 in-register broadcasts. 5002 if (!IsLoad && Subtarget->hasInt256() && 5003 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) 5004 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5005 5006 // The scalar source must be a normal load. 5007 if (!IsLoad) 5008 return SDValue(); 5009 5010 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || 5011 (Subtarget->hasVLX() && ScalarSize == 64)) 5012 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5013 5014 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5015 // double since there is no vbroadcastsd xmm 5016 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { 5017 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5018 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5019 } 5020 5021 // Unsupported broadcast. 5022 return SDValue(); 5023 } 5024 5025 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real 5026 /// underlying vector and index. 5027 /// 5028 /// Modifies \p ExtractedFromVec to the real vector and returns the real 5029 /// index. 5030 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, 5031 SDValue ExtIdx) { 5032 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 5033 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) 5034 return Idx; 5035 5036 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already 5037 // lowered this: 5038 // (extract_vector_elt (v8f32 %vreg1), Constant<6>) 5039 // to: 5040 // (extract_vector_elt (vector_shuffle<2,u,u,u> 5041 // (extract_subvector (v8f32 %vreg0), Constant<4>), 5042 // undef) 5043 // Constant<0>) 5044 // In this case the vector is the extract_subvector expression and the index 5045 // is 2, as specified by the shuffle. 5046 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); 5047 SDValue ShuffleVec = SVOp->getOperand(0); 5048 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); 5049 assert(ShuffleVecVT.getVectorElementType() == 5050 ExtractedFromVec.getSimpleValueType().getVectorElementType()); 5051 5052 int ShuffleIdx = SVOp->getMaskElt(Idx); 5053 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { 5054 ExtractedFromVec = ShuffleVec; 5055 return ShuffleIdx; 5056 } 5057 return Idx; 5058 } 5059 5060 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { 5061 MVT VT = Op.getSimpleValueType(); 5062 5063 // Skip if insert_vec_elt is not supported. 5064 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5065 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 5066 return SDValue(); 5067 5068 SDLoc DL(Op); 5069 unsigned NumElems = Op.getNumOperands(); 5070 5071 SDValue VecIn1; 5072 SDValue VecIn2; 5073 SmallVector<unsigned, 4> InsertIndices; 5074 SmallVector<int, 8> Mask(NumElems, -1); 5075 5076 for (unsigned i = 0; i != NumElems; ++i) { 5077 unsigned Opc = Op.getOperand(i).getOpcode(); 5078 5079 if (Opc == ISD::UNDEF) 5080 continue; 5081 5082 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 5083 // Quit if more than 1 elements need inserting. 5084 if (InsertIndices.size() > 1) 5085 return SDValue(); 5086 5087 InsertIndices.push_back(i); 5088 continue; 5089 } 5090 5091 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 5092 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 5093 // Quit if non-constant index. 5094 if (!isa<ConstantSDNode>(ExtIdx)) 5095 return SDValue(); 5096 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); 5097 5098 // Quit if extracted from vector of different type. 5099 if (ExtractedFromVec.getValueType() != VT) 5100 return SDValue(); 5101 5102 if (!VecIn1.getNode()) 5103 VecIn1 = ExtractedFromVec; 5104 else if (VecIn1 != ExtractedFromVec) { 5105 if (!VecIn2.getNode()) 5106 VecIn2 = ExtractedFromVec; 5107 else if (VecIn2 != ExtractedFromVec) 5108 // Quit if more than 2 vectors to shuffle 5109 return SDValue(); 5110 } 5111 5112 if (ExtractedFromVec == VecIn1) 5113 Mask[i] = Idx; 5114 else if (ExtractedFromVec == VecIn2) 5115 Mask[i] = Idx + NumElems; 5116 } 5117 5118 if (!VecIn1.getNode()) 5119 return SDValue(); 5120 5121 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 5122 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); 5123 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 5124 unsigned Idx = InsertIndices[i]; 5125 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 5126 DAG.getIntPtrConstant(Idx)); 5127 } 5128 5129 return NV; 5130 } 5131 5132 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. 5133 SDValue 5134 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { 5135 5136 MVT VT = Op.getSimpleValueType(); 5137 assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && 5138 "Unexpected type in LowerBUILD_VECTORvXi1!"); 5139 5140 SDLoc dl(Op); 5141 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5142 SDValue Cst = DAG.getTargetConstant(0, MVT::i1); 5143 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); 5144 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 5145 } 5146 5147 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5148 SDValue Cst = DAG.getTargetConstant(1, MVT::i1); 5149 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); 5150 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 5151 } 5152 5153 bool AllContants = true; 5154 uint64_t Immediate = 0; 5155 int NonConstIdx = -1; 5156 bool IsSplat = true; 5157 unsigned NumNonConsts = 0; 5158 unsigned NumConsts = 0; 5159 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 5160 SDValue In = Op.getOperand(idx); 5161 if (In.getOpcode() == ISD::UNDEF) 5162 continue; 5163 if (!isa<ConstantSDNode>(In)) { 5164 AllContants = false; 5165 NonConstIdx = idx; 5166 NumNonConsts++; 5167 } else { 5168 NumConsts++; 5169 if (cast<ConstantSDNode>(In)->getZExtValue()) 5170 Immediate |= (1ULL << idx); 5171 } 5172 if (In != Op.getOperand(0)) 5173 IsSplat = false; 5174 } 5175 5176 if (AllContants) { 5177 SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, 5178 DAG.getConstant(Immediate, MVT::i16)); 5179 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, 5180 DAG.getIntPtrConstant(0)); 5181 } 5182 5183 if (NumNonConsts == 1 && NonConstIdx != 0) { 5184 SDValue DstVec; 5185 if (NumConsts) { 5186 SDValue VecAsImm = DAG.getConstant(Immediate, 5187 MVT::getIntegerVT(VT.getSizeInBits())); 5188 DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); 5189 } 5190 else 5191 DstVec = DAG.getUNDEF(VT); 5192 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 5193 Op.getOperand(NonConstIdx), 5194 DAG.getIntPtrConstant(NonConstIdx)); 5195 } 5196 if (!IsSplat && (NonConstIdx != 0)) 5197 llvm_unreachable("Unsupported BUILD_VECTOR operation"); 5198 MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; 5199 SDValue Select; 5200 if (IsSplat) 5201 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), 5202 DAG.getConstant(-1, SelectVT), 5203 DAG.getConstant(0, SelectVT)); 5204 else 5205 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), 5206 DAG.getConstant((Immediate | 1), SelectVT), 5207 DAG.getConstant(Immediate, SelectVT)); 5208 return DAG.getNode(ISD::BITCAST, dl, VT, Select); 5209 } 5210 5211 /// \brief Return true if \p N implements a horizontal binop and return the 5212 /// operands for the horizontal binop into V0 and V1. 5213 /// 5214 /// This is a helper function of PerformBUILD_VECTORCombine. 5215 /// This function checks that the build_vector \p N in input implements a 5216 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal 5217 /// operation to match. 5218 /// For example, if \p Opcode is equal to ISD::ADD, then this function 5219 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode 5220 /// is equal to ISD::SUB, then this function checks if this is a horizontal 5221 /// arithmetic sub. 5222 /// 5223 /// This function only analyzes elements of \p N whose indices are 5224 /// in range [BaseIdx, LastIdx). 5225 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, 5226 SelectionDAG &DAG, 5227 unsigned BaseIdx, unsigned LastIdx, 5228 SDValue &V0, SDValue &V1) { 5229 EVT VT = N->getValueType(0); 5230 5231 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); 5232 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && 5233 "Invalid Vector in input!"); 5234 5235 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); 5236 bool CanFold = true; 5237 unsigned ExpectedVExtractIdx = BaseIdx; 5238 unsigned NumElts = LastIdx - BaseIdx; 5239 V0 = DAG.getUNDEF(VT); 5240 V1 = DAG.getUNDEF(VT); 5241 5242 // Check if N implements a horizontal binop. 5243 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { 5244 SDValue Op = N->getOperand(i + BaseIdx); 5245 5246 // Skip UNDEFs. 5247 if (Op->getOpcode() == ISD::UNDEF) { 5248 // Update the expected vector extract index. 5249 if (i * 2 == NumElts) 5250 ExpectedVExtractIdx = BaseIdx; 5251 ExpectedVExtractIdx += 2; 5252 continue; 5253 } 5254 5255 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); 5256 5257 if (!CanFold) 5258 break; 5259 5260 SDValue Op0 = Op.getOperand(0); 5261 SDValue Op1 = Op.getOperand(1); 5262 5263 // Try to match the following pattern: 5264 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) 5265 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5266 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5267 Op0.getOperand(0) == Op1.getOperand(0) && 5268 isa<ConstantSDNode>(Op0.getOperand(1)) && 5269 isa<ConstantSDNode>(Op1.getOperand(1))); 5270 if (!CanFold) 5271 break; 5272 5273 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); 5274 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); 5275 5276 if (i * 2 < NumElts) { 5277 if (V0.getOpcode() == ISD::UNDEF) 5278 V0 = Op0.getOperand(0); 5279 } else { 5280 if (V1.getOpcode() == ISD::UNDEF) 5281 V1 = Op0.getOperand(0); 5282 if (i * 2 == NumElts) 5283 ExpectedVExtractIdx = BaseIdx; 5284 } 5285 5286 SDValue Expected = (i * 2 < NumElts) ? V0 : V1; 5287 if (I0 == ExpectedVExtractIdx) 5288 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; 5289 else if (IsCommutable && I1 == ExpectedVExtractIdx) { 5290 // Try to match the following dag sequence: 5291 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) 5292 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; 5293 } else 5294 CanFold = false; 5295 5296 ExpectedVExtractIdx += 2; 5297 } 5298 5299 return CanFold; 5300 } 5301 5302 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by 5303 /// a concat_vector. 5304 /// 5305 /// This is a helper function of PerformBUILD_VECTORCombine. 5306 /// This function expects two 256-bit vectors called V0 and V1. 5307 /// At first, each vector is split into two separate 128-bit vectors. 5308 /// Then, the resulting 128-bit vectors are used to implement two 5309 /// horizontal binary operations. 5310 /// 5311 /// The kind of horizontal binary operation is defined by \p X86Opcode. 5312 /// 5313 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to 5314 /// the two new horizontal binop. 5315 /// When Mode is set, the first horizontal binop dag node would take as input 5316 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second 5317 /// horizontal binop dag node would take as input the lower 128-bit of V1 5318 /// and the upper 128-bit of V1. 5319 /// Example: 5320 /// HADD V0_LO, V0_HI 5321 /// HADD V1_LO, V1_HI 5322 /// 5323 /// Otherwise, the first horizontal binop dag node takes as input the lower 5324 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop 5325 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1. 5326 /// Example: 5327 /// HADD V0_LO, V1_LO 5328 /// HADD V0_HI, V1_HI 5329 /// 5330 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower 5331 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to 5332 /// the upper 128-bits of the result. 5333 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, 5334 SDLoc DL, SelectionDAG &DAG, 5335 unsigned X86Opcode, bool Mode, 5336 bool isUndefLO, bool isUndefHI) { 5337 EVT VT = V0.getValueType(); 5338 assert(VT.is256BitVector() && VT == V1.getValueType() && 5339 "Invalid nodes in input!"); 5340 5341 unsigned NumElts = VT.getVectorNumElements(); 5342 SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); 5343 SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); 5344 SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); 5345 SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); 5346 EVT NewVT = V0_LO.getValueType(); 5347 5348 SDValue LO = DAG.getUNDEF(NewVT); 5349 SDValue HI = DAG.getUNDEF(NewVT); 5350 5351 if (Mode) { 5352 // Don't emit a horizontal binop if the result is expected to be UNDEF. 5353 if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) 5354 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); 5355 if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) 5356 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); 5357 } else { 5358 // Don't emit a horizontal binop if the result is expected to be UNDEF. 5359 if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || 5360 V1_LO->getOpcode() != ISD::UNDEF)) 5361 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); 5362 5363 if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || 5364 V1_HI->getOpcode() != ISD::UNDEF)) 5365 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); 5366 } 5367 5368 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); 5369 } 5370 5371 /// \brief Try to fold a build_vector that performs an 'addsub' into the 5372 /// sequence of 'vadd + vsub + blendi'. 5373 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, 5374 const X86Subtarget *Subtarget) { 5375 SDLoc DL(BV); 5376 EVT VT = BV->getValueType(0); 5377 unsigned NumElts = VT.getVectorNumElements(); 5378 SDValue InVec0 = DAG.getUNDEF(VT); 5379 SDValue InVec1 = DAG.getUNDEF(VT); 5380 5381 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || 5382 VT == MVT::v2f64) && "build_vector with an invalid type found!"); 5383 5384 // Odd-numbered elements in the input build vector are obtained from 5385 // adding two integer/float elements. 5386 // Even-numbered elements in the input build vector are obtained from 5387 // subtracting two integer/float elements. 5388 unsigned ExpectedOpcode = ISD::FSUB; 5389 unsigned NextExpectedOpcode = ISD::FADD; 5390 bool AddFound = false; 5391 bool SubFound = false; 5392 5393 for (unsigned i = 0, e = NumElts; i != e; ++i) { 5394 SDValue Op = BV->getOperand(i); 5395 5396 // Skip 'undef' values. 5397 unsigned Opcode = Op.getOpcode(); 5398 if (Opcode == ISD::UNDEF) { 5399 std::swap(ExpectedOpcode, NextExpectedOpcode); 5400 continue; 5401 } 5402 5403 // Early exit if we found an unexpected opcode. 5404 if (Opcode != ExpectedOpcode) 5405 return SDValue(); 5406 5407 SDValue Op0 = Op.getOperand(0); 5408 SDValue Op1 = Op.getOperand(1); 5409 5410 // Try to match the following pattern: 5411 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) 5412 // Early exit if we cannot match that sequence. 5413 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5414 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5415 !isa<ConstantSDNode>(Op0.getOperand(1)) || 5416 !isa<ConstantSDNode>(Op1.getOperand(1)) || 5417 Op0.getOperand(1) != Op1.getOperand(1)) 5418 return SDValue(); 5419 5420 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); 5421 if (I0 != i) 5422 return SDValue(); 5423 5424 // We found a valid add/sub node. Update the information accordingly. 5425 if (i & 1) 5426 AddFound = true; 5427 else 5428 SubFound = true; 5429 5430 // Update InVec0 and InVec1. 5431 if (InVec0.getOpcode() == ISD::UNDEF) 5432 InVec0 = Op0.getOperand(0); 5433 if (InVec1.getOpcode() == ISD::UNDEF) 5434 InVec1 = Op1.getOperand(0); 5435 5436 // Make sure that operands in input to each add/sub node always 5437 // come from a same pair of vectors. 5438 if (InVec0 != Op0.getOperand(0)) { 5439 if (ExpectedOpcode == ISD::FSUB) 5440 return SDValue(); 5441 5442 // FADD is commutable. Try to commute the operands 5443 // and then test again. 5444 std::swap(Op0, Op1); 5445 if (InVec0 != Op0.getOperand(0)) 5446 return SDValue(); 5447 } 5448 5449 if (InVec1 != Op1.getOperand(0)) 5450 return SDValue(); 5451 5452 // Update the pair of expected opcodes. 5453 std::swap(ExpectedOpcode, NextExpectedOpcode); 5454 } 5455 5456 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. 5457 if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && 5458 InVec1.getOpcode() != ISD::UNDEF) 5459 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); 5460 5461 return SDValue(); 5462 } 5463 5464 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, 5465 const X86Subtarget *Subtarget) { 5466 SDLoc DL(N); 5467 EVT VT = N->getValueType(0); 5468 unsigned NumElts = VT.getVectorNumElements(); 5469 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N); 5470 SDValue InVec0, InVec1; 5471 5472 // Try to match an ADDSUB. 5473 if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 5474 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { 5475 SDValue Value = matchAddSub(BV, DAG, Subtarget); 5476 if (Value.getNode()) 5477 return Value; 5478 } 5479 5480 // Try to match horizontal ADD/SUB. 5481 unsigned NumUndefsLO = 0; 5482 unsigned NumUndefsHI = 0; 5483 unsigned Half = NumElts/2; 5484 5485 // Count the number of UNDEF operands in the build_vector in input. 5486 for (unsigned i = 0, e = Half; i != e; ++i) 5487 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) 5488 NumUndefsLO++; 5489 5490 for (unsigned i = Half, e = NumElts; i != e; ++i) 5491 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) 5492 NumUndefsHI++; 5493 5494 // Early exit if this is either a build_vector of all UNDEFs or all the 5495 // operands but one are UNDEF. 5496 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) 5497 return SDValue(); 5498 5499 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { 5500 // Try to match an SSE3 float HADD/HSUB. 5501 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) 5502 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); 5503 5504 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) 5505 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); 5506 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { 5507 // Try to match an SSSE3 integer HADD/HSUB. 5508 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) 5509 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); 5510 5511 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) 5512 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); 5513 } 5514 5515 if (!Subtarget->hasAVX()) 5516 return SDValue(); 5517 5518 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { 5519 // Try to match an AVX horizontal add/sub of packed single/double 5520 // precision floating point values from 256-bit vectors. 5521 SDValue InVec2, InVec3; 5522 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && 5523 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && 5524 ((InVec0.getOpcode() == ISD::UNDEF || 5525 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 5526 ((InVec1.getOpcode() == ISD::UNDEF || 5527 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 5528 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); 5529 5530 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && 5531 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && 5532 ((InVec0.getOpcode() == ISD::UNDEF || 5533 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 5534 ((InVec1.getOpcode() == ISD::UNDEF || 5535 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 5536 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); 5537 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { 5538 // Try to match an AVX2 horizontal add/sub of signed integers. 5539 SDValue InVec2, InVec3; 5540 unsigned X86Opcode; 5541 bool CanFold = true; 5542 5543 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && 5544 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && 5545 ((InVec0.getOpcode() == ISD::UNDEF || 5546 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 5547 ((InVec1.getOpcode() == ISD::UNDEF || 5548 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 5549 X86Opcode = X86ISD::HADD; 5550 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && 5551 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && 5552 ((InVec0.getOpcode() == ISD::UNDEF || 5553 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 5554 ((InVec1.getOpcode() == ISD::UNDEF || 5555 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 5556 X86Opcode = X86ISD::HSUB; 5557 else 5558 CanFold = false; 5559 5560 if (CanFold) { 5561 // Fold this build_vector into a single horizontal add/sub. 5562 // Do this only if the target has AVX2. 5563 if (Subtarget->hasAVX2()) 5564 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); 5565 5566 // Do not try to expand this build_vector into a pair of horizontal 5567 // add/sub if we can emit a pair of scalar add/sub. 5568 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) 5569 return SDValue(); 5570 5571 // Convert this build_vector into a pair of horizontal binop followed by 5572 // a concat vector. 5573 bool isUndefLO = NumUndefsLO == Half; 5574 bool isUndefHI = NumUndefsHI == Half; 5575 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, 5576 isUndefLO, isUndefHI); 5577 } 5578 } 5579 5580 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || 5581 VT == MVT::v16i16) && Subtarget->hasAVX()) { 5582 unsigned X86Opcode; 5583 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) 5584 X86Opcode = X86ISD::HADD; 5585 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) 5586 X86Opcode = X86ISD::HSUB; 5587 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) 5588 X86Opcode = X86ISD::FHADD; 5589 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) 5590 X86Opcode = X86ISD::FHSUB; 5591 else 5592 return SDValue(); 5593 5594 // Don't try to expand this build_vector into a pair of horizontal add/sub 5595 // if we can simply emit a pair of scalar add/sub. 5596 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) 5597 return SDValue(); 5598 5599 // Convert this build_vector into two horizontal add/sub followed by 5600 // a concat vector. 5601 bool isUndefLO = NumUndefsLO == Half; 5602 bool isUndefHI = NumUndefsHI == Half; 5603 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, 5604 isUndefLO, isUndefHI); 5605 } 5606 5607 return SDValue(); 5608 } 5609 5610 SDValue 5611 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5612 SDLoc dl(Op); 5613 5614 MVT VT = Op.getSimpleValueType(); 5615 MVT ExtVT = VT.getVectorElementType(); 5616 unsigned NumElems = Op.getNumOperands(); 5617 5618 // Generate vectors for predicate vectors. 5619 if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) 5620 return LowerBUILD_VECTORvXi1(Op, DAG); 5621 5622 // Vectors containing all zeros can be matched by pxor and xorps later 5623 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5624 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5625 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5626 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) 5627 return Op; 5628 5629 return getZeroVector(VT, Subtarget, DAG, dl); 5630 } 5631 5632 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5633 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5634 // vpcmpeqd on 256-bit vectors. 5635 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { 5636 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) 5637 return Op; 5638 5639 if (!VT.is512BitVector()) 5640 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); 5641 } 5642 5643 if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) 5644 return Broadcast; 5645 5646 unsigned EVTBits = ExtVT.getSizeInBits(); 5647 5648 unsigned NumZero = 0; 5649 unsigned NumNonZero = 0; 5650 unsigned NonZeros = 0; 5651 bool IsAllConstants = true; 5652 SmallSet<SDValue, 8> Values; 5653 for (unsigned i = 0; i < NumElems; ++i) { 5654 SDValue Elt = Op.getOperand(i); 5655 if (Elt.getOpcode() == ISD::UNDEF) 5656 continue; 5657 Values.insert(Elt); 5658 if (Elt.getOpcode() != ISD::Constant && 5659 Elt.getOpcode() != ISD::ConstantFP) 5660 IsAllConstants = false; 5661 if (X86::isZeroNode(Elt)) 5662 NumZero++; 5663 else { 5664 NonZeros |= (1 << i); 5665 NumNonZero++; 5666 } 5667 } 5668 5669 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5670 if (NumNonZero == 0) 5671 return DAG.getUNDEF(VT); 5672 5673 // Special case for single non-zero, non-undef, element. 5674 if (NumNonZero == 1) { 5675 unsigned Idx = countTrailingZeros(NonZeros); 5676 SDValue Item = Op.getOperand(Idx); 5677 5678 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5679 // the value are obviously zero, truncate the value to i32 and do the 5680 // insertion that way. Only do this if the value is non-constant or if the 5681 // value is a constant being inserted into element 0. It is cheaper to do 5682 // a constant pool load than it is to do a movd + shuffle. 5683 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5684 (!IsAllConstants || Idx == 0)) { 5685 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5686 // Handle SSE only. 5687 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5688 EVT VecVT = MVT::v4i32; 5689 5690 // Truncate the value (which may itself be a constant) to i32, and 5691 // convert it to a vector with movd (S2V+shuffle to zero extend). 5692 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5693 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5694 return DAG.getNode( 5695 ISD::BITCAST, dl, VT, 5696 getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); 5697 } 5698 } 5699 5700 // If we have a constant or non-constant insertion into the low element of 5701 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5702 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5703 // depending on what the source datatype is. 5704 if (Idx == 0) { 5705 if (NumZero == 0) 5706 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5707 5708 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5709 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5710 if (VT.is512BitVector()) { 5711 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5712 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5713 Item, DAG.getIntPtrConstant(0)); 5714 } 5715 assert((VT.is128BitVector() || VT.is256BitVector()) && 5716 "Expected an SSE value type!"); 5717 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5718 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5719 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5720 } 5721 5722 // We can't directly insert an i8 or i16 into a vector, so zero extend 5723 // it to i32 first. 5724 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5725 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5726 if (VT.is256BitVector()) { 5727 if (Subtarget->hasAVX()) { 5728 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item); 5729 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5730 } else { 5731 // Without AVX, we need to extend to a 128-bit vector and then 5732 // insert into the 256-bit vector. 5733 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5734 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5735 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 5736 } 5737 } else { 5738 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5739 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5740 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5741 } 5742 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5743 } 5744 } 5745 5746 // Is it a vector logical left shift? 5747 if (NumElems == 2 && Idx == 1 && 5748 X86::isZeroNode(Op.getOperand(0)) && 5749 !X86::isZeroNode(Op.getOperand(1))) { 5750 unsigned NumBits = VT.getSizeInBits(); 5751 return getVShift(true, VT, 5752 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5753 VT, Op.getOperand(1)), 5754 NumBits/2, DAG, *this, dl); 5755 } 5756 5757 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5758 return SDValue(); 5759 5760 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5761 // is a non-constant being inserted into an element other than the low one, 5762 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5763 // movd/movss) to move this into the low element, then shuffle it into 5764 // place. 5765 if (EVTBits == 32) { 5766 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5767 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); 5768 } 5769 } 5770 5771 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5772 if (Values.size() == 1) { 5773 if (EVTBits == 32) { 5774 // Instead of a shuffle like this: 5775 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5776 // Check if it's possible to issue this instead. 5777 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5778 unsigned Idx = countTrailingZeros(NonZeros); 5779 SDValue Item = Op.getOperand(Idx); 5780 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5781 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5782 } 5783 return SDValue(); 5784 } 5785 5786 // A vector full of immediates; various special cases are already 5787 // handled, so this is best done with a single constant-pool load. 5788 if (IsAllConstants) 5789 return SDValue(); 5790 5791 // For AVX-length vectors, see if we can use a vector load to get all of the 5792 // elements, otherwise build the individual 128-bit pieces and use 5793 // shuffles to put them in place. 5794 if (VT.is256BitVector() || VT.is512BitVector()) { 5795 SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems); 5796 5797 // Check for a build vector of consecutive loads. 5798 if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) 5799 return LD; 5800 5801 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5802 5803 // Build both the lower and upper subvector. 5804 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, 5805 makeArrayRef(&V[0], NumElems/2)); 5806 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, 5807 makeArrayRef(&V[NumElems / 2], NumElems/2)); 5808 5809 // Recreate the wider vector with the lower and upper part. 5810 if (VT.is256BitVector()) 5811 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 5812 return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 5813 } 5814 5815 // Let legalizer expand 2-wide build_vectors. 5816 if (EVTBits == 64) { 5817 if (NumNonZero == 1) { 5818 // One half is zero or undef. 5819 unsigned Idx = countTrailingZeros(NonZeros); 5820 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 5821 Op.getOperand(Idx)); 5822 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 5823 } 5824 return SDValue(); 5825 } 5826 5827 // If element VT is < 32 bits, convert it to inserts into a zero vector. 5828 if (EVTBits == 8 && NumElems == 16) 5829 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 5830 Subtarget, *this)) 5831 return V; 5832 5833 if (EVTBits == 16 && NumElems == 8) 5834 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 5835 Subtarget, *this)) 5836 return V; 5837 5838 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS 5839 if (EVTBits == 32 && NumElems == 4) 5840 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) 5841 return V; 5842 5843 // If element VT is == 32 bits, turn it into a number of shuffles. 5844 SmallVector<SDValue, 8> V(NumElems); 5845 if (NumElems == 4 && NumZero > 0) { 5846 for (unsigned i = 0; i < 4; ++i) { 5847 bool isZero = !(NonZeros & (1 << i)); 5848 if (isZero) 5849 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 5850 else 5851 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5852 } 5853 5854 for (unsigned i = 0; i < 2; ++i) { 5855 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5856 default: break; 5857 case 0: 5858 V[i] = V[i*2]; // Must be a zero vector. 5859 break; 5860 case 1: 5861 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5862 break; 5863 case 2: 5864 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5865 break; 5866 case 3: 5867 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5868 break; 5869 } 5870 } 5871 5872 bool Reverse1 = (NonZeros & 0x3) == 2; 5873 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5874 int MaskVec[] = { 5875 Reverse1 ? 1 : 0, 5876 Reverse1 ? 0 : 1, 5877 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 5878 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 5879 }; 5880 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5881 } 5882 5883 if (Values.size() > 1 && VT.is128BitVector()) { 5884 // Check for a build vector of consecutive loads. 5885 for (unsigned i = 0; i < NumElems; ++i) 5886 V[i] = Op.getOperand(i); 5887 5888 // Check for elements which are consecutive loads. 5889 if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) 5890 return LD; 5891 5892 // Check for a build vector from mostly shuffle plus few inserting. 5893 if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) 5894 return Sh; 5895 5896 // For SSE 4.1, use insertps to put the high elements into the low element. 5897 if (Subtarget->hasSSE41()) { 5898 SDValue Result; 5899 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5900 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5901 else 5902 Result = DAG.getUNDEF(VT); 5903 5904 for (unsigned i = 1; i < NumElems; ++i) { 5905 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5906 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5907 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5908 } 5909 return Result; 5910 } 5911 5912 // Otherwise, expand into a number of unpckl*, start by extending each of 5913 // our (non-undef) elements to the full vector width with the element in the 5914 // bottom slot of the vector (which generates no code for SSE). 5915 for (unsigned i = 0; i < NumElems; ++i) { 5916 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5917 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5918 else 5919 V[i] = DAG.getUNDEF(VT); 5920 } 5921 5922 // Next, we iteratively mix elements, e.g. for v4f32: 5923 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5924 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5925 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5926 unsigned EltStride = NumElems >> 1; 5927 while (EltStride != 0) { 5928 for (unsigned i = 0; i < EltStride; ++i) { 5929 // If V[i+EltStride] is undef and this is the first round of mixing, 5930 // then it is safe to just drop this shuffle: V[i] is already in the 5931 // right place, the one element (since it's the first round) being 5932 // inserted as undef can be dropped. This isn't safe for successive 5933 // rounds because they will permute elements within both vectors. 5934 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5935 EltStride == NumElems/2) 5936 continue; 5937 5938 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5939 } 5940 EltStride >>= 1; 5941 } 5942 return V[0]; 5943 } 5944 return SDValue(); 5945 } 5946 5947 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5948 // to create 256-bit vectors from two other 128-bit ones. 5949 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5950 SDLoc dl(Op); 5951 MVT ResVT = Op.getSimpleValueType(); 5952 5953 assert((ResVT.is256BitVector() || 5954 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); 5955 5956 SDValue V1 = Op.getOperand(0); 5957 SDValue V2 = Op.getOperand(1); 5958 unsigned NumElems = ResVT.getVectorNumElements(); 5959 if (ResVT.is256BitVector()) 5960 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 5961 5962 if (Op.getNumOperands() == 4) { 5963 MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), 5964 ResVT.getVectorNumElements()/2); 5965 SDValue V3 = Op.getOperand(2); 5966 SDValue V4 = Op.getOperand(3); 5967 return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), 5968 Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); 5969 } 5970 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 5971 } 5972 5973 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, 5974 const X86Subtarget *Subtarget, 5975 SelectionDAG & DAG) { 5976 SDLoc dl(Op); 5977 MVT ResVT = Op.getSimpleValueType(); 5978 unsigned NumOfOperands = Op.getNumOperands(); 5979 5980 assert(isPowerOf2_32(NumOfOperands) && 5981 "Unexpected number of operands in CONCAT_VECTORS"); 5982 5983 if (NumOfOperands > 2) { 5984 MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), 5985 ResVT.getVectorNumElements()/2); 5986 SmallVector<SDValue, 2> Ops; 5987 for (unsigned i = 0; i < NumOfOperands/2; i++) 5988 Ops.push_back(Op.getOperand(i)); 5989 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); 5990 Ops.clear(); 5991 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) 5992 Ops.push_back(Op.getOperand(i)); 5993 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); 5994 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); 5995 } 5996 5997 SDValue V1 = Op.getOperand(0); 5998 SDValue V2 = Op.getOperand(1); 5999 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); 6000 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); 6001 6002 if (IsZeroV1 && IsZeroV2) 6003 return getZeroVector(ResVT, Subtarget, DAG, dl); 6004 6005 SDValue ZeroIdx = DAG.getIntPtrConstant(0); 6006 SDValue Undef = DAG.getUNDEF(ResVT); 6007 unsigned NumElems = ResVT.getVectorNumElements(); 6008 SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8); 6009 6010 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); 6011 V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); 6012 if (IsZeroV1) 6013 return V2; 6014 6015 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); 6016 // Zero the upper bits of V1 6017 V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); 6018 V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); 6019 if (IsZeroV2) 6020 return V1; 6021 return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); 6022 } 6023 6024 static SDValue LowerCONCAT_VECTORS(SDValue Op, 6025 const X86Subtarget *Subtarget, 6026 SelectionDAG &DAG) { 6027 MVT VT = Op.getSimpleValueType(); 6028 if (VT.getVectorElementType() == MVT::i1) 6029 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); 6030 6031 assert((VT.is256BitVector() && Op.getNumOperands() == 2) || 6032 (VT.is512BitVector() && (Op.getNumOperands() == 2 || 6033 Op.getNumOperands() == 4))); 6034 6035 // AVX can use the vinsertf128 instruction to create 256-bit vectors 6036 // from two other 128-bit ones. 6037 6038 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors 6039 return LowerAVXCONCAT_VECTORS(Op, DAG); 6040 } 6041 6042 6043 //===----------------------------------------------------------------------===// 6044 // Vector shuffle lowering 6045 // 6046 // This is an experimental code path for lowering vector shuffles on x86. It is 6047 // designed to handle arbitrary vector shuffles and blends, gracefully 6048 // degrading performance as necessary. It works hard to recognize idiomatic 6049 // shuffles and lower them to optimal instruction patterns without leaving 6050 // a framework that allows reasonably efficient handling of all vector shuffle 6051 // patterns. 6052 //===----------------------------------------------------------------------===// 6053 6054 /// \brief Tiny helper function to identify a no-op mask. 6055 /// 6056 /// This is a somewhat boring predicate function. It checks whether the mask 6057 /// array input, which is assumed to be a single-input shuffle mask of the kind 6058 /// used by the X86 shuffle instructions (not a fully general 6059 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an 6060 /// in-place shuffle are 'no-op's. 6061 static bool isNoopShuffleMask(ArrayRef<int> Mask) { 6062 for (int i = 0, Size = Mask.size(); i < Size; ++i) 6063 if (Mask[i] != -1 && Mask[i] != i) 6064 return false; 6065 return true; 6066 } 6067 6068 /// \brief Helper function to classify a mask as a single-input mask. 6069 /// 6070 /// This isn't a generic single-input test because in the vector shuffle 6071 /// lowering we canonicalize single inputs to be the first input operand. This 6072 /// means we can more quickly test for a single input by only checking whether 6073 /// an input from the second operand exists. We also assume that the size of 6074 /// mask corresponds to the size of the input vectors which isn't true in the 6075 /// fully general case. 6076 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { 6077 for (int M : Mask) 6078 if (M >= (int)Mask.size()) 6079 return false; 6080 return true; 6081 } 6082 6083 /// \brief Test whether there are elements crossing 128-bit lanes in this 6084 /// shuffle mask. 6085 /// 6086 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations 6087 /// and we routinely test for these. 6088 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { 6089 int LaneSize = 128 / VT.getScalarSizeInBits(); 6090 int Size = Mask.size(); 6091 for (int i = 0; i < Size; ++i) 6092 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) 6093 return true; 6094 return false; 6095 } 6096 6097 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. 6098 /// 6099 /// This checks a shuffle mask to see if it is performing the same 6100 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies 6101 /// that it is also not lane-crossing. It may however involve a blend from the 6102 /// same lane of a second vector. 6103 /// 6104 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is 6105 /// non-trivial to compute in the face of undef lanes. The representation is 6106 /// *not* suitable for use with existing 128-bit shuffles as it will contain 6107 /// entries from both V1 and V2 inputs to the wider mask. 6108 static bool 6109 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, 6110 SmallVectorImpl<int> &RepeatedMask) { 6111 int LaneSize = 128 / VT.getScalarSizeInBits(); 6112 RepeatedMask.resize(LaneSize, -1); 6113 int Size = Mask.size(); 6114 for (int i = 0; i < Size; ++i) { 6115 if (Mask[i] < 0) 6116 continue; 6117 if ((Mask[i] % Size) / LaneSize != i / LaneSize) 6118 // This entry crosses lanes, so there is no way to model this shuffle. 6119 return false; 6120 6121 // Ok, handle the in-lane shuffles by detecting if and when they repeat. 6122 if (RepeatedMask[i % LaneSize] == -1) 6123 // This is the first non-undef entry in this slot of a 128-bit lane. 6124 RepeatedMask[i % LaneSize] = 6125 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; 6126 else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) 6127 // Found a mismatch with the repeated mask. 6128 return false; 6129 } 6130 return true; 6131 } 6132 6133 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of 6134 /// arguments. 6135 /// 6136 /// This is a fast way to test a shuffle mask against a fixed pattern: 6137 /// 6138 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } 6139 /// 6140 /// It returns true if the mask is exactly as wide as the argument list, and 6141 /// each element of the mask is either -1 (signifying undef) or the value given 6142 /// in the argument. 6143 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, 6144 ArrayRef<int> ExpectedMask) { 6145 if (Mask.size() != ExpectedMask.size()) 6146 return false; 6147 6148 int Size = Mask.size(); 6149 6150 // If the values are build vectors, we can look through them to find 6151 // equivalent inputs that make the shuffles equivalent. 6152 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1); 6153 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2); 6154 6155 for (int i = 0; i < Size; ++i) 6156 if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) { 6157 auto *MaskBV = Mask[i] < Size ? BV1 : BV2; 6158 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; 6159 if (!MaskBV || !ExpectedBV || 6160 MaskBV->getOperand(Mask[i] % Size) != 6161 ExpectedBV->getOperand(ExpectedMask[i] % Size)) 6162 return false; 6163 } 6164 6165 return true; 6166 } 6167 6168 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. 6169 /// 6170 /// This helper function produces an 8-bit shuffle immediate corresponding to 6171 /// the ubiquitous shuffle encoding scheme used in x86 instructions for 6172 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for 6173 /// example. 6174 /// 6175 /// NB: We rely heavily on "undef" masks preserving the input lane. 6176 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, 6177 SelectionDAG &DAG) { 6178 assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); 6179 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); 6180 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); 6181 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); 6182 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); 6183 6184 unsigned Imm = 0; 6185 Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; 6186 Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; 6187 Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; 6188 Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; 6189 return DAG.getConstant(Imm, MVT::i8); 6190 } 6191 6192 /// \brief Try to emit a blend instruction for a shuffle using bit math. 6193 /// 6194 /// This is used as a fallback approach when first class blend instructions are 6195 /// unavailable. Currently it is only suitable for integer vectors, but could 6196 /// be generalized for floating point vectors if desirable. 6197 static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, 6198 SDValue V2, ArrayRef<int> Mask, 6199 SelectionDAG &DAG) { 6200 assert(VT.isInteger() && "Only supports integer vector types!"); 6201 MVT EltVT = VT.getScalarType(); 6202 int NumEltBits = EltVT.getSizeInBits(); 6203 SDValue Zero = DAG.getConstant(0, EltVT); 6204 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT); 6205 SmallVector<SDValue, 16> MaskOps; 6206 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 6207 if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size) 6208 return SDValue(); // Shuffled input! 6209 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); 6210 } 6211 6212 SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps); 6213 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); 6214 // We have to cast V2 around. 6215 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); 6216 V2 = DAG.getNode(ISD::BITCAST, DL, VT, 6217 DAG.getNode(X86ISD::ANDNP, DL, MaskVT, 6218 DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask), 6219 DAG.getNode(ISD::BITCAST, DL, MaskVT, V2))); 6220 return DAG.getNode(ISD::OR, DL, VT, V1, V2); 6221 } 6222 6223 /// \brief Try to emit a blend instruction for a shuffle. 6224 /// 6225 /// This doesn't do any checks for the availability of instructions for blending 6226 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to 6227 /// be matched in the backend with the type given. What it does check for is 6228 /// that the shuffle mask is in fact a blend. 6229 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, 6230 SDValue V2, ArrayRef<int> Mask, 6231 const X86Subtarget *Subtarget, 6232 SelectionDAG &DAG) { 6233 unsigned BlendMask = 0; 6234 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 6235 if (Mask[i] >= Size) { 6236 if (Mask[i] != i + Size) 6237 return SDValue(); // Shuffled V2 input! 6238 BlendMask |= 1u << i; 6239 continue; 6240 } 6241 if (Mask[i] >= 0 && Mask[i] != i) 6242 return SDValue(); // Shuffled V1 input! 6243 } 6244 switch (VT.SimpleTy) { 6245 case MVT::v2f64: 6246 case MVT::v4f32: 6247 case MVT::v4f64: 6248 case MVT::v8f32: 6249 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, 6250 DAG.getConstant(BlendMask, MVT::i8)); 6251 6252 case MVT::v4i64: 6253 case MVT::v8i32: 6254 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); 6255 // FALLTHROUGH 6256 case MVT::v2i64: 6257 case MVT::v4i32: 6258 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into 6259 // that instruction. 6260 if (Subtarget->hasAVX2()) { 6261 // Scale the blend by the number of 32-bit dwords per element. 6262 int Scale = VT.getScalarSizeInBits() / 32; 6263 BlendMask = 0; 6264 for (int i = 0, Size = Mask.size(); i < Size; ++i) 6265 if (Mask[i] >= Size) 6266 for (int j = 0; j < Scale; ++j) 6267 BlendMask |= 1u << (i * Scale + j); 6268 6269 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; 6270 V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); 6271 V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); 6272 return DAG.getNode(ISD::BITCAST, DL, VT, 6273 DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, 6274 DAG.getConstant(BlendMask, MVT::i8))); 6275 } 6276 // FALLTHROUGH 6277 case MVT::v8i16: { 6278 // For integer shuffles we need to expand the mask and cast the inputs to 6279 // v8i16s prior to blending. 6280 int Scale = 8 / VT.getVectorNumElements(); 6281 BlendMask = 0; 6282 for (int i = 0, Size = Mask.size(); i < Size; ++i) 6283 if (Mask[i] >= Size) 6284 for (int j = 0; j < Scale; ++j) 6285 BlendMask |= 1u << (i * Scale + j); 6286 6287 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); 6288 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); 6289 return DAG.getNode(ISD::BITCAST, DL, VT, 6290 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, 6291 DAG.getConstant(BlendMask, MVT::i8))); 6292 } 6293 6294 case MVT::v16i16: { 6295 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); 6296 SmallVector<int, 8> RepeatedMask; 6297 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { 6298 // We can lower these with PBLENDW which is mirrored across 128-bit lanes. 6299 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); 6300 BlendMask = 0; 6301 for (int i = 0; i < 8; ++i) 6302 if (RepeatedMask[i] >= 16) 6303 BlendMask |= 1u << i; 6304 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, 6305 DAG.getConstant(BlendMask, MVT::i8)); 6306 } 6307 } 6308 // FALLTHROUGH 6309 case MVT::v16i8: 6310 case MVT::v32i8: { 6311 assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && 6312 "256-bit byte-blends require AVX2 support!"); 6313 6314 // Scale the blend by the number of bytes per element. 6315 int Scale = VT.getScalarSizeInBits() / 8; 6316 6317 // This form of blend is always done on bytes. Compute the byte vector 6318 // type. 6319 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); 6320 6321 // Compute the VSELECT mask. Note that VSELECT is really confusing in the 6322 // mix of LLVM's code generator and the x86 backend. We tell the code 6323 // generator that boolean values in the elements of an x86 vector register 6324 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' 6325 // mapping a select to operand #1, and 'false' mapping to operand #2. The 6326 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit 6327 // of the element (the remaining are ignored) and 0 in that high bit would 6328 // mean operand #1 while 1 in the high bit would mean operand #2. So while 6329 // the LLVM model for boolean values in vector elements gets the relevant 6330 // bit set, it is set backwards and over constrained relative to x86's 6331 // actual model. 6332 SmallVector<SDValue, 32> VSELECTMask; 6333 for (int i = 0, Size = Mask.size(); i < Size; ++i) 6334 for (int j = 0; j < Scale; ++j) 6335 VSELECTMask.push_back( 6336 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) 6337 : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8)); 6338 6339 V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); 6340 V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); 6341 return DAG.getNode( 6342 ISD::BITCAST, DL, VT, 6343 DAG.getNode(ISD::VSELECT, DL, BlendVT, 6344 DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask), 6345 V1, V2)); 6346 } 6347 6348 default: 6349 llvm_unreachable("Not a supported integer vector type!"); 6350 } 6351 } 6352 6353 /// \brief Try to lower as a blend of elements from two inputs followed by 6354 /// a single-input permutation. 6355 /// 6356 /// This matches the pattern where we can blend elements from two inputs and 6357 /// then reduce the shuffle to a single-input permutation. 6358 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, 6359 SDValue V2, 6360 ArrayRef<int> Mask, 6361 SelectionDAG &DAG) { 6362 // We build up the blend mask while checking whether a blend is a viable way 6363 // to reduce the shuffle. 6364 SmallVector<int, 32> BlendMask(Mask.size(), -1); 6365 SmallVector<int, 32> PermuteMask(Mask.size(), -1); 6366 6367 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 6368 if (Mask[i] < 0) 6369 continue; 6370 6371 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); 6372 6373 if (BlendMask[Mask[i] % Size] == -1) 6374 BlendMask[Mask[i] % Size] = Mask[i]; 6375 else if (BlendMask[Mask[i] % Size] != Mask[i]) 6376 return SDValue(); // Can't blend in the needed input! 6377 6378 PermuteMask[i] = Mask[i] % Size; 6379 } 6380 6381 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); 6382 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); 6383 } 6384 6385 /// \brief Generic routine to decompose a shuffle and blend into indepndent 6386 /// blends and permutes. 6387 /// 6388 /// This matches the extremely common pattern for handling combined 6389 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend 6390 /// operations. It will try to pick the best arrangement of shuffles and 6391 /// blends. 6392 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, 6393 SDValue V1, 6394 SDValue V2, 6395 ArrayRef<int> Mask, 6396 SelectionDAG &DAG) { 6397 // Shuffle the input elements into the desired positions in V1 and V2 and 6398 // blend them together. 6399 SmallVector<int, 32> V1Mask(Mask.size(), -1); 6400 SmallVector<int, 32> V2Mask(Mask.size(), -1); 6401 SmallVector<int, 32> BlendMask(Mask.size(), -1); 6402 for (int i = 0, Size = Mask.size(); i < Size; ++i) 6403 if (Mask[i] >= 0 && Mask[i] < Size) { 6404 V1Mask[i] = Mask[i]; 6405 BlendMask[i] = i; 6406 } else if (Mask[i] >= Size) { 6407 V2Mask[i] = Mask[i] - Size; 6408 BlendMask[i] = i + Size; 6409 } 6410 6411 // Try to lower with the simpler initial blend strategy unless one of the 6412 // input shuffles would be a no-op. We prefer to shuffle inputs as the 6413 // shuffle may be able to fold with a load or other benefit. However, when 6414 // we'll have to do 2x as many shuffles in order to achieve this, blending 6415 // first is a better strategy. 6416 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) 6417 if (SDValue BlendPerm = 6418 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) 6419 return BlendPerm; 6420 6421 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); 6422 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); 6423 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); 6424 } 6425 6426 /// \brief Try to lower a vector shuffle as a byte rotation. 6427 /// 6428 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary 6429 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use 6430 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will 6431 /// try to generically lower a vector shuffle through such an pattern. It 6432 /// does not check for the profitability of lowering either as PALIGNR or 6433 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. 6434 /// This matches shuffle vectors that look like: 6435 /// 6436 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] 6437 /// 6438 /// Essentially it concatenates V1 and V2, shifts right by some number of 6439 /// elements, and takes the low elements as the result. Note that while this is 6440 /// specified as a *right shift* because x86 is little-endian, it is a *left 6441 /// rotate* of the vector lanes. 6442 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, 6443 SDValue V2, 6444 ArrayRef<int> Mask, 6445 const X86Subtarget *Subtarget, 6446 SelectionDAG &DAG) { 6447 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); 6448 6449 int NumElts = Mask.size(); 6450 int NumLanes = VT.getSizeInBits() / 128; 6451 int NumLaneElts = NumElts / NumLanes; 6452 6453 // We need to detect various ways of spelling a rotation: 6454 // [11, 12, 13, 14, 15, 0, 1, 2] 6455 // [-1, 12, 13, 14, -1, -1, 1, -1] 6456 // [-1, -1, -1, -1, -1, -1, 1, 2] 6457 // [ 3, 4, 5, 6, 7, 8, 9, 10] 6458 // [-1, 4, 5, 6, -1, -1, 9, -1] 6459 // [-1, 4, 5, 6, -1, -1, -1, -1] 6460 int Rotation = 0; 6461 SDValue Lo, Hi; 6462 for (int l = 0; l < NumElts; l += NumLaneElts) { 6463 for (int i = 0; i < NumLaneElts; ++i) { 6464 if (Mask[l + i] == -1) 6465 continue; 6466 assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!"); 6467 6468 // Get the mod-Size index and lane correct it. 6469 int LaneIdx = (Mask[l + i] % NumElts) - l; 6470 // Make sure it was in this lane. 6471 if (LaneIdx < 0 || LaneIdx >= NumLaneElts) 6472 return SDValue(); 6473 6474 // Determine where a rotated vector would have started. 6475 int StartIdx = i - LaneIdx; 6476 if (StartIdx == 0) 6477 // The identity rotation isn't interesting, stop. 6478 return SDValue(); 6479 6480 // If we found the tail of a vector the rotation must be the missing 6481 // front. If we found the head of a vector, it must be how much of the 6482 // head. 6483 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; 6484 6485 if (Rotation == 0) 6486 Rotation = CandidateRotation; 6487 else if (Rotation != CandidateRotation) 6488 // The rotations don't match, so we can't match this mask. 6489 return SDValue(); 6490 6491 // Compute which value this mask is pointing at. 6492 SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; 6493 6494 // Compute which of the two target values this index should be assigned 6495 // to. This reflects whether the high elements are remaining or the low 6496 // elements are remaining. 6497 SDValue &TargetV = StartIdx < 0 ? Hi : Lo; 6498 6499 // Either set up this value if we've not encountered it before, or check 6500 // that it remains consistent. 6501 if (!TargetV) 6502 TargetV = MaskV; 6503 else if (TargetV != MaskV) 6504 // This may be a rotation, but it pulls from the inputs in some 6505 // unsupported interleaving. 6506 return SDValue(); 6507 } 6508 } 6509 6510 // Check that we successfully analyzed the mask, and normalize the results. 6511 assert(Rotation != 0 && "Failed to locate a viable rotation!"); 6512 assert((Lo || Hi) && "Failed to find a rotated input vector!"); 6513 if (!Lo) 6514 Lo = Hi; 6515 else if (!Hi) 6516 Hi = Lo; 6517 6518 // The actual rotate instruction rotates bytes, so we need to scale the 6519 // rotation based on how many bytes are in the vector lane. 6520 int Scale = 16 / NumLaneElts; 6521 6522 // SSSE3 targets can use the palignr instruction. 6523 if (Subtarget->hasSSSE3()) { 6524 // Cast the inputs to i8 vector of correct length to match PALIGNR. 6525 MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); 6526 Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo); 6527 Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi); 6528 6529 return DAG.getNode(ISD::BITCAST, DL, VT, 6530 DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo, 6531 DAG.getConstant(Rotation * Scale, MVT::i8))); 6532 } 6533 6534 assert(VT.getSizeInBits() == 128 && 6535 "Rotate-based lowering only supports 128-bit lowering!"); 6536 assert(Mask.size() <= 16 && 6537 "Can shuffle at most 16 bytes in a 128-bit vector!"); 6538 6539 // Default SSE2 implementation 6540 int LoByteShift = 16 - Rotation * Scale; 6541 int HiByteShift = Rotation * Scale; 6542 6543 // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. 6544 Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo); 6545 Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); 6546 6547 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, 6548 DAG.getConstant(LoByteShift, MVT::i8)); 6549 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, 6550 DAG.getConstant(HiByteShift, MVT::i8)); 6551 return DAG.getNode(ISD::BITCAST, DL, VT, 6552 DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); 6553 } 6554 6555 /// \brief Compute whether each element of a shuffle is zeroable. 6556 /// 6557 /// A "zeroable" vector shuffle element is one which can be lowered to zero. 6558 /// Either it is an undef element in the shuffle mask, the element of the input 6559 /// referenced is undef, or the element of the input referenced is known to be 6560 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle 6561 /// as many lanes with this technique as possible to simplify the remaining 6562 /// shuffle. 6563 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, 6564 SDValue V1, SDValue V2) { 6565 SmallBitVector Zeroable(Mask.size(), false); 6566 6567 while (V1.getOpcode() == ISD::BITCAST) 6568 V1 = V1->getOperand(0); 6569 while (V2.getOpcode() == ISD::BITCAST) 6570 V2 = V2->getOperand(0); 6571 6572 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); 6573 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); 6574 6575 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 6576 int M = Mask[i]; 6577 // Handle the easy cases. 6578 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { 6579 Zeroable[i] = true; 6580 continue; 6581 } 6582 6583 // If this is an index into a build_vector node (which has the same number 6584 // of elements), dig out the input value and use it. 6585 SDValue V = M < Size ? V1 : V2; 6586 if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) 6587 continue; 6588 6589 SDValue Input = V.getOperand(M % Size); 6590 // The UNDEF opcode check really should be dead code here, but not quite 6591 // worth asserting on (it isn't invalid, just unexpected). 6592 if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) 6593 Zeroable[i] = true; 6594 } 6595 6596 return Zeroable; 6597 } 6598 6599 /// \brief Try to emit a bitmask instruction for a shuffle. 6600 /// 6601 /// This handles cases where we can model a blend exactly as a bitmask due to 6602 /// one of the inputs being zeroable. 6603 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, 6604 SDValue V2, ArrayRef<int> Mask, 6605 SelectionDAG &DAG) { 6606 MVT EltVT = VT.getScalarType(); 6607 int NumEltBits = EltVT.getSizeInBits(); 6608 MVT IntEltVT = MVT::getIntegerVT(NumEltBits); 6609 SDValue Zero = DAG.getConstant(0, IntEltVT); 6610 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT); 6611 if (EltVT.isFloatingPoint()) { 6612 Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero); 6613 AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes); 6614 } 6615 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); 6616 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 6617 SDValue V; 6618 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 6619 if (Zeroable[i]) 6620 continue; 6621 if (Mask[i] % Size != i) 6622 return SDValue(); // Not a blend. 6623 if (!V) 6624 V = Mask[i] < Size ? V1 : V2; 6625 else if (V != (Mask[i] < Size ? V1 : V2)) 6626 return SDValue(); // Can only let one input through the mask. 6627 6628 VMaskOps[i] = AllOnes; 6629 } 6630 if (!V) 6631 return SDValue(); // No non-zeroable elements! 6632 6633 SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); 6634 V = DAG.getNode(VT.isFloatingPoint() 6635 ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, 6636 DL, VT, V, VMask); 6637 return V; 6638 } 6639 6640 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). 6641 /// 6642 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and 6643 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function 6644 /// matches elements from one of the input vectors shuffled to the left or 6645 /// right with zeroable elements 'shifted in'. It handles both the strictly 6646 /// bit-wise element shifts and the byte shift across an entire 128-bit double 6647 /// quad word lane. 6648 /// 6649 /// PSHL : (little-endian) left bit shift. 6650 /// [ zz, 0, zz, 2 ] 6651 /// [ -1, 4, zz, -1 ] 6652 /// PSRL : (little-endian) right bit shift. 6653 /// [ 1, zz, 3, zz] 6654 /// [ -1, -1, 7, zz] 6655 /// PSLLDQ : (little-endian) left byte shift 6656 /// [ zz, 0, 1, 2, 3, 4, 5, 6] 6657 /// [ zz, zz, -1, -1, 2, 3, 4, -1] 6658 /// [ zz, zz, zz, zz, zz, zz, -1, 1] 6659 /// PSRLDQ : (little-endian) right byte shift 6660 /// [ 5, 6, 7, zz, zz, zz, zz, zz] 6661 /// [ -1, 5, 6, 7, zz, zz, zz, zz] 6662 /// [ 1, 2, -1, -1, -1, -1, zz, zz] 6663 static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, 6664 SDValue V2, ArrayRef<int> Mask, 6665 SelectionDAG &DAG) { 6666 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 6667 6668 int Size = Mask.size(); 6669 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); 6670 6671 auto CheckZeros = [&](int Shift, int Scale, bool Left) { 6672 for (int i = 0; i < Size; i += Scale) 6673 for (int j = 0; j < Shift; ++j) 6674 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) 6675 return false; 6676 6677 return true; 6678 }; 6679 6680 auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { 6681 for (int i = 0; i != Size; i += Scale) { 6682 unsigned Pos = Left ? i + Shift : i; 6683 unsigned Low = Left ? i : i + Shift; 6684 unsigned Len = Scale - Shift; 6685 if (!isSequentialOrUndefInRange(Mask, Pos, Len, 6686 Low + (V == V1 ? 0 : Size))) 6687 return SDValue(); 6688 } 6689 6690 int ShiftEltBits = VT.getScalarSizeInBits() * Scale; 6691 bool ByteShift = ShiftEltBits > 64; 6692 unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) 6693 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); 6694 int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); 6695 6696 // Normalize the scale for byte shifts to still produce an i64 element 6697 // type. 6698 Scale = ByteShift ? Scale / 2 : Scale; 6699 6700 // We need to round trip through the appropriate type for the shift. 6701 MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); 6702 MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); 6703 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && 6704 "Illegal integer vector type"); 6705 V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); 6706 6707 V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8)); 6708 return DAG.getNode(ISD::BITCAST, DL, VT, V); 6709 }; 6710 6711 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just 6712 // keep doubling the size of the integer elements up to that. We can 6713 // then shift the elements of the integer vector by whole multiples of 6714 // their width within the elements of the larger integer vector. Test each 6715 // multiple to see if we can find a match with the moved element indices 6716 // and that the shifted in elements are all zeroable. 6717 for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2) 6718 for (int Shift = 1; Shift != Scale; ++Shift) 6719 for (bool Left : {true, false}) 6720 if (CheckZeros(Shift, Scale, Left)) 6721 for (SDValue V : {V1, V2}) 6722 if (SDValue Match = MatchShift(Shift, Scale, Left, V)) 6723 return Match; 6724 6725 // no match 6726 return SDValue(); 6727 } 6728 6729 /// \brief Lower a vector shuffle as a zero or any extension. 6730 /// 6731 /// Given a specific number of elements, element bit width, and extension 6732 /// stride, produce either a zero or any extension based on the available 6733 /// features of the subtarget. 6734 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( 6735 SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, 6736 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 6737 assert(Scale > 1 && "Need a scale to extend."); 6738 int NumElements = VT.getVectorNumElements(); 6739 int EltBits = VT.getScalarSizeInBits(); 6740 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 6741 "Only 8, 16, and 32 bit elements can be extended."); 6742 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); 6743 6744 // Found a valid zext mask! Try various lowering strategies based on the 6745 // input type and available ISA extensions. 6746 if (Subtarget->hasSSE41()) { 6747 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), 6748 NumElements / Scale); 6749 return DAG.getNode(ISD::BITCAST, DL, VT, 6750 DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); 6751 } 6752 6753 // For any extends we can cheat for larger element sizes and use shuffle 6754 // instructions that can fold with a load and/or copy. 6755 if (AnyExt && EltBits == 32) { 6756 int PSHUFDMask[4] = {0, -1, 1, -1}; 6757 return DAG.getNode( 6758 ISD::BITCAST, DL, VT, 6759 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, 6760 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), 6761 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); 6762 } 6763 if (AnyExt && EltBits == 16 && Scale > 2) { 6764 int PSHUFDMask[4] = {0, -1, 0, -1}; 6765 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, 6766 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), 6767 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)); 6768 int PSHUFHWMask[4] = {1, -1, -1, -1}; 6769 return DAG.getNode( 6770 ISD::BITCAST, DL, VT, 6771 DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, 6772 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV), 6773 getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG))); 6774 } 6775 6776 // If this would require more than 2 unpack instructions to expand, use 6777 // pshufb when available. We can only use more than 2 unpack instructions 6778 // when zero extending i8 elements which also makes it easier to use pshufb. 6779 if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { 6780 assert(NumElements == 16 && "Unexpected byte vector width!"); 6781 SDValue PSHUFBMask[16]; 6782 for (int i = 0; i < 16; ++i) 6783 PSHUFBMask[i] = 6784 DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8); 6785 InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV); 6786 return DAG.getNode(ISD::BITCAST, DL, VT, 6787 DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, 6788 DAG.getNode(ISD::BUILD_VECTOR, DL, 6789 MVT::v16i8, PSHUFBMask))); 6790 } 6791 6792 // Otherwise emit a sequence of unpacks. 6793 do { 6794 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); 6795 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) 6796 : getZeroVector(InputVT, Subtarget, DAG, DL); 6797 InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); 6798 InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); 6799 Scale /= 2; 6800 EltBits *= 2; 6801 NumElements /= 2; 6802 } while (Scale > 1); 6803 return DAG.getNode(ISD::BITCAST, DL, VT, InputV); 6804 } 6805 6806 /// \brief Try to lower a vector shuffle as a zero extension on any microarch. 6807 /// 6808 /// This routine will try to do everything in its power to cleverly lower 6809 /// a shuffle which happens to match the pattern of a zero extend. It doesn't 6810 /// check for the profitability of this lowering, it tries to aggressively 6811 /// match this pattern. It will use all of the micro-architectural details it 6812 /// can to emit an efficient lowering. It handles both blends with all-zero 6813 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to 6814 /// masking out later). 6815 /// 6816 /// The reason we have dedicated lowering for zext-style shuffles is that they 6817 /// are both incredibly common and often quite performance sensitive. 6818 static SDValue lowerVectorShuffleAsZeroOrAnyExtend( 6819 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 6820 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 6821 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 6822 6823 int Bits = VT.getSizeInBits(); 6824 int NumElements = VT.getVectorNumElements(); 6825 assert(VT.getScalarSizeInBits() <= 32 && 6826 "Exceeds 32-bit integer zero extension limit"); 6827 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); 6828 6829 // Define a helper function to check a particular ext-scale and lower to it if 6830 // valid. 6831 auto Lower = [&](int Scale) -> SDValue { 6832 SDValue InputV; 6833 bool AnyExt = true; 6834 for (int i = 0; i < NumElements; ++i) { 6835 if (Mask[i] == -1) 6836 continue; // Valid anywhere but doesn't tell us anything. 6837 if (i % Scale != 0) { 6838 // Each of the extended elements need to be zeroable. 6839 if (!Zeroable[i]) 6840 return SDValue(); 6841 6842 // We no longer are in the anyext case. 6843 AnyExt = false; 6844 continue; 6845 } 6846 6847 // Each of the base elements needs to be consecutive indices into the 6848 // same input vector. 6849 SDValue V = Mask[i] < NumElements ? V1 : V2; 6850 if (!InputV) 6851 InputV = V; 6852 else if (InputV != V) 6853 return SDValue(); // Flip-flopping inputs. 6854 6855 if (Mask[i] % NumElements != i / Scale) 6856 return SDValue(); // Non-consecutive strided elements. 6857 } 6858 6859 // If we fail to find an input, we have a zero-shuffle which should always 6860 // have already been handled. 6861 // FIXME: Maybe handle this here in case during blending we end up with one? 6862 if (!InputV) 6863 return SDValue(); 6864 6865 return lowerVectorShuffleAsSpecificZeroOrAnyExtend( 6866 DL, VT, Scale, AnyExt, InputV, Subtarget, DAG); 6867 }; 6868 6869 // The widest scale possible for extending is to a 64-bit integer. 6870 assert(Bits % 64 == 0 && 6871 "The number of bits in a vector must be divisible by 64 on x86!"); 6872 int NumExtElements = Bits / 64; 6873 6874 // Each iteration, try extending the elements half as much, but into twice as 6875 // many elements. 6876 for (; NumExtElements < NumElements; NumExtElements *= 2) { 6877 assert(NumElements % NumExtElements == 0 && 6878 "The input vector size must be divisible by the extended size."); 6879 if (SDValue V = Lower(NumElements / NumExtElements)) 6880 return V; 6881 } 6882 6883 // General extends failed, but 128-bit vectors may be able to use MOVQ. 6884 if (Bits != 128) 6885 return SDValue(); 6886 6887 // Returns one of the source operands if the shuffle can be reduced to a 6888 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. 6889 auto CanZExtLowHalf = [&]() { 6890 for (int i = NumElements / 2; i != NumElements; ++i) 6891 if (!Zeroable[i]) 6892 return SDValue(); 6893 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) 6894 return V1; 6895 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) 6896 return V2; 6897 return SDValue(); 6898 }; 6899 6900 if (SDValue V = CanZExtLowHalf()) { 6901 V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V); 6902 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); 6903 return DAG.getNode(ISD::BITCAST, DL, VT, V); 6904 } 6905 6906 // No viable ext lowering found. 6907 return SDValue(); 6908 } 6909 6910 /// \brief Try to get a scalar value for a specific element of a vector. 6911 /// 6912 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. 6913 static SDValue getScalarValueForVectorElement(SDValue V, int Idx, 6914 SelectionDAG &DAG) { 6915 MVT VT = V.getSimpleValueType(); 6916 MVT EltVT = VT.getVectorElementType(); 6917 while (V.getOpcode() == ISD::BITCAST) 6918 V = V.getOperand(0); 6919 // If the bitcasts shift the element size, we can't extract an equivalent 6920 // element from it. 6921 MVT NewVT = V.getSimpleValueType(); 6922 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) 6923 return SDValue(); 6924 6925 if (V.getOpcode() == ISD::BUILD_VECTOR || 6926 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) 6927 return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx)); 6928 6929 return SDValue(); 6930 } 6931 6932 /// \brief Helper to test for a load that can be folded with x86 shuffles. 6933 /// 6934 /// This is particularly important because the set of instructions varies 6935 /// significantly based on whether the operand is a load or not. 6936 static bool isShuffleFoldableLoad(SDValue V) { 6937 while (V.getOpcode() == ISD::BITCAST) 6938 V = V.getOperand(0); 6939 6940 return ISD::isNON_EXTLoad(V.getNode()); 6941 } 6942 6943 /// \brief Try to lower insertion of a single element into a zero vector. 6944 /// 6945 /// This is a common pattern that we have especially efficient patterns to lower 6946 /// across all subtarget feature sets. 6947 static SDValue lowerVectorShuffleAsElementInsertion( 6948 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 6949 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 6950 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 6951 MVT ExtVT = VT; 6952 MVT EltVT = VT.getVectorElementType(); 6953 6954 int V2Index = std::find_if(Mask.begin(), Mask.end(), 6955 [&Mask](int M) { return M >= (int)Mask.size(); }) - 6956 Mask.begin(); 6957 bool IsV1Zeroable = true; 6958 for (int i = 0, Size = Mask.size(); i < Size; ++i) 6959 if (i != V2Index && !Zeroable[i]) { 6960 IsV1Zeroable = false; 6961 break; 6962 } 6963 6964 // Check for a single input from a SCALAR_TO_VECTOR node. 6965 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and 6966 // all the smarts here sunk into that routine. However, the current 6967 // lowering of BUILD_VECTOR makes that nearly impossible until the old 6968 // vector shuffle lowering is dead. 6969 if (SDValue V2S = getScalarValueForVectorElement( 6970 V2, Mask[V2Index] - Mask.size(), DAG)) { 6971 // We need to zext the scalar if it is smaller than an i32. 6972 V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); 6973 if (EltVT == MVT::i8 || EltVT == MVT::i16) { 6974 // Using zext to expand a narrow element won't work for non-zero 6975 // insertions. 6976 if (!IsV1Zeroable) 6977 return SDValue(); 6978 6979 // Zero-extend directly to i32. 6980 ExtVT = MVT::v4i32; 6981 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); 6982 } 6983 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); 6984 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || 6985 EltVT == MVT::i16) { 6986 // Either not inserting from the low element of the input or the input 6987 // element size is too small to use VZEXT_MOVL to clear the high bits. 6988 return SDValue(); 6989 } 6990 6991 if (!IsV1Zeroable) { 6992 // If V1 can't be treated as a zero vector we have fewer options to lower 6993 // this. We can't support integer vectors or non-zero targets cheaply, and 6994 // the V1 elements can't be permuted in any way. 6995 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); 6996 if (!VT.isFloatingPoint() || V2Index != 0) 6997 return SDValue(); 6998 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end()); 6999 V1Mask[V2Index] = -1; 7000 if (!isNoopShuffleMask(V1Mask)) 7001 return SDValue(); 7002 // This is essentially a special case blend operation, but if we have 7003 // general purpose blend operations, they are always faster. Bail and let 7004 // the rest of the lowering handle these as blends. 7005 if (Subtarget->hasSSE41()) 7006 return SDValue(); 7007 7008 // Otherwise, use MOVSD or MOVSS. 7009 assert((EltVT == MVT::f32 || EltVT == MVT::f64) && 7010 "Only two types of floating point element types to handle!"); 7011 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, 7012 ExtVT, V1, V2); 7013 } 7014 7015 // This lowering only works for the low element with floating point vectors. 7016 if (VT.isFloatingPoint() && V2Index != 0) 7017 return SDValue(); 7018 7019 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); 7020 if (ExtVT != VT) 7021 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); 7022 7023 if (V2Index != 0) { 7024 // If we have 4 or fewer lanes we can cheaply shuffle the element into 7025 // the desired position. Otherwise it is more efficient to do a vector 7026 // shift left. We know that we can do a vector shift left because all 7027 // the inputs are zero. 7028 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { 7029 SmallVector<int, 4> V2Shuffle(Mask.size(), 1); 7030 V2Shuffle[V2Index] = 0; 7031 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); 7032 } else { 7033 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); 7034 V2 = DAG.getNode( 7035 X86ISD::VSHLDQ, DL, MVT::v2i64, V2, 7036 DAG.getConstant( 7037 V2Index * EltVT.getSizeInBits()/8, 7038 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); 7039 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); 7040 } 7041 } 7042 return V2; 7043 } 7044 7045 /// \brief Try to lower broadcast of a single element. 7046 /// 7047 /// For convenience, this code also bundles all of the subtarget feature set 7048 /// filtering. While a little annoying to re-dispatch on type here, there isn't 7049 /// a convenient way to factor it out. 7050 static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, 7051 ArrayRef<int> Mask, 7052 const X86Subtarget *Subtarget, 7053 SelectionDAG &DAG) { 7054 if (!Subtarget->hasAVX()) 7055 return SDValue(); 7056 if (VT.isInteger() && !Subtarget->hasAVX2()) 7057 return SDValue(); 7058 7059 // Check that the mask is a broadcast. 7060 int BroadcastIdx = -1; 7061 for (int M : Mask) 7062 if (M >= 0 && BroadcastIdx == -1) 7063 BroadcastIdx = M; 7064 else if (M >= 0 && M != BroadcastIdx) 7065 return SDValue(); 7066 7067 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " 7068 "a sorted mask where the broadcast " 7069 "comes from V1."); 7070 7071 // Go up the chain of (vector) values to find a scalar load that we can 7072 // combine with the broadcast. 7073 for (;;) { 7074 switch (V.getOpcode()) { 7075 case ISD::CONCAT_VECTORS: { 7076 int OperandSize = Mask.size() / V.getNumOperands(); 7077 V = V.getOperand(BroadcastIdx / OperandSize); 7078 BroadcastIdx %= OperandSize; 7079 continue; 7080 } 7081 7082 case ISD::INSERT_SUBVECTOR: { 7083 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); 7084 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); 7085 if (!ConstantIdx) 7086 break; 7087 7088 int BeginIdx = (int)ConstantIdx->getZExtValue(); 7089 int EndIdx = 7090 BeginIdx + (int)VInner.getValueType().getVectorNumElements(); 7091 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { 7092 BroadcastIdx -= BeginIdx; 7093 V = VInner; 7094 } else { 7095 V = VOuter; 7096 } 7097 continue; 7098 } 7099 } 7100 break; 7101 } 7102 7103 // Check if this is a broadcast of a scalar. We special case lowering 7104 // for scalars so that we can more effectively fold with loads. 7105 if (V.getOpcode() == ISD::BUILD_VECTOR || 7106 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { 7107 V = V.getOperand(BroadcastIdx); 7108 7109 // If the scalar isn't a load, we can't broadcast from it in AVX1. 7110 // Only AVX2 has register broadcasts. 7111 if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) 7112 return SDValue(); 7113 } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { 7114 // We can't broadcast from a vector register without AVX2, and we can only 7115 // broadcast from the zero-element of a vector register. 7116 return SDValue(); 7117 } 7118 7119 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); 7120 } 7121 7122 // Check for whether we can use INSERTPS to perform the shuffle. We only use 7123 // INSERTPS when the V1 elements are already in the correct locations 7124 // because otherwise we can just always use two SHUFPS instructions which 7125 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also 7126 // perform INSERTPS if a single V1 element is out of place and all V2 7127 // elements are zeroable. 7128 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, 7129 ArrayRef<int> Mask, 7130 SelectionDAG &DAG) { 7131 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); 7132 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 7133 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 7134 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 7135 7136 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7137 7138 unsigned ZMask = 0; 7139 int V1DstIndex = -1; 7140 int V2DstIndex = -1; 7141 bool V1UsedInPlace = false; 7142 7143 for (int i = 0; i < 4; ++i) { 7144 // Synthesize a zero mask from the zeroable elements (includes undefs). 7145 if (Zeroable[i]) { 7146 ZMask |= 1 << i; 7147 continue; 7148 } 7149 7150 // Flag if we use any V1 inputs in place. 7151 if (i == Mask[i]) { 7152 V1UsedInPlace = true; 7153 continue; 7154 } 7155 7156 // We can only insert a single non-zeroable element. 7157 if (V1DstIndex != -1 || V2DstIndex != -1) 7158 return SDValue(); 7159 7160 if (Mask[i] < 4) { 7161 // V1 input out of place for insertion. 7162 V1DstIndex = i; 7163 } else { 7164 // V2 input for insertion. 7165 V2DstIndex = i; 7166 } 7167 } 7168 7169 // Don't bother if we have no (non-zeroable) element for insertion. 7170 if (V1DstIndex == -1 && V2DstIndex == -1) 7171 return SDValue(); 7172 7173 // Determine element insertion src/dst indices. The src index is from the 7174 // start of the inserted vector, not the start of the concatenated vector. 7175 unsigned V2SrcIndex = 0; 7176 if (V1DstIndex != -1) { 7177 // If we have a V1 input out of place, we use V1 as the V2 element insertion 7178 // and don't use the original V2 at all. 7179 V2SrcIndex = Mask[V1DstIndex]; 7180 V2DstIndex = V1DstIndex; 7181 V2 = V1; 7182 } else { 7183 V2SrcIndex = Mask[V2DstIndex] - 4; 7184 } 7185 7186 // If no V1 inputs are used in place, then the result is created only from 7187 // the zero mask and the V2 insertion - so remove V1 dependency. 7188 if (!V1UsedInPlace) 7189 V1 = DAG.getUNDEF(MVT::v4f32); 7190 7191 unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; 7192 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); 7193 7194 // Insert the V2 element into the desired position. 7195 SDLoc DL(Op); 7196 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, 7197 DAG.getConstant(InsertPSMask, MVT::i8)); 7198 } 7199 7200 /// \brief Try to lower a shuffle as a permute of the inputs followed by an 7201 /// UNPCK instruction. 7202 /// 7203 /// This specifically targets cases where we end up with alternating between 7204 /// the two inputs, and so can permute them into something that feeds a single 7205 /// UNPCK instruction. Note that this routine only targets integer vectors 7206 /// because for floating point vectors we have a generalized SHUFPS lowering 7207 /// strategy that handles everything that doesn't *exactly* match an unpack, 7208 /// making this clever lowering unnecessary. 7209 static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1, 7210 SDValue V2, ArrayRef<int> Mask, 7211 SelectionDAG &DAG) { 7212 assert(!VT.isFloatingPoint() && 7213 "This routine only supports integer vectors."); 7214 assert(!isSingleInputShuffleMask(Mask) && 7215 "This routine should only be used when blending two inputs."); 7216 assert(Mask.size() >= 2 && "Single element masks are invalid."); 7217 7218 int Size = Mask.size(); 7219 7220 int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { 7221 return M >= 0 && M % Size < Size / 2; 7222 }); 7223 int NumHiInputs = std::count_if( 7224 Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; }); 7225 7226 bool UnpackLo = NumLoInputs >= NumHiInputs; 7227 7228 auto TryUnpack = [&](MVT UnpackVT, int Scale) { 7229 SmallVector<int, 32> V1Mask(Mask.size(), -1); 7230 SmallVector<int, 32> V2Mask(Mask.size(), -1); 7231 7232 for (int i = 0; i < Size; ++i) { 7233 if (Mask[i] < 0) 7234 continue; 7235 7236 // Each element of the unpack contains Scale elements from this mask. 7237 int UnpackIdx = i / Scale; 7238 7239 // We only handle the case where V1 feeds the first slots of the unpack. 7240 // We rely on canonicalization to ensure this is the case. 7241 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) 7242 return SDValue(); 7243 7244 // Setup the mask for this input. The indexing is tricky as we have to 7245 // handle the unpack stride. 7246 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; 7247 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = 7248 Mask[i] % Size; 7249 } 7250 7251 // If we will have to shuffle both inputs to use the unpack, check whether 7252 // we can just unpack first and shuffle the result. If so, skip this unpack. 7253 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && 7254 !isNoopShuffleMask(V2Mask)) 7255 return SDValue(); 7256 7257 // Shuffle the inputs into place. 7258 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); 7259 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); 7260 7261 // Cast the inputs to the type we will use to unpack them. 7262 V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1); 7263 V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2); 7264 7265 // Unpack the inputs and cast the result back to the desired type. 7266 return DAG.getNode(ISD::BITCAST, DL, VT, 7267 DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, 7268 DL, UnpackVT, V1, V2)); 7269 }; 7270 7271 // We try each unpack from the largest to the smallest to try and find one 7272 // that fits this mask. 7273 int OrigNumElements = VT.getVectorNumElements(); 7274 int OrigScalarSize = VT.getScalarSizeInBits(); 7275 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { 7276 int Scale = ScalarSize / OrigScalarSize; 7277 int NumElements = OrigNumElements / Scale; 7278 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); 7279 if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) 7280 return Unpack; 7281 } 7282 7283 // If none of the unpack-rooted lowerings worked (or were profitable) try an 7284 // initial unpack. 7285 if (NumLoInputs == 0 || NumHiInputs == 0) { 7286 assert((NumLoInputs > 0 || NumHiInputs > 0) && 7287 "We have to have *some* inputs!"); 7288 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; 7289 7290 // FIXME: We could consider the total complexity of the permute of each 7291 // possible unpacking. Or at the least we should consider how many 7292 // half-crossings are created. 7293 // FIXME: We could consider commuting the unpacks. 7294 7295 SmallVector<int, 32> PermMask; 7296 PermMask.assign(Size, -1); 7297 for (int i = 0; i < Size; ++i) { 7298 if (Mask[i] < 0) 7299 continue; 7300 7301 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); 7302 7303 PermMask[i] = 7304 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); 7305 } 7306 return DAG.getVectorShuffle( 7307 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, 7308 DL, VT, V1, V2), 7309 DAG.getUNDEF(VT), PermMask); 7310 } 7311 7312 return SDValue(); 7313 } 7314 7315 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. 7316 /// 7317 /// This is the basis function for the 2-lane 64-bit shuffles as we have full 7318 /// support for floating point shuffles but not integer shuffles. These 7319 /// instructions will incur a domain crossing penalty on some chips though so 7320 /// it is better to avoid lowering through this for integer vectors where 7321 /// possible. 7322 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 7323 const X86Subtarget *Subtarget, 7324 SelectionDAG &DAG) { 7325 SDLoc DL(Op); 7326 assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); 7327 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); 7328 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); 7329 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7330 ArrayRef<int> Mask = SVOp->getMask(); 7331 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); 7332 7333 if (isSingleInputShuffleMask(Mask)) { 7334 // Use low duplicate instructions for masks that match their pattern. 7335 if (Subtarget->hasSSE3()) 7336 if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) 7337 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); 7338 7339 // Straight shuffle of a single input vector. Simulate this by using the 7340 // single input as both of the "inputs" to this instruction.. 7341 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); 7342 7343 if (Subtarget->hasAVX()) { 7344 // If we have AVX, we can use VPERMILPS which will allow folding a load 7345 // into the shuffle. 7346 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, 7347 DAG.getConstant(SHUFPDMask, MVT::i8)); 7348 } 7349 7350 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, 7351 DAG.getConstant(SHUFPDMask, MVT::i8)); 7352 } 7353 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); 7354 assert(Mask[1] >= 2 && "Non-canonicalized blend!"); 7355 7356 // If we have a single input, insert that into V1 if we can do so cheaply. 7357 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { 7358 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 7359 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) 7360 return Insertion; 7361 // Try inverting the insertion since for v2 masks it is easy to do and we 7362 // can't reliably sort the mask one way or the other. 7363 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), 7364 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; 7365 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 7366 DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) 7367 return Insertion; 7368 } 7369 7370 // Try to use one of the special instruction patterns to handle two common 7371 // blend patterns if a zero-blend above didn't work. 7372 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || 7373 isShuffleEquivalent(V1, V2, Mask, {1, 3})) 7374 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) 7375 // We can either use a special instruction to load over the low double or 7376 // to move just the low double. 7377 return DAG.getNode( 7378 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, 7379 DL, MVT::v2f64, V2, 7380 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); 7381 7382 if (Subtarget->hasSSE41()) 7383 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, 7384 Subtarget, DAG)) 7385 return Blend; 7386 7387 // Use dedicated unpack instructions for masks that match their pattern. 7388 if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) 7389 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); 7390 if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) 7391 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); 7392 7393 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); 7394 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, 7395 DAG.getConstant(SHUFPDMask, MVT::i8)); 7396 } 7397 7398 /// \brief Handle lowering of 2-lane 64-bit integer shuffles. 7399 /// 7400 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by 7401 /// the integer unit to minimize domain crossing penalties. However, for blends 7402 /// it falls back to the floating point shuffle operation with appropriate bit 7403 /// casting. 7404 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 7405 const X86Subtarget *Subtarget, 7406 SelectionDAG &DAG) { 7407 SDLoc DL(Op); 7408 assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); 7409 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); 7410 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); 7411 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7412 ArrayRef<int> Mask = SVOp->getMask(); 7413 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); 7414 7415 if (isSingleInputShuffleMask(Mask)) { 7416 // Check for being able to broadcast a single element. 7417 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, 7418 Mask, Subtarget, DAG)) 7419 return Broadcast; 7420 7421 // Straight shuffle of a single input vector. For everything from SSE2 7422 // onward this has a single fast instruction with no scary immediates. 7423 // We have to map the mask as it is actually a v4i32 shuffle instruction. 7424 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); 7425 int WidenedMask[4] = { 7426 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, 7427 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; 7428 return DAG.getNode( 7429 ISD::BITCAST, DL, MVT::v2i64, 7430 DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, 7431 getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); 7432 } 7433 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); 7434 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); 7435 assert(Mask[0] < 2 && "We sort V1 to be the first input."); 7436 assert(Mask[1] >= 2 && "We sort V2 to be the second input."); 7437 7438 // If we have a blend of two PACKUS operations an the blend aligns with the 7439 // low and half halves, we can just merge the PACKUS operations. This is 7440 // particularly important as it lets us merge shuffles that this routine itself 7441 // creates. 7442 auto GetPackNode = [](SDValue V) { 7443 while (V.getOpcode() == ISD::BITCAST) 7444 V = V.getOperand(0); 7445 7446 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); 7447 }; 7448 if (SDValue V1Pack = GetPackNode(V1)) 7449 if (SDValue V2Pack = GetPackNode(V2)) 7450 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, 7451 DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, 7452 Mask[0] == 0 ? V1Pack.getOperand(0) 7453 : V1Pack.getOperand(1), 7454 Mask[1] == 2 ? V2Pack.getOperand(0) 7455 : V2Pack.getOperand(1))); 7456 7457 // Try to use shift instructions. 7458 if (SDValue Shift = 7459 lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG)) 7460 return Shift; 7461 7462 // When loading a scalar and then shuffling it into a vector we can often do 7463 // the insertion cheaply. 7464 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 7465 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) 7466 return Insertion; 7467 // Try inverting the insertion since for v2 masks it is easy to do and we 7468 // can't reliably sort the mask one way or the other. 7469 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; 7470 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 7471 DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) 7472 return Insertion; 7473 7474 // We have different paths for blend lowering, but they all must use the 7475 // *exact* same predicate. 7476 bool IsBlendSupported = Subtarget->hasSSE41(); 7477 if (IsBlendSupported) 7478 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, 7479 Subtarget, DAG)) 7480 return Blend; 7481 7482 // Use dedicated unpack instructions for masks that match their pattern. 7483 if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) 7484 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); 7485 if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) 7486 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); 7487 7488 // Try to use byte rotation instructions. 7489 // Its more profitable for pre-SSSE3 to use shuffles/unpacks. 7490 if (Subtarget->hasSSSE3()) 7491 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 7492 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) 7493 return Rotate; 7494 7495 // If we have direct support for blends, we should lower by decomposing into 7496 // a permute. That will be faster than the domain cross. 7497 if (IsBlendSupported) 7498 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, 7499 Mask, DAG); 7500 7501 // We implement this with SHUFPD which is pretty lame because it will likely 7502 // incur 2 cycles of stall for integer vectors on Nehalem and older chips. 7503 // However, all the alternatives are still more cycles and newer chips don't 7504 // have this problem. It would be really nice if x86 had better shuffles here. 7505 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); 7506 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); 7507 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, 7508 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); 7509 } 7510 7511 /// \brief Test whether this can be lowered with a single SHUFPS instruction. 7512 /// 7513 /// This is used to disable more specialized lowerings when the shufps lowering 7514 /// will happen to be efficient. 7515 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { 7516 // This routine only handles 128-bit shufps. 7517 assert(Mask.size() == 4 && "Unsupported mask size!"); 7518 7519 // To lower with a single SHUFPS we need to have the low half and high half 7520 // each requiring a single input. 7521 if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) 7522 return false; 7523 if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) 7524 return false; 7525 7526 return true; 7527 } 7528 7529 /// \brief Lower a vector shuffle using the SHUFPS instruction. 7530 /// 7531 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. 7532 /// It makes no assumptions about whether this is the *best* lowering, it simply 7533 /// uses it. 7534 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, 7535 ArrayRef<int> Mask, SDValue V1, 7536 SDValue V2, SelectionDAG &DAG) { 7537 SDValue LowV = V1, HighV = V2; 7538 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; 7539 7540 int NumV2Elements = 7541 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); 7542 7543 if (NumV2Elements == 1) { 7544 int V2Index = 7545 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - 7546 Mask.begin(); 7547 7548 // Compute the index adjacent to V2Index and in the same half by toggling 7549 // the low bit. 7550 int V2AdjIndex = V2Index ^ 1; 7551 7552 if (Mask[V2AdjIndex] == -1) { 7553 // Handles all the cases where we have a single V2 element and an undef. 7554 // This will only ever happen in the high lanes because we commute the 7555 // vector otherwise. 7556 if (V2Index < 2) 7557 std::swap(LowV, HighV); 7558 NewMask[V2Index] -= 4; 7559 } else { 7560 // Handle the case where the V2 element ends up adjacent to a V1 element. 7561 // To make this work, blend them together as the first step. 7562 int V1Index = V2AdjIndex; 7563 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; 7564 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, 7565 getV4X86ShuffleImm8ForMask(BlendMask, DAG)); 7566 7567 // Now proceed to reconstruct the final blend as we have the necessary 7568 // high or low half formed. 7569 if (V2Index < 2) { 7570 LowV = V2; 7571 HighV = V1; 7572 } else { 7573 HighV = V2; 7574 } 7575 NewMask[V1Index] = 2; // We put the V1 element in V2[2]. 7576 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. 7577 } 7578 } else if (NumV2Elements == 2) { 7579 if (Mask[0] < 4 && Mask[1] < 4) { 7580 // Handle the easy case where we have V1 in the low lanes and V2 in the 7581 // high lanes. 7582 NewMask[2] -= 4; 7583 NewMask[3] -= 4; 7584 } else if (Mask[2] < 4 && Mask[3] < 4) { 7585 // We also handle the reversed case because this utility may get called 7586 // when we detect a SHUFPS pattern but can't easily commute the shuffle to 7587 // arrange things in the right direction. 7588 NewMask[0] -= 4; 7589 NewMask[1] -= 4; 7590 HighV = V1; 7591 LowV = V2; 7592 } else { 7593 // We have a mixture of V1 and V2 in both low and high lanes. Rather than 7594 // trying to place elements directly, just blend them and set up the final 7595 // shuffle to place them. 7596 7597 // The first two blend mask elements are for V1, the second two are for 7598 // V2. 7599 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], 7600 Mask[2] < 4 ? Mask[2] : Mask[3], 7601 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, 7602 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; 7603 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, 7604 getV4X86ShuffleImm8ForMask(BlendMask, DAG)); 7605 7606 // Now we do a normal shuffle of V1 by giving V1 as both operands to 7607 // a blend. 7608 LowV = HighV = V1; 7609 NewMask[0] = Mask[0] < 4 ? 0 : 2; 7610 NewMask[1] = Mask[0] < 4 ? 2 : 0; 7611 NewMask[2] = Mask[2] < 4 ? 1 : 3; 7612 NewMask[3] = Mask[2] < 4 ? 3 : 1; 7613 } 7614 } 7615 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, 7616 getV4X86ShuffleImm8ForMask(NewMask, DAG)); 7617 } 7618 7619 /// \brief Lower 4-lane 32-bit floating point shuffles. 7620 /// 7621 /// Uses instructions exclusively from the floating point unit to minimize 7622 /// domain crossing penalties, as these are sufficient to implement all v4f32 7623 /// shuffles. 7624 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 7625 const X86Subtarget *Subtarget, 7626 SelectionDAG &DAG) { 7627 SDLoc DL(Op); 7628 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); 7629 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 7630 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 7631 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7632 ArrayRef<int> Mask = SVOp->getMask(); 7633 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 7634 7635 int NumV2Elements = 7636 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); 7637 7638 if (NumV2Elements == 0) { 7639 // Check for being able to broadcast a single element. 7640 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, 7641 Mask, Subtarget, DAG)) 7642 return Broadcast; 7643 7644 // Use even/odd duplicate instructions for masks that match their pattern. 7645 if (Subtarget->hasSSE3()) { 7646 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) 7647 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); 7648 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) 7649 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); 7650 } 7651 7652 if (Subtarget->hasAVX()) { 7653 // If we have AVX, we can use VPERMILPS which will allow folding a load 7654 // into the shuffle. 7655 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, 7656 getV4X86ShuffleImm8ForMask(Mask, DAG)); 7657 } 7658 7659 // Otherwise, use a straight shuffle of a single input vector. We pass the 7660 // input vector to both operands to simulate this with a SHUFPS. 7661 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, 7662 getV4X86ShuffleImm8ForMask(Mask, DAG)); 7663 } 7664 7665 // There are special ways we can lower some single-element blends. However, we 7666 // have custom ways we can lower more complex single-element blends below that 7667 // we defer to if both this and BLENDPS fail to match, so restrict this to 7668 // when the V2 input is targeting element 0 of the mask -- that is the fast 7669 // case here. 7670 if (NumV2Elements == 1 && Mask[0] >= 4) 7671 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, 7672 Mask, Subtarget, DAG)) 7673 return V; 7674 7675 if (Subtarget->hasSSE41()) { 7676 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, 7677 Subtarget, DAG)) 7678 return Blend; 7679 7680 // Use INSERTPS if we can complete the shuffle efficiently. 7681 if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) 7682 return V; 7683 7684 if (!isSingleSHUFPSMask(Mask)) 7685 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( 7686 DL, MVT::v4f32, V1, V2, Mask, DAG)) 7687 return BlendPerm; 7688 } 7689 7690 // Use dedicated unpack instructions for masks that match their pattern. 7691 if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) 7692 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); 7693 if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) 7694 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); 7695 if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) 7696 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1); 7697 if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) 7698 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1); 7699 7700 // Otherwise fall back to a SHUFPS lowering strategy. 7701 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); 7702 } 7703 7704 /// \brief Lower 4-lane i32 vector shuffles. 7705 /// 7706 /// We try to handle these with integer-domain shuffles where we can, but for 7707 /// blends we use the floating point domain blend instructions. 7708 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 7709 const X86Subtarget *Subtarget, 7710 SelectionDAG &DAG) { 7711 SDLoc DL(Op); 7712 assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); 7713 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); 7714 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); 7715 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7716 ArrayRef<int> Mask = SVOp->getMask(); 7717 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 7718 7719 // Whenever we can lower this as a zext, that instruction is strictly faster 7720 // than any alternative. It also allows us to fold memory operands into the 7721 // shuffle in many cases. 7722 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, 7723 Mask, Subtarget, DAG)) 7724 return ZExt; 7725 7726 int NumV2Elements = 7727 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); 7728 7729 if (NumV2Elements == 0) { 7730 // Check for being able to broadcast a single element. 7731 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, 7732 Mask, Subtarget, DAG)) 7733 return Broadcast; 7734 7735 // Straight shuffle of a single input vector. For everything from SSE2 7736 // onward this has a single fast instruction with no scary immediates. 7737 // We coerce the shuffle pattern to be compatible with UNPCK instructions 7738 // but we aren't actually going to use the UNPCK instruction because doing 7739 // so prevents folding a load into this instruction or making a copy. 7740 const int UnpackLoMask[] = {0, 0, 1, 1}; 7741 const int UnpackHiMask[] = {2, 2, 3, 3}; 7742 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) 7743 Mask = UnpackLoMask; 7744 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) 7745 Mask = UnpackHiMask; 7746 7747 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, 7748 getV4X86ShuffleImm8ForMask(Mask, DAG)); 7749 } 7750 7751 // Try to use shift instructions. 7752 if (SDValue Shift = 7753 lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG)) 7754 return Shift; 7755 7756 // There are special ways we can lower some single-element blends. 7757 if (NumV2Elements == 1) 7758 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, 7759 Mask, Subtarget, DAG)) 7760 return V; 7761 7762 // We have different paths for blend lowering, but they all must use the 7763 // *exact* same predicate. 7764 bool IsBlendSupported = Subtarget->hasSSE41(); 7765 if (IsBlendSupported) 7766 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, 7767 Subtarget, DAG)) 7768 return Blend; 7769 7770 if (SDValue Masked = 7771 lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) 7772 return Masked; 7773 7774 // Use dedicated unpack instructions for masks that match their pattern. 7775 if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) 7776 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); 7777 if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) 7778 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); 7779 if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) 7780 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1); 7781 if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) 7782 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1); 7783 7784 // Try to use byte rotation instructions. 7785 // Its more profitable for pre-SSSE3 to use shuffles/unpacks. 7786 if (Subtarget->hasSSSE3()) 7787 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 7788 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) 7789 return Rotate; 7790 7791 // If we have direct support for blends, we should lower by decomposing into 7792 // a permute. That will be faster than the domain cross. 7793 if (IsBlendSupported) 7794 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, 7795 Mask, DAG); 7796 7797 // Try to lower by permuting the inputs into an unpack instruction. 7798 if (SDValue Unpack = 7799 lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) 7800 return Unpack; 7801 7802 // We implement this with SHUFPS because it can blend from two vectors. 7803 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build 7804 // up the inputs, bypassing domain shift penalties that we would encur if we 7805 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't 7806 // relevant. 7807 return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, 7808 DAG.getVectorShuffle( 7809 MVT::v4f32, DL, 7810 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), 7811 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); 7812 } 7813 7814 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 7815 /// shuffle lowering, and the most complex part. 7816 /// 7817 /// The lowering strategy is to try to form pairs of input lanes which are 7818 /// targeted at the same half of the final vector, and then use a dword shuffle 7819 /// to place them onto the right half, and finally unpack the paired lanes into 7820 /// their final position. 7821 /// 7822 /// The exact breakdown of how to form these dword pairs and align them on the 7823 /// correct sides is really tricky. See the comments within the function for 7824 /// more of the details. 7825 /// 7826 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each 7827 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to 7828 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 7829 /// vector, form the analogous 128-bit 8-element Mask. 7830 static SDValue lowerV8I16GeneralSingleInputVectorShuffle( 7831 SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, 7832 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 7833 assert(VT.getScalarType() == MVT::i16 && "Bad input type!"); 7834 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); 7835 7836 assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); 7837 MutableArrayRef<int> LoMask = Mask.slice(0, 4); 7838 MutableArrayRef<int> HiMask = Mask.slice(4, 4); 7839 7840 SmallVector<int, 4> LoInputs; 7841 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), 7842 [](int M) { return M >= 0; }); 7843 std::sort(LoInputs.begin(), LoInputs.end()); 7844 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); 7845 SmallVector<int, 4> HiInputs; 7846 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), 7847 [](int M) { return M >= 0; }); 7848 std::sort(HiInputs.begin(), HiInputs.end()); 7849 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); 7850 int NumLToL = 7851 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); 7852 int NumHToL = LoInputs.size() - NumLToL; 7853 int NumLToH = 7854 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); 7855 int NumHToH = HiInputs.size() - NumLToH; 7856 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); 7857 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); 7858 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); 7859 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); 7860 7861 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all 7862 // such inputs we can swap two of the dwords across the half mark and end up 7863 // with <=2 inputs to each half in each half. Once there, we can fall through 7864 // to the generic code below. For example: 7865 // 7866 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] 7867 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] 7868 // 7869 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half 7870 // and an existing 2-into-2 on the other half. In this case we may have to 7871 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or 7872 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. 7873 // Fortunately, we don't have to handle anything but a 2-into-2 pattern 7874 // because any other situation (including a 3-into-1 or 1-into-3 in the other 7875 // half than the one we target for fixing) will be fixed when we re-enter this 7876 // path. We will also combine away any sequence of PSHUFD instructions that 7877 // result into a single instruction. Here is an example of the tricky case: 7878 // 7879 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] 7880 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] 7881 // 7882 // This now has a 1-into-3 in the high half! Instead, we do two shuffles: 7883 // 7884 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] 7885 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] 7886 // 7887 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] 7888 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] 7889 // 7890 // The result is fine to be handled by the generic logic. 7891 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, 7892 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, 7893 int AOffset, int BOffset) { 7894 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && 7895 "Must call this with A having 3 or 1 inputs from the A half."); 7896 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && 7897 "Must call this with B having 1 or 3 inputs from the B half."); 7898 assert(AToAInputs.size() + BToAInputs.size() == 4 && 7899 "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); 7900 7901 // Compute the index of dword with only one word among the three inputs in 7902 // a half by taking the sum of the half with three inputs and subtracting 7903 // the sum of the actual three inputs. The difference is the remaining 7904 // slot. 7905 int ADWord, BDWord; 7906 int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; 7907 int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; 7908 int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; 7909 ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; 7910 int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; 7911 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); 7912 int TripleNonInputIdx = 7913 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); 7914 TripleDWord = TripleNonInputIdx / 2; 7915 7916 // We use xor with one to compute the adjacent DWord to whichever one the 7917 // OneInput is in. 7918 OneInputDWord = (OneInput / 2) ^ 1; 7919 7920 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA 7921 // and BToA inputs. If there is also such a problem with the BToB and AToB 7922 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in 7923 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it 7924 // is essential that we don't *create* a 3<-1 as then we might oscillate. 7925 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { 7926 // Compute how many inputs will be flipped by swapping these DWords. We 7927 // need 7928 // to balance this to ensure we don't form a 3-1 shuffle in the other 7929 // half. 7930 int NumFlippedAToBInputs = 7931 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + 7932 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); 7933 int NumFlippedBToBInputs = 7934 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + 7935 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); 7936 if ((NumFlippedAToBInputs == 1 && 7937 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || 7938 (NumFlippedBToBInputs == 1 && 7939 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { 7940 // We choose whether to fix the A half or B half based on whether that 7941 // half has zero flipped inputs. At zero, we may not be able to fix it 7942 // with that half. We also bias towards fixing the B half because that 7943 // will more commonly be the high half, and we have to bias one way. 7944 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, 7945 ArrayRef<int> Inputs) { 7946 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. 7947 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), 7948 PinnedIdx ^ 1) != Inputs.end(); 7949 // Determine whether the free index is in the flipped dword or the 7950 // unflipped dword based on where the pinned index is. We use this bit 7951 // in an xor to conditionally select the adjacent dword. 7952 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); 7953 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), 7954 FixFreeIdx) != Inputs.end(); 7955 if (IsFixIdxInput == IsFixFreeIdxInput) 7956 FixFreeIdx += 1; 7957 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), 7958 FixFreeIdx) != Inputs.end(); 7959 assert(IsFixIdxInput != IsFixFreeIdxInput && 7960 "We need to be changing the number of flipped inputs!"); 7961 int PSHUFHalfMask[] = {0, 1, 2, 3}; 7962 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); 7963 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, 7964 MVT::v8i16, V, 7965 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG)); 7966 7967 for (int &M : Mask) 7968 if (M != -1 && M == FixIdx) 7969 M = FixFreeIdx; 7970 else if (M != -1 && M == FixFreeIdx) 7971 M = FixIdx; 7972 }; 7973 if (NumFlippedBToBInputs != 0) { 7974 int BPinnedIdx = 7975 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; 7976 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); 7977 } else { 7978 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); 7979 int APinnedIdx = 7980 AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; 7981 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); 7982 } 7983 } 7984 } 7985 7986 int PSHUFDMask[] = {0, 1, 2, 3}; 7987 PSHUFDMask[ADWord] = BDWord; 7988 PSHUFDMask[BDWord] = ADWord; 7989 V = DAG.getNode(ISD::BITCAST, DL, VT, 7990 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, 7991 DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), 7992 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); 7993 7994 // Adjust the mask to match the new locations of A and B. 7995 for (int &M : Mask) 7996 if (M != -1 && M/2 == ADWord) 7997 M = 2 * BDWord + M % 2; 7998 else if (M != -1 && M/2 == BDWord) 7999 M = 2 * ADWord + M % 2; 8000 8001 // Recurse back into this routine to re-compute state now that this isn't 8002 // a 3 and 1 problem. 8003 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, 8004 DAG); 8005 }; 8006 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) 8007 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); 8008 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) 8009 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); 8010 8011 // At this point there are at most two inputs to the low and high halves from 8012 // each half. That means the inputs can always be grouped into dwords and 8013 // those dwords can then be moved to the correct half with a dword shuffle. 8014 // We use at most one low and one high word shuffle to collect these paired 8015 // inputs into dwords, and finally a dword shuffle to place them. 8016 int PSHUFLMask[4] = {-1, -1, -1, -1}; 8017 int PSHUFHMask[4] = {-1, -1, -1, -1}; 8018 int PSHUFDMask[4] = {-1, -1, -1, -1}; 8019 8020 // First fix the masks for all the inputs that are staying in their 8021 // original halves. This will then dictate the targets of the cross-half 8022 // shuffles. 8023 auto fixInPlaceInputs = 8024 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, 8025 MutableArrayRef<int> SourceHalfMask, 8026 MutableArrayRef<int> HalfMask, int HalfOffset) { 8027 if (InPlaceInputs.empty()) 8028 return; 8029 if (InPlaceInputs.size() == 1) { 8030 SourceHalfMask[InPlaceInputs[0] - HalfOffset] = 8031 InPlaceInputs[0] - HalfOffset; 8032 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; 8033 return; 8034 } 8035 if (IncomingInputs.empty()) { 8036 // Just fix all of the in place inputs. 8037 for (int Input : InPlaceInputs) { 8038 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; 8039 PSHUFDMask[Input / 2] = Input / 2; 8040 } 8041 return; 8042 } 8043 8044 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); 8045 SourceHalfMask[InPlaceInputs[0] - HalfOffset] = 8046 InPlaceInputs[0] - HalfOffset; 8047 // Put the second input next to the first so that they are packed into 8048 // a dword. We find the adjacent index by toggling the low bit. 8049 int AdjIndex = InPlaceInputs[0] ^ 1; 8050 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; 8051 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); 8052 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; 8053 }; 8054 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); 8055 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); 8056 8057 // Now gather the cross-half inputs and place them into a free dword of 8058 // their target half. 8059 // FIXME: This operation could almost certainly be simplified dramatically to 8060 // look more like the 3-1 fixing operation. 8061 auto moveInputsToRightHalf = [&PSHUFDMask]( 8062 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, 8063 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, 8064 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, 8065 int DestOffset) { 8066 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { 8067 return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; 8068 }; 8069 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, 8070 int Word) { 8071 int LowWord = Word & ~1; 8072 int HighWord = Word | 1; 8073 return isWordClobbered(SourceHalfMask, LowWord) || 8074 isWordClobbered(SourceHalfMask, HighWord); 8075 }; 8076 8077 if (IncomingInputs.empty()) 8078 return; 8079 8080 if (ExistingInputs.empty()) { 8081 // Map any dwords with inputs from them into the right half. 8082 for (int Input : IncomingInputs) { 8083 // If the source half mask maps over the inputs, turn those into 8084 // swaps and use the swapped lane. 8085 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { 8086 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { 8087 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = 8088 Input - SourceOffset; 8089 // We have to swap the uses in our half mask in one sweep. 8090 for (int &M : HalfMask) 8091 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) 8092 M = Input; 8093 else if (M == Input) 8094 M = SourceHalfMask[Input - SourceOffset] + SourceOffset; 8095 } else { 8096 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == 8097 Input - SourceOffset && 8098 "Previous placement doesn't match!"); 8099 } 8100 // Note that this correctly re-maps both when we do a swap and when 8101 // we observe the other side of the swap above. We rely on that to 8102 // avoid swapping the members of the input list directly. 8103 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; 8104 } 8105 8106 // Map the input's dword into the correct half. 8107 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) 8108 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; 8109 else 8110 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == 8111 Input / 2 && 8112 "Previous placement doesn't match!"); 8113 } 8114 8115 // And just directly shift any other-half mask elements to be same-half 8116 // as we will have mirrored the dword containing the element into the 8117 // same position within that half. 8118 for (int &M : HalfMask) 8119 if (M >= SourceOffset && M < SourceOffset + 4) { 8120 M = M - SourceOffset + DestOffset; 8121 assert(M >= 0 && "This should never wrap below zero!"); 8122 } 8123 return; 8124 } 8125 8126 // Ensure we have the input in a viable dword of its current half. This 8127 // is particularly tricky because the original position may be clobbered 8128 // by inputs being moved and *staying* in that half. 8129 if (IncomingInputs.size() == 1) { 8130 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { 8131 int InputFixed = std::find(std::begin(SourceHalfMask), 8132 std::end(SourceHalfMask), -1) - 8133 std::begin(SourceHalfMask) + SourceOffset; 8134 SourceHalfMask[InputFixed - SourceOffset] = 8135 IncomingInputs[0] - SourceOffset; 8136 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], 8137 InputFixed); 8138 IncomingInputs[0] = InputFixed; 8139 } 8140 } else if (IncomingInputs.size() == 2) { 8141 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || 8142 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { 8143 // We have two non-adjacent or clobbered inputs we need to extract from 8144 // the source half. To do this, we need to map them into some adjacent 8145 // dword slot in the source mask. 8146 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, 8147 IncomingInputs[1] - SourceOffset}; 8148 8149 // If there is a free slot in the source half mask adjacent to one of 8150 // the inputs, place the other input in it. We use (Index XOR 1) to 8151 // compute an adjacent index. 8152 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && 8153 SourceHalfMask[InputsFixed[0] ^ 1] == -1) { 8154 SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; 8155 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; 8156 InputsFixed[1] = InputsFixed[0] ^ 1; 8157 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && 8158 SourceHalfMask[InputsFixed[1] ^ 1] == -1) { 8159 SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; 8160 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; 8161 InputsFixed[0] = InputsFixed[1] ^ 1; 8162 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && 8163 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { 8164 // The two inputs are in the same DWord but it is clobbered and the 8165 // adjacent DWord isn't used at all. Move both inputs to the free 8166 // slot. 8167 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; 8168 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; 8169 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); 8170 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; 8171 } else { 8172 // The only way we hit this point is if there is no clobbering 8173 // (because there are no off-half inputs to this half) and there is no 8174 // free slot adjacent to one of the inputs. In this case, we have to 8175 // swap an input with a non-input. 8176 for (int i = 0; i < 4; ++i) 8177 assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && 8178 "We can't handle any clobbers here!"); 8179 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && 8180 "Cannot have adjacent inputs here!"); 8181 8182 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; 8183 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; 8184 8185 // We also have to update the final source mask in this case because 8186 // it may need to undo the above swap. 8187 for (int &M : FinalSourceHalfMask) 8188 if (M == (InputsFixed[0] ^ 1) + SourceOffset) 8189 M = InputsFixed[1] + SourceOffset; 8190 else if (M == InputsFixed[1] + SourceOffset) 8191 M = (InputsFixed[0] ^ 1) + SourceOffset; 8192 8193 InputsFixed[1] = InputsFixed[0] ^ 1; 8194 } 8195 8196 // Point everything at the fixed inputs. 8197 for (int &M : HalfMask) 8198 if (M == IncomingInputs[0]) 8199 M = InputsFixed[0] + SourceOffset; 8200 else if (M == IncomingInputs[1]) 8201 M = InputsFixed[1] + SourceOffset; 8202 8203 IncomingInputs[0] = InputsFixed[0] + SourceOffset; 8204 IncomingInputs[1] = InputsFixed[1] + SourceOffset; 8205 } 8206 } else { 8207 llvm_unreachable("Unhandled input size!"); 8208 } 8209 8210 // Now hoist the DWord down to the right half. 8211 int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; 8212 assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); 8213 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; 8214 for (int &M : HalfMask) 8215 for (int Input : IncomingInputs) 8216 if (M == Input) 8217 M = FreeDWord * 2 + Input % 2; 8218 }; 8219 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, 8220 /*SourceOffset*/ 4, /*DestOffset*/ 0); 8221 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, 8222 /*SourceOffset*/ 0, /*DestOffset*/ 4); 8223 8224 // Now enact all the shuffles we've computed to move the inputs into their 8225 // target half. 8226 if (!isNoopShuffleMask(PSHUFLMask)) 8227 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, 8228 getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); 8229 if (!isNoopShuffleMask(PSHUFHMask)) 8230 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, 8231 getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); 8232 if (!isNoopShuffleMask(PSHUFDMask)) 8233 V = DAG.getNode(ISD::BITCAST, DL, VT, 8234 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, 8235 DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), 8236 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); 8237 8238 // At this point, each half should contain all its inputs, and we can then 8239 // just shuffle them into their final position. 8240 assert(std::count_if(LoMask.begin(), LoMask.end(), 8241 [](int M) { return M >= 4; }) == 0 && 8242 "Failed to lift all the high half inputs to the low mask!"); 8243 assert(std::count_if(HiMask.begin(), HiMask.end(), 8244 [](int M) { return M >= 0 && M < 4; }) == 0 && 8245 "Failed to lift all the low half inputs to the high mask!"); 8246 8247 // Do a half shuffle for the low mask. 8248 if (!isNoopShuffleMask(LoMask)) 8249 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, 8250 getV4X86ShuffleImm8ForMask(LoMask, DAG)); 8251 8252 // Do a half shuffle with the high mask after shifting its values down. 8253 for (int &M : HiMask) 8254 if (M >= 0) 8255 M -= 4; 8256 if (!isNoopShuffleMask(HiMask)) 8257 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, 8258 getV4X86ShuffleImm8ForMask(HiMask, DAG)); 8259 8260 return V; 8261 } 8262 8263 /// \brief Helper to form a PSHUFB-based shuffle+blend. 8264 static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, 8265 SDValue V2, ArrayRef<int> Mask, 8266 SelectionDAG &DAG, bool &V1InUse, 8267 bool &V2InUse) { 8268 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 8269 SDValue V1Mask[16]; 8270 SDValue V2Mask[16]; 8271 V1InUse = false; 8272 V2InUse = false; 8273 8274 int Size = Mask.size(); 8275 int Scale = 16 / Size; 8276 for (int i = 0; i < 16; ++i) { 8277 if (Mask[i / Scale] == -1) { 8278 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); 8279 } else { 8280 const int ZeroMask = 0x80; 8281 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale 8282 : ZeroMask; 8283 int V2Idx = Mask[i / Scale] < Size 8284 ? ZeroMask 8285 : (Mask[i / Scale] - Size) * Scale + i % Scale; 8286 if (Zeroable[i / Scale]) 8287 V1Idx = V2Idx = ZeroMask; 8288 V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8); 8289 V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8); 8290 V1InUse |= (ZeroMask != V1Idx); 8291 V2InUse |= (ZeroMask != V2Idx); 8292 } 8293 } 8294 8295 if (V1InUse) 8296 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, 8297 DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1), 8298 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); 8299 if (V2InUse) 8300 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, 8301 DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2), 8302 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); 8303 8304 // If we need shuffled inputs from both, blend the two. 8305 SDValue V; 8306 if (V1InUse && V2InUse) 8307 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); 8308 else 8309 V = V1InUse ? V1 : V2; 8310 8311 // Cast the result back to the correct type. 8312 return DAG.getNode(ISD::BITCAST, DL, VT, V); 8313 } 8314 8315 /// \brief Generic lowering of 8-lane i16 shuffles. 8316 /// 8317 /// This handles both single-input shuffles and combined shuffle/blends with 8318 /// two inputs. The single input shuffles are immediately delegated to 8319 /// a dedicated lowering routine. 8320 /// 8321 /// The blends are lowered in one of three fundamental ways. If there are few 8322 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle 8323 /// of the input is significantly cheaper when lowered as an interleaving of 8324 /// the two inputs, try to interleave them. Otherwise, blend the low and high 8325 /// halves of the inputs separately (making them have relatively few inputs) 8326 /// and then concatenate them. 8327 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 8328 const X86Subtarget *Subtarget, 8329 SelectionDAG &DAG) { 8330 SDLoc DL(Op); 8331 assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); 8332 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); 8333 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); 8334 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 8335 ArrayRef<int> OrigMask = SVOp->getMask(); 8336 int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], 8337 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; 8338 MutableArrayRef<int> Mask(MaskStorage); 8339 8340 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 8341 8342 // Whenever we can lower this as a zext, that instruction is strictly faster 8343 // than any alternative. 8344 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( 8345 DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) 8346 return ZExt; 8347 8348 auto isV1 = [](int M) { return M >= 0 && M < 8; }; 8349 (void)isV1; 8350 auto isV2 = [](int M) { return M >= 8; }; 8351 8352 int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); 8353 8354 if (NumV2Inputs == 0) { 8355 // Check for being able to broadcast a single element. 8356 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, 8357 Mask, Subtarget, DAG)) 8358 return Broadcast; 8359 8360 // Try to use shift instructions. 8361 if (SDValue Shift = 8362 lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG)) 8363 return Shift; 8364 8365 // Use dedicated unpack instructions for masks that match their pattern. 8366 if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3})) 8367 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1); 8368 if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7})) 8369 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1); 8370 8371 // Try to use byte rotation instructions. 8372 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, 8373 Mask, Subtarget, DAG)) 8374 return Rotate; 8375 8376 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask, 8377 Subtarget, DAG); 8378 } 8379 8380 assert(std::any_of(Mask.begin(), Mask.end(), isV1) && 8381 "All single-input shuffles should be canonicalized to be V1-input " 8382 "shuffles."); 8383 8384 // Try to use shift instructions. 8385 if (SDValue Shift = 8386 lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) 8387 return Shift; 8388 8389 // There are special ways we can lower some single-element blends. 8390 if (NumV2Inputs == 1) 8391 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, 8392 Mask, Subtarget, DAG)) 8393 return V; 8394 8395 // We have different paths for blend lowering, but they all must use the 8396 // *exact* same predicate. 8397 bool IsBlendSupported = Subtarget->hasSSE41(); 8398 if (IsBlendSupported) 8399 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, 8400 Subtarget, DAG)) 8401 return Blend; 8402 8403 if (SDValue Masked = 8404 lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) 8405 return Masked; 8406 8407 // Use dedicated unpack instructions for masks that match their pattern. 8408 if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11})) 8409 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); 8410 if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15})) 8411 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); 8412 8413 // Try to use byte rotation instructions. 8414 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 8415 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) 8416 return Rotate; 8417 8418 if (SDValue BitBlend = 8419 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) 8420 return BitBlend; 8421 8422 if (SDValue Unpack = 8423 lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) 8424 return Unpack; 8425 8426 // If we can't directly blend but can use PSHUFB, that will be better as it 8427 // can both shuffle and set up the inefficient blend. 8428 if (!IsBlendSupported && Subtarget->hasSSSE3()) { 8429 bool V1InUse, V2InUse; 8430 return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG, 8431 V1InUse, V2InUse); 8432 } 8433 8434 // We can always bit-blend if we have to so the fallback strategy is to 8435 // decompose into single-input permutes and blends. 8436 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, 8437 Mask, DAG); 8438 } 8439 8440 /// \brief Check whether a compaction lowering can be done by dropping even 8441 /// elements and compute how many times even elements must be dropped. 8442 /// 8443 /// This handles shuffles which take every Nth element where N is a power of 8444 /// two. Example shuffle masks: 8445 /// 8446 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 8447 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 8448 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 8449 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 8450 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 8451 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 8452 /// 8453 /// Any of these lanes can of course be undef. 8454 /// 8455 /// This routine only supports N <= 3. 8456 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here 8457 /// for larger N. 8458 /// 8459 /// \returns N above, or the number of times even elements must be dropped if 8460 /// there is such a number. Otherwise returns zero. 8461 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { 8462 // Figure out whether we're looping over two inputs or just one. 8463 bool IsSingleInput = isSingleInputShuffleMask(Mask); 8464 8465 // The modulus for the shuffle vector entries is based on whether this is 8466 // a single input or not. 8467 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); 8468 assert(isPowerOf2_32((uint32_t)ShuffleModulus) && 8469 "We should only be called with masks with a power-of-2 size!"); 8470 8471 uint64_t ModMask = (uint64_t)ShuffleModulus - 1; 8472 8473 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, 8474 // and 2^3 simultaneously. This is because we may have ambiguity with 8475 // partially undef inputs. 8476 bool ViableForN[3] = {true, true, true}; 8477 8478 for (int i = 0, e = Mask.size(); i < e; ++i) { 8479 // Ignore undef lanes, we'll optimistically collapse them to the pattern we 8480 // want. 8481 if (Mask[i] == -1) 8482 continue; 8483 8484 bool IsAnyViable = false; 8485 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) 8486 if (ViableForN[j]) { 8487 uint64_t N = j + 1; 8488 8489 // The shuffle mask must be equal to (i * 2^N) % M. 8490 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) 8491 IsAnyViable = true; 8492 else 8493 ViableForN[j] = false; 8494 } 8495 // Early exit if we exhaust the possible powers of two. 8496 if (!IsAnyViable) 8497 break; 8498 } 8499 8500 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) 8501 if (ViableForN[j]) 8502 return j + 1; 8503 8504 // Return 0 as there is no viable power of two. 8505 return 0; 8506 } 8507 8508 /// \brief Generic lowering of v16i8 shuffles. 8509 /// 8510 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to 8511 /// detect any complexity reducing interleaving. If that doesn't help, it uses 8512 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses 8513 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them 8514 /// back together. 8515 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 8516 const X86Subtarget *Subtarget, 8517 SelectionDAG &DAG) { 8518 SDLoc DL(Op); 8519 assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); 8520 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); 8521 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); 8522 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 8523 ArrayRef<int> Mask = SVOp->getMask(); 8524 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 8525 8526 // Try to use shift instructions. 8527 if (SDValue Shift = 8528 lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG)) 8529 return Shift; 8530 8531 // Try to use byte rotation instructions. 8532 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 8533 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) 8534 return Rotate; 8535 8536 // Try to use a zext lowering. 8537 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( 8538 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) 8539 return ZExt; 8540 8541 int NumV2Elements = 8542 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); 8543 8544 // For single-input shuffles, there are some nicer lowering tricks we can use. 8545 if (NumV2Elements == 0) { 8546 // Check for being able to broadcast a single element. 8547 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, 8548 Mask, Subtarget, DAG)) 8549 return Broadcast; 8550 8551 // Check whether we can widen this to an i16 shuffle by duplicating bytes. 8552 // Notably, this handles splat and partial-splat shuffles more efficiently. 8553 // However, it only makes sense if the pre-duplication shuffle simplifies 8554 // things significantly. Currently, this means we need to be able to 8555 // express the pre-duplication shuffle as an i16 shuffle. 8556 // 8557 // FIXME: We should check for other patterns which can be widened into an 8558 // i16 shuffle as well. 8559 auto canWidenViaDuplication = [](ArrayRef<int> Mask) { 8560 for (int i = 0; i < 16; i += 2) 8561 if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) 8562 return false; 8563 8564 return true; 8565 }; 8566 auto tryToWidenViaDuplication = [&]() -> SDValue { 8567 if (!canWidenViaDuplication(Mask)) 8568 return SDValue(); 8569 SmallVector<int, 4> LoInputs; 8570 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), 8571 [](int M) { return M >= 0 && M < 8; }); 8572 std::sort(LoInputs.begin(), LoInputs.end()); 8573 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), 8574 LoInputs.end()); 8575 SmallVector<int, 4> HiInputs; 8576 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), 8577 [](int M) { return M >= 8; }); 8578 std::sort(HiInputs.begin(), HiInputs.end()); 8579 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), 8580 HiInputs.end()); 8581 8582 bool TargetLo = LoInputs.size() >= HiInputs.size(); 8583 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; 8584 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs; 8585 8586 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; 8587 SmallDenseMap<int, int, 8> LaneMap; 8588 for (int I : InPlaceInputs) { 8589 PreDupI16Shuffle[I/2] = I/2; 8590 LaneMap[I] = I; 8591 } 8592 int j = TargetLo ? 0 : 4, je = j + 4; 8593 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { 8594 // Check if j is already a shuffle of this input. This happens when 8595 // there are two adjacent bytes after we move the low one. 8596 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { 8597 // If we haven't yet mapped the input, search for a slot into which 8598 // we can map it. 8599 while (j < je && PreDupI16Shuffle[j] != -1) 8600 ++j; 8601 8602 if (j == je) 8603 // We can't place the inputs into a single half with a simple i16 shuffle, so bail. 8604 return SDValue(); 8605 8606 // Map this input with the i16 shuffle. 8607 PreDupI16Shuffle[j] = MovingInputs[i] / 2; 8608 } 8609 8610 // Update the lane map based on the mapping we ended up with. 8611 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; 8612 } 8613 V1 = DAG.getNode( 8614 ISD::BITCAST, DL, MVT::v16i8, 8615 DAG.getVectorShuffle(MVT::v8i16, DL, 8616 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), 8617 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); 8618 8619 // Unpack the bytes to form the i16s that will be shuffled into place. 8620 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, 8621 MVT::v16i8, V1, V1); 8622 8623 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 8624 for (int i = 0; i < 16; ++i) 8625 if (Mask[i] != -1) { 8626 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); 8627 assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); 8628 if (PostDupI16Shuffle[i / 2] == -1) 8629 PostDupI16Shuffle[i / 2] = MappedMask; 8630 else 8631 assert(PostDupI16Shuffle[i / 2] == MappedMask && 8632 "Conflicting entrties in the original shuffle!"); 8633 } 8634 return DAG.getNode( 8635 ISD::BITCAST, DL, MVT::v16i8, 8636 DAG.getVectorShuffle(MVT::v8i16, DL, 8637 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), 8638 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); 8639 }; 8640 if (SDValue V = tryToWidenViaDuplication()) 8641 return V; 8642 } 8643 8644 // Use dedicated unpack instructions for masks that match their pattern. 8645 if (isShuffleEquivalent(V1, V2, Mask, {// Low half. 8646 0, 16, 1, 17, 2, 18, 3, 19, 8647 // High half. 8648 4, 20, 5, 21, 6, 22, 7, 23})) 8649 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2); 8650 if (isShuffleEquivalent(V1, V2, Mask, {// Low half. 8651 8, 24, 9, 25, 10, 26, 11, 27, 8652 // High half. 8653 12, 28, 13, 29, 14, 30, 15, 31})) 8654 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2); 8655 8656 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly 8657 // with PSHUFB. It is important to do this before we attempt to generate any 8658 // blends but after all of the single-input lowerings. If the single input 8659 // lowerings can find an instruction sequence that is faster than a PSHUFB, we 8660 // want to preserve that and we can DAG combine any longer sequences into 8661 // a PSHUFB in the end. But once we start blending from multiple inputs, 8662 // the complexity of DAG combining bad patterns back into PSHUFB is too high, 8663 // and there are *very* few patterns that would actually be faster than the 8664 // PSHUFB approach because of its ability to zero lanes. 8665 // 8666 // FIXME: The only exceptions to the above are blends which are exact 8667 // interleavings with direct instructions supporting them. We currently don't 8668 // handle those well here. 8669 if (Subtarget->hasSSSE3()) { 8670 bool V1InUse = false; 8671 bool V2InUse = false; 8672 8673 SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask, 8674 DAG, V1InUse, V2InUse); 8675 8676 // If both V1 and V2 are in use and we can use a direct blend or an unpack, 8677 // do so. This avoids using them to handle blends-with-zero which is 8678 // important as a single pshufb is significantly faster for that. 8679 if (V1InUse && V2InUse) { 8680 if (Subtarget->hasSSE41()) 8681 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, 8682 Mask, Subtarget, DAG)) 8683 return Blend; 8684 8685 // We can use an unpack to do the blending rather than an or in some 8686 // cases. Even though the or may be (very minorly) more efficient, we 8687 // preference this lowering because there are common cases where part of 8688 // the complexity of the shuffles goes away when we do the final blend as 8689 // an unpack. 8690 // FIXME: It might be worth trying to detect if the unpack-feeding 8691 // shuffles will both be pshufb, in which case we shouldn't bother with 8692 // this. 8693 if (SDValue Unpack = 8694 lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG)) 8695 return Unpack; 8696 } 8697 8698 return PSHUFB; 8699 } 8700 8701 // There are special ways we can lower some single-element blends. 8702 if (NumV2Elements == 1) 8703 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, 8704 Mask, Subtarget, DAG)) 8705 return V; 8706 8707 if (SDValue BitBlend = 8708 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) 8709 return BitBlend; 8710 8711 // Check whether a compaction lowering can be done. This handles shuffles 8712 // which take every Nth element for some even N. See the helper function for 8713 // details. 8714 // 8715 // We special case these as they can be particularly efficiently handled with 8716 // the PACKUSB instruction on x86 and they show up in common patterns of 8717 // rearranging bytes to truncate wide elements. 8718 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { 8719 // NumEvenDrops is the power of two stride of the elements. Another way of 8720 // thinking about it is that we need to drop the even elements this many 8721 // times to get the original input. 8722 bool IsSingleInput = isSingleInputShuffleMask(Mask); 8723 8724 // First we need to zero all the dropped bytes. 8725 assert(NumEvenDrops <= 3 && 8726 "No support for dropping even elements more than 3 times."); 8727 // We use the mask type to pick which bytes are preserved based on how many 8728 // elements are dropped. 8729 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; 8730 SDValue ByteClearMask = 8731 DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, 8732 DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1])); 8733 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); 8734 if (!IsSingleInput) 8735 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); 8736 8737 // Now pack things back together. 8738 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); 8739 V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); 8740 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); 8741 for (int i = 1; i < NumEvenDrops; ++i) { 8742 Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result); 8743 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); 8744 } 8745 8746 return Result; 8747 } 8748 8749 // Handle multi-input cases by blending single-input shuffles. 8750 if (NumV2Elements > 0) 8751 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, 8752 Mask, DAG); 8753 8754 // The fallback path for single-input shuffles widens this into two v8i16 8755 // vectors with unpacks, shuffles those, and then pulls them back together 8756 // with a pack. 8757 SDValue V = V1; 8758 8759 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 8760 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 8761 for (int i = 0; i < 16; ++i) 8762 if (Mask[i] >= 0) 8763 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; 8764 8765 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); 8766 8767 SDValue VLoHalf, VHiHalf; 8768 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask 8769 // them out and avoid using UNPCK{L,H} to extract the elements of V as 8770 // i16s. 8771 if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask), 8772 [](int M) { return M >= 0 && M % 2 == 1; }) && 8773 std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), 8774 [](int M) { return M >= 0 && M % 2 == 1; })) { 8775 // Use a mask to drop the high bytes. 8776 VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); 8777 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, 8778 DAG.getConstant(0x00FF, MVT::v8i16)); 8779 8780 // This will be a single vector shuffle instead of a blend so nuke VHiHalf. 8781 VHiHalf = DAG.getUNDEF(MVT::v8i16); 8782 8783 // Squash the masks to point directly into VLoHalf. 8784 for (int &M : LoBlendMask) 8785 if (M >= 0) 8786 M /= 2; 8787 for (int &M : HiBlendMask) 8788 if (M >= 0) 8789 M /= 2; 8790 } else { 8791 // Otherwise just unpack the low half of V into VLoHalf and the high half into 8792 // VHiHalf so that we can blend them as i16s. 8793 VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, 8794 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); 8795 VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, 8796 DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); 8797 } 8798 8799 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); 8800 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); 8801 8802 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); 8803 } 8804 8805 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. 8806 /// 8807 /// This routine breaks down the specific type of 128-bit shuffle and 8808 /// dispatches to the lowering routines accordingly. 8809 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 8810 MVT VT, const X86Subtarget *Subtarget, 8811 SelectionDAG &DAG) { 8812 switch (VT.SimpleTy) { 8813 case MVT::v2i64: 8814 return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); 8815 case MVT::v2f64: 8816 return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); 8817 case MVT::v4i32: 8818 return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); 8819 case MVT::v4f32: 8820 return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); 8821 case MVT::v8i16: 8822 return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); 8823 case MVT::v16i8: 8824 return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); 8825 8826 default: 8827 llvm_unreachable("Unimplemented!"); 8828 } 8829 } 8830 8831 /// \brief Helper function to test whether a shuffle mask could be 8832 /// simplified by widening the elements being shuffled. 8833 /// 8834 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise 8835 /// leaves it in an unspecified state. 8836 /// 8837 /// NOTE: This must handle normal vector shuffle masks and *target* vector 8838 /// shuffle masks. The latter have the special property of a '-2' representing 8839 /// a zero-ed lane of a vector. 8840 static bool canWidenShuffleElements(ArrayRef<int> Mask, 8841 SmallVectorImpl<int> &WidenedMask) { 8842 for (int i = 0, Size = Mask.size(); i < Size; i += 2) { 8843 // If both elements are undef, its trivial. 8844 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { 8845 WidenedMask.push_back(SM_SentinelUndef); 8846 continue; 8847 } 8848 8849 // Check for an undef mask and a mask value properly aligned to fit with 8850 // a pair of values. If we find such a case, use the non-undef mask's value. 8851 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { 8852 WidenedMask.push_back(Mask[i + 1] / 2); 8853 continue; 8854 } 8855 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { 8856 WidenedMask.push_back(Mask[i] / 2); 8857 continue; 8858 } 8859 8860 // When zeroing, we need to spread the zeroing across both lanes to widen. 8861 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { 8862 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && 8863 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { 8864 WidenedMask.push_back(SM_SentinelZero); 8865 continue; 8866 } 8867 return false; 8868 } 8869 8870 // Finally check if the two mask values are adjacent and aligned with 8871 // a pair. 8872 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { 8873 WidenedMask.push_back(Mask[i] / 2); 8874 continue; 8875 } 8876 8877 // Otherwise we can't safely widen the elements used in this shuffle. 8878 return false; 8879 } 8880 assert(WidenedMask.size() == Mask.size() / 2 && 8881 "Incorrect size of mask after widening the elements!"); 8882 8883 return true; 8884 } 8885 8886 /// \brief Generic routine to split vector shuffle into half-sized shuffles. 8887 /// 8888 /// This routine just extracts two subvectors, shuffles them independently, and 8889 /// then concatenates them back together. This should work effectively with all 8890 /// AVX vector shuffle types. 8891 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, 8892 SDValue V2, ArrayRef<int> Mask, 8893 SelectionDAG &DAG) { 8894 assert(VT.getSizeInBits() >= 256 && 8895 "Only for 256-bit or wider vector shuffles!"); 8896 assert(V1.getSimpleValueType() == VT && "Bad operand type!"); 8897 assert(V2.getSimpleValueType() == VT && "Bad operand type!"); 8898 8899 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2); 8900 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2); 8901 8902 int NumElements = VT.getVectorNumElements(); 8903 int SplitNumElements = NumElements / 2; 8904 MVT ScalarVT = VT.getScalarType(); 8905 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); 8906 8907 // Rather than splitting build-vectors, just build two narrower build 8908 // vectors. This helps shuffling with splats and zeros. 8909 auto SplitVector = [&](SDValue V) { 8910 while (V.getOpcode() == ISD::BITCAST) 8911 V = V->getOperand(0); 8912 8913 MVT OrigVT = V.getSimpleValueType(); 8914 int OrigNumElements = OrigVT.getVectorNumElements(); 8915 int OrigSplitNumElements = OrigNumElements / 2; 8916 MVT OrigScalarVT = OrigVT.getScalarType(); 8917 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); 8918 8919 SDValue LoV, HiV; 8920 8921 auto *BV = dyn_cast<BuildVectorSDNode>(V); 8922 if (!BV) { 8923 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, 8924 DAG.getIntPtrConstant(0)); 8925 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, 8926 DAG.getIntPtrConstant(OrigSplitNumElements)); 8927 } else { 8928 8929 SmallVector<SDValue, 16> LoOps, HiOps; 8930 for (int i = 0; i < OrigSplitNumElements; ++i) { 8931 LoOps.push_back(BV->getOperand(i)); 8932 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); 8933 } 8934 LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); 8935 HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); 8936 } 8937 return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV), 8938 DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV)); 8939 }; 8940 8941 SDValue LoV1, HiV1, LoV2, HiV2; 8942 std::tie(LoV1, HiV1) = SplitVector(V1); 8943 std::tie(LoV2, HiV2) = SplitVector(V2); 8944 8945 // Now create two 4-way blends of these half-width vectors. 8946 auto HalfBlend = [&](ArrayRef<int> HalfMask) { 8947 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; 8948 SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask; 8949 for (int i = 0; i < SplitNumElements; ++i) { 8950 int M = HalfMask[i]; 8951 if (M >= NumElements) { 8952 if (M >= NumElements + SplitNumElements) 8953 UseHiV2 = true; 8954 else 8955 UseLoV2 = true; 8956 V2BlendMask.push_back(M - NumElements); 8957 V1BlendMask.push_back(-1); 8958 BlendMask.push_back(SplitNumElements + i); 8959 } else if (M >= 0) { 8960 if (M >= SplitNumElements) 8961 UseHiV1 = true; 8962 else 8963 UseLoV1 = true; 8964 V2BlendMask.push_back(-1); 8965 V1BlendMask.push_back(M); 8966 BlendMask.push_back(i); 8967 } else { 8968 V2BlendMask.push_back(-1); 8969 V1BlendMask.push_back(-1); 8970 BlendMask.push_back(-1); 8971 } 8972 } 8973 8974 // Because the lowering happens after all combining takes place, we need to 8975 // manually combine these blend masks as much as possible so that we create 8976 // a minimal number of high-level vector shuffle nodes. 8977 8978 // First try just blending the halves of V1 or V2. 8979 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) 8980 return DAG.getUNDEF(SplitVT); 8981 if (!UseLoV2 && !UseHiV2) 8982 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); 8983 if (!UseLoV1 && !UseHiV1) 8984 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); 8985 8986 SDValue V1Blend, V2Blend; 8987 if (UseLoV1 && UseHiV1) { 8988 V1Blend = 8989 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); 8990 } else { 8991 // We only use half of V1 so map the usage down into the final blend mask. 8992 V1Blend = UseLoV1 ? LoV1 : HiV1; 8993 for (int i = 0; i < SplitNumElements; ++i) 8994 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) 8995 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); 8996 } 8997 if (UseLoV2 && UseHiV2) { 8998 V2Blend = 8999 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); 9000 } else { 9001 // We only use half of V2 so map the usage down into the final blend mask. 9002 V2Blend = UseLoV2 ? LoV2 : HiV2; 9003 for (int i = 0; i < SplitNumElements; ++i) 9004 if (BlendMask[i] >= SplitNumElements) 9005 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); 9006 } 9007 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); 9008 }; 9009 SDValue Lo = HalfBlend(LoMask); 9010 SDValue Hi = HalfBlend(HiMask); 9011 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); 9012 } 9013 9014 /// \brief Either split a vector in halves or decompose the shuffles and the 9015 /// blend. 9016 /// 9017 /// This is provided as a good fallback for many lowerings of non-single-input 9018 /// shuffles with more than one 128-bit lane. In those cases, we want to select 9019 /// between splitting the shuffle into 128-bit components and stitching those 9020 /// back together vs. extracting the single-input shuffles and blending those 9021 /// results. 9022 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, 9023 SDValue V2, ArrayRef<int> Mask, 9024 SelectionDAG &DAG) { 9025 assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " 9026 "lower single-input shuffles as it " 9027 "could then recurse on itself."); 9028 int Size = Mask.size(); 9029 9030 // If this can be modeled as a broadcast of two elements followed by a blend, 9031 // prefer that lowering. This is especially important because broadcasts can 9032 // often fold with memory operands. 9033 auto DoBothBroadcast = [&] { 9034 int V1BroadcastIdx = -1, V2BroadcastIdx = -1; 9035 for (int M : Mask) 9036 if (M >= Size) { 9037 if (V2BroadcastIdx == -1) 9038 V2BroadcastIdx = M - Size; 9039 else if (M - Size != V2BroadcastIdx) 9040 return false; 9041 } else if (M >= 0) { 9042 if (V1BroadcastIdx == -1) 9043 V1BroadcastIdx = M; 9044 else if (M != V1BroadcastIdx) 9045 return false; 9046 } 9047 return true; 9048 }; 9049 if (DoBothBroadcast()) 9050 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, 9051 DAG); 9052 9053 // If the inputs all stem from a single 128-bit lane of each input, then we 9054 // split them rather than blending because the split will decompose to 9055 // unusually few instructions. 9056 int LaneCount = VT.getSizeInBits() / 128; 9057 int LaneSize = Size / LaneCount; 9058 SmallBitVector LaneInputs[2]; 9059 LaneInputs[0].resize(LaneCount, false); 9060 LaneInputs[1].resize(LaneCount, false); 9061 for (int i = 0; i < Size; ++i) 9062 if (Mask[i] >= 0) 9063 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; 9064 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) 9065 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 9066 9067 // Otherwise, just fall back to decomposed shuffles and a blend. This requires 9068 // that the decomposed single-input shuffles don't end up here. 9069 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); 9070 } 9071 9072 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as 9073 /// a permutation and blend of those lanes. 9074 /// 9075 /// This essentially blends the out-of-lane inputs to each lane into the lane 9076 /// from a permuted copy of the vector. This lowering strategy results in four 9077 /// instructions in the worst case for a single-input cross lane shuffle which 9078 /// is lower than any other fully general cross-lane shuffle strategy I'm aware 9079 /// of. Special cases for each particular shuffle pattern should be handled 9080 /// prior to trying this lowering. 9081 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, 9082 SDValue V1, SDValue V2, 9083 ArrayRef<int> Mask, 9084 SelectionDAG &DAG) { 9085 // FIXME: This should probably be generalized for 512-bit vectors as well. 9086 assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); 9087 int LaneSize = Mask.size() / 2; 9088 9089 // If there are only inputs from one 128-bit lane, splitting will in fact be 9090 // less expensive. The flags track whether the given lane contains an element 9091 // that crosses to another lane. 9092 bool LaneCrossing[2] = {false, false}; 9093 for (int i = 0, Size = Mask.size(); i < Size; ++i) 9094 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) 9095 LaneCrossing[(Mask[i] % Size) / LaneSize] = true; 9096 if (!LaneCrossing[0] || !LaneCrossing[1]) 9097 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 9098 9099 if (isSingleInputShuffleMask(Mask)) { 9100 SmallVector<int, 32> FlippedBlendMask; 9101 for (int i = 0, Size = Mask.size(); i < Size; ++i) 9102 FlippedBlendMask.push_back( 9103 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) 9104 ? Mask[i] 9105 : Mask[i] % LaneSize + 9106 (i / LaneSize) * LaneSize + Size)); 9107 9108 // Flip the vector, and blend the results which should now be in-lane. The 9109 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and 9110 // 5 for the high source. The value 3 selects the high half of source 2 and 9111 // the value 2 selects the low half of source 2. We only use source 2 to 9112 // allow folding it into a memory operand. 9113 unsigned PERMMask = 3 | 2 << 4; 9114 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), 9115 V1, DAG.getConstant(PERMMask, MVT::i8)); 9116 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); 9117 } 9118 9119 // This now reduces to two single-input shuffles of V1 and V2 which at worst 9120 // will be handled by the above logic and a blend of the results, much like 9121 // other patterns in AVX. 9122 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); 9123 } 9124 9125 /// \brief Handle lowering 2-lane 128-bit shuffles. 9126 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, 9127 SDValue V2, ArrayRef<int> Mask, 9128 const X86Subtarget *Subtarget, 9129 SelectionDAG &DAG) { 9130 // TODO: If minimizing size and one of the inputs is a zero vector and the 9131 // the zero vector has only one use, we could use a VPERM2X128 to save the 9132 // instruction bytes needed to explicitly generate the zero vector. 9133 9134 // Blends are faster and handle all the non-lane-crossing cases. 9135 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, 9136 Subtarget, DAG)) 9137 return Blend; 9138 9139 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); 9140 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); 9141 9142 // If either input operand is a zero vector, use VPERM2X128 because its mask 9143 // allows us to replace the zero input with an implicit zero. 9144 if (!IsV1Zero && !IsV2Zero) { 9145 // Check for patterns which can be matched with a single insert of a 128-bit 9146 // subvector. 9147 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); 9148 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { 9149 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 9150 VT.getVectorNumElements() / 2); 9151 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, 9152 DAG.getIntPtrConstant(0)); 9153 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, 9154 OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0)); 9155 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); 9156 } 9157 } 9158 9159 // Otherwise form a 128-bit permutation. After accounting for undefs, 9160 // convert the 64-bit shuffle mask selection values into 128-bit 9161 // selection bits by dividing the indexes by 2 and shifting into positions 9162 // defined by a vperm2*128 instruction's immediate control byte. 9163 9164 // The immediate permute control byte looks like this: 9165 // [1:0] - select 128 bits from sources for low half of destination 9166 // [2] - ignore 9167 // [3] - zero low half of destination 9168 // [5:4] - select 128 bits from sources for high half of destination 9169 // [6] - ignore 9170 // [7] - zero high half of destination 9171 9172 int MaskLO = Mask[0]; 9173 if (MaskLO == SM_SentinelUndef) 9174 MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; 9175 9176 int MaskHI = Mask[2]; 9177 if (MaskHI == SM_SentinelUndef) 9178 MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; 9179 9180 unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; 9181 9182 // If either input is a zero vector, replace it with an undef input. 9183 // Shuffle mask values < 4 are selecting elements of V1. 9184 // Shuffle mask values >= 4 are selecting elements of V2. 9185 // Adjust each half of the permute mask by clearing the half that was 9186 // selecting the zero vector and setting the zero mask bit. 9187 if (IsV1Zero) { 9188 V1 = DAG.getUNDEF(VT); 9189 if (MaskLO < 4) 9190 PermMask = (PermMask & 0xf0) | 0x08; 9191 if (MaskHI < 4) 9192 PermMask = (PermMask & 0x0f) | 0x80; 9193 } 9194 if (IsV2Zero) { 9195 V2 = DAG.getUNDEF(VT); 9196 if (MaskLO >= 4) 9197 PermMask = (PermMask & 0xf0) | 0x08; 9198 if (MaskHI >= 4) 9199 PermMask = (PermMask & 0x0f) | 0x80; 9200 } 9201 9202 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, 9203 DAG.getConstant(PermMask, MVT::i8)); 9204 } 9205 9206 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then 9207 /// shuffling each lane. 9208 /// 9209 /// This will only succeed when the result of fixing the 128-bit lanes results 9210 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in 9211 /// each 128-bit lanes. This handles many cases where we can quickly blend away 9212 /// the lane crosses early and then use simpler shuffles within each lane. 9213 /// 9214 /// FIXME: It might be worthwhile at some point to support this without 9215 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently 9216 /// in x86 only floating point has interesting non-repeating shuffles, and even 9217 /// those are still *marginally* more expensive. 9218 static SDValue lowerVectorShuffleByMerging128BitLanes( 9219 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 9220 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 9221 assert(!isSingleInputShuffleMask(Mask) && 9222 "This is only useful with multiple inputs."); 9223 9224 int Size = Mask.size(); 9225 int LaneSize = 128 / VT.getScalarSizeInBits(); 9226 int NumLanes = Size / LaneSize; 9227 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); 9228 9229 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also 9230 // check whether the in-128-bit lane shuffles share a repeating pattern. 9231 SmallVector<int, 4> Lanes; 9232 Lanes.resize(NumLanes, -1); 9233 SmallVector<int, 4> InLaneMask; 9234 InLaneMask.resize(LaneSize, -1); 9235 for (int i = 0; i < Size; ++i) { 9236 if (Mask[i] < 0) 9237 continue; 9238 9239 int j = i / LaneSize; 9240 9241 if (Lanes[j] < 0) { 9242 // First entry we've seen for this lane. 9243 Lanes[j] = Mask[i] / LaneSize; 9244 } else if (Lanes[j] != Mask[i] / LaneSize) { 9245 // This doesn't match the lane selected previously! 9246 return SDValue(); 9247 } 9248 9249 // Check that within each lane we have a consistent shuffle mask. 9250 int k = i % LaneSize; 9251 if (InLaneMask[k] < 0) { 9252 InLaneMask[k] = Mask[i] % LaneSize; 9253 } else if (InLaneMask[k] != Mask[i] % LaneSize) { 9254 // This doesn't fit a repeating in-lane mask. 9255 return SDValue(); 9256 } 9257 } 9258 9259 // First shuffle the lanes into place. 9260 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, 9261 VT.getSizeInBits() / 64); 9262 SmallVector<int, 8> LaneMask; 9263 LaneMask.resize(NumLanes * 2, -1); 9264 for (int i = 0; i < NumLanes; ++i) 9265 if (Lanes[i] >= 0) { 9266 LaneMask[2 * i + 0] = 2*Lanes[i] + 0; 9267 LaneMask[2 * i + 1] = 2*Lanes[i] + 1; 9268 } 9269 9270 V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1); 9271 V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2); 9272 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); 9273 9274 // Cast it back to the type we actually want. 9275 LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle); 9276 9277 // Now do a simple shuffle that isn't lane crossing. 9278 SmallVector<int, 8> NewMask; 9279 NewMask.resize(Size, -1); 9280 for (int i = 0; i < Size; ++i) 9281 if (Mask[i] >= 0) 9282 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; 9283 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && 9284 "Must not introduce lane crosses at this point!"); 9285 9286 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); 9287 } 9288 9289 /// \brief Test whether the specified input (0 or 1) is in-place blended by the 9290 /// given mask. 9291 /// 9292 /// This returns true if the elements from a particular input are already in the 9293 /// slot required by the given mask and require no permutation. 9294 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { 9295 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); 9296 int Size = Mask.size(); 9297 for (int i = 0; i < Size; ++i) 9298 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) 9299 return false; 9300 9301 return true; 9302 } 9303 9304 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. 9305 /// 9306 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 9307 /// isn't available. 9308 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9309 const X86Subtarget *Subtarget, 9310 SelectionDAG &DAG) { 9311 SDLoc DL(Op); 9312 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); 9313 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); 9314 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9315 ArrayRef<int> Mask = SVOp->getMask(); 9316 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 9317 9318 SmallVector<int, 4> WidenedMask; 9319 if (canWidenShuffleElements(Mask, WidenedMask)) 9320 return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, 9321 DAG); 9322 9323 if (isSingleInputShuffleMask(Mask)) { 9324 // Check for being able to broadcast a single element. 9325 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, 9326 Mask, Subtarget, DAG)) 9327 return Broadcast; 9328 9329 // Use low duplicate instructions for masks that match their pattern. 9330 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) 9331 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); 9332 9333 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { 9334 // Non-half-crossing single input shuffles can be lowerid with an 9335 // interleaved permutation. 9336 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | 9337 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); 9338 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, 9339 DAG.getConstant(VPERMILPMask, MVT::i8)); 9340 } 9341 9342 // With AVX2 we have direct support for this permutation. 9343 if (Subtarget->hasAVX2()) 9344 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, 9345 getV4X86ShuffleImm8ForMask(Mask, DAG)); 9346 9347 // Otherwise, fall back. 9348 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, 9349 DAG); 9350 } 9351 9352 // X86 has dedicated unpack instructions that can handle specific blend 9353 // operations: UNPCKH and UNPCKL. 9354 if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) 9355 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); 9356 if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) 9357 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); 9358 if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) 9359 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); 9360 if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) 9361 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); 9362 9363 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, 9364 Subtarget, DAG)) 9365 return Blend; 9366 9367 // Check if the blend happens to exactly fit that of SHUFPD. 9368 if ((Mask[0] == -1 || Mask[0] < 2) && 9369 (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) && 9370 (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) && 9371 (Mask[3] == -1 || Mask[3] >= 6)) { 9372 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | 9373 ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); 9374 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, 9375 DAG.getConstant(SHUFPDMask, MVT::i8)); 9376 } 9377 if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) && 9378 (Mask[1] == -1 || Mask[1] < 2) && 9379 (Mask[2] == -1 || Mask[2] >= 6) && 9380 (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) { 9381 unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | 9382 ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); 9383 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, 9384 DAG.getConstant(SHUFPDMask, MVT::i8)); 9385 } 9386 9387 // Try to simplify this by merging 128-bit lanes to enable a lane-based 9388 // shuffle. However, if we have AVX2 and either inputs are already in place, 9389 // we will be able to shuffle even across lanes the other input in a single 9390 // instruction so skip this pattern. 9391 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || 9392 isShuffleMaskInputInPlace(1, Mask)))) 9393 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 9394 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) 9395 return Result; 9396 9397 // If we have AVX2 then we always want to lower with a blend because an v4 we 9398 // can fully permute the elements. 9399 if (Subtarget->hasAVX2()) 9400 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, 9401 Mask, DAG); 9402 9403 // Otherwise fall back on generic lowering. 9404 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); 9405 } 9406 9407 /// \brief Handle lowering of 4-lane 64-bit integer shuffles. 9408 /// 9409 /// This routine is only called when we have AVX2 and thus a reasonable 9410 /// instruction set for v4i64 shuffling.. 9411 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9412 const X86Subtarget *Subtarget, 9413 SelectionDAG &DAG) { 9414 SDLoc DL(Op); 9415 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); 9416 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); 9417 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9418 ArrayRef<int> Mask = SVOp->getMask(); 9419 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 9420 assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); 9421 9422 SmallVector<int, 4> WidenedMask; 9423 if (canWidenShuffleElements(Mask, WidenedMask)) 9424 return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, 9425 DAG); 9426 9427 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, 9428 Subtarget, DAG)) 9429 return Blend; 9430 9431 // Check for being able to broadcast a single element. 9432 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, 9433 Mask, Subtarget, DAG)) 9434 return Broadcast; 9435 9436 // When the shuffle is mirrored between the 128-bit lanes of the unit, we can 9437 // use lower latency instructions that will operate on both 128-bit lanes. 9438 SmallVector<int, 2> RepeatedMask; 9439 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { 9440 if (isSingleInputShuffleMask(Mask)) { 9441 int PSHUFDMask[] = {-1, -1, -1, -1}; 9442 for (int i = 0; i < 2; ++i) 9443 if (RepeatedMask[i] >= 0) { 9444 PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; 9445 PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; 9446 } 9447 return DAG.getNode( 9448 ISD::BITCAST, DL, MVT::v4i64, 9449 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, 9450 DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), 9451 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); 9452 } 9453 } 9454 9455 // AVX2 provides a direct instruction for permuting a single input across 9456 // lanes. 9457 if (isSingleInputShuffleMask(Mask)) 9458 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, 9459 getV4X86ShuffleImm8ForMask(Mask, DAG)); 9460 9461 // Try to use shift instructions. 9462 if (SDValue Shift = 9463 lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG)) 9464 return Shift; 9465 9466 // Use dedicated unpack instructions for masks that match their pattern. 9467 if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) 9468 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); 9469 if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) 9470 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); 9471 if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) 9472 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1); 9473 if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) 9474 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1); 9475 9476 // Try to simplify this by merging 128-bit lanes to enable a lane-based 9477 // shuffle. However, if we have AVX2 and either inputs are already in place, 9478 // we will be able to shuffle even across lanes the other input in a single 9479 // instruction so skip this pattern. 9480 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || 9481 isShuffleMaskInputInPlace(1, Mask)))) 9482 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 9483 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) 9484 return Result; 9485 9486 // Otherwise fall back on generic blend lowering. 9487 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, 9488 Mask, DAG); 9489 } 9490 9491 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles. 9492 /// 9493 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 9494 /// isn't available. 9495 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9496 const X86Subtarget *Subtarget, 9497 SelectionDAG &DAG) { 9498 SDLoc DL(Op); 9499 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); 9500 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); 9501 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9502 ArrayRef<int> Mask = SVOp->getMask(); 9503 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 9504 9505 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, 9506 Subtarget, DAG)) 9507 return Blend; 9508 9509 // Check for being able to broadcast a single element. 9510 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, 9511 Mask, Subtarget, DAG)) 9512 return Broadcast; 9513 9514 // If the shuffle mask is repeated in each 128-bit lane, we have many more 9515 // options to efficiently lower the shuffle. 9516 SmallVector<int, 4> RepeatedMask; 9517 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { 9518 assert(RepeatedMask.size() == 4 && 9519 "Repeated masks must be half the mask width!"); 9520 9521 // Use even/odd duplicate instructions for masks that match their pattern. 9522 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) 9523 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); 9524 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7})) 9525 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); 9526 9527 if (isSingleInputShuffleMask(Mask)) 9528 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, 9529 getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); 9530 9531 // Use dedicated unpack instructions for masks that match their pattern. 9532 if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) 9533 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); 9534 if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) 9535 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); 9536 if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) 9537 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1); 9538 if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) 9539 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1); 9540 9541 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we 9542 // have already handled any direct blends. We also need to squash the 9543 // repeated mask into a simulated v4f32 mask. 9544 for (int i = 0; i < 4; ++i) 9545 if (RepeatedMask[i] >= 8) 9546 RepeatedMask[i] -= 4; 9547 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); 9548 } 9549 9550 // If we have a single input shuffle with different shuffle patterns in the 9551 // two 128-bit lanes use the variable mask to VPERMILPS. 9552 if (isSingleInputShuffleMask(Mask)) { 9553 SDValue VPermMask[8]; 9554 for (int i = 0; i < 8; ++i) 9555 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) 9556 : DAG.getConstant(Mask[i], MVT::i32); 9557 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) 9558 return DAG.getNode( 9559 X86ISD::VPERMILPV, DL, MVT::v8f32, V1, 9560 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); 9561 9562 if (Subtarget->hasAVX2()) 9563 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, 9564 DAG.getNode(ISD::BITCAST, DL, MVT::v8f32, 9565 DAG.getNode(ISD::BUILD_VECTOR, DL, 9566 MVT::v8i32, VPermMask)), 9567 V1); 9568 9569 // Otherwise, fall back. 9570 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, 9571 DAG); 9572 } 9573 9574 // Try to simplify this by merging 128-bit lanes to enable a lane-based 9575 // shuffle. 9576 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 9577 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) 9578 return Result; 9579 9580 // If we have AVX2 then we always want to lower with a blend because at v8 we 9581 // can fully permute the elements. 9582 if (Subtarget->hasAVX2()) 9583 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, 9584 Mask, DAG); 9585 9586 // Otherwise fall back on generic lowering. 9587 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); 9588 } 9589 9590 /// \brief Handle lowering of 8-lane 32-bit integer shuffles. 9591 /// 9592 /// This routine is only called when we have AVX2 and thus a reasonable 9593 /// instruction set for v8i32 shuffling.. 9594 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9595 const X86Subtarget *Subtarget, 9596 SelectionDAG &DAG) { 9597 SDLoc DL(Op); 9598 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); 9599 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); 9600 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9601 ArrayRef<int> Mask = SVOp->getMask(); 9602 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 9603 assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); 9604 9605 // Whenever we can lower this as a zext, that instruction is strictly faster 9606 // than any alternative. It also allows us to fold memory operands into the 9607 // shuffle in many cases. 9608 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, 9609 Mask, Subtarget, DAG)) 9610 return ZExt; 9611 9612 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, 9613 Subtarget, DAG)) 9614 return Blend; 9615 9616 // Check for being able to broadcast a single element. 9617 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, 9618 Mask, Subtarget, DAG)) 9619 return Broadcast; 9620 9621 // If the shuffle mask is repeated in each 128-bit lane we can use more 9622 // efficient instructions that mirror the shuffles across the two 128-bit 9623 // lanes. 9624 SmallVector<int, 4> RepeatedMask; 9625 if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { 9626 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); 9627 if (isSingleInputShuffleMask(Mask)) 9628 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, 9629 getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); 9630 9631 // Use dedicated unpack instructions for masks that match their pattern. 9632 if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) 9633 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); 9634 if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) 9635 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); 9636 if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) 9637 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1); 9638 if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) 9639 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1); 9640 } 9641 9642 // Try to use shift instructions. 9643 if (SDValue Shift = 9644 lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG)) 9645 return Shift; 9646 9647 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 9648 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) 9649 return Rotate; 9650 9651 // If the shuffle patterns aren't repeated but it is a single input, directly 9652 // generate a cross-lane VPERMD instruction. 9653 if (isSingleInputShuffleMask(Mask)) { 9654 SDValue VPermMask[8]; 9655 for (int i = 0; i < 8; ++i) 9656 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) 9657 : DAG.getConstant(Mask[i], MVT::i32); 9658 return DAG.getNode( 9659 X86ISD::VPERMV, DL, MVT::v8i32, 9660 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); 9661 } 9662 9663 // Try to simplify this by merging 128-bit lanes to enable a lane-based 9664 // shuffle. 9665 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 9666 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) 9667 return Result; 9668 9669 // Otherwise fall back on generic blend lowering. 9670 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, 9671 Mask, DAG); 9672 } 9673 9674 /// \brief Handle lowering of 16-lane 16-bit integer shuffles. 9675 /// 9676 /// This routine is only called when we have AVX2 and thus a reasonable 9677 /// instruction set for v16i16 shuffling.. 9678 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9679 const X86Subtarget *Subtarget, 9680 SelectionDAG &DAG) { 9681 SDLoc DL(Op); 9682 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); 9683 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); 9684 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9685 ArrayRef<int> Mask = SVOp->getMask(); 9686 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 9687 assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); 9688 9689 // Whenever we can lower this as a zext, that instruction is strictly faster 9690 // than any alternative. It also allows us to fold memory operands into the 9691 // shuffle in many cases. 9692 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2, 9693 Mask, Subtarget, DAG)) 9694 return ZExt; 9695 9696 // Check for being able to broadcast a single element. 9697 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, 9698 Mask, Subtarget, DAG)) 9699 return Broadcast; 9700 9701 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, 9702 Subtarget, DAG)) 9703 return Blend; 9704 9705 // Use dedicated unpack instructions for masks that match their pattern. 9706 if (isShuffleEquivalent(V1, V2, Mask, 9707 {// First 128-bit lane: 9708 0, 16, 1, 17, 2, 18, 3, 19, 9709 // Second 128-bit lane: 9710 8, 24, 9, 25, 10, 26, 11, 27})) 9711 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); 9712 if (isShuffleEquivalent(V1, V2, Mask, 9713 {// First 128-bit lane: 9714 4, 20, 5, 21, 6, 22, 7, 23, 9715 // Second 128-bit lane: 9716 12, 28, 13, 29, 14, 30, 15, 31})) 9717 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); 9718 9719 // Try to use shift instructions. 9720 if (SDValue Shift = 9721 lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG)) 9722 return Shift; 9723 9724 // Try to use byte rotation instructions. 9725 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 9726 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) 9727 return Rotate; 9728 9729 if (isSingleInputShuffleMask(Mask)) { 9730 // There are no generalized cross-lane shuffle operations available on i16 9731 // element types. 9732 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) 9733 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, 9734 Mask, DAG); 9735 9736 SmallVector<int, 8> RepeatedMask; 9737 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { 9738 // As this is a single-input shuffle, the repeated mask should be 9739 // a strictly valid v8i16 mask that we can pass through to the v8i16 9740 // lowering to handle even the v16 case. 9741 return lowerV8I16GeneralSingleInputVectorShuffle( 9742 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); 9743 } 9744 9745 SDValue PSHUFBMask[32]; 9746 for (int i = 0; i < 16; ++i) { 9747 if (Mask[i] == -1) { 9748 PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); 9749 continue; 9750 } 9751 9752 int M = i < 8 ? Mask[i] : Mask[i] - 8; 9753 assert(M >= 0 && M < 8 && "Invalid single-input mask!"); 9754 PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8); 9755 PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8); 9756 } 9757 return DAG.getNode( 9758 ISD::BITCAST, DL, MVT::v16i16, 9759 DAG.getNode( 9760 X86ISD::PSHUFB, DL, MVT::v32i8, 9761 DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1), 9762 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); 9763 } 9764 9765 // Try to simplify this by merging 128-bit lanes to enable a lane-based 9766 // shuffle. 9767 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 9768 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) 9769 return Result; 9770 9771 // Otherwise fall back on generic lowering. 9772 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); 9773 } 9774 9775 /// \brief Handle lowering of 32-lane 8-bit integer shuffles. 9776 /// 9777 /// This routine is only called when we have AVX2 and thus a reasonable 9778 /// instruction set for v32i8 shuffling.. 9779 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9780 const X86Subtarget *Subtarget, 9781 SelectionDAG &DAG) { 9782 SDLoc DL(Op); 9783 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); 9784 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); 9785 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9786 ArrayRef<int> Mask = SVOp->getMask(); 9787 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); 9788 assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); 9789 9790 // Whenever we can lower this as a zext, that instruction is strictly faster 9791 // than any alternative. It also allows us to fold memory operands into the 9792 // shuffle in many cases. 9793 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, 9794 Mask, Subtarget, DAG)) 9795 return ZExt; 9796 9797 // Check for being able to broadcast a single element. 9798 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, 9799 Mask, Subtarget, DAG)) 9800 return Broadcast; 9801 9802 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, 9803 Subtarget, DAG)) 9804 return Blend; 9805 9806 // Use dedicated unpack instructions for masks that match their pattern. 9807 // Note that these are repeated 128-bit lane unpacks, not unpacks across all 9808 // 256-bit lanes. 9809 if (isShuffleEquivalent( 9810 V1, V2, Mask, 9811 {// First 128-bit lane: 9812 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 9813 // Second 128-bit lane: 9814 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55})) 9815 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); 9816 if (isShuffleEquivalent( 9817 V1, V2, Mask, 9818 {// First 128-bit lane: 9819 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 9820 // Second 128-bit lane: 9821 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63})) 9822 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); 9823 9824 // Try to use shift instructions. 9825 if (SDValue Shift = 9826 lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG)) 9827 return Shift; 9828 9829 // Try to use byte rotation instructions. 9830 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 9831 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) 9832 return Rotate; 9833 9834 if (isSingleInputShuffleMask(Mask)) { 9835 // There are no generalized cross-lane shuffle operations available on i8 9836 // element types. 9837 if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) 9838 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, 9839 Mask, DAG); 9840 9841 SDValue PSHUFBMask[32]; 9842 for (int i = 0; i < 32; ++i) 9843 PSHUFBMask[i] = 9844 Mask[i] < 0 9845 ? DAG.getUNDEF(MVT::i8) 9846 : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8); 9847 9848 return DAG.getNode( 9849 X86ISD::PSHUFB, DL, MVT::v32i8, V1, 9850 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); 9851 } 9852 9853 // Try to simplify this by merging 128-bit lanes to enable a lane-based 9854 // shuffle. 9855 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 9856 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) 9857 return Result; 9858 9859 // Otherwise fall back on generic lowering. 9860 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); 9861 } 9862 9863 /// \brief High-level routine to lower various 256-bit x86 vector shuffles. 9864 /// 9865 /// This routine either breaks down the specific type of a 256-bit x86 vector 9866 /// shuffle or splits it into two 128-bit shuffles and fuses the results back 9867 /// together based on the available instructions. 9868 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9869 MVT VT, const X86Subtarget *Subtarget, 9870 SelectionDAG &DAG) { 9871 SDLoc DL(Op); 9872 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9873 ArrayRef<int> Mask = SVOp->getMask(); 9874 9875 // If we have a single input to the zero element, insert that into V1 if we 9876 // can do so cheaply. 9877 int NumElts = VT.getVectorNumElements(); 9878 int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) { 9879 return M >= NumElts; 9880 }); 9881 9882 if (NumV2Elements == 1 && Mask[0] >= NumElts) 9883 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 9884 DL, VT, V1, V2, Mask, Subtarget, DAG)) 9885 return Insertion; 9886 9887 // There is a really nice hard cut-over between AVX1 and AVX2 that means we can 9888 // check for those subtargets here and avoid much of the subtarget querying in 9889 // the per-vector-type lowering routines. With AVX1 we have essentially *zero* 9890 // ability to manipulate a 256-bit vector with integer types. Since we'll use 9891 // floating point types there eventually, just immediately cast everything to 9892 // a float and operate entirely in that domain. 9893 if (VT.isInteger() && !Subtarget->hasAVX2()) { 9894 int ElementBits = VT.getScalarSizeInBits(); 9895 if (ElementBits < 32) 9896 // No floating point type available, decompose into 128-bit vectors. 9897 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 9898 9899 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), 9900 VT.getVectorNumElements()); 9901 V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1); 9902 V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2); 9903 return DAG.getNode(ISD::BITCAST, DL, VT, 9904 DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); 9905 } 9906 9907 switch (VT.SimpleTy) { 9908 case MVT::v4f64: 9909 return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); 9910 case MVT::v4i64: 9911 return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); 9912 case MVT::v8f32: 9913 return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); 9914 case MVT::v8i32: 9915 return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); 9916 case MVT::v16i16: 9917 return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); 9918 case MVT::v32i8: 9919 return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); 9920 9921 default: 9922 llvm_unreachable("Not a valid 256-bit x86 vector type!"); 9923 } 9924 } 9925 9926 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. 9927 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9928 const X86Subtarget *Subtarget, 9929 SelectionDAG &DAG) { 9930 SDLoc DL(Op); 9931 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); 9932 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); 9933 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9934 ArrayRef<int> Mask = SVOp->getMask(); 9935 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 9936 9937 // X86 has dedicated unpack instructions that can handle specific blend 9938 // operations: UNPCKH and UNPCKL. 9939 if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) 9940 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); 9941 if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) 9942 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); 9943 9944 // FIXME: Implement direct support for this type! 9945 return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); 9946 } 9947 9948 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. 9949 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9950 const X86Subtarget *Subtarget, 9951 SelectionDAG &DAG) { 9952 SDLoc DL(Op); 9953 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); 9954 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); 9955 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9956 ArrayRef<int> Mask = SVOp->getMask(); 9957 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 9958 9959 // Use dedicated unpack instructions for masks that match their pattern. 9960 if (isShuffleEquivalent(V1, V2, Mask, 9961 {// First 128-bit lane. 9962 0, 16, 1, 17, 4, 20, 5, 21, 9963 // Second 128-bit lane. 9964 8, 24, 9, 25, 12, 28, 13, 29})) 9965 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); 9966 if (isShuffleEquivalent(V1, V2, Mask, 9967 {// First 128-bit lane. 9968 2, 18, 3, 19, 6, 22, 7, 23, 9969 // Second 128-bit lane. 9970 10, 26, 11, 27, 14, 30, 15, 31})) 9971 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); 9972 9973 // FIXME: Implement direct support for this type! 9974 return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); 9975 } 9976 9977 /// \brief Handle lowering of 8-lane 64-bit integer shuffles. 9978 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9979 const X86Subtarget *Subtarget, 9980 SelectionDAG &DAG) { 9981 SDLoc DL(Op); 9982 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); 9983 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); 9984 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9985 ArrayRef<int> Mask = SVOp->getMask(); 9986 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 9987 9988 // X86 has dedicated unpack instructions that can handle specific blend 9989 // operations: UNPCKH and UNPCKL. 9990 if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) 9991 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); 9992 if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) 9993 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); 9994 9995 // FIXME: Implement direct support for this type! 9996 return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); 9997 } 9998 9999 /// \brief Handle lowering of 16-lane 32-bit integer shuffles. 10000 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10001 const X86Subtarget *Subtarget, 10002 SelectionDAG &DAG) { 10003 SDLoc DL(Op); 10004 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); 10005 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); 10006 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10007 ArrayRef<int> Mask = SVOp->getMask(); 10008 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 10009 10010 // Use dedicated unpack instructions for masks that match their pattern. 10011 if (isShuffleEquivalent(V1, V2, Mask, 10012 {// First 128-bit lane. 10013 0, 16, 1, 17, 4, 20, 5, 21, 10014 // Second 128-bit lane. 10015 8, 24, 9, 25, 12, 28, 13, 29})) 10016 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); 10017 if (isShuffleEquivalent(V1, V2, Mask, 10018 {// First 128-bit lane. 10019 2, 18, 3, 19, 6, 22, 7, 23, 10020 // Second 128-bit lane. 10021 10, 26, 11, 27, 14, 30, 15, 31})) 10022 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); 10023 10024 // FIXME: Implement direct support for this type! 10025 return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); 10026 } 10027 10028 /// \brief Handle lowering of 32-lane 16-bit integer shuffles. 10029 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10030 const X86Subtarget *Subtarget, 10031 SelectionDAG &DAG) { 10032 SDLoc DL(Op); 10033 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); 10034 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); 10035 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10036 ArrayRef<int> Mask = SVOp->getMask(); 10037 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); 10038 assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); 10039 10040 // FIXME: Implement direct support for this type! 10041 return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); 10042 } 10043 10044 /// \brief Handle lowering of 64-lane 8-bit integer shuffles. 10045 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10046 const X86Subtarget *Subtarget, 10047 SelectionDAG &DAG) { 10048 SDLoc DL(Op); 10049 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); 10050 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); 10051 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10052 ArrayRef<int> Mask = SVOp->getMask(); 10053 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); 10054 assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); 10055 10056 // FIXME: Implement direct support for this type! 10057 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); 10058 } 10059 10060 /// \brief High-level routine to lower various 512-bit x86 vector shuffles. 10061 /// 10062 /// This routine either breaks down the specific type of a 512-bit x86 vector 10063 /// shuffle or splits it into two 256-bit shuffles and fuses the results back 10064 /// together based on the available instructions. 10065 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10066 MVT VT, const X86Subtarget *Subtarget, 10067 SelectionDAG &DAG) { 10068 SDLoc DL(Op); 10069 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10070 ArrayRef<int> Mask = SVOp->getMask(); 10071 assert(Subtarget->hasAVX512() && 10072 "Cannot lower 512-bit vectors w/ basic ISA!"); 10073 10074 // Check for being able to broadcast a single element. 10075 if (SDValue Broadcast = 10076 lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) 10077 return Broadcast; 10078 10079 // Dispatch to each element type for lowering. If we don't have supprot for 10080 // specific element type shuffles at 512 bits, immediately split them and 10081 // lower them. Each lowering routine of a given type is allowed to assume that 10082 // the requisite ISA extensions for that element type are available. 10083 switch (VT.SimpleTy) { 10084 case MVT::v8f64: 10085 return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); 10086 case MVT::v16f32: 10087 return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); 10088 case MVT::v8i64: 10089 return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); 10090 case MVT::v16i32: 10091 return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); 10092 case MVT::v32i16: 10093 if (Subtarget->hasBWI()) 10094 return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); 10095 break; 10096 case MVT::v64i8: 10097 if (Subtarget->hasBWI()) 10098 return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); 10099 break; 10100 10101 default: 10102 llvm_unreachable("Not a valid 512-bit x86 vector type!"); 10103 } 10104 10105 // Otherwise fall back on splitting. 10106 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 10107 } 10108 10109 /// \brief Top-level lowering for x86 vector shuffles. 10110 /// 10111 /// This handles decomposition, canonicalization, and lowering of all x86 10112 /// vector shuffles. Most of the specific lowering strategies are encapsulated 10113 /// above in helper routines. The canonicalization attempts to widen shuffles 10114 /// to involve fewer lanes of wider elements, consolidate symmetric patterns 10115 /// s.t. only one of the two inputs needs to be tested, etc. 10116 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, 10117 SelectionDAG &DAG) { 10118 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10119 ArrayRef<int> Mask = SVOp->getMask(); 10120 SDValue V1 = Op.getOperand(0); 10121 SDValue V2 = Op.getOperand(1); 10122 MVT VT = Op.getSimpleValueType(); 10123 int NumElements = VT.getVectorNumElements(); 10124 SDLoc dl(Op); 10125 10126 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 10127 10128 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 10129 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 10130 if (V1IsUndef && V2IsUndef) 10131 return DAG.getUNDEF(VT); 10132 10133 // When we create a shuffle node we put the UNDEF node to second operand, 10134 // but in some cases the first operand may be transformed to UNDEF. 10135 // In this case we should just commute the node. 10136 if (V1IsUndef) 10137 return DAG.getCommutedVectorShuffle(*SVOp); 10138 10139 // Check for non-undef masks pointing at an undef vector and make the masks 10140 // undef as well. This makes it easier to match the shuffle based solely on 10141 // the mask. 10142 if (V2IsUndef) 10143 for (int M : Mask) 10144 if (M >= NumElements) { 10145 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); 10146 for (int &M : NewMask) 10147 if (M >= NumElements) 10148 M = -1; 10149 return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); 10150 } 10151 10152 // We actually see shuffles that are entirely re-arrangements of a set of 10153 // zero inputs. This mostly happens while decomposing complex shuffles into 10154 // simple ones. Directly lower these as a buildvector of zeros. 10155 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 10156 if (Zeroable.all()) 10157 return getZeroVector(VT, Subtarget, DAG, dl); 10158 10159 // Try to collapse shuffles into using a vector type with fewer elements but 10160 // wider element types. We cap this to not form integers or floating point 10161 // elements wider than 64 bits, but it might be interesting to form i128 10162 // integers to handle flipping the low and high halves of AVX 256-bit vectors. 10163 SmallVector<int, 16> WidenedMask; 10164 if (VT.getScalarSizeInBits() < 64 && 10165 canWidenShuffleElements(Mask, WidenedMask)) { 10166 MVT NewEltVT = VT.isFloatingPoint() 10167 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) 10168 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); 10169 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); 10170 // Make sure that the new vector type is legal. For example, v2f64 isn't 10171 // legal on SSE1. 10172 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { 10173 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 10174 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 10175 return DAG.getNode(ISD::BITCAST, dl, VT, 10176 DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); 10177 } 10178 } 10179 10180 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; 10181 for (int M : SVOp->getMask()) 10182 if (M < 0) 10183 ++NumUndefElements; 10184 else if (M < NumElements) 10185 ++NumV1Elements; 10186 else 10187 ++NumV2Elements; 10188 10189 // Commute the shuffle as needed such that more elements come from V1 than 10190 // V2. This allows us to match the shuffle pattern strictly on how many 10191 // elements come from V1 without handling the symmetric cases. 10192 if (NumV2Elements > NumV1Elements) 10193 return DAG.getCommutedVectorShuffle(*SVOp); 10194 10195 // When the number of V1 and V2 elements are the same, try to minimize the 10196 // number of uses of V2 in the low half of the vector. When that is tied, 10197 // ensure that the sum of indices for V1 is equal to or lower than the sum 10198 // indices for V2. When those are equal, try to ensure that the number of odd 10199 // indices for V1 is lower than the number of odd indices for V2. 10200 if (NumV1Elements == NumV2Elements) { 10201 int LowV1Elements = 0, LowV2Elements = 0; 10202 for (int M : SVOp->getMask().slice(0, NumElements / 2)) 10203 if (M >= NumElements) 10204 ++LowV2Elements; 10205 else if (M >= 0) 10206 ++LowV1Elements; 10207 if (LowV2Elements > LowV1Elements) { 10208 return DAG.getCommutedVectorShuffle(*SVOp); 10209 } else if (LowV2Elements == LowV1Elements) { 10210 int SumV1Indices = 0, SumV2Indices = 0; 10211 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) 10212 if (SVOp->getMask()[i] >= NumElements) 10213 SumV2Indices += i; 10214 else if (SVOp->getMask()[i] >= 0) 10215 SumV1Indices += i; 10216 if (SumV2Indices < SumV1Indices) { 10217 return DAG.getCommutedVectorShuffle(*SVOp); 10218 } else if (SumV2Indices == SumV1Indices) { 10219 int NumV1OddIndices = 0, NumV2OddIndices = 0; 10220 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) 10221 if (SVOp->getMask()[i] >= NumElements) 10222 NumV2OddIndices += i % 2; 10223 else if (SVOp->getMask()[i] >= 0) 10224 NumV1OddIndices += i % 2; 10225 if (NumV2OddIndices < NumV1OddIndices) 10226 return DAG.getCommutedVectorShuffle(*SVOp); 10227 } 10228 } 10229 } 10230 10231 // For each vector width, delegate to a specialized lowering routine. 10232 if (VT.getSizeInBits() == 128) 10233 return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 10234 10235 if (VT.getSizeInBits() == 256) 10236 return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 10237 10238 // Force AVX-512 vectors to be scalarized for now. 10239 // FIXME: Implement AVX-512 support! 10240 if (VT.getSizeInBits() == 512) 10241 return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 10242 10243 llvm_unreachable("Unimplemented!"); 10244 } 10245 10246 // This function assumes its argument is a BUILD_VECTOR of constants or 10247 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is 10248 // true. 10249 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, 10250 unsigned &MaskValue) { 10251 MaskValue = 0; 10252 unsigned NumElems = BuildVector->getNumOperands(); 10253 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 10254 unsigned NumLanes = (NumElems - 1) / 8 + 1; 10255 unsigned NumElemsInLane = NumElems / NumLanes; 10256 10257 // Blend for v16i16 should be symetric for the both lanes. 10258 for (unsigned i = 0; i < NumElemsInLane; ++i) { 10259 SDValue EltCond = BuildVector->getOperand(i); 10260 SDValue SndLaneEltCond = 10261 (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; 10262 10263 int Lane1Cond = -1, Lane2Cond = -1; 10264 if (isa<ConstantSDNode>(EltCond)) 10265 Lane1Cond = !isZero(EltCond); 10266 if (isa<ConstantSDNode>(SndLaneEltCond)) 10267 Lane2Cond = !isZero(SndLaneEltCond); 10268 10269 if (Lane1Cond == Lane2Cond || Lane2Cond < 0) 10270 // Lane1Cond != 0, means we want the first argument. 10271 // Lane1Cond == 0, means we want the second argument. 10272 // The encoding of this argument is 0 for the first argument, 1 10273 // for the second. Therefore, invert the condition. 10274 MaskValue |= !Lane1Cond << i; 10275 else if (Lane1Cond < 0) 10276 MaskValue |= !Lane2Cond << i; 10277 else 10278 return false; 10279 } 10280 return true; 10281 } 10282 10283 /// \brief Try to lower a VSELECT instruction to a vector shuffle. 10284 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, 10285 const X86Subtarget *Subtarget, 10286 SelectionDAG &DAG) { 10287 SDValue Cond = Op.getOperand(0); 10288 SDValue LHS = Op.getOperand(1); 10289 SDValue RHS = Op.getOperand(2); 10290 SDLoc dl(Op); 10291 MVT VT = Op.getSimpleValueType(); 10292 10293 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) 10294 return SDValue(); 10295 auto *CondBV = cast<BuildVectorSDNode>(Cond); 10296 10297 // Only non-legal VSELECTs reach this lowering, convert those into generic 10298 // shuffles and re-use the shuffle lowering path for blends. 10299 SmallVector<int, 32> Mask; 10300 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { 10301 SDValue CondElt = CondBV->getOperand(i); 10302 Mask.push_back( 10303 isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); 10304 } 10305 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); 10306 } 10307 10308 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { 10309 // A vselect where all conditions and data are constants can be optimized into 10310 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). 10311 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && 10312 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && 10313 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) 10314 return SDValue(); 10315 10316 // Try to lower this to a blend-style vector shuffle. This can handle all 10317 // constant condition cases. 10318 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) 10319 return BlendOp; 10320 10321 // Variable blends are only legal from SSE4.1 onward. 10322 if (!Subtarget->hasSSE41()) 10323 return SDValue(); 10324 10325 // Only some types will be legal on some subtargets. If we can emit a legal 10326 // VSELECT-matching blend, return Op, and but if we need to expand, return 10327 // a null value. 10328 switch (Op.getSimpleValueType().SimpleTy) { 10329 default: 10330 // Most of the vector types have blends past SSE4.1. 10331 return Op; 10332 10333 case MVT::v32i8: 10334 // The byte blends for AVX vectors were introduced only in AVX2. 10335 if (Subtarget->hasAVX2()) 10336 return Op; 10337 10338 return SDValue(); 10339 10340 case MVT::v8i16: 10341 case MVT::v16i16: 10342 // AVX-512 BWI and VLX features support VSELECT with i16 elements. 10343 if (Subtarget->hasBWI() && Subtarget->hasVLX()) 10344 return Op; 10345 10346 // FIXME: We should custom lower this by fixing the condition and using i8 10347 // blends. 10348 return SDValue(); 10349 } 10350 } 10351 10352 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 10353 MVT VT = Op.getSimpleValueType(); 10354 SDLoc dl(Op); 10355 10356 if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) 10357 return SDValue(); 10358 10359 if (VT.getSizeInBits() == 8) { 10360 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 10361 Op.getOperand(0), Op.getOperand(1)); 10362 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 10363 DAG.getValueType(VT)); 10364 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 10365 } 10366 10367 if (VT.getSizeInBits() == 16) { 10368 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10369 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 10370 if (Idx == 0) 10371 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 10372 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 10373 DAG.getNode(ISD::BITCAST, dl, 10374 MVT::v4i32, 10375 Op.getOperand(0)), 10376 Op.getOperand(1))); 10377 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 10378 Op.getOperand(0), Op.getOperand(1)); 10379 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 10380 DAG.getValueType(VT)); 10381 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 10382 } 10383 10384 if (VT == MVT::f32) { 10385 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 10386 // the result back to FR32 register. It's only worth matching if the 10387 // result has a single use which is a store or a bitcast to i32. And in 10388 // the case of a store, it's not worth it if the index is a constant 0, 10389 // because a MOVSSmr can be used instead, which is smaller and faster. 10390 if (!Op.hasOneUse()) 10391 return SDValue(); 10392 SDNode *User = *Op.getNode()->use_begin(); 10393 if ((User->getOpcode() != ISD::STORE || 10394 (isa<ConstantSDNode>(Op.getOperand(1)) && 10395 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 10396 (User->getOpcode() != ISD::BITCAST || 10397 User->getValueType(0) != MVT::i32)) 10398 return SDValue(); 10399 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 10400 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 10401 Op.getOperand(0)), 10402 Op.getOperand(1)); 10403 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 10404 } 10405 10406 if (VT == MVT::i32 || VT == MVT::i64) { 10407 // ExtractPS/pextrq works with constant index. 10408 if (isa<ConstantSDNode>(Op.getOperand(1))) 10409 return Op; 10410 } 10411 return SDValue(); 10412 } 10413 10414 /// Extract one bit from mask vector, like v16i1 or v8i1. 10415 /// AVX-512 feature. 10416 SDValue 10417 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { 10418 SDValue Vec = Op.getOperand(0); 10419 SDLoc dl(Vec); 10420 MVT VecVT = Vec.getSimpleValueType(); 10421 SDValue Idx = Op.getOperand(1); 10422 MVT EltVT = Op.getSimpleValueType(); 10423 10424 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); 10425 assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && 10426 "Unexpected vector type in ExtractBitFromMaskVector"); 10427 10428 // variable index can't be handled in mask registers, 10429 // extend vector to VR512 10430 if (!isa<ConstantSDNode>(Idx)) { 10431 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 10432 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); 10433 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 10434 ExtVT.getVectorElementType(), Ext, Idx); 10435 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 10436 } 10437 10438 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 10439 const TargetRegisterClass* rc = getRegClassFor(VecVT); 10440 if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) 10441 rc = getRegClassFor(MVT::v16i1); 10442 unsigned MaxSift = rc->getSize()*8 - 1; 10443 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, 10444 DAG.getConstant(MaxSift - IdxVal, MVT::i8)); 10445 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, 10446 DAG.getConstant(MaxSift, MVT::i8)); 10447 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, 10448 DAG.getIntPtrConstant(0)); 10449 } 10450 10451 SDValue 10452 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 10453 SelectionDAG &DAG) const { 10454 SDLoc dl(Op); 10455 SDValue Vec = Op.getOperand(0); 10456 MVT VecVT = Vec.getSimpleValueType(); 10457 SDValue Idx = Op.getOperand(1); 10458 10459 if (Op.getSimpleValueType() == MVT::i1) 10460 return ExtractBitFromMaskVector(Op, DAG); 10461 10462 if (!isa<ConstantSDNode>(Idx)) { 10463 if (VecVT.is512BitVector() || 10464 (VecVT.is256BitVector() && Subtarget->hasInt256() && 10465 VecVT.getVectorElementType().getSizeInBits() == 32)) { 10466 10467 MVT MaskEltVT = 10468 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); 10469 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / 10470 MaskEltVT.getSizeInBits()); 10471 10472 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); 10473 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, 10474 getZeroVector(MaskVT, Subtarget, DAG, dl), 10475 Idx, DAG.getConstant(0, getPointerTy())); 10476 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); 10477 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), 10478 Perm, DAG.getConstant(0, getPointerTy())); 10479 } 10480 return SDValue(); 10481 } 10482 10483 // If this is a 256-bit vector result, first extract the 128-bit vector and 10484 // then extract the element from the 128-bit vector. 10485 if (VecVT.is256BitVector() || VecVT.is512BitVector()) { 10486 10487 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 10488 // Get the 128-bit vector. 10489 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 10490 MVT EltVT = VecVT.getVectorElementType(); 10491 10492 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); 10493 10494 //if (IdxVal >= NumElems/2) 10495 // IdxVal -= NumElems/2; 10496 IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; 10497 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 10498 DAG.getConstant(IdxVal, MVT::i32)); 10499 } 10500 10501 assert(VecVT.is128BitVector() && "Unexpected vector length"); 10502 10503 if (Subtarget->hasSSE41()) { 10504 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 10505 if (Res.getNode()) 10506 return Res; 10507 } 10508 10509 MVT VT = Op.getSimpleValueType(); 10510 // TODO: handle v16i8. 10511 if (VT.getSizeInBits() == 16) { 10512 SDValue Vec = Op.getOperand(0); 10513 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10514 if (Idx == 0) 10515 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 10516 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 10517 DAG.getNode(ISD::BITCAST, dl, 10518 MVT::v4i32, Vec), 10519 Op.getOperand(1))); 10520 // Transform it so it match pextrw which produces a 32-bit result. 10521 MVT EltVT = MVT::i32; 10522 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 10523 Op.getOperand(0), Op.getOperand(1)); 10524 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 10525 DAG.getValueType(VT)); 10526 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 10527 } 10528 10529 if (VT.getSizeInBits() == 32) { 10530 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10531 if (Idx == 0) 10532 return Op; 10533 10534 // SHUFPS the element to the lowest double word, then movss. 10535 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 10536 MVT VVT = Op.getOperand(0).getSimpleValueType(); 10537 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 10538 DAG.getUNDEF(VVT), Mask); 10539 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 10540 DAG.getIntPtrConstant(0)); 10541 } 10542 10543 if (VT.getSizeInBits() == 64) { 10544 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 10545 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 10546 // to match extract_elt for f64. 10547 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10548 if (Idx == 0) 10549 return Op; 10550 10551 // UNPCKHPD the element to the lowest double word, then movsd. 10552 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 10553 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 10554 int Mask[2] = { 1, -1 }; 10555 MVT VVT = Op.getOperand(0).getSimpleValueType(); 10556 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 10557 DAG.getUNDEF(VVT), Mask); 10558 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 10559 DAG.getIntPtrConstant(0)); 10560 } 10561 10562 return SDValue(); 10563 } 10564 10565 /// Insert one bit to mask vector, like v16i1 or v8i1. 10566 /// AVX-512 feature. 10567 SDValue 10568 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { 10569 SDLoc dl(Op); 10570 SDValue Vec = Op.getOperand(0); 10571 SDValue Elt = Op.getOperand(1); 10572 SDValue Idx = Op.getOperand(2); 10573 MVT VecVT = Vec.getSimpleValueType(); 10574 10575 if (!isa<ConstantSDNode>(Idx)) { 10576 // Non constant index. Extend source and destination, 10577 // insert element and then truncate the result. 10578 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 10579 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); 10580 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 10581 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), 10582 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); 10583 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); 10584 } 10585 10586 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 10587 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); 10588 if (Vec.getOpcode() == ISD::UNDEF) 10589 return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, 10590 DAG.getConstant(IdxVal, MVT::i8)); 10591 const TargetRegisterClass* rc = getRegClassFor(VecVT); 10592 unsigned MaxSift = rc->getSize()*8 - 1; 10593 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, 10594 DAG.getConstant(MaxSift, MVT::i8)); 10595 EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, 10596 DAG.getConstant(MaxSift - IdxVal, MVT::i8)); 10597 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); 10598 } 10599 10600 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 10601 SelectionDAG &DAG) const { 10602 MVT VT = Op.getSimpleValueType(); 10603 MVT EltVT = VT.getVectorElementType(); 10604 10605 if (EltVT == MVT::i1) 10606 return InsertBitToMaskVector(Op, DAG); 10607 10608 SDLoc dl(Op); 10609 SDValue N0 = Op.getOperand(0); 10610 SDValue N1 = Op.getOperand(1); 10611 SDValue N2 = Op.getOperand(2); 10612 if (!isa<ConstantSDNode>(N2)) 10613 return SDValue(); 10614 auto *N2C = cast<ConstantSDNode>(N2); 10615 unsigned IdxVal = N2C->getZExtValue(); 10616 10617 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert 10618 // into that, and then insert the subvector back into the result. 10619 if (VT.is256BitVector() || VT.is512BitVector()) { 10620 // With a 256-bit vector, we can insert into the zero element efficiently 10621 // using a blend if we have AVX or AVX2 and the right data type. 10622 if (VT.is256BitVector() && IdxVal == 0) { 10623 // TODO: It is worthwhile to cast integer to floating point and back 10624 // and incur a domain crossing penalty if that's what we'll end up 10625 // doing anyway after extracting to a 128-bit vector. 10626 if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || 10627 (Subtarget->hasAVX2() && EltVT == MVT::i32)) { 10628 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); 10629 N2 = DAG.getIntPtrConstant(1); 10630 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); 10631 } 10632 } 10633 10634 // Get the desired 128-bit vector chunk. 10635 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 10636 10637 // Insert the element into the desired chunk. 10638 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); 10639 unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; 10640 10641 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 10642 DAG.getConstant(IdxIn128, MVT::i32)); 10643 10644 // Insert the changed part back into the bigger vector 10645 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 10646 } 10647 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); 10648 10649 if (Subtarget->hasSSE41()) { 10650 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { 10651 unsigned Opc; 10652 if (VT == MVT::v8i16) { 10653 Opc = X86ISD::PINSRW; 10654 } else { 10655 assert(VT == MVT::v16i8); 10656 Opc = X86ISD::PINSRB; 10657 } 10658 10659 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 10660 // argument. 10661 if (N1.getValueType() != MVT::i32) 10662 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 10663 if (N2.getValueType() != MVT::i32) 10664 N2 = DAG.getIntPtrConstant(IdxVal); 10665 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 10666 } 10667 10668 if (EltVT == MVT::f32) { 10669 // Bits [7:6] of the constant are the source select. This will always be 10670 // zero here. The DAG Combiner may combine an extract_elt index into 10671 // these bits. For example (insert (extract, 3), 2) could be matched by 10672 // putting the '3' into bits [7:6] of X86ISD::INSERTPS. 10673 // Bits [5:4] of the constant are the destination select. This is the 10674 // value of the incoming immediate. 10675 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 10676 // combine either bitwise AND or insert of float 0.0 to set these bits. 10677 10678 const Function *F = DAG.getMachineFunction().getFunction(); 10679 bool MinSize = F->hasFnAttribute(Attribute::MinSize); 10680 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { 10681 // If this is an insertion of 32-bits into the low 32-bits of 10682 // a vector, we prefer to generate a blend with immediate rather 10683 // than an insertps. Blends are simpler operations in hardware and so 10684 // will always have equal or better performance than insertps. 10685 // But if optimizing for size and there's a load folding opportunity, 10686 // generate insertps because blendps does not have a 32-bit memory 10687 // operand form. 10688 N2 = DAG.getIntPtrConstant(1); 10689 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 10690 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); 10691 } 10692 N2 = DAG.getIntPtrConstant(IdxVal << 4); 10693 // Create this as a scalar to vector.. 10694 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 10695 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 10696 } 10697 10698 if (EltVT == MVT::i32 || EltVT == MVT::i64) { 10699 // PINSR* works with constant index. 10700 return Op; 10701 } 10702 } 10703 10704 if (EltVT == MVT::i8) 10705 return SDValue(); 10706 10707 if (EltVT.getSizeInBits() == 16) { 10708 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 10709 // as its second argument. 10710 if (N1.getValueType() != MVT::i32) 10711 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 10712 if (N2.getValueType() != MVT::i32) 10713 N2 = DAG.getIntPtrConstant(IdxVal); 10714 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 10715 } 10716 return SDValue(); 10717 } 10718 10719 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 10720 SDLoc dl(Op); 10721 MVT OpVT = Op.getSimpleValueType(); 10722 10723 // If this is a 256-bit vector result, first insert into a 128-bit 10724 // vector and then insert into the 256-bit vector. 10725 if (!OpVT.is128BitVector()) { 10726 // Insert into a 128-bit vector. 10727 unsigned SizeFactor = OpVT.getSizeInBits()/128; 10728 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), 10729 OpVT.getVectorNumElements() / SizeFactor); 10730 10731 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 10732 10733 // Insert the 128-bit vector. 10734 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 10735 } 10736 10737 if (OpVT == MVT::v1i64 && 10738 Op.getOperand(0).getValueType() == MVT::i64) 10739 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 10740 10741 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 10742 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 10743 return DAG.getNode(ISD::BITCAST, dl, OpVT, 10744 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 10745 } 10746 10747 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 10748 // a simple subregister reference or explicit instructions to grab 10749 // upper bits of a vector. 10750 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 10751 SelectionDAG &DAG) { 10752 SDLoc dl(Op); 10753 SDValue In = Op.getOperand(0); 10754 SDValue Idx = Op.getOperand(1); 10755 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 10756 MVT ResVT = Op.getSimpleValueType(); 10757 MVT InVT = In.getSimpleValueType(); 10758 10759 if (Subtarget->hasFp256()) { 10760 if (ResVT.is128BitVector() && 10761 (InVT.is256BitVector() || InVT.is512BitVector()) && 10762 isa<ConstantSDNode>(Idx)) { 10763 return Extract128BitVector(In, IdxVal, DAG, dl); 10764 } 10765 if (ResVT.is256BitVector() && InVT.is512BitVector() && 10766 isa<ConstantSDNode>(Idx)) { 10767 return Extract256BitVector(In, IdxVal, DAG, dl); 10768 } 10769 } 10770 return SDValue(); 10771 } 10772 10773 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 10774 // simple superregister reference or explicit instructions to insert 10775 // the upper bits of a vector. 10776 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 10777 SelectionDAG &DAG) { 10778 if (!Subtarget->hasAVX()) 10779 return SDValue(); 10780 10781 SDLoc dl(Op); 10782 SDValue Vec = Op.getOperand(0); 10783 SDValue SubVec = Op.getOperand(1); 10784 SDValue Idx = Op.getOperand(2); 10785 10786 if (!isa<ConstantSDNode>(Idx)) 10787 return SDValue(); 10788 10789 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 10790 MVT OpVT = Op.getSimpleValueType(); 10791 MVT SubVecVT = SubVec.getSimpleValueType(); 10792 10793 // Fold two 16-byte subvector loads into one 32-byte load: 10794 // (insert_subvector (insert_subvector undef, (load addr), 0), 10795 // (load addr + 16), Elts/2) 10796 // --> load32 addr 10797 if ((IdxVal == OpVT.getVectorNumElements() / 2) && 10798 Vec.getOpcode() == ISD::INSERT_SUBVECTOR && 10799 OpVT.is256BitVector() && SubVecVT.is128BitVector() && 10800 !Subtarget->isUnalignedMem32Slow()) { 10801 SDValue SubVec2 = Vec.getOperand(1); 10802 if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) { 10803 if (Idx2->getZExtValue() == 0) { 10804 SDValue Ops[] = { SubVec2, SubVec }; 10805 SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false); 10806 if (LD.getNode()) 10807 return LD; 10808 } 10809 } 10810 } 10811 10812 if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && 10813 SubVecVT.is128BitVector()) 10814 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 10815 10816 if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) 10817 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); 10818 10819 if (OpVT.getVectorElementType() == MVT::i1) { 10820 if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal 10821 return Op; 10822 SDValue ZeroIdx = DAG.getIntPtrConstant(0); 10823 SDValue Undef = DAG.getUNDEF(OpVT); 10824 unsigned NumElems = OpVT.getVectorNumElements(); 10825 SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8); 10826 10827 if (IdxVal == OpVT.getVectorNumElements() / 2) { 10828 // Zero upper bits of the Vec 10829 Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); 10830 Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); 10831 10832 SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, 10833 SubVec, ZeroIdx); 10834 Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); 10835 return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); 10836 } 10837 if (IdxVal == 0) { 10838 SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, 10839 SubVec, ZeroIdx); 10840 // Zero upper bits of the Vec2 10841 Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); 10842 Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); 10843 // Zero lower bits of the Vec 10844 Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); 10845 Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); 10846 // Merge them together 10847 return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); 10848 } 10849 } 10850 return SDValue(); 10851 } 10852 10853 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 10854 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 10855 // one of the above mentioned nodes. It has to be wrapped because otherwise 10856 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 10857 // be used to form addressing mode. These wrapped nodes will be selected 10858 // into MOV32ri. 10859 SDValue 10860 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 10861 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 10862 10863 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 10864 // global base reg. 10865 unsigned char OpFlag = 0; 10866 unsigned WrapperKind = X86ISD::Wrapper; 10867 CodeModel::Model M = DAG.getTarget().getCodeModel(); 10868 10869 if (Subtarget->isPICStyleRIPRel() && 10870 (M == CodeModel::Small || M == CodeModel::Kernel)) 10871 WrapperKind = X86ISD::WrapperRIP; 10872 else if (Subtarget->isPICStyleGOT()) 10873 OpFlag = X86II::MO_GOTOFF; 10874 else if (Subtarget->isPICStyleStubPIC()) 10875 OpFlag = X86II::MO_PIC_BASE_OFFSET; 10876 10877 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 10878 CP->getAlignment(), 10879 CP->getOffset(), OpFlag); 10880 SDLoc DL(CP); 10881 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 10882 // With PIC, the address is actually $g + Offset. 10883 if (OpFlag) { 10884 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10885 DAG.getNode(X86ISD::GlobalBaseReg, 10886 SDLoc(), getPointerTy()), 10887 Result); 10888 } 10889 10890 return Result; 10891 } 10892 10893 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 10894 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 10895 10896 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 10897 // global base reg. 10898 unsigned char OpFlag = 0; 10899 unsigned WrapperKind = X86ISD::Wrapper; 10900 CodeModel::Model M = DAG.getTarget().getCodeModel(); 10901 10902 if (Subtarget->isPICStyleRIPRel() && 10903 (M == CodeModel::Small || M == CodeModel::Kernel)) 10904 WrapperKind = X86ISD::WrapperRIP; 10905 else if (Subtarget->isPICStyleGOT()) 10906 OpFlag = X86II::MO_GOTOFF; 10907 else if (Subtarget->isPICStyleStubPIC()) 10908 OpFlag = X86II::MO_PIC_BASE_OFFSET; 10909 10910 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 10911 OpFlag); 10912 SDLoc DL(JT); 10913 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 10914 10915 // With PIC, the address is actually $g + Offset. 10916 if (OpFlag) 10917 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10918 DAG.getNode(X86ISD::GlobalBaseReg, 10919 SDLoc(), getPointerTy()), 10920 Result); 10921 10922 return Result; 10923 } 10924 10925 SDValue 10926 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 10927 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 10928 10929 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 10930 // global base reg. 10931 unsigned char OpFlag = 0; 10932 unsigned WrapperKind = X86ISD::Wrapper; 10933 CodeModel::Model M = DAG.getTarget().getCodeModel(); 10934 10935 if (Subtarget->isPICStyleRIPRel() && 10936 (M == CodeModel::Small || M == CodeModel::Kernel)) { 10937 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 10938 OpFlag = X86II::MO_GOTPCREL; 10939 WrapperKind = X86ISD::WrapperRIP; 10940 } else if (Subtarget->isPICStyleGOT()) { 10941 OpFlag = X86II::MO_GOT; 10942 } else if (Subtarget->isPICStyleStubPIC()) { 10943 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 10944 } else if (Subtarget->isPICStyleStubNoDynamic()) { 10945 OpFlag = X86II::MO_DARWIN_NONLAZY; 10946 } 10947 10948 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 10949 10950 SDLoc DL(Op); 10951 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 10952 10953 // With PIC, the address is actually $g + Offset. 10954 if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && 10955 !Subtarget->is64Bit()) { 10956 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10957 DAG.getNode(X86ISD::GlobalBaseReg, 10958 SDLoc(), getPointerTy()), 10959 Result); 10960 } 10961 10962 // For symbols that require a load from a stub to get the address, emit the 10963 // load. 10964 if (isGlobalStubReference(OpFlag)) 10965 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 10966 MachinePointerInfo::getGOT(), false, false, false, 0); 10967 10968 return Result; 10969 } 10970 10971 SDValue 10972 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 10973 // Create the TargetBlockAddressAddress node. 10974 unsigned char OpFlags = 10975 Subtarget->ClassifyBlockAddressReference(); 10976 CodeModel::Model M = DAG.getTarget().getCodeModel(); 10977 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 10978 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 10979 SDLoc dl(Op); 10980 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, 10981 OpFlags); 10982 10983 if (Subtarget->isPICStyleRIPRel() && 10984 (M == CodeModel::Small || M == CodeModel::Kernel)) 10985 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 10986 else 10987 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 10988 10989 // With PIC, the address is actually $g + Offset. 10990 if (isGlobalRelativeToPICBase(OpFlags)) { 10991 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 10992 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 10993 Result); 10994 } 10995 10996 return Result; 10997 } 10998 10999 SDValue 11000 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, 11001 int64_t Offset, SelectionDAG &DAG) const { 11002 // Create the TargetGlobalAddress node, folding in the constant 11003 // offset if it is legal. 11004 unsigned char OpFlags = 11005 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); 11006 CodeModel::Model M = DAG.getTarget().getCodeModel(); 11007 SDValue Result; 11008 if (OpFlags == X86II::MO_NO_FLAG && 11009 X86::isOffsetSuitableForCodeModel(Offset, M)) { 11010 // A direct static reference to a global. 11011 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 11012 Offset = 0; 11013 } else { 11014 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 11015 } 11016 11017 if (Subtarget->isPICStyleRIPRel() && 11018 (M == CodeModel::Small || M == CodeModel::Kernel)) 11019 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 11020 else 11021 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 11022 11023 // With PIC, the address is actually $g + Offset. 11024 if (isGlobalRelativeToPICBase(OpFlags)) { 11025 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 11026 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 11027 Result); 11028 } 11029 11030 // For globals that require a load from a stub to get the address, emit the 11031 // load. 11032 if (isGlobalStubReference(OpFlags)) 11033 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 11034 MachinePointerInfo::getGOT(), false, false, false, 0); 11035 11036 // If there was a non-zero offset that we didn't fold, create an explicit 11037 // addition for it. 11038 if (Offset != 0) 11039 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 11040 DAG.getConstant(Offset, getPointerTy())); 11041 11042 return Result; 11043 } 11044 11045 SDValue 11046 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 11047 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 11048 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 11049 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); 11050 } 11051 11052 static SDValue 11053 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 11054 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 11055 unsigned char OperandFlags, bool LocalDynamic = false) { 11056 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 11057 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 11058 SDLoc dl(GA); 11059 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 11060 GA->getValueType(0), 11061 GA->getOffset(), 11062 OperandFlags); 11063 11064 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 11065 : X86ISD::TLSADDR; 11066 11067 if (InFlag) { 11068 SDValue Ops[] = { Chain, TGA, *InFlag }; 11069 Chain = DAG.getNode(CallType, dl, NodeTys, Ops); 11070 } else { 11071 SDValue Ops[] = { Chain, TGA }; 11072 Chain = DAG.getNode(CallType, dl, NodeTys, Ops); 11073 } 11074 11075 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 11076 MFI->setAdjustsStack(true); 11077 MFI->setHasCalls(true); 11078 11079 SDValue Flag = Chain.getValue(1); 11080 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 11081 } 11082 11083 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 11084 static SDValue 11085 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 11086 const EVT PtrVT) { 11087 SDValue InFlag; 11088 SDLoc dl(GA); // ? function entry point might be better 11089 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 11090 DAG.getNode(X86ISD::GlobalBaseReg, 11091 SDLoc(), PtrVT), InFlag); 11092 InFlag = Chain.getValue(1); 11093 11094 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 11095 } 11096 11097 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 11098 static SDValue 11099 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 11100 const EVT PtrVT) { 11101 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, 11102 X86::RAX, X86II::MO_TLSGD); 11103 } 11104 11105 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 11106 SelectionDAG &DAG, 11107 const EVT PtrVT, 11108 bool is64Bit) { 11109 SDLoc dl(GA); 11110 11111 // Get the start address of the TLS block for this module. 11112 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 11113 .getInfo<X86MachineFunctionInfo>(); 11114 MFI->incNumLocalDynamicTLSAccesses(); 11115 11116 SDValue Base; 11117 if (is64Bit) { 11118 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, 11119 X86II::MO_TLSLD, /*LocalDynamic=*/true); 11120 } else { 11121 SDValue InFlag; 11122 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 11123 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); 11124 InFlag = Chain.getValue(1); 11125 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 11126 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 11127 } 11128 11129 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 11130 // of Base. 11131 11132 // Build x@dtpoff. 11133 unsigned char OperandFlags = X86II::MO_DTPOFF; 11134 unsigned WrapperKind = X86ISD::Wrapper; 11135 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 11136 GA->getValueType(0), 11137 GA->getOffset(), OperandFlags); 11138 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 11139 11140 // Add x@dtpoff with the base. 11141 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 11142 } 11143 11144 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 11145 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 11146 const EVT PtrVT, TLSModel::Model model, 11147 bool is64Bit, bool isPIC) { 11148 SDLoc dl(GA); 11149 11150 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 11151 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 11152 is64Bit ? 257 : 256)); 11153 11154 SDValue ThreadPointer = 11155 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), 11156 MachinePointerInfo(Ptr), false, false, false, 0); 11157 11158 unsigned char OperandFlags = 0; 11159 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 11160 // initialexec. 11161 unsigned WrapperKind = X86ISD::Wrapper; 11162 if (model == TLSModel::LocalExec) { 11163 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 11164 } else if (model == TLSModel::InitialExec) { 11165 if (is64Bit) { 11166 OperandFlags = X86II::MO_GOTTPOFF; 11167 WrapperKind = X86ISD::WrapperRIP; 11168 } else { 11169 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 11170 } 11171 } else { 11172 llvm_unreachable("Unexpected model"); 11173 } 11174 11175 // emit "addl x@ntpoff,%eax" (local exec) 11176 // or "addl x@indntpoff,%eax" (initial exec) 11177 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 11178 SDValue TGA = 11179 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), 11180 GA->getOffset(), OperandFlags); 11181 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 11182 11183 if (model == TLSModel::InitialExec) { 11184 if (isPIC && !is64Bit) { 11185 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 11186 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 11187 Offset); 11188 } 11189 11190 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 11191 MachinePointerInfo::getGOT(), false, false, false, 0); 11192 } 11193 11194 // The address of the thread local variable is the add of the thread 11195 // pointer with the offset of the variable. 11196 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 11197 } 11198 11199 SDValue 11200 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 11201 11202 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 11203 const GlobalValue *GV = GA->getGlobal(); 11204 11205 if (Subtarget->isTargetELF()) { 11206 TLSModel::Model model = DAG.getTarget().getTLSModel(GV); 11207 11208 switch (model) { 11209 case TLSModel::GeneralDynamic: 11210 if (Subtarget->is64Bit()) 11211 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 11212 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 11213 case TLSModel::LocalDynamic: 11214 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), 11215 Subtarget->is64Bit()); 11216 case TLSModel::InitialExec: 11217 case TLSModel::LocalExec: 11218 return LowerToTLSExecModel( 11219 GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), 11220 DAG.getTarget().getRelocationModel() == Reloc::PIC_); 11221 } 11222 llvm_unreachable("Unknown TLS model."); 11223 } 11224 11225 if (Subtarget->isTargetDarwin()) { 11226 // Darwin only has one model of TLS. Lower to that. 11227 unsigned char OpFlag = 0; 11228 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 11229 X86ISD::WrapperRIP : X86ISD::Wrapper; 11230 11231 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 11232 // global base reg. 11233 bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && 11234 !Subtarget->is64Bit(); 11235 if (PIC32) 11236 OpFlag = X86II::MO_TLVP_PIC_BASE; 11237 else 11238 OpFlag = X86II::MO_TLVP; 11239 SDLoc DL(Op); 11240 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 11241 GA->getValueType(0), 11242 GA->getOffset(), OpFlag); 11243 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 11244 11245 // With PIC32, the address is actually $g + Offset. 11246 if (PIC32) 11247 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 11248 DAG.getNode(X86ISD::GlobalBaseReg, 11249 SDLoc(), getPointerTy()), 11250 Offset); 11251 11252 // Lowering the machine isd will make sure everything is in the right 11253 // location. 11254 SDValue Chain = DAG.getEntryNode(); 11255 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 11256 SDValue Args[] = { Chain, Offset }; 11257 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); 11258 11259 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 11260 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 11261 MFI->setAdjustsStack(true); 11262 11263 // And our return value (tls address) is in the standard call return value 11264 // location. 11265 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 11266 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 11267 Chain.getValue(1)); 11268 } 11269 11270 if (Subtarget->isTargetKnownWindowsMSVC() || 11271 Subtarget->isTargetWindowsGNU()) { 11272 // Just use the implicit TLS architecture 11273 // Need to generate someting similar to: 11274 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 11275 // ; from TEB 11276 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 11277 // mov rcx, qword [rdx+rcx*8] 11278 // mov eax, .tls$:tlsvar 11279 // [rax+rcx] contains the address 11280 // Windows 64bit: gs:0x58 11281 // Windows 32bit: fs:__tls_array 11282 11283 SDLoc dl(GA); 11284 SDValue Chain = DAG.getEntryNode(); 11285 11286 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 11287 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly 11288 // use its literal value of 0x2C. 11289 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 11290 ? Type::getInt8PtrTy(*DAG.getContext(), 11291 256) 11292 : Type::getInt32PtrTy(*DAG.getContext(), 11293 257)); 11294 11295 SDValue TlsArray = 11296 Subtarget->is64Bit() 11297 ? DAG.getIntPtrConstant(0x58) 11298 : (Subtarget->isTargetWindowsGNU() 11299 ? DAG.getIntPtrConstant(0x2C) 11300 : DAG.getExternalSymbol("_tls_array", getPointerTy())); 11301 11302 SDValue ThreadPointer = 11303 DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, 11304 MachinePointerInfo(Ptr), false, false, false, 0); 11305 11306 // Load the _tls_index variable 11307 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 11308 if (Subtarget->is64Bit()) 11309 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 11310 IDX, MachinePointerInfo(), MVT::i32, 11311 false, false, false, 0); 11312 else 11313 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 11314 false, false, false, 0); 11315 11316 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 11317 getPointerTy()); 11318 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 11319 11320 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 11321 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 11322 false, false, false, 0); 11323 11324 // Get the offset of start of .tls section 11325 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 11326 GA->getValueType(0), 11327 GA->getOffset(), X86II::MO_SECREL); 11328 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 11329 11330 // The address of the thread local variable is the add of the thread 11331 // pointer with the offset of the variable. 11332 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 11333 } 11334 11335 llvm_unreachable("TLS not implemented for this target."); 11336 } 11337 11338 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 11339 /// and take a 2 x i32 value to shift plus a shift amount. 11340 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { 11341 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 11342 MVT VT = Op.getSimpleValueType(); 11343 unsigned VTBits = VT.getSizeInBits(); 11344 SDLoc dl(Op); 11345 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 11346 SDValue ShOpLo = Op.getOperand(0); 11347 SDValue ShOpHi = Op.getOperand(1); 11348 SDValue ShAmt = Op.getOperand(2); 11349 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the 11350 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away 11351 // during isel. 11352 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 11353 DAG.getConstant(VTBits - 1, MVT::i8)); 11354 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 11355 DAG.getConstant(VTBits - 1, MVT::i8)) 11356 : DAG.getConstant(0, VT); 11357 11358 SDValue Tmp2, Tmp3; 11359 if (Op.getOpcode() == ISD::SHL_PARTS) { 11360 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 11361 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); 11362 } else { 11363 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 11364 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); 11365 } 11366 11367 // If the shift amount is larger or equal than the width of a part we can't 11368 // rely on the results of shld/shrd. Insert a test and select the appropriate 11369 // values for large shift amounts. 11370 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 11371 DAG.getConstant(VTBits, MVT::i8)); 11372 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 11373 AndNode, DAG.getConstant(0, MVT::i8)); 11374 11375 SDValue Hi, Lo; 11376 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 11377 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 11378 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 11379 11380 if (Op.getOpcode() == ISD::SHL_PARTS) { 11381 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); 11382 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); 11383 } else { 11384 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); 11385 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); 11386 } 11387 11388 SDValue Ops[2] = { Lo, Hi }; 11389 return DAG.getMergeValues(Ops, dl); 11390 } 11391 11392 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 11393 SelectionDAG &DAG) const { 11394 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 11395 SDLoc dl(Op); 11396 11397 if (SrcVT.isVector()) { 11398 if (SrcVT.getVectorElementType() == MVT::i1) { 11399 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); 11400 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 11401 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, 11402 Op.getOperand(0))); 11403 } 11404 return SDValue(); 11405 } 11406 11407 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && 11408 "Unknown SINT_TO_FP to lower!"); 11409 11410 // These are really Legal; return the operand so the caller accepts it as 11411 // Legal. 11412 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 11413 return Op; 11414 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 11415 Subtarget->is64Bit()) { 11416 return Op; 11417 } 11418 11419 unsigned Size = SrcVT.getSizeInBits()/8; 11420 MachineFunction &MF = DAG.getMachineFunction(); 11421 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 11422 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 11423 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 11424 StackSlot, 11425 MachinePointerInfo::getFixedStack(SSFI), 11426 false, false, 0); 11427 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 11428 } 11429 11430 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 11431 SDValue StackSlot, 11432 SelectionDAG &DAG) const { 11433 // Build the FILD 11434 SDLoc DL(Op); 11435 SDVTList Tys; 11436 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 11437 if (useSSE) 11438 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 11439 else 11440 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 11441 11442 unsigned ByteSize = SrcVT.getSizeInBits()/8; 11443 11444 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 11445 MachineMemOperand *MMO; 11446 if (FI) { 11447 int SSFI = FI->getIndex(); 11448 MMO = 11449 DAG.getMachineFunction() 11450 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 11451 MachineMemOperand::MOLoad, ByteSize, ByteSize); 11452 } else { 11453 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 11454 StackSlot = StackSlot.getOperand(1); 11455 } 11456 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 11457 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 11458 X86ISD::FILD, DL, 11459 Tys, Ops, SrcVT, MMO); 11460 11461 if (useSSE) { 11462 Chain = Result.getValue(1); 11463 SDValue InFlag = Result.getValue(2); 11464 11465 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 11466 // shouldn't be necessary except that RFP cannot be live across 11467 // multiple blocks. When stackifier is fixed, they can be uncoupled. 11468 MachineFunction &MF = DAG.getMachineFunction(); 11469 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 11470 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 11471 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 11472 Tys = DAG.getVTList(MVT::Other); 11473 SDValue Ops[] = { 11474 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 11475 }; 11476 MachineMemOperand *MMO = 11477 DAG.getMachineFunction() 11478 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 11479 MachineMemOperand::MOStore, SSFISize, SSFISize); 11480 11481 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 11482 Ops, Op.getValueType(), MMO); 11483 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 11484 MachinePointerInfo::getFixedStack(SSFI), 11485 false, false, false, 0); 11486 } 11487 11488 return Result; 11489 } 11490 11491 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 11492 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 11493 SelectionDAG &DAG) const { 11494 // This algorithm is not obvious. Here it is what we're trying to output: 11495 /* 11496 movq %rax, %xmm0 11497 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 11498 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 11499 #ifdef __SSE3__ 11500 haddpd %xmm0, %xmm0 11501 #else 11502 pshufd $0x4e, %xmm0, %xmm1 11503 addpd %xmm1, %xmm0 11504 #endif 11505 */ 11506 11507 SDLoc dl(Op); 11508 LLVMContext *Context = DAG.getContext(); 11509 11510 // Build some magic constants. 11511 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 11512 Constant *C0 = ConstantDataVector::get(*Context, CV0); 11513 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 11514 11515 SmallVector<Constant*,2> CV1; 11516 CV1.push_back( 11517 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 11518 APInt(64, 0x4330000000000000ULL)))); 11519 CV1.push_back( 11520 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 11521 APInt(64, 0x4530000000000000ULL)))); 11522 Constant *C1 = ConstantVector::get(CV1); 11523 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 11524 11525 // Load the 64-bit value into an XMM register. 11526 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 11527 Op.getOperand(0)); 11528 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 11529 MachinePointerInfo::getConstantPool(), 11530 false, false, false, 16); 11531 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 11532 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 11533 CLod0); 11534 11535 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 11536 MachinePointerInfo::getConstantPool(), 11537 false, false, false, 16); 11538 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 11539 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 11540 SDValue Result; 11541 11542 if (Subtarget->hasSSE3()) { 11543 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 11544 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 11545 } else { 11546 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 11547 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 11548 S2F, 0x4E, DAG); 11549 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 11550 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 11551 Sub); 11552 } 11553 11554 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 11555 DAG.getIntPtrConstant(0)); 11556 } 11557 11558 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 11559 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 11560 SelectionDAG &DAG) const { 11561 SDLoc dl(Op); 11562 // FP constant to bias correct the final result. 11563 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 11564 MVT::f64); 11565 11566 // Load the 32-bit value into an XMM register. 11567 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 11568 Op.getOperand(0)); 11569 11570 // Zero out the upper parts of the register. 11571 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 11572 11573 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 11574 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 11575 DAG.getIntPtrConstant(0)); 11576 11577 // Or the load with the bias. 11578 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 11579 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 11580 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 11581 MVT::v2f64, Load)), 11582 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 11583 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 11584 MVT::v2f64, Bias))); 11585 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 11586 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 11587 DAG.getIntPtrConstant(0)); 11588 11589 // Subtract the bias. 11590 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 11591 11592 // Handle final rounding. 11593 EVT DestVT = Op.getValueType(); 11594 11595 if (DestVT.bitsLT(MVT::f64)) 11596 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 11597 DAG.getIntPtrConstant(0)); 11598 if (DestVT.bitsGT(MVT::f64)) 11599 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 11600 11601 // Handle final rounding. 11602 return Sub; 11603 } 11604 11605 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, 11606 const X86Subtarget &Subtarget) { 11607 // The algorithm is the following: 11608 // #ifdef __SSE4_1__ 11609 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); 11610 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), 11611 // (uint4) 0x53000000, 0xaa); 11612 // #else 11613 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; 11614 // uint4 hi = (v >> 16) | (uint4) 0x53000000; 11615 // #endif 11616 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); 11617 // return (float4) lo + fhi; 11618 11619 SDLoc DL(Op); 11620 SDValue V = Op->getOperand(0); 11621 EVT VecIntVT = V.getValueType(); 11622 bool Is128 = VecIntVT == MVT::v4i32; 11623 EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; 11624 // If we convert to something else than the supported type, e.g., to v4f64, 11625 // abort early. 11626 if (VecFloatVT != Op->getValueType(0)) 11627 return SDValue(); 11628 11629 unsigned NumElts = VecIntVT.getVectorNumElements(); 11630 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && 11631 "Unsupported custom type"); 11632 assert(NumElts <= 8 && "The size of the constant array must be fixed"); 11633 11634 // In the #idef/#else code, we have in common: 11635 // - The vector of constants: 11636 // -- 0x4b000000 11637 // -- 0x53000000 11638 // - A shift: 11639 // -- v >> 16 11640 11641 // Create the splat vector for 0x4b000000. 11642 SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32); 11643 SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, 11644 CstLow, CstLow, CstLow, CstLow}; 11645 SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, 11646 makeArrayRef(&CstLowArray[0], NumElts)); 11647 // Create the splat vector for 0x53000000. 11648 SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32); 11649 SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, 11650 CstHigh, CstHigh, CstHigh, CstHigh}; 11651 SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, 11652 makeArrayRef(&CstHighArray[0], NumElts)); 11653 11654 // Create the right shift. 11655 SDValue CstShift = DAG.getConstant(16, MVT::i32); 11656 SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, 11657 CstShift, CstShift, CstShift, CstShift}; 11658 SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, 11659 makeArrayRef(&CstShiftArray[0], NumElts)); 11660 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); 11661 11662 SDValue Low, High; 11663 if (Subtarget.hasSSE41()) { 11664 EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; 11665 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); 11666 SDValue VecCstLowBitcast = 11667 DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow); 11668 SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V); 11669 // Low will be bitcasted right away, so do not bother bitcasting back to its 11670 // original type. 11671 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, 11672 VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32)); 11673 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), 11674 // (uint4) 0x53000000, 0xaa); 11675 SDValue VecCstHighBitcast = 11676 DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh); 11677 SDValue VecShiftBitcast = 11678 DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift); 11679 // High will be bitcasted right away, so do not bother bitcasting back to 11680 // its original type. 11681 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, 11682 VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32)); 11683 } else { 11684 SDValue CstMask = DAG.getConstant(0xffff, MVT::i32); 11685 SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, 11686 CstMask, CstMask, CstMask); 11687 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; 11688 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); 11689 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); 11690 11691 // uint4 hi = (v >> 16) | (uint4) 0x53000000; 11692 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); 11693 } 11694 11695 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). 11696 SDValue CstFAdd = DAG.getConstantFP( 11697 APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32); 11698 SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, 11699 CstFAdd, CstFAdd, CstFAdd, CstFAdd}; 11700 SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, 11701 makeArrayRef(&CstFAddArray[0], NumElts)); 11702 11703 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); 11704 SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High); 11705 SDValue FHigh = 11706 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); 11707 // return (float4) lo + fhi; 11708 SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low); 11709 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); 11710 } 11711 11712 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, 11713 SelectionDAG &DAG) const { 11714 SDValue N0 = Op.getOperand(0); 11715 MVT SVT = N0.getSimpleValueType(); 11716 SDLoc dl(Op); 11717 11718 switch (SVT.SimpleTy) { 11719 default: 11720 llvm_unreachable("Custom UINT_TO_FP is not supported!"); 11721 case MVT::v4i8: 11722 case MVT::v4i16: 11723 case MVT::v8i8: 11724 case MVT::v8i16: { 11725 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); 11726 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 11727 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); 11728 } 11729 case MVT::v4i32: 11730 case MVT::v8i32: 11731 return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); 11732 } 11733 llvm_unreachable(nullptr); 11734 } 11735 11736 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 11737 SelectionDAG &DAG) const { 11738 SDValue N0 = Op.getOperand(0); 11739 SDLoc dl(Op); 11740 11741 if (Op.getValueType().isVector()) 11742 return lowerUINT_TO_FP_vec(Op, DAG); 11743 11744 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 11745 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 11746 // the optimization here. 11747 if (DAG.SignBitIsZero(N0)) 11748 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 11749 11750 MVT SrcVT = N0.getSimpleValueType(); 11751 MVT DstVT = Op.getSimpleValueType(); 11752 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 11753 return LowerUINT_TO_FP_i64(Op, DAG); 11754 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 11755 return LowerUINT_TO_FP_i32(Op, DAG); 11756 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 11757 return SDValue(); 11758 11759 // Make a 64-bit buffer, and use it to build an FILD. 11760 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 11761 if (SrcVT == MVT::i32) { 11762 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 11763 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 11764 getPointerTy(), StackSlot, WordOff); 11765 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 11766 StackSlot, MachinePointerInfo(), 11767 false, false, 0); 11768 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 11769 OffsetSlot, MachinePointerInfo(), 11770 false, false, 0); 11771 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 11772 return Fild; 11773 } 11774 11775 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 11776 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 11777 StackSlot, MachinePointerInfo(), 11778 false, false, 0); 11779 // For i64 source, we need to add the appropriate power of 2 if the input 11780 // was negative. This is the same as the optimization in 11781 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 11782 // we must be careful to do the computation in x87 extended precision, not 11783 // in SSE. (The generic code can't know it's OK to do this, or how to.) 11784 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 11785 MachineMemOperand *MMO = 11786 DAG.getMachineFunction() 11787 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 11788 MachineMemOperand::MOLoad, 8, 8); 11789 11790 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 11791 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 11792 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 11793 MVT::i64, MMO); 11794 11795 APInt FF(32, 0x5F800000ULL); 11796 11797 // Check whether the sign bit is set. 11798 SDValue SignSet = DAG.getSetCC(dl, 11799 getSetCCResultType(*DAG.getContext(), MVT::i64), 11800 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 11801 ISD::SETLT); 11802 11803 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 11804 SDValue FudgePtr = DAG.getConstantPool( 11805 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 11806 getPointerTy()); 11807 11808 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 11809 SDValue Zero = DAG.getIntPtrConstant(0); 11810 SDValue Four = DAG.getIntPtrConstant(4); 11811 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 11812 Zero, Four); 11813 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 11814 11815 // Load the value out, extending it from f32 to f80. 11816 // FIXME: Avoid the extend by constructing the right constant pool? 11817 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 11818 FudgePtr, MachinePointerInfo::getConstantPool(), 11819 MVT::f32, false, false, false, 4); 11820 // Extend everything to 80 bits to force it to be done on x87. 11821 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 11822 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 11823 } 11824 11825 std::pair<SDValue,SDValue> 11826 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, 11827 bool IsSigned, bool IsReplace) const { 11828 SDLoc DL(Op); 11829 11830 EVT DstTy = Op.getValueType(); 11831 11832 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 11833 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 11834 DstTy = MVT::i64; 11835 } 11836 11837 assert(DstTy.getSimpleVT() <= MVT::i64 && 11838 DstTy.getSimpleVT() >= MVT::i16 && 11839 "Unknown FP_TO_INT to lower!"); 11840 11841 // These are really Legal. 11842 if (DstTy == MVT::i32 && 11843 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 11844 return std::make_pair(SDValue(), SDValue()); 11845 if (Subtarget->is64Bit() && 11846 DstTy == MVT::i64 && 11847 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 11848 return std::make_pair(SDValue(), SDValue()); 11849 11850 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 11851 // stack slot, or into the FTOL runtime function. 11852 MachineFunction &MF = DAG.getMachineFunction(); 11853 unsigned MemSize = DstTy.getSizeInBits()/8; 11854 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 11855 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 11856 11857 unsigned Opc; 11858 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 11859 Opc = X86ISD::WIN_FTOL; 11860 else 11861 switch (DstTy.getSimpleVT().SimpleTy) { 11862 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 11863 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 11864 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 11865 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 11866 } 11867 11868 SDValue Chain = DAG.getEntryNode(); 11869 SDValue Value = Op.getOperand(0); 11870 EVT TheVT = Op.getOperand(0).getValueType(); 11871 // FIXME This causes a redundant load/store if the SSE-class value is already 11872 // in memory, such as if it is on the callstack. 11873 if (isScalarFPTypeInSSEReg(TheVT)) { 11874 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 11875 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 11876 MachinePointerInfo::getFixedStack(SSFI), 11877 false, false, 0); 11878 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 11879 SDValue Ops[] = { 11880 Chain, StackSlot, DAG.getValueType(TheVT) 11881 }; 11882 11883 MachineMemOperand *MMO = 11884 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 11885 MachineMemOperand::MOLoad, MemSize, MemSize); 11886 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); 11887 Chain = Value.getValue(1); 11888 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 11889 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 11890 } 11891 11892 MachineMemOperand *MMO = 11893 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 11894 MachineMemOperand::MOStore, MemSize, MemSize); 11895 11896 if (Opc != X86ISD::WIN_FTOL) { 11897 // Build the FP_TO_INT*_IN_MEM 11898 SDValue Ops[] = { Chain, Value, StackSlot }; 11899 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 11900 Ops, DstTy, MMO); 11901 return std::make_pair(FIST, StackSlot); 11902 } else { 11903 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 11904 DAG.getVTList(MVT::Other, MVT::Glue), 11905 Chain, Value); 11906 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 11907 MVT::i32, ftol.getValue(1)); 11908 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 11909 MVT::i32, eax.getValue(2)); 11910 SDValue Ops[] = { eax, edx }; 11911 SDValue pair = IsReplace 11912 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) 11913 : DAG.getMergeValues(Ops, DL); 11914 return std::make_pair(pair, SDValue()); 11915 } 11916 } 11917 11918 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, 11919 const X86Subtarget *Subtarget) { 11920 MVT VT = Op->getSimpleValueType(0); 11921 SDValue In = Op->getOperand(0); 11922 MVT InVT = In.getSimpleValueType(); 11923 SDLoc dl(Op); 11924 11925 // Optimize vectors in AVX mode: 11926 // 11927 // v8i16 -> v8i32 11928 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 11929 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 11930 // Concat upper and lower parts. 11931 // 11932 // v4i32 -> v4i64 11933 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 11934 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 11935 // Concat upper and lower parts. 11936 // 11937 11938 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && 11939 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && 11940 ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) 11941 return SDValue(); 11942 11943 if (Subtarget->hasInt256()) 11944 return DAG.getNode(X86ISD::VZEXT, dl, VT, In); 11945 11946 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); 11947 SDValue Undef = DAG.getUNDEF(InVT); 11948 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; 11949 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 11950 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 11951 11952 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), 11953 VT.getVectorNumElements()/2); 11954 11955 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 11956 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 11957 11958 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 11959 } 11960 11961 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, 11962 SelectionDAG &DAG) { 11963 MVT VT = Op->getSimpleValueType(0); 11964 SDValue In = Op->getOperand(0); 11965 MVT InVT = In.getSimpleValueType(); 11966 SDLoc DL(Op); 11967 unsigned int NumElts = VT.getVectorNumElements(); 11968 if (NumElts != 8 && NumElts != 16) 11969 return SDValue(); 11970 11971 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 11972 return DAG.getNode(X86ISD::VZEXT, DL, VT, In); 11973 11974 EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; 11975 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11976 // Now we have only mask extension 11977 assert(InVT.getVectorElementType() == MVT::i1); 11978 SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); 11979 const Constant *C = cast<ConstantSDNode>(Cst)->getConstantIntValue(); 11980 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 11981 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 11982 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 11983 MachinePointerInfo::getConstantPool(), 11984 false, false, false, Alignment); 11985 11986 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); 11987 if (VT.is512BitVector()) 11988 return Brcst; 11989 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); 11990 } 11991 11992 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 11993 SelectionDAG &DAG) { 11994 if (Subtarget->hasFp256()) { 11995 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 11996 if (Res.getNode()) 11997 return Res; 11998 } 11999 12000 return SDValue(); 12001 } 12002 12003 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 12004 SelectionDAG &DAG) { 12005 SDLoc DL(Op); 12006 MVT VT = Op.getSimpleValueType(); 12007 SDValue In = Op.getOperand(0); 12008 MVT SVT = In.getSimpleValueType(); 12009 12010 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) 12011 return LowerZERO_EXTEND_AVX512(Op, DAG); 12012 12013 if (Subtarget->hasFp256()) { 12014 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 12015 if (Res.getNode()) 12016 return Res; 12017 } 12018 12019 assert(!VT.is256BitVector() || !SVT.is128BitVector() || 12020 VT.getVectorNumElements() != SVT.getVectorNumElements()); 12021 return SDValue(); 12022 } 12023 12024 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 12025 SDLoc DL(Op); 12026 MVT VT = Op.getSimpleValueType(); 12027 SDValue In = Op.getOperand(0); 12028 MVT InVT = In.getSimpleValueType(); 12029 12030 if (VT == MVT::i1) { 12031 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && 12032 "Invalid scalar TRUNCATE operation"); 12033 if (InVT.getSizeInBits() >= 32) 12034 return SDValue(); 12035 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); 12036 return DAG.getNode(ISD::TRUNCATE, DL, VT, In); 12037 } 12038 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && 12039 "Invalid TRUNCATE operation"); 12040 12041 if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { 12042 if (VT.getVectorElementType().getSizeInBits() >=8) 12043 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); 12044 12045 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 12046 unsigned NumElts = InVT.getVectorNumElements(); 12047 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); 12048 if (InVT.getSizeInBits() < 512) { 12049 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; 12050 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); 12051 InVT = ExtVT; 12052 } 12053 12054 SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); 12055 const Constant *C = cast<ConstantSDNode>(Cst)->getConstantIntValue(); 12056 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 12057 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 12058 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 12059 MachinePointerInfo::getConstantPool(), 12060 false, false, false, Alignment); 12061 SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); 12062 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); 12063 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); 12064 } 12065 12066 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { 12067 // On AVX2, v4i64 -> v4i32 becomes VPERMD. 12068 if (Subtarget->hasInt256()) { 12069 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 12070 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); 12071 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), 12072 ShufMask); 12073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, 12074 DAG.getIntPtrConstant(0)); 12075 } 12076 12077 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 12078 DAG.getIntPtrConstant(0)); 12079 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 12080 DAG.getIntPtrConstant(2)); 12081 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 12082 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 12083 static const int ShufMask[] = {0, 2, 4, 6}; 12084 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); 12085 } 12086 12087 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { 12088 // On AVX2, v8i32 -> v8i16 becomed PSHUFB. 12089 if (Subtarget->hasInt256()) { 12090 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); 12091 12092 SmallVector<SDValue,32> pshufbMask; 12093 for (unsigned i = 0; i < 2; ++i) { 12094 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); 12095 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); 12096 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); 12097 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); 12098 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); 12099 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); 12100 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); 12101 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); 12102 for (unsigned j = 0; j < 8; ++j) 12103 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 12104 } 12105 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); 12106 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); 12107 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); 12108 12109 static const int ShufMask[] = {0, 2, -1, -1}; 12110 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), 12111 &ShufMask[0]); 12112 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 12113 DAG.getIntPtrConstant(0)); 12114 return DAG.getNode(ISD::BITCAST, DL, VT, In); 12115 } 12116 12117 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 12118 DAG.getIntPtrConstant(0)); 12119 12120 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 12121 DAG.getIntPtrConstant(4)); 12122 12123 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); 12124 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); 12125 12126 // The PSHUFB mask: 12127 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 12128 -1, -1, -1, -1, -1, -1, -1, -1}; 12129 12130 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 12131 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); 12132 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); 12133 12134 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 12135 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 12136 12137 // The MOVLHPS Mask: 12138 static const int ShufMask2[] = {0, 1, 4, 5}; 12139 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); 12140 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); 12141 } 12142 12143 // Handle truncation of V256 to V128 using shuffles. 12144 if (!VT.is128BitVector() || !InVT.is256BitVector()) 12145 return SDValue(); 12146 12147 assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); 12148 12149 unsigned NumElems = VT.getVectorNumElements(); 12150 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); 12151 12152 SmallVector<int, 16> MaskVec(NumElems * 2, -1); 12153 // Prepare truncation shuffle mask 12154 for (unsigned i = 0; i != NumElems; ++i) 12155 MaskVec[i] = i * 2; 12156 SDValue V = DAG.getVectorShuffle(NVT, DL, 12157 DAG.getNode(ISD::BITCAST, DL, NVT, In), 12158 DAG.getUNDEF(NVT), &MaskVec[0]); 12159 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, 12160 DAG.getIntPtrConstant(0)); 12161 } 12162 12163 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 12164 SelectionDAG &DAG) const { 12165 assert(!Op.getSimpleValueType().isVector()); 12166 12167 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 12168 /*IsSigned=*/ true, /*IsReplace=*/ false); 12169 SDValue FIST = Vals.first, StackSlot = Vals.second; 12170 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 12171 if (!FIST.getNode()) return Op; 12172 12173 if (StackSlot.getNode()) 12174 // Load the result. 12175 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 12176 FIST, StackSlot, MachinePointerInfo(), 12177 false, false, false, 0); 12178 12179 // The node is the result. 12180 return FIST; 12181 } 12182 12183 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 12184 SelectionDAG &DAG) const { 12185 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 12186 /*IsSigned=*/ false, /*IsReplace=*/ false); 12187 SDValue FIST = Vals.first, StackSlot = Vals.second; 12188 assert(FIST.getNode() && "Unexpected failure"); 12189 12190 if (StackSlot.getNode()) 12191 // Load the result. 12192 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 12193 FIST, StackSlot, MachinePointerInfo(), 12194 false, false, false, 0); 12195 12196 // The node is the result. 12197 return FIST; 12198 } 12199 12200 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { 12201 SDLoc DL(Op); 12202 MVT VT = Op.getSimpleValueType(); 12203 SDValue In = Op.getOperand(0); 12204 MVT SVT = In.getSimpleValueType(); 12205 12206 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 12207 12208 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 12209 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 12210 In, DAG.getUNDEF(SVT))); 12211 } 12212 12213 /// The only differences between FABS and FNEG are the mask and the logic op. 12214 /// FNEG also has a folding opportunity for FNEG(FABS(x)). 12215 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { 12216 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && 12217 "Wrong opcode for lowering FABS or FNEG."); 12218 12219 bool IsFABS = (Op.getOpcode() == ISD::FABS); 12220 12221 // If this is a FABS and it has an FNEG user, bail out to fold the combination 12222 // into an FNABS. We'll lower the FABS after that if it is still in use. 12223 if (IsFABS) 12224 for (SDNode *User : Op->uses()) 12225 if (User->getOpcode() == ISD::FNEG) 12226 return Op; 12227 12228 SDValue Op0 = Op.getOperand(0); 12229 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); 12230 12231 SDLoc dl(Op); 12232 MVT VT = Op.getSimpleValueType(); 12233 // Assume scalar op for initialization; update for vector if needed. 12234 // Note that there are no scalar bitwise logical SSE/AVX instructions, so we 12235 // generate a 16-byte vector constant and logic op even for the scalar case. 12236 // Using a 16-byte mask allows folding the load of the mask with 12237 // the logic op, so it can save (~4 bytes) on code size. 12238 MVT EltVT = VT; 12239 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 12240 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to 12241 // decide if we should generate a 16-byte constant mask when we only need 4 or 12242 // 8 bytes for the scalar case. 12243 if (VT.isVector()) { 12244 EltVT = VT.getVectorElementType(); 12245 NumElts = VT.getVectorNumElements(); 12246 } 12247 12248 unsigned EltBits = EltVT.getSizeInBits(); 12249 LLVMContext *Context = DAG.getContext(); 12250 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... 12251 APInt MaskElt = 12252 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); 12253 Constant *C = ConstantInt::get(*Context, MaskElt); 12254 C = ConstantVector::getSplat(NumElts, C); 12255 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12256 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); 12257 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 12258 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 12259 MachinePointerInfo::getConstantPool(), 12260 false, false, false, Alignment); 12261 12262 if (VT.isVector()) { 12263 // For a vector, cast operands to a vector type, perform the logic op, 12264 // and cast the result back to the original value type. 12265 MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); 12266 SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask); 12267 SDValue Operand = IsFNABS ? 12268 DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) : 12269 DAG.getNode(ISD::BITCAST, dl, VecVT, Op0); 12270 unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR; 12271 return DAG.getNode(ISD::BITCAST, dl, VT, 12272 DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); 12273 } 12274 12275 // If not vector, then scalar. 12276 unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; 12277 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; 12278 return DAG.getNode(BitOp, dl, VT, Operand, Mask); 12279 } 12280 12281 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 12282 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12283 LLVMContext *Context = DAG.getContext(); 12284 SDValue Op0 = Op.getOperand(0); 12285 SDValue Op1 = Op.getOperand(1); 12286 SDLoc dl(Op); 12287 MVT VT = Op.getSimpleValueType(); 12288 MVT SrcVT = Op1.getSimpleValueType(); 12289 12290 // If second operand is smaller, extend it first. 12291 if (SrcVT.bitsLT(VT)) { 12292 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 12293 SrcVT = VT; 12294 } 12295 // And if it is bigger, shrink it first. 12296 if (SrcVT.bitsGT(VT)) { 12297 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 12298 SrcVT = VT; 12299 } 12300 12301 // At this point the operands and the result should have the same 12302 // type, and that won't be f80 since that is not custom lowered. 12303 12304 const fltSemantics &Sem = 12305 VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; 12306 const unsigned SizeInBits = VT.getSizeInBits(); 12307 12308 SmallVector<Constant *, 4> CV( 12309 VT == MVT::f64 ? 2 : 4, 12310 ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); 12311 12312 // First, clear all bits but the sign bit from the second operand (sign). 12313 CV[0] = ConstantFP::get(*Context, 12314 APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); 12315 Constant *C = ConstantVector::get(CV); 12316 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); 12317 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 12318 MachinePointerInfo::getConstantPool(), 12319 false, false, false, 16); 12320 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 12321 12322 // Next, clear the sign bit from the first operand (magnitude). 12323 // If it's a constant, we can clear it here. 12324 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) { 12325 APFloat APF = Op0CN->getValueAPF(); 12326 // If the magnitude is a positive zero, the sign bit alone is enough. 12327 if (APF.isPosZero()) 12328 return SignBit; 12329 APF.clearSign(); 12330 CV[0] = ConstantFP::get(*Context, APF); 12331 } else { 12332 CV[0] = ConstantFP::get( 12333 *Context, 12334 APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); 12335 } 12336 C = ConstantVector::get(CV); 12337 CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); 12338 SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 12339 MachinePointerInfo::getConstantPool(), 12340 false, false, false, 16); 12341 // If the magnitude operand wasn't a constant, we need to AND out the sign. 12342 if (!isa<ConstantFPSDNode>(Op0)) 12343 Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val); 12344 12345 // OR the magnitude value with the sign bit. 12346 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 12347 } 12348 12349 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 12350 SDValue N0 = Op.getOperand(0); 12351 SDLoc dl(Op); 12352 MVT VT = Op.getSimpleValueType(); 12353 12354 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 12355 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 12356 DAG.getConstant(1, VT)); 12357 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 12358 } 12359 12360 // Check whether an OR'd tree is PTEST-able. 12361 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, 12362 SelectionDAG &DAG) { 12363 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 12364 12365 if (!Subtarget->hasSSE41()) 12366 return SDValue(); 12367 12368 if (!Op->hasOneUse()) 12369 return SDValue(); 12370 12371 SDNode *N = Op.getNode(); 12372 SDLoc DL(N); 12373 12374 SmallVector<SDValue, 8> Opnds; 12375 DenseMap<SDValue, unsigned> VecInMap; 12376 SmallVector<SDValue, 8> VecIns; 12377 EVT VT = MVT::Other; 12378 12379 // Recognize a special case where a vector is casted into wide integer to 12380 // test all 0s. 12381 Opnds.push_back(N->getOperand(0)); 12382 Opnds.push_back(N->getOperand(1)); 12383 12384 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 12385 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; 12386 // BFS traverse all OR'd operands. 12387 if (I->getOpcode() == ISD::OR) { 12388 Opnds.push_back(I->getOperand(0)); 12389 Opnds.push_back(I->getOperand(1)); 12390 // Re-evaluate the number of nodes to be traversed. 12391 e += 2; // 2 more nodes (LHS and RHS) are pushed. 12392 continue; 12393 } 12394 12395 // Quit if a non-EXTRACT_VECTOR_ELT 12396 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12397 return SDValue(); 12398 12399 // Quit if without a constant index. 12400 SDValue Idx = I->getOperand(1); 12401 if (!isa<ConstantSDNode>(Idx)) 12402 return SDValue(); 12403 12404 SDValue ExtractedFromVec = I->getOperand(0); 12405 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 12406 if (M == VecInMap.end()) { 12407 VT = ExtractedFromVec.getValueType(); 12408 // Quit if not 128/256-bit vector. 12409 if (!VT.is128BitVector() && !VT.is256BitVector()) 12410 return SDValue(); 12411 // Quit if not the same type. 12412 if (VecInMap.begin() != VecInMap.end() && 12413 VT != VecInMap.begin()->first.getValueType()) 12414 return SDValue(); 12415 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 12416 VecIns.push_back(ExtractedFromVec); 12417 } 12418 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 12419 } 12420 12421 assert((VT.is128BitVector() || VT.is256BitVector()) && 12422 "Not extracted from 128-/256-bit vector."); 12423 12424 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 12425 12426 for (DenseMap<SDValue, unsigned>::const_iterator 12427 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 12428 // Quit if not all elements are used. 12429 if (I->second != FullMask) 12430 return SDValue(); 12431 } 12432 12433 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 12434 12435 // Cast all vectors into TestVT for PTEST. 12436 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 12437 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); 12438 12439 // If more than one full vectors are evaluated, OR them first before PTEST. 12440 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 12441 // Each iteration will OR 2 nodes and append the result until there is only 12442 // 1 node left, i.e. the final OR'd value of all vectors. 12443 SDValue LHS = VecIns[Slot]; 12444 SDValue RHS = VecIns[Slot + 1]; 12445 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 12446 } 12447 12448 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 12449 VecIns.back(), VecIns.back()); 12450 } 12451 12452 /// \brief return true if \c Op has a use that doesn't just read flags. 12453 static bool hasNonFlagsUse(SDValue Op) { 12454 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; 12455 ++UI) { 12456 SDNode *User = *UI; 12457 unsigned UOpNo = UI.getOperandNo(); 12458 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 12459 // Look pass truncate. 12460 UOpNo = User->use_begin().getOperandNo(); 12461 User = *User->use_begin(); 12462 } 12463 12464 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && 12465 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) 12466 return true; 12467 } 12468 return false; 12469 } 12470 12471 /// Emit nodes that will be selected as "test Op0,Op0", or something 12472 /// equivalent. 12473 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, 12474 SelectionDAG &DAG) const { 12475 if (Op.getValueType() == MVT::i1) { 12476 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); 12477 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, 12478 DAG.getConstant(0, MVT::i8)); 12479 } 12480 // CF and OF aren't always set the way we want. Determine which 12481 // of these we need. 12482 bool NeedCF = false; 12483 bool NeedOF = false; 12484 switch (X86CC) { 12485 default: break; 12486 case X86::COND_A: case X86::COND_AE: 12487 case X86::COND_B: case X86::COND_BE: 12488 NeedCF = true; 12489 break; 12490 case X86::COND_G: case X86::COND_GE: 12491 case X86::COND_L: case X86::COND_LE: 12492 case X86::COND_O: case X86::COND_NO: { 12493 // Check if we really need to set the 12494 // Overflow flag. If NoSignedWrap is present 12495 // that is not actually needed. 12496 switch (Op->getOpcode()) { 12497 case ISD::ADD: 12498 case ISD::SUB: 12499 case ISD::MUL: 12500 case ISD::SHL: { 12501 const BinaryWithFlagsSDNode *BinNode = 12502 cast<BinaryWithFlagsSDNode>(Op.getNode()); 12503 if (BinNode->hasNoSignedWrap()) 12504 break; 12505 } 12506 default: 12507 NeedOF = true; 12508 break; 12509 } 12510 break; 12511 } 12512 } 12513 // See if we can use the EFLAGS value from the operand instead of 12514 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 12515 // we prove that the arithmetic won't overflow, we can't use OF or CF. 12516 if (Op.getResNo() != 0 || NeedOF || NeedCF) { 12517 // Emit a CMP with 0, which is the TEST pattern. 12518 //if (Op.getValueType() == MVT::i1) 12519 // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, 12520 // DAG.getConstant(0, MVT::i1)); 12521 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 12522 DAG.getConstant(0, Op.getValueType())); 12523 } 12524 unsigned Opcode = 0; 12525 unsigned NumOperands = 0; 12526 12527 // Truncate operations may prevent the merge of the SETCC instruction 12528 // and the arithmetic instruction before it. Attempt to truncate the operands 12529 // of the arithmetic instruction and use a reduced bit-width instruction. 12530 bool NeedTruncation = false; 12531 SDValue ArithOp = Op; 12532 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 12533 SDValue Arith = Op->getOperand(0); 12534 // Both the trunc and the arithmetic op need to have one user each. 12535 if (Arith->hasOneUse()) 12536 switch (Arith.getOpcode()) { 12537 default: break; 12538 case ISD::ADD: 12539 case ISD::SUB: 12540 case ISD::AND: 12541 case ISD::OR: 12542 case ISD::XOR: { 12543 NeedTruncation = true; 12544 ArithOp = Arith; 12545 } 12546 } 12547 } 12548 12549 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 12550 // which may be the result of a CAST. We use the variable 'Op', which is the 12551 // non-casted variable when we check for possible users. 12552 switch (ArithOp.getOpcode()) { 12553 case ISD::ADD: 12554 // Due to an isel shortcoming, be conservative if this add is likely to be 12555 // selected as part of a load-modify-store instruction. When the root node 12556 // in a match is a store, isel doesn't know how to remap non-chain non-flag 12557 // uses of other nodes in the match, such as the ADD in this case. This 12558 // leads to the ADD being left around and reselected, with the result being 12559 // two adds in the output. Alas, even if none our users are stores, that 12560 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 12561 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 12562 // climbing the DAG back to the root, and it doesn't seem to be worth the 12563 // effort. 12564 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 12565 UE = Op.getNode()->use_end(); UI != UE; ++UI) 12566 if (UI->getOpcode() != ISD::CopyToReg && 12567 UI->getOpcode() != ISD::SETCC && 12568 UI->getOpcode() != ISD::STORE) 12569 goto default_case; 12570 12571 if (ConstantSDNode *C = 12572 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 12573 // An add of one will be selected as an INC. 12574 if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { 12575 Opcode = X86ISD::INC; 12576 NumOperands = 1; 12577 break; 12578 } 12579 12580 // An add of negative one (subtract of one) will be selected as a DEC. 12581 if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { 12582 Opcode = X86ISD::DEC; 12583 NumOperands = 1; 12584 break; 12585 } 12586 } 12587 12588 // Otherwise use a regular EFLAGS-setting add. 12589 Opcode = X86ISD::ADD; 12590 NumOperands = 2; 12591 break; 12592 case ISD::SHL: 12593 case ISD::SRL: 12594 // If we have a constant logical shift that's only used in a comparison 12595 // against zero turn it into an equivalent AND. This allows turning it into 12596 // a TEST instruction later. 12597 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && 12598 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { 12599 EVT VT = Op.getValueType(); 12600 unsigned BitWidth = VT.getSizeInBits(); 12601 unsigned ShAmt = Op->getConstantOperandVal(1); 12602 if (ShAmt >= BitWidth) // Avoid undefined shifts. 12603 break; 12604 APInt Mask = ArithOp.getOpcode() == ISD::SRL 12605 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) 12606 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); 12607 if (!Mask.isSignedIntN(32)) // Avoid large immediates. 12608 break; 12609 SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), 12610 DAG.getConstant(Mask, VT)); 12611 DAG.ReplaceAllUsesWith(Op, New); 12612 Op = New; 12613 } 12614 break; 12615 12616 case ISD::AND: 12617 // If the primary and result isn't used, don't bother using X86ISD::AND, 12618 // because a TEST instruction will be better. 12619 if (!hasNonFlagsUse(Op)) 12620 break; 12621 // FALL THROUGH 12622 case ISD::SUB: 12623 case ISD::OR: 12624 case ISD::XOR: 12625 // Due to the ISEL shortcoming noted above, be conservative if this op is 12626 // likely to be selected as part of a load-modify-store instruction. 12627 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 12628 UE = Op.getNode()->use_end(); UI != UE; ++UI) 12629 if (UI->getOpcode() == ISD::STORE) 12630 goto default_case; 12631 12632 // Otherwise use a regular EFLAGS-setting instruction. 12633 switch (ArithOp.getOpcode()) { 12634 default: llvm_unreachable("unexpected operator!"); 12635 case ISD::SUB: Opcode = X86ISD::SUB; break; 12636 case ISD::XOR: Opcode = X86ISD::XOR; break; 12637 case ISD::AND: Opcode = X86ISD::AND; break; 12638 case ISD::OR: { 12639 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 12640 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); 12641 if (EFLAGS.getNode()) 12642 return EFLAGS; 12643 } 12644 Opcode = X86ISD::OR; 12645 break; 12646 } 12647 } 12648 12649 NumOperands = 2; 12650 break; 12651 case X86ISD::ADD: 12652 case X86ISD::SUB: 12653 case X86ISD::INC: 12654 case X86ISD::DEC: 12655 case X86ISD::OR: 12656 case X86ISD::XOR: 12657 case X86ISD::AND: 12658 return SDValue(Op.getNode(), 1); 12659 default: 12660 default_case: 12661 break; 12662 } 12663 12664 // If we found that truncation is beneficial, perform the truncation and 12665 // update 'Op'. 12666 if (NeedTruncation) { 12667 EVT VT = Op.getValueType(); 12668 SDValue WideVal = Op->getOperand(0); 12669 EVT WideVT = WideVal.getValueType(); 12670 unsigned ConvertedOp = 0; 12671 // Use a target machine opcode to prevent further DAGCombine 12672 // optimizations that may separate the arithmetic operations 12673 // from the setcc node. 12674 switch (WideVal.getOpcode()) { 12675 default: break; 12676 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 12677 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 12678 case ISD::AND: ConvertedOp = X86ISD::AND; break; 12679 case ISD::OR: ConvertedOp = X86ISD::OR; break; 12680 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 12681 } 12682 12683 if (ConvertedOp) { 12684 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12685 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 12686 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 12687 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 12688 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 12689 } 12690 } 12691 } 12692 12693 if (Opcode == 0) 12694 // Emit a CMP with 0, which is the TEST pattern. 12695 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 12696 DAG.getConstant(0, Op.getValueType())); 12697 12698 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 12699 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands); 12700 12701 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); 12702 DAG.ReplaceAllUsesWith(Op, New); 12703 return SDValue(New.getNode(), 1); 12704 } 12705 12706 /// Emit nodes that will be selected as "cmp Op0,Op1", or something 12707 /// equivalent. 12708 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 12709 SDLoc dl, SelectionDAG &DAG) const { 12710 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) { 12711 if (C->getAPIntValue() == 0) 12712 return EmitTest(Op0, X86CC, dl, DAG); 12713 12714 if (Op0.getValueType() == MVT::i1) 12715 llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); 12716 } 12717 12718 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 12719 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 12720 // Do the comparison at i32 if it's smaller, besides the Atom case. 12721 // This avoids subregister aliasing issues. Keep the smaller reference 12722 // if we're optimizing for size, however, as that'll allow better folding 12723 // of memory operations. 12724 if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && 12725 !DAG.getMachineFunction().getFunction()->hasFnAttribute( 12726 Attribute::MinSize) && 12727 !Subtarget->isAtom()) { 12728 unsigned ExtendOp = 12729 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; 12730 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); 12731 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); 12732 } 12733 // Use SUB instead of CMP to enable CSE between SUB and CMP. 12734 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 12735 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 12736 Op0, Op1); 12737 return SDValue(Sub.getNode(), 1); 12738 } 12739 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 12740 } 12741 12742 /// Convert a comparison if required by the subtarget. 12743 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 12744 SelectionDAG &DAG) const { 12745 // If the subtarget does not support the FUCOMI instruction, floating-point 12746 // comparisons have to be converted. 12747 if (Subtarget->hasCMov() || 12748 Cmp.getOpcode() != X86ISD::CMP || 12749 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 12750 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 12751 return Cmp; 12752 12753 // The instruction selector will select an FUCOM instruction instead of 12754 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 12755 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 12756 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 12757 SDLoc dl(Cmp); 12758 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 12759 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 12760 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 12761 DAG.getConstant(8, MVT::i8)); 12762 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 12763 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 12764 } 12765 12766 /// The minimum architected relative accuracy is 2^-12. We need one 12767 /// Newton-Raphson step to have a good float result (24 bits of precision). 12768 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, 12769 DAGCombinerInfo &DCI, 12770 unsigned &RefinementSteps, 12771 bool &UseOneConstNR) const { 12772 // FIXME: We should use instruction latency models to calculate the cost of 12773 // each potential sequence, but this is very hard to do reliably because 12774 // at least Intel's Core* chips have variable timing based on the number of 12775 // significant digits in the divisor and/or sqrt operand. 12776 if (!Subtarget->useSqrtEst()) 12777 return SDValue(); 12778 12779 EVT VT = Op.getValueType(); 12780 12781 // SSE1 has rsqrtss and rsqrtps. 12782 // TODO: Add support for AVX512 (v16f32). 12783 // It is likely not profitable to do this for f64 because a double-precision 12784 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 12785 // instructions: convert to single, rsqrtss, convert back to double, refine 12786 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA 12787 // along with FMA, this could be a throughput win. 12788 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || 12789 (Subtarget->hasAVX() && VT == MVT::v8f32)) { 12790 RefinementSteps = 1; 12791 UseOneConstNR = false; 12792 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); 12793 } 12794 return SDValue(); 12795 } 12796 12797 /// The minimum architected relative accuracy is 2^-12. We need one 12798 /// Newton-Raphson step to have a good float result (24 bits of precision). 12799 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, 12800 DAGCombinerInfo &DCI, 12801 unsigned &RefinementSteps) const { 12802 // FIXME: We should use instruction latency models to calculate the cost of 12803 // each potential sequence, but this is very hard to do reliably because 12804 // at least Intel's Core* chips have variable timing based on the number of 12805 // significant digits in the divisor. 12806 if (!Subtarget->useReciprocalEst()) 12807 return SDValue(); 12808 12809 EVT VT = Op.getValueType(); 12810 12811 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. 12812 // TODO: Add support for AVX512 (v16f32). 12813 // It is likely not profitable to do this for f64 because a double-precision 12814 // reciprocal estimate with refinement on x86 prior to FMA requires 12815 // 15 instructions: convert to single, rcpss, convert back to double, refine 12816 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA 12817 // along with FMA, this could be a throughput win. 12818 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || 12819 (Subtarget->hasAVX() && VT == MVT::v8f32)) { 12820 RefinementSteps = ReciprocalEstimateRefinementSteps; 12821 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); 12822 } 12823 return SDValue(); 12824 } 12825 12826 /// If we have at least two divisions that use the same divisor, convert to 12827 /// multplication by a reciprocal. This may need to be adjusted for a given 12828 /// CPU if a division's cost is not at least twice the cost of a multiplication. 12829 /// This is because we still need one division to calculate the reciprocal and 12830 /// then we need two multiplies by that reciprocal as replacements for the 12831 /// original divisions. 12832 bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { 12833 return NumUsers > 1; 12834 } 12835 12836 static bool isAllOnes(SDValue V) { 12837 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 12838 return C && C->isAllOnesValue(); 12839 } 12840 12841 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 12842 /// if it's possible. 12843 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 12844 SDLoc dl, SelectionDAG &DAG) const { 12845 SDValue Op0 = And.getOperand(0); 12846 SDValue Op1 = And.getOperand(1); 12847 if (Op0.getOpcode() == ISD::TRUNCATE) 12848 Op0 = Op0.getOperand(0); 12849 if (Op1.getOpcode() == ISD::TRUNCATE) 12850 Op1 = Op1.getOperand(0); 12851 12852 SDValue LHS, RHS; 12853 if (Op1.getOpcode() == ISD::SHL) 12854 std::swap(Op0, Op1); 12855 if (Op0.getOpcode() == ISD::SHL) { 12856 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 12857 if (And00C->getZExtValue() == 1) { 12858 // If we looked past a truncate, check that it's only truncating away 12859 // known zeros. 12860 unsigned BitWidth = Op0.getValueSizeInBits(); 12861 unsigned AndBitWidth = And.getValueSizeInBits(); 12862 if (BitWidth > AndBitWidth) { 12863 APInt Zeros, Ones; 12864 DAG.computeKnownBits(Op0, Zeros, Ones); 12865 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 12866 return SDValue(); 12867 } 12868 LHS = Op1; 12869 RHS = Op0.getOperand(1); 12870 } 12871 } else if (Op1.getOpcode() == ISD::Constant) { 12872 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 12873 uint64_t AndRHSVal = AndRHS->getZExtValue(); 12874 SDValue AndLHS = Op0; 12875 12876 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 12877 LHS = AndLHS.getOperand(0); 12878 RHS = AndLHS.getOperand(1); 12879 } 12880 12881 // Use BT if the immediate can't be encoded in a TEST instruction. 12882 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 12883 LHS = AndLHS; 12884 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 12885 } 12886 } 12887 12888 if (LHS.getNode()) { 12889 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 12890 // instruction. Since the shift amount is in-range-or-undefined, we know 12891 // that doing a bittest on the i32 value is ok. We extend to i32 because 12892 // the encoding for the i16 version is larger than the i32 version. 12893 // Also promote i16 to i32 for performance / code size reason. 12894 if (LHS.getValueType() == MVT::i8 || 12895 LHS.getValueType() == MVT::i16) 12896 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 12897 12898 // If the operand types disagree, extend the shift amount to match. Since 12899 // BT ignores high bits (like shifts) we can use anyextend. 12900 if (LHS.getValueType() != RHS.getValueType()) 12901 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 12902 12903 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 12904 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 12905 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 12906 DAG.getConstant(Cond, MVT::i8), BT); 12907 } 12908 12909 return SDValue(); 12910 } 12911 12912 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point 12913 /// mask CMPs. 12914 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, 12915 SDValue &Op1) { 12916 unsigned SSECC; 12917 bool Swap = false; 12918 12919 // SSE Condition code mapping: 12920 // 0 - EQ 12921 // 1 - LT 12922 // 2 - LE 12923 // 3 - UNORD 12924 // 4 - NEQ 12925 // 5 - NLT 12926 // 6 - NLE 12927 // 7 - ORD 12928 switch (SetCCOpcode) { 12929 default: llvm_unreachable("Unexpected SETCC condition"); 12930 case ISD::SETOEQ: 12931 case ISD::SETEQ: SSECC = 0; break; 12932 case ISD::SETOGT: 12933 case ISD::SETGT: Swap = true; // Fallthrough 12934 case ISD::SETLT: 12935 case ISD::SETOLT: SSECC = 1; break; 12936 case ISD::SETOGE: 12937 case ISD::SETGE: Swap = true; // Fallthrough 12938 case ISD::SETLE: 12939 case ISD::SETOLE: SSECC = 2; break; 12940 case ISD::SETUO: SSECC = 3; break; 12941 case ISD::SETUNE: 12942 case ISD::SETNE: SSECC = 4; break; 12943 case ISD::SETULE: Swap = true; // Fallthrough 12944 case ISD::SETUGE: SSECC = 5; break; 12945 case ISD::SETULT: Swap = true; // Fallthrough 12946 case ISD::SETUGT: SSECC = 6; break; 12947 case ISD::SETO: SSECC = 7; break; 12948 case ISD::SETUEQ: 12949 case ISD::SETONE: SSECC = 8; break; 12950 } 12951 if (Swap) 12952 std::swap(Op0, Op1); 12953 12954 return SSECC; 12955 } 12956 12957 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 12958 // ones, and then concatenate the result back. 12959 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 12960 MVT VT = Op.getSimpleValueType(); 12961 12962 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 12963 "Unsupported value type for operation"); 12964 12965 unsigned NumElems = VT.getVectorNumElements(); 12966 SDLoc dl(Op); 12967 SDValue CC = Op.getOperand(2); 12968 12969 // Extract the LHS vectors 12970 SDValue LHS = Op.getOperand(0); 12971 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 12972 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 12973 12974 // Extract the RHS vectors 12975 SDValue RHS = Op.getOperand(1); 12976 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 12977 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 12978 12979 // Issue the operation on the smaller types and concatenate the result back 12980 MVT EltVT = VT.getVectorElementType(); 12981 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 12982 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 12983 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 12984 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 12985 } 12986 12987 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, 12988 const X86Subtarget *Subtarget) { 12989 SDValue Op0 = Op.getOperand(0); 12990 SDValue Op1 = Op.getOperand(1); 12991 SDValue CC = Op.getOperand(2); 12992 MVT VT = Op.getSimpleValueType(); 12993 SDLoc dl(Op); 12994 12995 assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && 12996 Op.getValueType().getScalarType() == MVT::i1 && 12997 "Cannot set masked compare for this operation"); 12998 12999 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 13000 unsigned Opc = 0; 13001 bool Unsigned = false; 13002 bool Swap = false; 13003 unsigned SSECC; 13004 switch (SetCCOpcode) { 13005 default: llvm_unreachable("Unexpected SETCC condition"); 13006 case ISD::SETNE: SSECC = 4; break; 13007 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; 13008 case ISD::SETUGT: SSECC = 6; Unsigned = true; break; 13009 case ISD::SETLT: Swap = true; //fall-through 13010 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; 13011 case ISD::SETULT: SSECC = 1; Unsigned = true; break; 13012 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT 13013 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap 13014 case ISD::SETULE: Unsigned = true; //fall-through 13015 case ISD::SETLE: SSECC = 2; break; 13016 } 13017 13018 if (Swap) 13019 std::swap(Op0, Op1); 13020 if (Opc) 13021 return DAG.getNode(Opc, dl, VT, Op0, Op1); 13022 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; 13023 return DAG.getNode(Opc, dl, VT, Op0, Op1, 13024 DAG.getConstant(SSECC, MVT::i8)); 13025 } 13026 13027 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second 13028 /// operand \p Op1. If non-trivial (for example because it's not constant) 13029 /// return an empty value. 13030 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) 13031 { 13032 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode()); 13033 if (!BV) 13034 return SDValue(); 13035 13036 MVT VT = Op1.getSimpleValueType(); 13037 MVT EVT = VT.getVectorElementType(); 13038 unsigned n = VT.getVectorNumElements(); 13039 SmallVector<SDValue, 8> ULTOp1; 13040 13041 for (unsigned i = 0; i < n; ++i) { 13042 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); 13043 if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT) 13044 return SDValue(); 13045 13046 // Avoid underflow. 13047 APInt Val = Elt->getAPIntValue(); 13048 if (Val == 0) 13049 return SDValue(); 13050 13051 ULTOp1.push_back(DAG.getConstant(Val - 1, EVT)); 13052 } 13053 13054 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); 13055 } 13056 13057 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, 13058 SelectionDAG &DAG) { 13059 SDValue Op0 = Op.getOperand(0); 13060 SDValue Op1 = Op.getOperand(1); 13061 SDValue CC = Op.getOperand(2); 13062 MVT VT = Op.getSimpleValueType(); 13063 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 13064 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); 13065 SDLoc dl(Op); 13066 13067 if (isFP) { 13068 #ifndef NDEBUG 13069 MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); 13070 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 13071 #endif 13072 13073 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); 13074 unsigned Opc = X86ISD::CMPP; 13075 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { 13076 assert(VT.getVectorNumElements() <= 16); 13077 Opc = X86ISD::CMPM; 13078 } 13079 // In the two special cases we can't handle, emit two comparisons. 13080 if (SSECC == 8) { 13081 unsigned CC0, CC1; 13082 unsigned CombineOpc; 13083 if (SetCCOpcode == ISD::SETUEQ) { 13084 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 13085 } else { 13086 assert(SetCCOpcode == ISD::SETONE); 13087 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 13088 } 13089 13090 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, 13091 DAG.getConstant(CC0, MVT::i8)); 13092 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, 13093 DAG.getConstant(CC1, MVT::i8)); 13094 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 13095 } 13096 // Handle all other FP comparisons here. 13097 return DAG.getNode(Opc, dl, VT, Op0, Op1, 13098 DAG.getConstant(SSECC, MVT::i8)); 13099 } 13100 13101 // Break 256-bit integer vector compare into smaller ones. 13102 if (VT.is256BitVector() && !Subtarget->hasInt256()) 13103 return Lower256IntVSETCC(Op, DAG); 13104 13105 bool MaskResult = (VT.getVectorElementType() == MVT::i1); 13106 EVT OpVT = Op1.getValueType(); 13107 if (Subtarget->hasAVX512()) { 13108 if (Op1.getValueType().is512BitVector() || 13109 (Subtarget->hasBWI() && Subtarget->hasVLX()) || 13110 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) 13111 return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); 13112 13113 // In AVX-512 architecture setcc returns mask with i1 elements, 13114 // But there is no compare instruction for i8 and i16 elements in KNL. 13115 // We are not talking about 512-bit operands in this case, these 13116 // types are illegal. 13117 if (MaskResult && 13118 (OpVT.getVectorElementType().getSizeInBits() < 32 && 13119 OpVT.getVectorElementType().getSizeInBits() >= 8)) 13120 return DAG.getNode(ISD::TRUNCATE, dl, VT, 13121 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); 13122 } 13123 13124 // We are handling one of the integer comparisons here. Since SSE only has 13125 // GT and EQ comparisons for integer, swapping operands and multiple 13126 // operations may be required for some comparisons. 13127 unsigned Opc; 13128 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; 13129 bool Subus = false; 13130 13131 switch (SetCCOpcode) { 13132 default: llvm_unreachable("Unexpected SETCC condition"); 13133 case ISD::SETNE: Invert = true; 13134 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; 13135 case ISD::SETLT: Swap = true; 13136 case ISD::SETGT: Opc = X86ISD::PCMPGT; break; 13137 case ISD::SETGE: Swap = true; 13138 case ISD::SETLE: Opc = X86ISD::PCMPGT; 13139 Invert = true; break; 13140 case ISD::SETULT: Swap = true; 13141 case ISD::SETUGT: Opc = X86ISD::PCMPGT; 13142 FlipSigns = true; break; 13143 case ISD::SETUGE: Swap = true; 13144 case ISD::SETULE: Opc = X86ISD::PCMPGT; 13145 FlipSigns = true; Invert = true; break; 13146 } 13147 13148 // Special case: Use min/max operations for SETULE/SETUGE 13149 MVT VET = VT.getVectorElementType(); 13150 bool hasMinMax = 13151 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) 13152 || (Subtarget->hasSSE2() && (VET == MVT::i8)); 13153 13154 if (hasMinMax) { 13155 switch (SetCCOpcode) { 13156 default: break; 13157 case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; 13158 case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; 13159 } 13160 13161 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } 13162 } 13163 13164 bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); 13165 if (!MinMax && hasSubus) { 13166 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for 13167 // Op0 u<= Op1: 13168 // t = psubus Op0, Op1 13169 // pcmpeq t, <0..0> 13170 switch (SetCCOpcode) { 13171 default: break; 13172 case ISD::SETULT: { 13173 // If the comparison is against a constant we can turn this into a 13174 // setule. With psubus, setule does not require a swap. This is 13175 // beneficial because the constant in the register is no longer 13176 // destructed as the destination so it can be hoisted out of a loop. 13177 // Only do this pre-AVX since vpcmp* is no longer destructive. 13178 if (Subtarget->hasAVX()) 13179 break; 13180 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); 13181 if (ULEOp1.getNode()) { 13182 Op1 = ULEOp1; 13183 Subus = true; Invert = false; Swap = false; 13184 } 13185 break; 13186 } 13187 // Psubus is better than flip-sign because it requires no inversion. 13188 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; 13189 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; 13190 } 13191 13192 if (Subus) { 13193 Opc = X86ISD::SUBUS; 13194 FlipSigns = false; 13195 } 13196 } 13197 13198 if (Swap) 13199 std::swap(Op0, Op1); 13200 13201 // Check that the operation in question is available (most are plain SSE2, 13202 // but PCMPGTQ and PCMPEQQ have different requirements). 13203 if (VT == MVT::v2i64) { 13204 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { 13205 assert(Subtarget->hasSSE2() && "Don't know how to lower!"); 13206 13207 // First cast everything to the right type. 13208 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 13209 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 13210 13211 // Since SSE has no unsigned integer comparisons, we need to flip the sign 13212 // bits of the inputs before performing those operations. The lower 13213 // compare is always unsigned. 13214 SDValue SB; 13215 if (FlipSigns) { 13216 SB = DAG.getConstant(0x80000000U, MVT::v4i32); 13217 } else { 13218 SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); 13219 SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); 13220 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 13221 Sign, Zero, Sign, Zero); 13222 } 13223 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); 13224 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); 13225 13226 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) 13227 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); 13228 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); 13229 13230 // Create masks for only the low parts/high parts of the 64 bit integers. 13231 static const int MaskHi[] = { 1, 1, 3, 3 }; 13232 static const int MaskLo[] = { 0, 0, 2, 2 }; 13233 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); 13234 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); 13235 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); 13236 13237 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); 13238 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); 13239 13240 if (Invert) 13241 Result = DAG.getNOT(dl, Result, MVT::v4i32); 13242 13243 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 13244 } 13245 13246 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { 13247 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with 13248 // pcmpeqd + pshufd + pand. 13249 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); 13250 13251 // First cast everything to the right type. 13252 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 13253 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 13254 13255 // Do the compare. 13256 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); 13257 13258 // Make sure the lower and upper halves are both all-ones. 13259 static const int Mask[] = { 1, 0, 3, 2 }; 13260 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); 13261 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); 13262 13263 if (Invert) 13264 Result = DAG.getNOT(dl, Result, MVT::v4i32); 13265 13266 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 13267 } 13268 } 13269 13270 // Since SSE has no unsigned integer comparisons, we need to flip the sign 13271 // bits of the inputs before performing those operations. 13272 if (FlipSigns) { 13273 EVT EltVT = VT.getVectorElementType(); 13274 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); 13275 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); 13276 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); 13277 } 13278 13279 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 13280 13281 // If the logical-not of the result is required, perform that now. 13282 if (Invert) 13283 Result = DAG.getNOT(dl, Result, VT); 13284 13285 if (MinMax) 13286 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); 13287 13288 if (Subus) 13289 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, 13290 getZeroVector(VT, Subtarget, DAG, dl)); 13291 13292 return Result; 13293 } 13294 13295 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 13296 13297 MVT VT = Op.getSimpleValueType(); 13298 13299 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); 13300 13301 assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) 13302 && "SetCC type must be 8-bit or 1-bit integer"); 13303 SDValue Op0 = Op.getOperand(0); 13304 SDValue Op1 = Op.getOperand(1); 13305 SDLoc dl(Op); 13306 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 13307 13308 // Optimize to BT if possible. 13309 // Lower (X & (1 << N)) == 0 to BT(X, N). 13310 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 13311 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 13312 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 13313 Op1.getOpcode() == ISD::Constant && 13314 cast<ConstantSDNode>(Op1)->isNullValue() && 13315 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 13316 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 13317 if (NewSetCC.getNode()) { 13318 if (VT == MVT::i1) 13319 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); 13320 return NewSetCC; 13321 } 13322 } 13323 13324 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 13325 // these. 13326 if (Op1.getOpcode() == ISD::Constant && 13327 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 13328 cast<ConstantSDNode>(Op1)->isNullValue()) && 13329 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 13330 13331 // If the input is a setcc, then reuse the input setcc or use a new one with 13332 // the inverted condition. 13333 if (Op0.getOpcode() == X86ISD::SETCC) { 13334 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 13335 bool Invert = (CC == ISD::SETNE) ^ 13336 cast<ConstantSDNode>(Op1)->isNullValue(); 13337 if (!Invert) 13338 return Op0; 13339 13340 CCode = X86::GetOppositeBranchCondition(CCode); 13341 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 13342 DAG.getConstant(CCode, MVT::i8), 13343 Op0.getOperand(1)); 13344 if (VT == MVT::i1) 13345 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); 13346 return SetCC; 13347 } 13348 } 13349 if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && 13350 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) && 13351 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 13352 13353 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); 13354 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC); 13355 } 13356 13357 bool isFP = Op1.getSimpleValueType().isFloatingPoint(); 13358 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 13359 if (X86CC == X86::COND_INVALID) 13360 return SDValue(); 13361 13362 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); 13363 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 13364 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 13365 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 13366 if (VT == MVT::i1) 13367 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); 13368 return SetCC; 13369 } 13370 13371 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 13372 static bool isX86LogicalCmp(SDValue Op) { 13373 unsigned Opc = Op.getNode()->getOpcode(); 13374 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 13375 Opc == X86ISD::SAHF) 13376 return true; 13377 if (Op.getResNo() == 1 && 13378 (Opc == X86ISD::ADD || 13379 Opc == X86ISD::SUB || 13380 Opc == X86ISD::ADC || 13381 Opc == X86ISD::SBB || 13382 Opc == X86ISD::SMUL || 13383 Opc == X86ISD::UMUL || 13384 Opc == X86ISD::INC || 13385 Opc == X86ISD::DEC || 13386 Opc == X86ISD::OR || 13387 Opc == X86ISD::XOR || 13388 Opc == X86ISD::AND)) 13389 return true; 13390 13391 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 13392 return true; 13393 13394 return false; 13395 } 13396 13397 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 13398 if (V.getOpcode() != ISD::TRUNCATE) 13399 return false; 13400 13401 SDValue VOp0 = V.getOperand(0); 13402 unsigned InBits = VOp0.getValueSizeInBits(); 13403 unsigned Bits = V.getValueSizeInBits(); 13404 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 13405 } 13406 13407 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 13408 bool addTest = true; 13409 SDValue Cond = Op.getOperand(0); 13410 SDValue Op1 = Op.getOperand(1); 13411 SDValue Op2 = Op.getOperand(2); 13412 SDLoc DL(Op); 13413 EVT VT = Op1.getValueType(); 13414 SDValue CC; 13415 13416 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops 13417 // are available or VBLENDV if AVX is available. 13418 // Otherwise FP cmovs get lowered into a less efficient branch sequence later. 13419 if (Cond.getOpcode() == ISD::SETCC && 13420 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || 13421 (Subtarget->hasSSE1() && VT == MVT::f32)) && 13422 VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { 13423 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); 13424 int SSECC = translateX86FSETCC( 13425 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); 13426 13427 if (SSECC != 8) { 13428 if (Subtarget->hasAVX512()) { 13429 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, 13430 DAG.getConstant(SSECC, MVT::i8)); 13431 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); 13432 } 13433 13434 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, 13435 DAG.getConstant(SSECC, MVT::i8)); 13436 13437 // If we have AVX, we can use a variable vector select (VBLENDV) instead 13438 // of 3 logic instructions for size savings and potentially speed. 13439 // Unfortunately, there is no scalar form of VBLENDV. 13440 13441 // If either operand is a constant, don't try this. We can expect to 13442 // optimize away at least one of the logic instructions later in that 13443 // case, so that sequence would be faster than a variable blend. 13444 13445 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly 13446 // uses XMM0 as the selection register. That may need just as many 13447 // instructions as the AND/ANDN/OR sequence due to register moves, so 13448 // don't bother. 13449 13450 if (Subtarget->hasAVX() && 13451 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) { 13452 13453 // Convert to vectors, do a VSELECT, and convert back to scalar. 13454 // All of the conversions should be optimized away. 13455 13456 EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; 13457 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); 13458 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); 13459 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); 13460 13461 EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; 13462 VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp); 13463 13464 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); 13465 13466 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, 13467 VSel, DAG.getIntPtrConstant(0)); 13468 } 13469 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); 13470 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); 13471 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); 13472 } 13473 } 13474 13475 if (Cond.getOpcode() == ISD::SETCC) { 13476 SDValue NewCond = LowerSETCC(Cond, DAG); 13477 if (NewCond.getNode()) 13478 Cond = NewCond; 13479 } 13480 13481 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 13482 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 13483 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 13484 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 13485 if (Cond.getOpcode() == X86ISD::SETCC && 13486 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 13487 isZero(Cond.getOperand(1).getOperand(1))) { 13488 SDValue Cmp = Cond.getOperand(1); 13489 13490 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 13491 13492 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 13493 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 13494 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 13495 13496 SDValue CmpOp0 = Cmp.getOperand(0); 13497 // Apply further optimizations for special cases 13498 // (select (x != 0), -1, 0) -> neg & sbb 13499 // (select (x == 0), 0, -1) -> neg & sbb 13500 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) 13501 if (YC->isNullValue() && 13502 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { 13503 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 13504 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 13505 DAG.getConstant(0, CmpOp0.getValueType()), 13506 CmpOp0); 13507 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 13508 DAG.getConstant(X86::COND_B, MVT::i8), 13509 SDValue(Neg.getNode(), 1)); 13510 return Res; 13511 } 13512 13513 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 13514 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 13515 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 13516 13517 SDValue Res = // Res = 0 or -1. 13518 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 13519 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 13520 13521 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 13522 Res = DAG.getNOT(DL, Res, Res.getValueType()); 13523 13524 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 13525 if (!N2C || !N2C->isNullValue()) 13526 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 13527 return Res; 13528 } 13529 } 13530 13531 // Look past (and (setcc_carry (cmp ...)), 1). 13532 if (Cond.getOpcode() == ISD::AND && 13533 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 13534 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 13535 if (C && C->getAPIntValue() == 1) 13536 Cond = Cond.getOperand(0); 13537 } 13538 13539 // If condition flag is set by a X86ISD::CMP, then use it as the condition 13540 // setting operand in place of the X86ISD::SETCC. 13541 unsigned CondOpcode = Cond.getOpcode(); 13542 if (CondOpcode == X86ISD::SETCC || 13543 CondOpcode == X86ISD::SETCC_CARRY) { 13544 CC = Cond.getOperand(0); 13545 13546 SDValue Cmp = Cond.getOperand(1); 13547 unsigned Opc = Cmp.getOpcode(); 13548 MVT VT = Op.getSimpleValueType(); 13549 13550 bool IllegalFPCMov = false; 13551 if (VT.isFloatingPoint() && !VT.isVector() && 13552 !isScalarFPTypeInSSEReg(VT)) // FPStack? 13553 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 13554 13555 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 13556 Opc == X86ISD::BT) { // FIXME 13557 Cond = Cmp; 13558 addTest = false; 13559 } 13560 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 13561 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 13562 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 13563 Cond.getOperand(0).getValueType() != MVT::i8)) { 13564 SDValue LHS = Cond.getOperand(0); 13565 SDValue RHS = Cond.getOperand(1); 13566 unsigned X86Opcode; 13567 unsigned X86Cond; 13568 SDVTList VTs; 13569 switch (CondOpcode) { 13570 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 13571 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 13572 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 13573 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 13574 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 13575 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 13576 default: llvm_unreachable("unexpected overflowing operator"); 13577 } 13578 if (CondOpcode == ISD::UMULO) 13579 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 13580 MVT::i32); 13581 else 13582 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 13583 13584 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 13585 13586 if (CondOpcode == ISD::UMULO) 13587 Cond = X86Op.getValue(2); 13588 else 13589 Cond = X86Op.getValue(1); 13590 13591 CC = DAG.getConstant(X86Cond, MVT::i8); 13592 addTest = false; 13593 } 13594 13595 if (addTest) { 13596 // Look pass the truncate if the high bits are known zero. 13597 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 13598 Cond = Cond.getOperand(0); 13599 13600 // We know the result of AND is compared against zero. Try to match 13601 // it to BT. 13602 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 13603 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 13604 if (NewSetCC.getNode()) { 13605 CC = NewSetCC.getOperand(0); 13606 Cond = NewSetCC.getOperand(1); 13607 addTest = false; 13608 } 13609 } 13610 } 13611 13612 if (addTest) { 13613 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 13614 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); 13615 } 13616 13617 // a < b ? -1 : 0 -> RES = ~setcc_carry 13618 // a < b ? 0 : -1 -> RES = setcc_carry 13619 // a >= b ? -1 : 0 -> RES = setcc_carry 13620 // a >= b ? 0 : -1 -> RES = ~setcc_carry 13621 if (Cond.getOpcode() == X86ISD::SUB) { 13622 Cond = ConvertCmpIfNecessary(Cond, DAG); 13623 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 13624 13625 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 13626 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 13627 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 13628 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 13629 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 13630 return DAG.getNOT(DL, Res, Res.getValueType()); 13631 return Res; 13632 } 13633 } 13634 13635 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate 13636 // widen the cmov and push the truncate through. This avoids introducing a new 13637 // branch during isel and doesn't add any extensions. 13638 if (Op.getValueType() == MVT::i8 && 13639 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { 13640 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); 13641 if (T1.getValueType() == T2.getValueType() && 13642 // Blacklist CopyFromReg to avoid partial register stalls. 13643 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ 13644 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); 13645 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); 13646 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); 13647 } 13648 } 13649 13650 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 13651 // condition is true. 13652 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 13653 SDValue Ops[] = { Op2, Op1, CC, Cond }; 13654 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); 13655 } 13656 13657 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, 13658 SelectionDAG &DAG) { 13659 MVT VT = Op->getSimpleValueType(0); 13660 SDValue In = Op->getOperand(0); 13661 MVT InVT = In.getSimpleValueType(); 13662 MVT VTElt = VT.getVectorElementType(); 13663 MVT InVTElt = InVT.getVectorElementType(); 13664 SDLoc dl(Op); 13665 13666 // SKX processor 13667 if ((InVTElt == MVT::i1) && 13668 (((Subtarget->hasBWI() && Subtarget->hasVLX() && 13669 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || 13670 13671 ((Subtarget->hasBWI() && VT.is512BitVector() && 13672 VTElt.getSizeInBits() <= 16)) || 13673 13674 ((Subtarget->hasDQI() && Subtarget->hasVLX() && 13675 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || 13676 13677 ((Subtarget->hasDQI() && VT.is512BitVector() && 13678 VTElt.getSizeInBits() >= 32)))) 13679 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 13680 13681 unsigned int NumElts = VT.getVectorNumElements(); 13682 13683 if (NumElts != 8 && NumElts != 16) 13684 return SDValue(); 13685 13686 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { 13687 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) 13688 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); 13689 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 13690 } 13691 13692 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13693 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 13694 13695 MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; 13696 Constant *C = ConstantInt::get(*DAG.getContext(), 13697 APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); 13698 13699 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 13700 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 13701 SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, 13702 MachinePointerInfo::getConstantPool(), 13703 false, false, false, Alignment); 13704 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); 13705 if (VT.is512BitVector()) 13706 return Brcst; 13707 return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); 13708 } 13709 13710 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 13711 SelectionDAG &DAG) { 13712 MVT VT = Op->getSimpleValueType(0); 13713 SDValue In = Op->getOperand(0); 13714 MVT InVT = In.getSimpleValueType(); 13715 SDLoc dl(Op); 13716 13717 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 13718 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); 13719 13720 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && 13721 (VT != MVT::v8i32 || InVT != MVT::v8i16) && 13722 (VT != MVT::v16i16 || InVT != MVT::v16i8)) 13723 return SDValue(); 13724 13725 if (Subtarget->hasInt256()) 13726 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 13727 13728 // Optimize vectors in AVX mode 13729 // Sign extend v8i16 to v8i32 and 13730 // v4i32 to v4i64 13731 // 13732 // Divide input vector into two parts 13733 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 13734 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 13735 // concat the vectors to original VT 13736 13737 unsigned NumElems = InVT.getVectorNumElements(); 13738 SDValue Undef = DAG.getUNDEF(InVT); 13739 13740 SmallVector<int,8> ShufMask1(NumElems, -1); 13741 for (unsigned i = 0; i != NumElems/2; ++i) 13742 ShufMask1[i] = i; 13743 13744 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); 13745 13746 SmallVector<int,8> ShufMask2(NumElems, -1); 13747 for (unsigned i = 0; i != NumElems/2; ++i) 13748 ShufMask2[i] = i + NumElems/2; 13749 13750 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); 13751 13752 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), 13753 VT.getVectorNumElements()/2); 13754 13755 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); 13756 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); 13757 13758 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 13759 } 13760 13761 // Lower vector extended loads using a shuffle. If SSSE3 is not available we 13762 // may emit an illegal shuffle but the expansion is still better than scalar 13763 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise 13764 // we'll emit a shuffle and a arithmetic shift. 13765 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. 13766 // TODO: It is possible to support ZExt by zeroing the undef values during 13767 // the shuffle phase or after the shuffle. 13768 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, 13769 SelectionDAG &DAG) { 13770 MVT RegVT = Op.getSimpleValueType(); 13771 assert(RegVT.isVector() && "We only custom lower vector sext loads."); 13772 assert(RegVT.isInteger() && 13773 "We only custom lower integer vector sext loads."); 13774 13775 // Nothing useful we can do without SSE2 shuffles. 13776 assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); 13777 13778 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); 13779 SDLoc dl(Ld); 13780 EVT MemVT = Ld->getMemoryVT(); 13781 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13782 unsigned RegSz = RegVT.getSizeInBits(); 13783 13784 ISD::LoadExtType Ext = Ld->getExtensionType(); 13785 13786 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) 13787 && "Only anyext and sext are currently implemented."); 13788 assert(MemVT != RegVT && "Cannot extend to the same type"); 13789 assert(MemVT.isVector() && "Must load a vector from memory"); 13790 13791 unsigned NumElems = RegVT.getVectorNumElements(); 13792 unsigned MemSz = MemVT.getSizeInBits(); 13793 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 13794 13795 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { 13796 // The only way in which we have a legal 256-bit vector result but not the 13797 // integer 256-bit operations needed to directly lower a sextload is if we 13798 // have AVX1 but not AVX2. In that case, we can always emit a sextload to 13799 // a 128-bit vector and a normal sign_extend to 256-bits that should get 13800 // correctly legalized. We do this late to allow the canonical form of 13801 // sextload to persist throughout the rest of the DAG combiner -- it wants 13802 // to fold together any extensions it can, and so will fuse a sign_extend 13803 // of an sextload into a sextload targeting a wider value. 13804 SDValue Load; 13805 if (MemSz == 128) { 13806 // Just switch this to a normal load. 13807 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " 13808 "it must be a legal 128-bit vector " 13809 "type!"); 13810 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), 13811 Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), 13812 Ld->isInvariant(), Ld->getAlignment()); 13813 } else { 13814 assert(MemSz < 128 && 13815 "Can't extend a type wider than 128 bits to a 256 bit vector!"); 13816 // Do an sext load to a 128-bit vector type. We want to use the same 13817 // number of elements, but elements half as wide. This will end up being 13818 // recursively lowered by this routine, but will succeed as we definitely 13819 // have all the necessary features if we're using AVX1. 13820 EVT HalfEltVT = 13821 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); 13822 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); 13823 Load = 13824 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), 13825 Ld->getPointerInfo(), MemVT, Ld->isVolatile(), 13826 Ld->isNonTemporal(), Ld->isInvariant(), 13827 Ld->getAlignment()); 13828 } 13829 13830 // Replace chain users with the new chain. 13831 assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); 13832 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); 13833 13834 // Finally, do a normal sign-extend to the desired register. 13835 return DAG.getSExtOrTrunc(Load, dl, RegVT); 13836 } 13837 13838 // All sizes must be a power of two. 13839 assert(isPowerOf2_32(RegSz * MemSz * NumElems) && 13840 "Non-power-of-two elements are not custom lowered!"); 13841 13842 // Attempt to load the original value using scalar loads. 13843 // Find the largest scalar type that divides the total loaded size. 13844 MVT SclrLoadTy = MVT::i8; 13845 for (MVT Tp : MVT::integer_valuetypes()) { 13846 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 13847 SclrLoadTy = Tp; 13848 } 13849 } 13850 13851 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 13852 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 13853 (64 <= MemSz)) 13854 SclrLoadTy = MVT::f64; 13855 13856 // Calculate the number of scalar loads that we need to perform 13857 // in order to load our vector from memory. 13858 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 13859 13860 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && 13861 "Can only lower sext loads with a single scalar load!"); 13862 13863 unsigned loadRegZize = RegSz; 13864 if (Ext == ISD::SEXTLOAD && RegSz == 256) 13865 loadRegZize /= 2; 13866 13867 // Represent our vector as a sequence of elements which are the 13868 // largest scalar that we can load. 13869 EVT LoadUnitVecVT = EVT::getVectorVT( 13870 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); 13871 13872 // Represent the data using the same element type that is stored in 13873 // memory. In practice, we ''widen'' MemVT. 13874 EVT WideVecVT = 13875 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 13876 loadRegZize / MemVT.getScalarType().getSizeInBits()); 13877 13878 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 13879 "Invalid vector type"); 13880 13881 // We can't shuffle using an illegal type. 13882 assert(TLI.isTypeLegal(WideVecVT) && 13883 "We only lower types that form legal widened vector types"); 13884 13885 SmallVector<SDValue, 8> Chains; 13886 SDValue Ptr = Ld->getBasePtr(); 13887 SDValue Increment = 13888 DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy()); 13889 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 13890 13891 for (unsigned i = 0; i < NumLoads; ++i) { 13892 // Perform a single load. 13893 SDValue ScalarLoad = 13894 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 13895 Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), 13896 Ld->getAlignment()); 13897 Chains.push_back(ScalarLoad.getValue(1)); 13898 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 13899 // another round of DAGCombining. 13900 if (i == 0) 13901 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 13902 else 13903 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 13904 ScalarLoad, DAG.getIntPtrConstant(i)); 13905 13906 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 13907 } 13908 13909 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 13910 13911 // Bitcast the loaded value to a vector of the original element type, in 13912 // the size of the target vector type. 13913 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); 13914 unsigned SizeRatio = RegSz / MemSz; 13915 13916 if (Ext == ISD::SEXTLOAD) { 13917 // If we have SSE4.1, we can directly emit a VSEXT node. 13918 if (Subtarget->hasSSE41()) { 13919 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); 13920 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 13921 return Sext; 13922 } 13923 13924 // Otherwise we'll shuffle the small elements in the high bits of the 13925 // larger type and perform an arithmetic shift. If the shift is not legal 13926 // it's better to scalarize. 13927 assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && 13928 "We can't implement a sext load without an arithmetic right shift!"); 13929 13930 // Redistribute the loaded elements into the different locations. 13931 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 13932 for (unsigned i = 0; i != NumElems; ++i) 13933 ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; 13934 13935 SDValue Shuff = DAG.getVectorShuffle( 13936 WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); 13937 13938 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 13939 13940 // Build the arithmetic shift. 13941 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - 13942 MemVT.getVectorElementType().getSizeInBits(); 13943 Shuff = 13944 DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT)); 13945 13946 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 13947 return Shuff; 13948 } 13949 13950 // Redistribute the loaded elements into the different locations. 13951 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 13952 for (unsigned i = 0; i != NumElems; ++i) 13953 ShuffleVec[i * SizeRatio] = i; 13954 13955 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 13956 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); 13957 13958 // Bitcast to the requested type. 13959 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 13960 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 13961 return Shuff; 13962 } 13963 13964 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 13965 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 13966 // from the AND / OR. 13967 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 13968 Opc = Op.getOpcode(); 13969 if (Opc != ISD::OR && Opc != ISD::AND) 13970 return false; 13971 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 13972 Op.getOperand(0).hasOneUse() && 13973 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 13974 Op.getOperand(1).hasOneUse()); 13975 } 13976 13977 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 13978 // 1 and that the SETCC node has a single use. 13979 static bool isXor1OfSetCC(SDValue Op) { 13980 if (Op.getOpcode() != ISD::XOR) 13981 return false; 13982 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 13983 if (N1C && N1C->getAPIntValue() == 1) { 13984 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 13985 Op.getOperand(0).hasOneUse(); 13986 } 13987 return false; 13988 } 13989 13990 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 13991 bool addTest = true; 13992 SDValue Chain = Op.getOperand(0); 13993 SDValue Cond = Op.getOperand(1); 13994 SDValue Dest = Op.getOperand(2); 13995 SDLoc dl(Op); 13996 SDValue CC; 13997 bool Inverted = false; 13998 13999 if (Cond.getOpcode() == ISD::SETCC) { 14000 // Check for setcc([su]{add,sub,mul}o == 0). 14001 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 14002 isa<ConstantSDNode>(Cond.getOperand(1)) && 14003 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 14004 Cond.getOperand(0).getResNo() == 1 && 14005 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 14006 Cond.getOperand(0).getOpcode() == ISD::UADDO || 14007 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 14008 Cond.getOperand(0).getOpcode() == ISD::USUBO || 14009 Cond.getOperand(0).getOpcode() == ISD::SMULO || 14010 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 14011 Inverted = true; 14012 Cond = Cond.getOperand(0); 14013 } else { 14014 SDValue NewCond = LowerSETCC(Cond, DAG); 14015 if (NewCond.getNode()) 14016 Cond = NewCond; 14017 } 14018 } 14019 #if 0 14020 // FIXME: LowerXALUO doesn't handle these!! 14021 else if (Cond.getOpcode() == X86ISD::ADD || 14022 Cond.getOpcode() == X86ISD::SUB || 14023 Cond.getOpcode() == X86ISD::SMUL || 14024 Cond.getOpcode() == X86ISD::UMUL) 14025 Cond = LowerXALUO(Cond, DAG); 14026 #endif 14027 14028 // Look pass (and (setcc_carry (cmp ...)), 1). 14029 if (Cond.getOpcode() == ISD::AND && 14030 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 14031 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 14032 if (C && C->getAPIntValue() == 1) 14033 Cond = Cond.getOperand(0); 14034 } 14035 14036 // If condition flag is set by a X86ISD::CMP, then use it as the condition 14037 // setting operand in place of the X86ISD::SETCC. 14038 unsigned CondOpcode = Cond.getOpcode(); 14039 if (CondOpcode == X86ISD::SETCC || 14040 CondOpcode == X86ISD::SETCC_CARRY) { 14041 CC = Cond.getOperand(0); 14042 14043 SDValue Cmp = Cond.getOperand(1); 14044 unsigned Opc = Cmp.getOpcode(); 14045 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 14046 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 14047 Cond = Cmp; 14048 addTest = false; 14049 } else { 14050 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 14051 default: break; 14052 case X86::COND_O: 14053 case X86::COND_B: 14054 // These can only come from an arithmetic instruction with overflow, 14055 // e.g. SADDO, UADDO. 14056 Cond = Cond.getNode()->getOperand(1); 14057 addTest = false; 14058 break; 14059 } 14060 } 14061 } 14062 CondOpcode = Cond.getOpcode(); 14063 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 14064 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 14065 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 14066 Cond.getOperand(0).getValueType() != MVT::i8)) { 14067 SDValue LHS = Cond.getOperand(0); 14068 SDValue RHS = Cond.getOperand(1); 14069 unsigned X86Opcode; 14070 unsigned X86Cond; 14071 SDVTList VTs; 14072 // Keep this in sync with LowerXALUO, otherwise we might create redundant 14073 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and 14074 // X86ISD::INC). 14075 switch (CondOpcode) { 14076 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 14077 case ISD::SADDO: 14078 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 14079 if (C->isOne()) { 14080 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; 14081 break; 14082 } 14083 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 14084 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 14085 case ISD::SSUBO: 14086 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 14087 if (C->isOne()) { 14088 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; 14089 break; 14090 } 14091 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 14092 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 14093 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 14094 default: llvm_unreachable("unexpected overflowing operator"); 14095 } 14096 if (Inverted) 14097 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 14098 if (CondOpcode == ISD::UMULO) 14099 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 14100 MVT::i32); 14101 else 14102 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 14103 14104 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 14105 14106 if (CondOpcode == ISD::UMULO) 14107 Cond = X86Op.getValue(2); 14108 else 14109 Cond = X86Op.getValue(1); 14110 14111 CC = DAG.getConstant(X86Cond, MVT::i8); 14112 addTest = false; 14113 } else { 14114 unsigned CondOpc; 14115 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 14116 SDValue Cmp = Cond.getOperand(0).getOperand(1); 14117 if (CondOpc == ISD::OR) { 14118 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 14119 // two branches instead of an explicit OR instruction with a 14120 // separate test. 14121 if (Cmp == Cond.getOperand(1).getOperand(1) && 14122 isX86LogicalCmp(Cmp)) { 14123 CC = Cond.getOperand(0).getOperand(0); 14124 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14125 Chain, Dest, CC, Cmp); 14126 CC = Cond.getOperand(1).getOperand(0); 14127 Cond = Cmp; 14128 addTest = false; 14129 } 14130 } else { // ISD::AND 14131 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 14132 // two branches instead of an explicit AND instruction with a 14133 // separate test. However, we only do this if this block doesn't 14134 // have a fall-through edge, because this requires an explicit 14135 // jmp when the condition is false. 14136 if (Cmp == Cond.getOperand(1).getOperand(1) && 14137 isX86LogicalCmp(Cmp) && 14138 Op.getNode()->hasOneUse()) { 14139 X86::CondCode CCode = 14140 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 14141 CCode = X86::GetOppositeBranchCondition(CCode); 14142 CC = DAG.getConstant(CCode, MVT::i8); 14143 SDNode *User = *Op.getNode()->use_begin(); 14144 // Look for an unconditional branch following this conditional branch. 14145 // We need this because we need to reverse the successors in order 14146 // to implement FCMP_OEQ. 14147 if (User->getOpcode() == ISD::BR) { 14148 SDValue FalseBB = User->getOperand(1); 14149 SDNode *NewBR = 14150 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 14151 assert(NewBR == User); 14152 (void)NewBR; 14153 Dest = FalseBB; 14154 14155 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14156 Chain, Dest, CC, Cmp); 14157 X86::CondCode CCode = 14158 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 14159 CCode = X86::GetOppositeBranchCondition(CCode); 14160 CC = DAG.getConstant(CCode, MVT::i8); 14161 Cond = Cmp; 14162 addTest = false; 14163 } 14164 } 14165 } 14166 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 14167 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 14168 // It should be transformed during dag combiner except when the condition 14169 // is set by a arithmetics with overflow node. 14170 X86::CondCode CCode = 14171 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 14172 CCode = X86::GetOppositeBranchCondition(CCode); 14173 CC = DAG.getConstant(CCode, MVT::i8); 14174 Cond = Cond.getOperand(0).getOperand(1); 14175 addTest = false; 14176 } else if (Cond.getOpcode() == ISD::SETCC && 14177 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 14178 // For FCMP_OEQ, we can emit 14179 // two branches instead of an explicit AND instruction with a 14180 // separate test. However, we only do this if this block doesn't 14181 // have a fall-through edge, because this requires an explicit 14182 // jmp when the condition is false. 14183 if (Op.getNode()->hasOneUse()) { 14184 SDNode *User = *Op.getNode()->use_begin(); 14185 // Look for an unconditional branch following this conditional branch. 14186 // We need this because we need to reverse the successors in order 14187 // to implement FCMP_OEQ. 14188 if (User->getOpcode() == ISD::BR) { 14189 SDValue FalseBB = User->getOperand(1); 14190 SDNode *NewBR = 14191 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 14192 assert(NewBR == User); 14193 (void)NewBR; 14194 Dest = FalseBB; 14195 14196 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 14197 Cond.getOperand(0), Cond.getOperand(1)); 14198 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 14199 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 14200 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14201 Chain, Dest, CC, Cmp); 14202 CC = DAG.getConstant(X86::COND_P, MVT::i8); 14203 Cond = Cmp; 14204 addTest = false; 14205 } 14206 } 14207 } else if (Cond.getOpcode() == ISD::SETCC && 14208 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 14209 // For FCMP_UNE, we can emit 14210 // two branches instead of an explicit AND instruction with a 14211 // separate test. However, we only do this if this block doesn't 14212 // have a fall-through edge, because this requires an explicit 14213 // jmp when the condition is false. 14214 if (Op.getNode()->hasOneUse()) { 14215 SDNode *User = *Op.getNode()->use_begin(); 14216 // Look for an unconditional branch following this conditional branch. 14217 // We need this because we need to reverse the successors in order 14218 // to implement FCMP_UNE. 14219 if (User->getOpcode() == ISD::BR) { 14220 SDValue FalseBB = User->getOperand(1); 14221 SDNode *NewBR = 14222 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 14223 assert(NewBR == User); 14224 (void)NewBR; 14225 14226 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 14227 Cond.getOperand(0), Cond.getOperand(1)); 14228 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 14229 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 14230 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14231 Chain, Dest, CC, Cmp); 14232 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 14233 Cond = Cmp; 14234 addTest = false; 14235 Dest = FalseBB; 14236 } 14237 } 14238 } 14239 } 14240 14241 if (addTest) { 14242 // Look pass the truncate if the high bits are known zero. 14243 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 14244 Cond = Cond.getOperand(0); 14245 14246 // We know the result of AND is compared against zero. Try to match 14247 // it to BT. 14248 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 14249 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 14250 if (NewSetCC.getNode()) { 14251 CC = NewSetCC.getOperand(0); 14252 Cond = NewSetCC.getOperand(1); 14253 addTest = false; 14254 } 14255 } 14256 } 14257 14258 if (addTest) { 14259 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; 14260 CC = DAG.getConstant(X86Cond, MVT::i8); 14261 Cond = EmitTest(Cond, X86Cond, dl, DAG); 14262 } 14263 Cond = ConvertCmpIfNecessary(Cond, DAG); 14264 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14265 Chain, Dest, CC, Cond); 14266 } 14267 14268 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 14269 // Calls to _alloca are needed to probe the stack when allocating more than 4k 14270 // bytes in one go. Touching the stack at 4K increments is necessary to ensure 14271 // that the guard pages used by the OS virtual memory manager are allocated in 14272 // correct sequence. 14273 SDValue 14274 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 14275 SelectionDAG &DAG) const { 14276 MachineFunction &MF = DAG.getMachineFunction(); 14277 bool SplitStack = MF.shouldSplitStack(); 14278 bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || 14279 SplitStack; 14280 SDLoc dl(Op); 14281 14282 if (!Lower) { 14283 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14284 SDNode* Node = Op.getNode(); 14285 14286 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); 14287 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" 14288 " not tell us which reg is the stack pointer!"); 14289 EVT VT = Node->getValueType(0); 14290 SDValue Tmp1 = SDValue(Node, 0); 14291 SDValue Tmp2 = SDValue(Node, 1); 14292 SDValue Tmp3 = Node->getOperand(2); 14293 SDValue Chain = Tmp1.getOperand(0); 14294 14295 // Chain the dynamic stack allocation so that it doesn't modify the stack 14296 // pointer when other instructions are using the stack. 14297 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true), 14298 SDLoc(Node)); 14299 14300 SDValue Size = Tmp2.getOperand(1); 14301 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); 14302 Chain = SP.getValue(1); 14303 unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); 14304 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 14305 unsigned StackAlign = TFI.getStackAlignment(); 14306 Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value 14307 if (Align > StackAlign) 14308 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, 14309 DAG.getConstant(-(uint64_t)Align, VT)); 14310 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain 14311 14312 Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), 14313 DAG.getIntPtrConstant(0, true), SDValue(), 14314 SDLoc(Node)); 14315 14316 SDValue Ops[2] = { Tmp1, Tmp2 }; 14317 return DAG.getMergeValues(Ops, dl); 14318 } 14319 14320 // Get the inputs. 14321 SDValue Chain = Op.getOperand(0); 14322 SDValue Size = Op.getOperand(1); 14323 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 14324 EVT VT = Op.getNode()->getValueType(0); 14325 14326 bool Is64Bit = Subtarget->is64Bit(); 14327 EVT SPTy = getPointerTy(); 14328 14329 if (SplitStack) { 14330 MachineRegisterInfo &MRI = MF.getRegInfo(); 14331 14332 if (Is64Bit) { 14333 // The 64 bit implementation of segmented stacks needs to clobber both r10 14334 // r11. This makes it impossible to use it along with nested parameters. 14335 const Function *F = MF.getFunction(); 14336 14337 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 14338 I != E; ++I) 14339 if (I->hasNestAttr()) 14340 report_fatal_error("Cannot use segmented stacks with functions that " 14341 "have nested arguments."); 14342 } 14343 14344 const TargetRegisterClass *AddrRegClass = 14345 getRegClassFor(getPointerTy()); 14346 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 14347 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 14348 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 14349 DAG.getRegister(Vreg, SPTy)); 14350 SDValue Ops1[2] = { Value, Chain }; 14351 return DAG.getMergeValues(Ops1, dl); 14352 } else { 14353 SDValue Flag; 14354 const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); 14355 14356 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 14357 Flag = Chain.getValue(1); 14358 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 14359 14360 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 14361 14362 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 14363 unsigned SPReg = RegInfo->getStackRegister(); 14364 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); 14365 Chain = SP.getValue(1); 14366 14367 if (Align) { 14368 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 14369 DAG.getConstant(-(uint64_t)Align, VT)); 14370 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); 14371 } 14372 14373 SDValue Ops1[2] = { SP, Chain }; 14374 return DAG.getMergeValues(Ops1, dl); 14375 } 14376 } 14377 14378 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 14379 MachineFunction &MF = DAG.getMachineFunction(); 14380 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 14381 14382 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 14383 SDLoc DL(Op); 14384 14385 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 14386 // vastart just stores the address of the VarArgsFrameIndex slot into the 14387 // memory location argument. 14388 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 14389 getPointerTy()); 14390 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 14391 MachinePointerInfo(SV), false, false, 0); 14392 } 14393 14394 // __va_list_tag: 14395 // gp_offset (0 - 6 * 8) 14396 // fp_offset (48 - 48 + 8 * 16) 14397 // overflow_arg_area (point to parameters coming in memory). 14398 // reg_save_area 14399 SmallVector<SDValue, 8> MemOps; 14400 SDValue FIN = Op.getOperand(1); 14401 // Store gp_offset 14402 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 14403 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 14404 MVT::i32), 14405 FIN, MachinePointerInfo(SV), false, false, 0); 14406 MemOps.push_back(Store); 14407 14408 // Store fp_offset 14409 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 14410 FIN, DAG.getIntPtrConstant(4)); 14411 Store = DAG.getStore(Op.getOperand(0), DL, 14412 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 14413 MVT::i32), 14414 FIN, MachinePointerInfo(SV, 4), false, false, 0); 14415 MemOps.push_back(Store); 14416 14417 // Store ptr to overflow_arg_area 14418 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 14419 FIN, DAG.getIntPtrConstant(4)); 14420 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 14421 getPointerTy()); 14422 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 14423 MachinePointerInfo(SV, 8), 14424 false, false, 0); 14425 MemOps.push_back(Store); 14426 14427 // Store ptr to reg_save_area. 14428 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 14429 FIN, DAG.getIntPtrConstant(8)); 14430 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 14431 getPointerTy()); 14432 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 14433 MachinePointerInfo(SV, 16), false, false, 0); 14434 MemOps.push_back(Store); 14435 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 14436 } 14437 14438 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 14439 assert(Subtarget->is64Bit() && 14440 "LowerVAARG only handles 64-bit va_arg!"); 14441 assert((Subtarget->isTargetLinux() || 14442 Subtarget->isTargetDarwin()) && 14443 "Unhandled target in LowerVAARG"); 14444 assert(Op.getNode()->getNumOperands() == 4); 14445 SDValue Chain = Op.getOperand(0); 14446 SDValue SrcPtr = Op.getOperand(1); 14447 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 14448 unsigned Align = Op.getConstantOperandVal(3); 14449 SDLoc dl(Op); 14450 14451 EVT ArgVT = Op.getNode()->getValueType(0); 14452 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 14453 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 14454 uint8_t ArgMode; 14455 14456 // Decide which area this value should be read from. 14457 // TODO: Implement the AMD64 ABI in its entirety. This simple 14458 // selection mechanism works only for the basic types. 14459 if (ArgVT == MVT::f80) { 14460 llvm_unreachable("va_arg for f80 not yet implemented"); 14461 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 14462 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 14463 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 14464 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 14465 } else { 14466 llvm_unreachable("Unhandled argument type in LowerVAARG"); 14467 } 14468 14469 if (ArgMode == 2) { 14470 // Sanity Check: Make sure using fp_offset makes sense. 14471 assert(!DAG.getTarget().Options.UseSoftFloat && 14472 !(DAG.getMachineFunction().getFunction()->hasFnAttribute( 14473 Attribute::NoImplicitFloat)) && 14474 Subtarget->hasSSE1()); 14475 } 14476 14477 // Insert VAARG_64 node into the DAG 14478 // VAARG_64 returns two values: Variable Argument Address, Chain 14479 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, MVT::i32), 14480 DAG.getConstant(ArgMode, MVT::i8), 14481 DAG.getConstant(Align, MVT::i32)}; 14482 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 14483 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 14484 VTs, InstOps, MVT::i64, 14485 MachinePointerInfo(SV), 14486 /*Align=*/0, 14487 /*Volatile=*/false, 14488 /*ReadMem=*/true, 14489 /*WriteMem=*/true); 14490 Chain = VAARG.getValue(1); 14491 14492 // Load the next argument and return it 14493 return DAG.getLoad(ArgVT, dl, 14494 Chain, 14495 VAARG, 14496 MachinePointerInfo(), 14497 false, false, false, 0); 14498 } 14499 14500 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, 14501 SelectionDAG &DAG) { 14502 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 14503 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 14504 SDValue Chain = Op.getOperand(0); 14505 SDValue DstPtr = Op.getOperand(1); 14506 SDValue SrcPtr = Op.getOperand(2); 14507 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 14508 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 14509 SDLoc DL(Op); 14510 14511 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 14512 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 14513 false, false, 14514 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 14515 } 14516 14517 // getTargetVShiftByConstNode - Handle vector element shifts where the shift 14518 // amount is a constant. Takes immediate version of shift as input. 14519 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, 14520 SDValue SrcOp, uint64_t ShiftAmt, 14521 SelectionDAG &DAG) { 14522 MVT ElementType = VT.getVectorElementType(); 14523 14524 // Fold this packed shift into its first operand if ShiftAmt is 0. 14525 if (ShiftAmt == 0) 14526 return SrcOp; 14527 14528 // Check for ShiftAmt >= element width 14529 if (ShiftAmt >= ElementType.getSizeInBits()) { 14530 if (Opc == X86ISD::VSRAI) 14531 ShiftAmt = ElementType.getSizeInBits() - 1; 14532 else 14533 return DAG.getConstant(0, VT); 14534 } 14535 14536 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) 14537 && "Unknown target vector shift-by-constant node"); 14538 14539 // Fold this packed vector shift into a build vector if SrcOp is a 14540 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. 14541 if (VT == SrcOp.getSimpleValueType() && 14542 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { 14543 SmallVector<SDValue, 8> Elts; 14544 unsigned NumElts = SrcOp->getNumOperands(); 14545 ConstantSDNode *ND; 14546 14547 switch(Opc) { 14548 default: llvm_unreachable(nullptr); 14549 case X86ISD::VSHLI: 14550 for (unsigned i=0; i!=NumElts; ++i) { 14551 SDValue CurrentOp = SrcOp->getOperand(i); 14552 if (CurrentOp->getOpcode() == ISD::UNDEF) { 14553 Elts.push_back(CurrentOp); 14554 continue; 14555 } 14556 ND = cast<ConstantSDNode>(CurrentOp); 14557 const APInt &C = ND->getAPIntValue(); 14558 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType)); 14559 } 14560 break; 14561 case X86ISD::VSRLI: 14562 for (unsigned i=0; i!=NumElts; ++i) { 14563 SDValue CurrentOp = SrcOp->getOperand(i); 14564 if (CurrentOp->getOpcode() == ISD::UNDEF) { 14565 Elts.push_back(CurrentOp); 14566 continue; 14567 } 14568 ND = cast<ConstantSDNode>(CurrentOp); 14569 const APInt &C = ND->getAPIntValue(); 14570 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType)); 14571 } 14572 break; 14573 case X86ISD::VSRAI: 14574 for (unsigned i=0; i!=NumElts; ++i) { 14575 SDValue CurrentOp = SrcOp->getOperand(i); 14576 if (CurrentOp->getOpcode() == ISD::UNDEF) { 14577 Elts.push_back(CurrentOp); 14578 continue; 14579 } 14580 ND = cast<ConstantSDNode>(CurrentOp); 14581 const APInt &C = ND->getAPIntValue(); 14582 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType)); 14583 } 14584 break; 14585 } 14586 14587 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); 14588 } 14589 14590 return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); 14591 } 14592 14593 // getTargetVShiftNode - Handle vector element shifts where the shift amount 14594 // may or may not be a constant. Takes immediate version of shift as input. 14595 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, 14596 SDValue SrcOp, SDValue ShAmt, 14597 SelectionDAG &DAG) { 14598 MVT SVT = ShAmt.getSimpleValueType(); 14599 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); 14600 14601 // Catch shift-by-constant. 14602 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) 14603 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, 14604 CShAmt->getZExtValue(), DAG); 14605 14606 // Change opcode to non-immediate version 14607 switch (Opc) { 14608 default: llvm_unreachable("Unknown target vector shift node"); 14609 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 14610 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 14611 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 14612 } 14613 14614 const X86Subtarget &Subtarget = 14615 static_cast<const X86Subtarget &>(DAG.getSubtarget()); 14616 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && 14617 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { 14618 // Let the shuffle legalizer expand this shift amount node. 14619 SDValue Op0 = ShAmt.getOperand(0); 14620 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); 14621 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); 14622 } else { 14623 // Need to build a vector containing shift amount. 14624 // SSE/AVX packed shifts only use the lower 64-bit of the shift count. 14625 SmallVector<SDValue, 4> ShOps; 14626 ShOps.push_back(ShAmt); 14627 if (SVT == MVT::i32) { 14628 ShOps.push_back(DAG.getConstant(0, SVT)); 14629 ShOps.push_back(DAG.getUNDEF(SVT)); 14630 } 14631 ShOps.push_back(DAG.getUNDEF(SVT)); 14632 14633 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; 14634 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); 14635 } 14636 14637 // The return type has to be a 128-bit type with the same element 14638 // type as the input type. 14639 MVT EltVT = VT.getVectorElementType(); 14640 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 14641 14642 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); 14643 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 14644 } 14645 14646 /// \brief Return (and \p Op, \p Mask) for compare instructions or 14647 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the 14648 /// necessary casting for \p Mask when lowering masking intrinsics. 14649 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, 14650 SDValue PreservedSrc, 14651 const X86Subtarget *Subtarget, 14652 SelectionDAG &DAG) { 14653 EVT VT = Op.getValueType(); 14654 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), 14655 MVT::i1, VT.getVectorNumElements()); 14656 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 14657 Mask.getValueType().getSizeInBits()); 14658 SDLoc dl(Op); 14659 14660 assert(MaskVT.isSimple() && "invalid mask type"); 14661 14662 if (isAllOnes(Mask)) 14663 return Op; 14664 14665 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements 14666 // are extracted by EXTRACT_SUBVECTOR. 14667 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 14668 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), 14669 DAG.getIntPtrConstant(0)); 14670 14671 switch (Op.getOpcode()) { 14672 default: break; 14673 case X86ISD::PCMPEQM: 14674 case X86ISD::PCMPGTM: 14675 case X86ISD::CMPM: 14676 case X86ISD::CMPMU: 14677 return DAG.getNode(ISD::AND, dl, VT, Op, VMask); 14678 } 14679 if (PreservedSrc.getOpcode() == ISD::UNDEF) 14680 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); 14681 return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); 14682 } 14683 14684 /// \brief Creates an SDNode for a predicated scalar operation. 14685 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). 14686 /// The mask is comming as MVT::i8 and it should be truncated 14687 /// to MVT::i1 while lowering masking intrinsics. 14688 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using 14689 /// "X86select" instead of "vselect". We just can't create the "vselect" node for 14690 /// a scalar instruction. 14691 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, 14692 SDValue PreservedSrc, 14693 const X86Subtarget *Subtarget, 14694 SelectionDAG &DAG) { 14695 if (isAllOnes(Mask)) 14696 return Op; 14697 14698 EVT VT = Op.getValueType(); 14699 SDLoc dl(Op); 14700 // The mask should be of type MVT::i1 14701 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); 14702 14703 if (PreservedSrc.getOpcode() == ISD::UNDEF) 14704 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); 14705 return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); 14706 } 14707 14708 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, 14709 SelectionDAG &DAG) { 14710 SDLoc dl(Op); 14711 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 14712 EVT VT = Op.getValueType(); 14713 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); 14714 if (IntrData) { 14715 switch(IntrData->Type) { 14716 case INTR_TYPE_1OP: 14717 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); 14718 case INTR_TYPE_2OP: 14719 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 14720 Op.getOperand(2)); 14721 case INTR_TYPE_3OP: 14722 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 14723 Op.getOperand(2), Op.getOperand(3)); 14724 case INTR_TYPE_1OP_MASK_RM: { 14725 SDValue Src = Op.getOperand(1); 14726 SDValue Src0 = Op.getOperand(2); 14727 SDValue Mask = Op.getOperand(3); 14728 SDValue RoundingMode = Op.getOperand(4); 14729 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, 14730 RoundingMode), 14731 Mask, Src0, Subtarget, DAG); 14732 } 14733 case INTR_TYPE_SCALAR_MASK_RM: { 14734 SDValue Src1 = Op.getOperand(1); 14735 SDValue Src2 = Op.getOperand(2); 14736 SDValue Src0 = Op.getOperand(3); 14737 SDValue Mask = Op.getOperand(4); 14738 // There are 2 kinds of intrinsics in this group: 14739 // (1) With supress-all-exceptions (sae) - 6 operands 14740 // (2) With rounding mode and sae - 7 operands. 14741 if (Op.getNumOperands() == 6) { 14742 SDValue Sae = Op.getOperand(5); 14743 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, 14744 Sae), 14745 Mask, Src0, Subtarget, DAG); 14746 } 14747 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); 14748 SDValue RoundingMode = Op.getOperand(5); 14749 SDValue Sae = Op.getOperand(6); 14750 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, 14751 RoundingMode, Sae), 14752 Mask, Src0, Subtarget, DAG); 14753 } 14754 case INTR_TYPE_2OP_MASK: { 14755 SDValue Src1 = Op.getOperand(1); 14756 SDValue Src2 = Op.getOperand(2); 14757 SDValue PassThru = Op.getOperand(3); 14758 SDValue Mask = Op.getOperand(4); 14759 // We specify 2 possible opcodes for intrinsics with rounding modes. 14760 // First, we check if the intrinsic may have non-default rounding mode, 14761 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 14762 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 14763 if (IntrWithRoundingModeOpcode != 0) { 14764 SDValue Rnd = Op.getOperand(5); 14765 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); 14766 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { 14767 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 14768 dl, Op.getValueType(), 14769 Src1, Src2, Rnd), 14770 Mask, PassThru, Subtarget, DAG); 14771 } 14772 } 14773 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 14774 Src1,Src2), 14775 Mask, PassThru, Subtarget, DAG); 14776 } 14777 case FMA_OP_MASK: { 14778 SDValue Src1 = Op.getOperand(1); 14779 SDValue Src2 = Op.getOperand(2); 14780 SDValue Src3 = Op.getOperand(3); 14781 SDValue Mask = Op.getOperand(4); 14782 // We specify 2 possible opcodes for intrinsics with rounding modes. 14783 // First, we check if the intrinsic may have non-default rounding mode, 14784 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 14785 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 14786 if (IntrWithRoundingModeOpcode != 0) { 14787 SDValue Rnd = Op.getOperand(5); 14788 if (cast<ConstantSDNode>(Rnd)->getZExtValue() != 14789 X86::STATIC_ROUNDING::CUR_DIRECTION) 14790 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 14791 dl, Op.getValueType(), 14792 Src1, Src2, Src3, Rnd), 14793 Mask, Src1, Subtarget, DAG); 14794 } 14795 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, 14796 dl, Op.getValueType(), 14797 Src1, Src2, Src3), 14798 Mask, Src1, Subtarget, DAG); 14799 } 14800 case CMP_MASK: 14801 case CMP_MASK_CC: { 14802 // Comparison intrinsics with masks. 14803 // Example of transformation: 14804 // (i8 (int_x86_avx512_mask_pcmpeq_q_128 14805 // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> 14806 // (i8 (bitcast 14807 // (v8i1 (insert_subvector undef, 14808 // (v2i1 (and (PCMPEQM %a, %b), 14809 // (extract_subvector 14810 // (v8i1 (bitcast %mask)), 0))), 0)))) 14811 EVT VT = Op.getOperand(1).getValueType(); 14812 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 14813 VT.getVectorNumElements()); 14814 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); 14815 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 14816 Mask.getValueType().getSizeInBits()); 14817 SDValue Cmp; 14818 if (IntrData->Type == CMP_MASK_CC) { 14819 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), 14820 Op.getOperand(2), Op.getOperand(3)); 14821 } else { 14822 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); 14823 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), 14824 Op.getOperand(2)); 14825 } 14826 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, 14827 DAG.getTargetConstant(0, MaskVT), 14828 Subtarget, DAG); 14829 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, 14830 DAG.getUNDEF(BitcastVT), CmpMask, 14831 DAG.getIntPtrConstant(0)); 14832 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 14833 } 14834 case COMI: { // Comparison intrinsics 14835 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; 14836 SDValue LHS = Op.getOperand(1); 14837 SDValue RHS = Op.getOperand(2); 14838 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 14839 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 14840 SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); 14841 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 14842 DAG.getConstant(X86CC, MVT::i8), Cond); 14843 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 14844 } 14845 case VSHIFT: 14846 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), 14847 Op.getOperand(1), Op.getOperand(2), DAG); 14848 case VSHIFT_MASK: 14849 return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, 14850 Op.getSimpleValueType(), 14851 Op.getOperand(1), 14852 Op.getOperand(2), DAG), 14853 Op.getOperand(4), Op.getOperand(3), Subtarget, 14854 DAG); 14855 case COMPRESS_EXPAND_IN_REG: { 14856 SDValue Mask = Op.getOperand(3); 14857 SDValue DataToCompress = Op.getOperand(1); 14858 SDValue PassThru = Op.getOperand(2); 14859 if (isAllOnes(Mask)) // return data as is 14860 return Op.getOperand(1); 14861 EVT VT = Op.getValueType(); 14862 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 14863 VT.getVectorNumElements()); 14864 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 14865 Mask.getValueType().getSizeInBits()); 14866 SDLoc dl(Op); 14867 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 14868 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), 14869 DAG.getIntPtrConstant(0)); 14870 14871 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, 14872 PassThru); 14873 } 14874 case BLEND: { 14875 SDValue Mask = Op.getOperand(3); 14876 EVT VT = Op.getValueType(); 14877 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 14878 VT.getVectorNumElements()); 14879 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 14880 Mask.getValueType().getSizeInBits()); 14881 SDLoc dl(Op); 14882 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 14883 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), 14884 DAG.getIntPtrConstant(0)); 14885 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), 14886 Op.getOperand(2)); 14887 } 14888 default: 14889 break; 14890 } 14891 } 14892 14893 switch (IntNo) { 14894 default: return SDValue(); // Don't custom lower most intrinsics. 14895 14896 case Intrinsic::x86_avx2_permd: 14897 case Intrinsic::x86_avx2_permps: 14898 // Operands intentionally swapped. Mask is last operand to intrinsic, 14899 // but second operand for node/instruction. 14900 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 14901 Op.getOperand(2), Op.getOperand(1)); 14902 14903 case Intrinsic::x86_avx512_mask_valign_q_512: 14904 case Intrinsic::x86_avx512_mask_valign_d_512: 14905 // Vector source operands are swapped. 14906 return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, 14907 Op.getValueType(), Op.getOperand(2), 14908 Op.getOperand(1), 14909 Op.getOperand(3)), 14910 Op.getOperand(5), Op.getOperand(4), 14911 Subtarget, DAG); 14912 14913 // ptest and testp intrinsics. The intrinsic these come from are designed to 14914 // return an integer value, not just an instruction so lower it to the ptest 14915 // or testp pattern and a setcc for the result. 14916 case Intrinsic::x86_sse41_ptestz: 14917 case Intrinsic::x86_sse41_ptestc: 14918 case Intrinsic::x86_sse41_ptestnzc: 14919 case Intrinsic::x86_avx_ptestz_256: 14920 case Intrinsic::x86_avx_ptestc_256: 14921 case Intrinsic::x86_avx_ptestnzc_256: 14922 case Intrinsic::x86_avx_vtestz_ps: 14923 case Intrinsic::x86_avx_vtestc_ps: 14924 case Intrinsic::x86_avx_vtestnzc_ps: 14925 case Intrinsic::x86_avx_vtestz_pd: 14926 case Intrinsic::x86_avx_vtestc_pd: 14927 case Intrinsic::x86_avx_vtestnzc_pd: 14928 case Intrinsic::x86_avx_vtestz_ps_256: 14929 case Intrinsic::x86_avx_vtestc_ps_256: 14930 case Intrinsic::x86_avx_vtestnzc_ps_256: 14931 case Intrinsic::x86_avx_vtestz_pd_256: 14932 case Intrinsic::x86_avx_vtestc_pd_256: 14933 case Intrinsic::x86_avx_vtestnzc_pd_256: { 14934 bool IsTestPacked = false; 14935 unsigned X86CC; 14936 switch (IntNo) { 14937 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 14938 case Intrinsic::x86_avx_vtestz_ps: 14939 case Intrinsic::x86_avx_vtestz_pd: 14940 case Intrinsic::x86_avx_vtestz_ps_256: 14941 case Intrinsic::x86_avx_vtestz_pd_256: 14942 IsTestPacked = true; // Fallthrough 14943 case Intrinsic::x86_sse41_ptestz: 14944 case Intrinsic::x86_avx_ptestz_256: 14945 // ZF = 1 14946 X86CC = X86::COND_E; 14947 break; 14948 case Intrinsic::x86_avx_vtestc_ps: 14949 case Intrinsic::x86_avx_vtestc_pd: 14950 case Intrinsic::x86_avx_vtestc_ps_256: 14951 case Intrinsic::x86_avx_vtestc_pd_256: 14952 IsTestPacked = true; // Fallthrough 14953 case Intrinsic::x86_sse41_ptestc: 14954 case Intrinsic::x86_avx_ptestc_256: 14955 // CF = 1 14956 X86CC = X86::COND_B; 14957 break; 14958 case Intrinsic::x86_avx_vtestnzc_ps: 14959 case Intrinsic::x86_avx_vtestnzc_pd: 14960 case Intrinsic::x86_avx_vtestnzc_ps_256: 14961 case Intrinsic::x86_avx_vtestnzc_pd_256: 14962 IsTestPacked = true; // Fallthrough 14963 case Intrinsic::x86_sse41_ptestnzc: 14964 case Intrinsic::x86_avx_ptestnzc_256: 14965 // ZF and CF = 0 14966 X86CC = X86::COND_A; 14967 break; 14968 } 14969 14970 SDValue LHS = Op.getOperand(1); 14971 SDValue RHS = Op.getOperand(2); 14972 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 14973 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 14974 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 14975 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 14976 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 14977 } 14978 case Intrinsic::x86_avx512_kortestz_w: 14979 case Intrinsic::x86_avx512_kortestc_w: { 14980 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; 14981 SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); 14982 SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); 14983 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 14984 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); 14985 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); 14986 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 14987 } 14988 14989 case Intrinsic::x86_sse42_pcmpistria128: 14990 case Intrinsic::x86_sse42_pcmpestria128: 14991 case Intrinsic::x86_sse42_pcmpistric128: 14992 case Intrinsic::x86_sse42_pcmpestric128: 14993 case Intrinsic::x86_sse42_pcmpistrio128: 14994 case Intrinsic::x86_sse42_pcmpestrio128: 14995 case Intrinsic::x86_sse42_pcmpistris128: 14996 case Intrinsic::x86_sse42_pcmpestris128: 14997 case Intrinsic::x86_sse42_pcmpistriz128: 14998 case Intrinsic::x86_sse42_pcmpestriz128: { 14999 unsigned Opcode; 15000 unsigned X86CC; 15001 switch (IntNo) { 15002 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 15003 case Intrinsic::x86_sse42_pcmpistria128: 15004 Opcode = X86ISD::PCMPISTRI; 15005 X86CC = X86::COND_A; 15006 break; 15007 case Intrinsic::x86_sse42_pcmpestria128: 15008 Opcode = X86ISD::PCMPESTRI; 15009 X86CC = X86::COND_A; 15010 break; 15011 case Intrinsic::x86_sse42_pcmpistric128: 15012 Opcode = X86ISD::PCMPISTRI; 15013 X86CC = X86::COND_B; 15014 break; 15015 case Intrinsic::x86_sse42_pcmpestric128: 15016 Opcode = X86ISD::PCMPESTRI; 15017 X86CC = X86::COND_B; 15018 break; 15019 case Intrinsic::x86_sse42_pcmpistrio128: 15020 Opcode = X86ISD::PCMPISTRI; 15021 X86CC = X86::COND_O; 15022 break; 15023 case Intrinsic::x86_sse42_pcmpestrio128: 15024 Opcode = X86ISD::PCMPESTRI; 15025 X86CC = X86::COND_O; 15026 break; 15027 case Intrinsic::x86_sse42_pcmpistris128: 15028 Opcode = X86ISD::PCMPISTRI; 15029 X86CC = X86::COND_S; 15030 break; 15031 case Intrinsic::x86_sse42_pcmpestris128: 15032 Opcode = X86ISD::PCMPESTRI; 15033 X86CC = X86::COND_S; 15034 break; 15035 case Intrinsic::x86_sse42_pcmpistriz128: 15036 Opcode = X86ISD::PCMPISTRI; 15037 X86CC = X86::COND_E; 15038 break; 15039 case Intrinsic::x86_sse42_pcmpestriz128: 15040 Opcode = X86ISD::PCMPESTRI; 15041 X86CC = X86::COND_E; 15042 break; 15043 } 15044 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 15045 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 15046 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); 15047 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15048 DAG.getConstant(X86CC, MVT::i8), 15049 SDValue(PCMP.getNode(), 1)); 15050 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 15051 } 15052 15053 case Intrinsic::x86_sse42_pcmpistri128: 15054 case Intrinsic::x86_sse42_pcmpestri128: { 15055 unsigned Opcode; 15056 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 15057 Opcode = X86ISD::PCMPISTRI; 15058 else 15059 Opcode = X86ISD::PCMPESTRI; 15060 15061 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 15062 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 15063 return DAG.getNode(Opcode, dl, VTs, NewOps); 15064 } 15065 } 15066 } 15067 15068 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 15069 SDValue Src, SDValue Mask, SDValue Base, 15070 SDValue Index, SDValue ScaleOp, SDValue Chain, 15071 const X86Subtarget * Subtarget) { 15072 SDLoc dl(Op); 15073 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 15074 assert(C && "Invalid scale type"); 15075 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 15076 EVT MaskVT = MVT::getVectorVT(MVT::i1, 15077 Index.getSimpleValueType().getVectorNumElements()); 15078 SDValue MaskInReg; 15079 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 15080 if (MaskC) 15081 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); 15082 else 15083 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 15084 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 15085 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 15086 SDValue Segment = DAG.getRegister(0, MVT::i32); 15087 if (Src.getOpcode() == ISD::UNDEF) 15088 Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 15089 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 15090 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 15091 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 15092 return DAG.getMergeValues(RetOps, dl); 15093 } 15094 15095 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 15096 SDValue Src, SDValue Mask, SDValue Base, 15097 SDValue Index, SDValue ScaleOp, SDValue Chain) { 15098 SDLoc dl(Op); 15099 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 15100 assert(C && "Invalid scale type"); 15101 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 15102 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 15103 SDValue Segment = DAG.getRegister(0, MVT::i32); 15104 EVT MaskVT = MVT::getVectorVT(MVT::i1, 15105 Index.getSimpleValueType().getVectorNumElements()); 15106 SDValue MaskInReg; 15107 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 15108 if (MaskC) 15109 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); 15110 else 15111 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 15112 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 15113 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 15114 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 15115 return SDValue(Res, 1); 15116 } 15117 15118 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 15119 SDValue Mask, SDValue Base, SDValue Index, 15120 SDValue ScaleOp, SDValue Chain) { 15121 SDLoc dl(Op); 15122 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 15123 assert(C && "Invalid scale type"); 15124 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 15125 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 15126 SDValue Segment = DAG.getRegister(0, MVT::i32); 15127 EVT MaskVT = 15128 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); 15129 SDValue MaskInReg; 15130 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 15131 if (MaskC) 15132 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); 15133 else 15134 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 15135 //SDVTList VTs = DAG.getVTList(MVT::Other); 15136 SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 15137 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); 15138 return SDValue(Res, 0); 15139 } 15140 15141 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that 15142 // read performance monitor counters (x86_rdpmc). 15143 static void getReadPerformanceCounter(SDNode *N, SDLoc DL, 15144 SelectionDAG &DAG, const X86Subtarget *Subtarget, 15145 SmallVectorImpl<SDValue> &Results) { 15146 assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); 15147 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 15148 SDValue LO, HI; 15149 15150 // The ECX register is used to select the index of the performance counter 15151 // to read. 15152 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, 15153 N->getOperand(2)); 15154 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); 15155 15156 // Reads the content of a 64-bit performance counter and returns it in the 15157 // registers EDX:EAX. 15158 if (Subtarget->is64Bit()) { 15159 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); 15160 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, 15161 LO.getValue(2)); 15162 } else { 15163 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); 15164 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, 15165 LO.getValue(2)); 15166 } 15167 Chain = HI.getValue(1); 15168 15169 if (Subtarget->is64Bit()) { 15170 // The EAX register is loaded with the low-order 32 bits. The EDX register 15171 // is loaded with the supported high-order bits of the counter. 15172 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, 15173 DAG.getConstant(32, MVT::i8)); 15174 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); 15175 Results.push_back(Chain); 15176 return; 15177 } 15178 15179 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 15180 SDValue Ops[] = { LO, HI }; 15181 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); 15182 Results.push_back(Pair); 15183 Results.push_back(Chain); 15184 } 15185 15186 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that 15187 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is 15188 // also used to custom lower READCYCLECOUNTER nodes. 15189 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, 15190 SelectionDAG &DAG, const X86Subtarget *Subtarget, 15191 SmallVectorImpl<SDValue> &Results) { 15192 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 15193 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); 15194 SDValue LO, HI; 15195 15196 // The processor's time-stamp counter (a 64-bit MSR) is stored into the 15197 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR 15198 // and the EAX register is loaded with the low-order 32 bits. 15199 if (Subtarget->is64Bit()) { 15200 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); 15201 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, 15202 LO.getValue(2)); 15203 } else { 15204 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); 15205 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, 15206 LO.getValue(2)); 15207 } 15208 SDValue Chain = HI.getValue(1); 15209 15210 if (Opcode == X86ISD::RDTSCP_DAG) { 15211 assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); 15212 15213 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into 15214 // the ECX register. Add 'ecx' explicitly to the chain. 15215 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, 15216 HI.getValue(2)); 15217 // Explicitly store the content of ECX at the location passed in input 15218 // to the 'rdtscp' intrinsic. 15219 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), 15220 MachinePointerInfo(), false, false, 0); 15221 } 15222 15223 if (Subtarget->is64Bit()) { 15224 // The EDX register is loaded with the high-order 32 bits of the MSR, and 15225 // the EAX register is loaded with the low-order 32 bits. 15226 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, 15227 DAG.getConstant(32, MVT::i8)); 15228 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); 15229 Results.push_back(Chain); 15230 return; 15231 } 15232 15233 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 15234 SDValue Ops[] = { LO, HI }; 15235 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); 15236 Results.push_back(Pair); 15237 Results.push_back(Chain); 15238 } 15239 15240 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, 15241 SelectionDAG &DAG) { 15242 SmallVector<SDValue, 2> Results; 15243 SDLoc DL(Op); 15244 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, 15245 Results); 15246 return DAG.getMergeValues(Results, DL); 15247 } 15248 15249 15250 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, 15251 SelectionDAG &DAG) { 15252 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 15253 15254 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); 15255 if (!IntrData) 15256 return SDValue(); 15257 15258 SDLoc dl(Op); 15259 switch(IntrData->Type) { 15260 default: 15261 llvm_unreachable("Unknown Intrinsic Type"); 15262 break; 15263 case RDSEED: 15264 case RDRAND: { 15265 // Emit the node with the right value type. 15266 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 15267 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); 15268 15269 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. 15270 // Otherwise return the value from Rand, which is always 0, casted to i32. 15271 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 15272 DAG.getConstant(1, Op->getValueType(1)), 15273 DAG.getConstant(X86::COND_B, MVT::i32), 15274 SDValue(Result.getNode(), 1) }; 15275 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 15276 DAG.getVTList(Op->getValueType(1), MVT::Glue), 15277 Ops); 15278 15279 // Return { result, isValid, chain }. 15280 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 15281 SDValue(Result.getNode(), 2)); 15282 } 15283 case GATHER: { 15284 //gather(v1, mask, index, base, scale); 15285 SDValue Chain = Op.getOperand(0); 15286 SDValue Src = Op.getOperand(2); 15287 SDValue Base = Op.getOperand(3); 15288 SDValue Index = Op.getOperand(4); 15289 SDValue Mask = Op.getOperand(5); 15290 SDValue Scale = Op.getOperand(6); 15291 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, 15292 Subtarget); 15293 } 15294 case SCATTER: { 15295 //scatter(base, mask, index, v1, scale); 15296 SDValue Chain = Op.getOperand(0); 15297 SDValue Base = Op.getOperand(2); 15298 SDValue Mask = Op.getOperand(3); 15299 SDValue Index = Op.getOperand(4); 15300 SDValue Src = Op.getOperand(5); 15301 SDValue Scale = Op.getOperand(6); 15302 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); 15303 } 15304 case PREFETCH: { 15305 SDValue Hint = Op.getOperand(6); 15306 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); 15307 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); 15308 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); 15309 SDValue Chain = Op.getOperand(0); 15310 SDValue Mask = Op.getOperand(2); 15311 SDValue Index = Op.getOperand(3); 15312 SDValue Base = Op.getOperand(4); 15313 SDValue Scale = Op.getOperand(5); 15314 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); 15315 } 15316 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). 15317 case RDTSC: { 15318 SmallVector<SDValue, 2> Results; 15319 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); 15320 return DAG.getMergeValues(Results, dl); 15321 } 15322 // Read Performance Monitoring Counters. 15323 case RDPMC: { 15324 SmallVector<SDValue, 2> Results; 15325 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); 15326 return DAG.getMergeValues(Results, dl); 15327 } 15328 // XTEST intrinsics. 15329 case XTEST: { 15330 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 15331 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); 15332 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15333 DAG.getConstant(X86::COND_NE, MVT::i8), 15334 InTrans); 15335 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); 15336 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), 15337 Ret, SDValue(InTrans.getNode(), 1)); 15338 } 15339 // ADC/ADCX/SBB 15340 case ADX: { 15341 SmallVector<SDValue, 2> Results; 15342 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 15343 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); 15344 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), 15345 DAG.getConstant(-1, MVT::i8)); 15346 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), 15347 Op.getOperand(4), GenCF.getValue(1)); 15348 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), 15349 Op.getOperand(5), MachinePointerInfo(), 15350 false, false, 0); 15351 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15352 DAG.getConstant(X86::COND_B, MVT::i8), 15353 Res.getValue(1)); 15354 Results.push_back(SetCC); 15355 Results.push_back(Store); 15356 return DAG.getMergeValues(Results, dl); 15357 } 15358 case COMPRESS_TO_MEM: { 15359 SDLoc dl(Op); 15360 SDValue Mask = Op.getOperand(4); 15361 SDValue DataToCompress = Op.getOperand(3); 15362 SDValue Addr = Op.getOperand(2); 15363 SDValue Chain = Op.getOperand(0); 15364 15365 if (isAllOnes(Mask)) // return just a store 15366 return DAG.getStore(Chain, dl, DataToCompress, Addr, 15367 MachinePointerInfo(), false, false, 0); 15368 15369 EVT VT = DataToCompress.getValueType(); 15370 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 15371 VT.getVectorNumElements()); 15372 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 15373 Mask.getValueType().getSizeInBits()); 15374 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 15375 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), 15376 DAG.getIntPtrConstant(0)); 15377 15378 SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, 15379 DataToCompress, DAG.getUNDEF(VT)); 15380 return DAG.getStore(Chain, dl, Compressed, Addr, 15381 MachinePointerInfo(), false, false, 0); 15382 } 15383 case EXPAND_FROM_MEM: { 15384 SDLoc dl(Op); 15385 SDValue Mask = Op.getOperand(4); 15386 SDValue PathThru = Op.getOperand(3); 15387 SDValue Addr = Op.getOperand(2); 15388 SDValue Chain = Op.getOperand(0); 15389 EVT VT = Op.getValueType(); 15390 15391 if (isAllOnes(Mask)) // return just a load 15392 return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, 15393 false, 0); 15394 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 15395 VT.getVectorNumElements()); 15396 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 15397 Mask.getValueType().getSizeInBits()); 15398 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 15399 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), 15400 DAG.getIntPtrConstant(0)); 15401 15402 SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), 15403 false, false, false, 0); 15404 15405 SDValue Results[] = { 15406 DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru), 15407 Chain}; 15408 return DAG.getMergeValues(Results, dl); 15409 } 15410 } 15411 } 15412 15413 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 15414 SelectionDAG &DAG) const { 15415 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 15416 MFI->setReturnAddressIsTaken(true); 15417 15418 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 15419 return SDValue(); 15420 15421 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 15422 SDLoc dl(Op); 15423 EVT PtrVT = getPointerTy(); 15424 15425 if (Depth > 0) { 15426 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 15427 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 15428 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); 15429 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 15430 DAG.getNode(ISD::ADD, dl, PtrVT, 15431 FrameAddr, Offset), 15432 MachinePointerInfo(), false, false, false, 0); 15433 } 15434 15435 // Just load the return address. 15436 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 15437 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 15438 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 15439 } 15440 15441 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 15442 MachineFunction &MF = DAG.getMachineFunction(); 15443 MachineFrameInfo *MFI = MF.getFrameInfo(); 15444 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 15445 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 15446 EVT VT = Op.getValueType(); 15447 15448 MFI->setFrameAddressIsTaken(true); 15449 15450 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { 15451 // Depth > 0 makes no sense on targets which use Windows unwind codes. It 15452 // is not possible to crawl up the stack without looking at the unwind codes 15453 // simultaneously. 15454 int FrameAddrIndex = FuncInfo->getFAIndex(); 15455 if (!FrameAddrIndex) { 15456 // Set up a frame object for the return address. 15457 unsigned SlotSize = RegInfo->getSlotSize(); 15458 FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( 15459 SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false); 15460 FuncInfo->setFAIndex(FrameAddrIndex); 15461 } 15462 return DAG.getFrameIndex(FrameAddrIndex, VT); 15463 } 15464 15465 unsigned FrameReg = 15466 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); 15467 SDLoc dl(Op); // FIXME probably not meaningful 15468 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 15469 assert(((FrameReg == X86::RBP && VT == MVT::i64) || 15470 (FrameReg == X86::EBP && VT == MVT::i32)) && 15471 "Invalid Frame Register!"); 15472 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 15473 while (Depth--) 15474 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 15475 MachinePointerInfo(), 15476 false, false, false, 0); 15477 return FrameAddr; 15478 } 15479 15480 // FIXME? Maybe this could be a TableGen attribute on some registers and 15481 // this table could be generated automatically from RegInfo. 15482 unsigned X86TargetLowering::getRegisterByName(const char* RegName, 15483 EVT VT) const { 15484 unsigned Reg = StringSwitch<unsigned>(RegName) 15485 .Case("esp", X86::ESP) 15486 .Case("rsp", X86::RSP) 15487 .Default(0); 15488 if (Reg) 15489 return Reg; 15490 report_fatal_error("Invalid register name global variable"); 15491 } 15492 15493 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 15494 SelectionDAG &DAG) const { 15495 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 15496 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); 15497 } 15498 15499 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 15500 SDValue Chain = Op.getOperand(0); 15501 SDValue Offset = Op.getOperand(1); 15502 SDValue Handler = Op.getOperand(2); 15503 SDLoc dl (Op); 15504 15505 EVT PtrVT = getPointerTy(); 15506 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 15507 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 15508 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || 15509 (FrameReg == X86::EBP && PtrVT == MVT::i32)) && 15510 "Invalid Frame Register!"); 15511 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); 15512 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; 15513 15514 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, 15515 DAG.getIntPtrConstant(RegInfo->getSlotSize())); 15516 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); 15517 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 15518 false, false, 0); 15519 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 15520 15521 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, 15522 DAG.getRegister(StoreAddrReg, PtrVT)); 15523 } 15524 15525 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 15526 SelectionDAG &DAG) const { 15527 SDLoc DL(Op); 15528 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, 15529 DAG.getVTList(MVT::i32, MVT::Other), 15530 Op.getOperand(0), Op.getOperand(1)); 15531 } 15532 15533 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 15534 SelectionDAG &DAG) const { 15535 SDLoc DL(Op); 15536 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 15537 Op.getOperand(0), Op.getOperand(1)); 15538 } 15539 15540 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 15541 return Op.getOperand(0); 15542 } 15543 15544 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 15545 SelectionDAG &DAG) const { 15546 SDValue Root = Op.getOperand(0); 15547 SDValue Trmp = Op.getOperand(1); // trampoline 15548 SDValue FPtr = Op.getOperand(2); // nested function 15549 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 15550 SDLoc dl (Op); 15551 15552 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 15553 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 15554 15555 if (Subtarget->is64Bit()) { 15556 SDValue OutChains[6]; 15557 15558 // Large code-model. 15559 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 15560 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 15561 15562 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 15563 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 15564 15565 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 15566 15567 // Load the pointer to the nested function into R11. 15568 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 15569 SDValue Addr = Trmp; 15570 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 15571 Addr, MachinePointerInfo(TrmpAddr), 15572 false, false, 0); 15573 15574 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15575 DAG.getConstant(2, MVT::i64)); 15576 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 15577 MachinePointerInfo(TrmpAddr, 2), 15578 false, false, 2); 15579 15580 // Load the 'nest' parameter value into R10. 15581 // R10 is specified in X86CallingConv.td 15582 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 15583 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15584 DAG.getConstant(10, MVT::i64)); 15585 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 15586 Addr, MachinePointerInfo(TrmpAddr, 10), 15587 false, false, 0); 15588 15589 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15590 DAG.getConstant(12, MVT::i64)); 15591 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 15592 MachinePointerInfo(TrmpAddr, 12), 15593 false, false, 2); 15594 15595 // Jump to the nested function. 15596 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 15597 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15598 DAG.getConstant(20, MVT::i64)); 15599 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 15600 Addr, MachinePointerInfo(TrmpAddr, 20), 15601 false, false, 0); 15602 15603 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 15604 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15605 DAG.getConstant(22, MVT::i64)); 15606 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 15607 MachinePointerInfo(TrmpAddr, 22), 15608 false, false, 0); 15609 15610 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); 15611 } else { 15612 const Function *Func = 15613 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 15614 CallingConv::ID CC = Func->getCallingConv(); 15615 unsigned NestReg; 15616 15617 switch (CC) { 15618 default: 15619 llvm_unreachable("Unsupported calling convention"); 15620 case CallingConv::C: 15621 case CallingConv::X86_StdCall: { 15622 // Pass 'nest' parameter in ECX. 15623 // Must be kept in sync with X86CallingConv.td 15624 NestReg = X86::ECX; 15625 15626 // Check that ECX wasn't needed by an 'inreg' parameter. 15627 FunctionType *FTy = Func->getFunctionType(); 15628 const AttributeSet &Attrs = Func->getAttributes(); 15629 15630 if (!Attrs.isEmpty() && !Func->isVarArg()) { 15631 unsigned InRegCount = 0; 15632 unsigned Idx = 1; 15633 15634 for (FunctionType::param_iterator I = FTy->param_begin(), 15635 E = FTy->param_end(); I != E; ++I, ++Idx) 15636 if (Attrs.hasAttribute(Idx, Attribute::InReg)) 15637 // FIXME: should only count parameters that are lowered to integers. 15638 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 15639 15640 if (InRegCount > 2) { 15641 report_fatal_error("Nest register in use - reduce number of inreg" 15642 " parameters!"); 15643 } 15644 } 15645 break; 15646 } 15647 case CallingConv::X86_FastCall: 15648 case CallingConv::X86_ThisCall: 15649 case CallingConv::Fast: 15650 // Pass 'nest' parameter in EAX. 15651 // Must be kept in sync with X86CallingConv.td 15652 NestReg = X86::EAX; 15653 break; 15654 } 15655 15656 SDValue OutChains[4]; 15657 SDValue Addr, Disp; 15658 15659 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 15660 DAG.getConstant(10, MVT::i32)); 15661 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 15662 15663 // This is storing the opcode for MOV32ri. 15664 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 15665 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 15666 OutChains[0] = DAG.getStore(Root, dl, 15667 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 15668 Trmp, MachinePointerInfo(TrmpAddr), 15669 false, false, 0); 15670 15671 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 15672 DAG.getConstant(1, MVT::i32)); 15673 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 15674 MachinePointerInfo(TrmpAddr, 1), 15675 false, false, 1); 15676 15677 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 15678 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 15679 DAG.getConstant(5, MVT::i32)); 15680 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 15681 MachinePointerInfo(TrmpAddr, 5), 15682 false, false, 1); 15683 15684 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 15685 DAG.getConstant(6, MVT::i32)); 15686 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 15687 MachinePointerInfo(TrmpAddr, 6), 15688 false, false, 1); 15689 15690 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); 15691 } 15692 } 15693 15694 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 15695 SelectionDAG &DAG) const { 15696 /* 15697 The rounding mode is in bits 11:10 of FPSR, and has the following 15698 settings: 15699 00 Round to nearest 15700 01 Round to -inf 15701 10 Round to +inf 15702 11 Round to 0 15703 15704 FLT_ROUNDS, on the other hand, expects the following: 15705 -1 Undefined 15706 0 Round to 0 15707 1 Round to nearest 15708 2 Round to +inf 15709 3 Round to -inf 15710 15711 To perform the conversion, we do: 15712 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 15713 */ 15714 15715 MachineFunction &MF = DAG.getMachineFunction(); 15716 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 15717 unsigned StackAlignment = TFI.getStackAlignment(); 15718 MVT VT = Op.getSimpleValueType(); 15719 SDLoc DL(Op); 15720 15721 // Save FP Control Word to stack slot 15722 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 15723 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 15724 15725 MachineMemOperand *MMO = 15726 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 15727 MachineMemOperand::MOStore, 2, 2); 15728 15729 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 15730 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 15731 DAG.getVTList(MVT::Other), 15732 Ops, MVT::i16, MMO); 15733 15734 // Load FP Control Word from stack slot 15735 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 15736 MachinePointerInfo(), false, false, false, 0); 15737 15738 // Transform as necessary 15739 SDValue CWD1 = 15740 DAG.getNode(ISD::SRL, DL, MVT::i16, 15741 DAG.getNode(ISD::AND, DL, MVT::i16, 15742 CWD, DAG.getConstant(0x800, MVT::i16)), 15743 DAG.getConstant(11, MVT::i8)); 15744 SDValue CWD2 = 15745 DAG.getNode(ISD::SRL, DL, MVT::i16, 15746 DAG.getNode(ISD::AND, DL, MVT::i16, 15747 CWD, DAG.getConstant(0x400, MVT::i16)), 15748 DAG.getConstant(9, MVT::i8)); 15749 15750 SDValue RetVal = 15751 DAG.getNode(ISD::AND, DL, MVT::i16, 15752 DAG.getNode(ISD::ADD, DL, MVT::i16, 15753 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 15754 DAG.getConstant(1, MVT::i16)), 15755 DAG.getConstant(3, MVT::i16)); 15756 15757 return DAG.getNode((VT.getSizeInBits() < 16 ? 15758 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 15759 } 15760 15761 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 15762 MVT VT = Op.getSimpleValueType(); 15763 EVT OpVT = VT; 15764 unsigned NumBits = VT.getSizeInBits(); 15765 SDLoc dl(Op); 15766 15767 Op = Op.getOperand(0); 15768 if (VT == MVT::i8) { 15769 // Zero extend to i32 since there is not an i8 bsr. 15770 OpVT = MVT::i32; 15771 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 15772 } 15773 15774 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 15775 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 15776 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 15777 15778 // If src is zero (i.e. bsr sets ZF), returns NumBits. 15779 SDValue Ops[] = { 15780 Op, 15781 DAG.getConstant(NumBits+NumBits-1, OpVT), 15782 DAG.getConstant(X86::COND_E, MVT::i8), 15783 Op.getValue(1) 15784 }; 15785 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); 15786 15787 // Finally xor with NumBits-1. 15788 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 15789 15790 if (VT == MVT::i8) 15791 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 15792 return Op; 15793 } 15794 15795 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { 15796 MVT VT = Op.getSimpleValueType(); 15797 EVT OpVT = VT; 15798 unsigned NumBits = VT.getSizeInBits(); 15799 SDLoc dl(Op); 15800 15801 Op = Op.getOperand(0); 15802 if (VT == MVT::i8) { 15803 // Zero extend to i32 since there is not an i8 bsr. 15804 OpVT = MVT::i32; 15805 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 15806 } 15807 15808 // Issue a bsr (scan bits in reverse). 15809 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 15810 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 15811 15812 // And xor with NumBits-1. 15813 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 15814 15815 if (VT == MVT::i8) 15816 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 15817 return Op; 15818 } 15819 15820 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 15821 MVT VT = Op.getSimpleValueType(); 15822 unsigned NumBits = VT.getSizeInBits(); 15823 SDLoc dl(Op); 15824 Op = Op.getOperand(0); 15825 15826 // Issue a bsf (scan bits forward) which also sets EFLAGS. 15827 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 15828 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 15829 15830 // If src is zero (i.e. bsf sets ZF), returns NumBits. 15831 SDValue Ops[] = { 15832 Op, 15833 DAG.getConstant(NumBits, VT), 15834 DAG.getConstant(X86::COND_E, MVT::i8), 15835 Op.getValue(1) 15836 }; 15837 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); 15838 } 15839 15840 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 15841 // ones, and then concatenate the result back. 15842 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 15843 MVT VT = Op.getSimpleValueType(); 15844 15845 assert(VT.is256BitVector() && VT.isInteger() && 15846 "Unsupported value type for operation"); 15847 15848 unsigned NumElems = VT.getVectorNumElements(); 15849 SDLoc dl(Op); 15850 15851 // Extract the LHS vectors 15852 SDValue LHS = Op.getOperand(0); 15853 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 15854 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 15855 15856 // Extract the RHS vectors 15857 SDValue RHS = Op.getOperand(1); 15858 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 15859 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 15860 15861 MVT EltVT = VT.getVectorElementType(); 15862 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 15863 15864 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 15865 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 15866 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 15867 } 15868 15869 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 15870 assert(Op.getSimpleValueType().is256BitVector() && 15871 Op.getSimpleValueType().isInteger() && 15872 "Only handle AVX 256-bit vector integer operation"); 15873 return Lower256IntArith(Op, DAG); 15874 } 15875 15876 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 15877 assert(Op.getSimpleValueType().is256BitVector() && 15878 Op.getSimpleValueType().isInteger() && 15879 "Only handle AVX 256-bit vector integer operation"); 15880 return Lower256IntArith(Op, DAG); 15881 } 15882 15883 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, 15884 SelectionDAG &DAG) { 15885 SDLoc dl(Op); 15886 MVT VT = Op.getSimpleValueType(); 15887 15888 // Decompose 256-bit ops into smaller 128-bit ops. 15889 if (VT.is256BitVector() && !Subtarget->hasInt256()) 15890 return Lower256IntArith(Op, DAG); 15891 15892 SDValue A = Op.getOperand(0); 15893 SDValue B = Op.getOperand(1); 15894 15895 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. 15896 if (VT == MVT::v4i32) { 15897 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && 15898 "Should not custom lower when pmuldq is available!"); 15899 15900 // Extract the odd parts. 15901 static const int UnpackMask[] = { 1, -1, 3, -1 }; 15902 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); 15903 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); 15904 15905 // Multiply the even parts. 15906 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); 15907 // Now multiply odd parts. 15908 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); 15909 15910 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); 15911 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); 15912 15913 // Merge the two vectors back together with a shuffle. This expands into 2 15914 // shuffles. 15915 static const int ShufMask[] = { 0, 4, 2, 6 }; 15916 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); 15917 } 15918 15919 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && 15920 "Only know how to lower V2I64/V4I64/V8I64 multiply"); 15921 15922 // Ahi = psrlqi(a, 32); 15923 // Bhi = psrlqi(b, 32); 15924 // 15925 // AloBlo = pmuludq(a, b); 15926 // AloBhi = pmuludq(a, Bhi); 15927 // AhiBlo = pmuludq(Ahi, b); 15928 15929 // AloBhi = psllqi(AloBhi, 32); 15930 // AhiBlo = psllqi(AhiBlo, 32); 15931 // return AloBlo + AloBhi + AhiBlo; 15932 15933 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); 15934 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); 15935 15936 // Bit cast to 32-bit vectors for MULUDQ 15937 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : 15938 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; 15939 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 15940 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 15941 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 15942 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 15943 15944 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 15945 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 15946 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 15947 15948 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); 15949 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); 15950 15951 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 15952 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 15953 } 15954 15955 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { 15956 assert(Subtarget->isTargetWin64() && "Unexpected target"); 15957 EVT VT = Op.getValueType(); 15958 assert(VT.isInteger() && VT.getSizeInBits() == 128 && 15959 "Unexpected return type for lowering"); 15960 15961 RTLIB::Libcall LC; 15962 bool isSigned; 15963 switch (Op->getOpcode()) { 15964 default: llvm_unreachable("Unexpected request for libcall!"); 15965 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; 15966 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; 15967 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; 15968 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; 15969 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; 15970 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; 15971 } 15972 15973 SDLoc dl(Op); 15974 SDValue InChain = DAG.getEntryNode(); 15975 15976 TargetLowering::ArgListTy Args; 15977 TargetLowering::ArgListEntry Entry; 15978 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { 15979 EVT ArgVT = Op->getOperand(i).getValueType(); 15980 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && 15981 "Unexpected argument type for lowering"); 15982 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); 15983 Entry.Node = StackPtr; 15984 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), 15985 false, false, 16); 15986 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 15987 Entry.Ty = PointerType::get(ArgTy,0); 15988 Entry.isSExt = false; 15989 Entry.isZExt = false; 15990 Args.push_back(Entry); 15991 } 15992 15993 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 15994 getPointerTy()); 15995 15996 TargetLowering::CallLoweringInfo CLI(DAG); 15997 CLI.setDebugLoc(dl).setChain(InChain) 15998 .setCallee(getLibcallCallingConv(LC), 15999 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), 16000 Callee, std::move(Args), 0) 16001 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 16002 16003 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 16004 return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first); 16005 } 16006 16007 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, 16008 SelectionDAG &DAG) { 16009 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); 16010 EVT VT = Op0.getValueType(); 16011 SDLoc dl(Op); 16012 16013 assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || 16014 (VT == MVT::v8i32 && Subtarget->hasInt256())); 16015 16016 // PMULxD operations multiply each even value (starting at 0) of LHS with 16017 // the related value of RHS and produce a widen result. 16018 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> 16019 // => <2 x i64> <ae|cg> 16020 // 16021 // In other word, to have all the results, we need to perform two PMULxD: 16022 // 1. one with the even values. 16023 // 2. one with the odd values. 16024 // To achieve #2, with need to place the odd values at an even position. 16025 // 16026 // Place the odd value at an even position (basically, shift all values 1 16027 // step to the left): 16028 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; 16029 // <a|b|c|d> => <b|undef|d|undef> 16030 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); 16031 // <e|f|g|h> => <f|undef|h|undef> 16032 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); 16033 16034 // Emit two multiplies, one for the lower 2 ints and one for the higher 2 16035 // ints. 16036 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; 16037 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; 16038 unsigned Opcode = 16039 (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; 16040 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> 16041 // => <2 x i64> <ae|cg> 16042 SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, 16043 DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); 16044 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef> 16045 // => <2 x i64> <bf|dh> 16046 SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, 16047 DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); 16048 16049 // Shuffle it back into the right order. 16050 SDValue Highs, Lows; 16051 if (VT == MVT::v8i32) { 16052 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; 16053 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); 16054 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; 16055 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); 16056 } else { 16057 const int HighMask[] = {1, 5, 3, 7}; 16058 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); 16059 const int LowMask[] = {0, 4, 2, 6}; 16060 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); 16061 } 16062 16063 // If we have a signed multiply but no PMULDQ fix up the high parts of a 16064 // unsigned multiply. 16065 if (IsSigned && !Subtarget->hasSSE41()) { 16066 SDValue ShAmt = 16067 DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); 16068 SDValue T1 = DAG.getNode(ISD::AND, dl, VT, 16069 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); 16070 SDValue T2 = DAG.getNode(ISD::AND, dl, VT, 16071 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); 16072 16073 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); 16074 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); 16075 } 16076 16077 // The first result of MUL_LOHI is actually the low value, followed by the 16078 // high value. 16079 SDValue Ops[] = {Lows, Highs}; 16080 return DAG.getMergeValues(Ops, dl); 16081 } 16082 16083 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, 16084 const X86Subtarget *Subtarget) { 16085 MVT VT = Op.getSimpleValueType(); 16086 SDLoc dl(Op); 16087 SDValue R = Op.getOperand(0); 16088 SDValue Amt = Op.getOperand(1); 16089 16090 // Optimize shl/srl/sra with constant shift amount. 16091 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { 16092 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { 16093 uint64_t ShiftAmt = ShiftConst->getZExtValue(); 16094 16095 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 16096 (Subtarget->hasInt256() && 16097 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || 16098 (Subtarget->hasAVX512() && 16099 (VT == MVT::v8i64 || VT == MVT::v16i32))) { 16100 if (Op.getOpcode() == ISD::SHL) 16101 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 16102 DAG); 16103 if (Op.getOpcode() == ISD::SRL) 16104 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 16105 DAG); 16106 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 16107 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 16108 DAG); 16109 } 16110 16111 if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { 16112 unsigned NumElts = VT.getVectorNumElements(); 16113 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); 16114 16115 if (Op.getOpcode() == ISD::SHL) { 16116 // Make a large shift. 16117 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, 16118 R, ShiftAmt, DAG); 16119 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 16120 // Zero out the rightmost bits. 16121 SmallVector<SDValue, 32> V( 16122 NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); 16123 return DAG.getNode(ISD::AND, dl, VT, SHL, 16124 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); 16125 } 16126 if (Op.getOpcode() == ISD::SRL) { 16127 // Make a large shift. 16128 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, 16129 R, ShiftAmt, DAG); 16130 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 16131 // Zero out the leftmost bits. 16132 SmallVector<SDValue, 32> V( 16133 NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); 16134 return DAG.getNode(ISD::AND, dl, VT, SRL, 16135 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); 16136 } 16137 if (Op.getOpcode() == ISD::SRA) { 16138 if (ShiftAmt == 7) { 16139 // R s>> 7 === R s< 0 16140 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 16141 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 16142 } 16143 16144 // R s>> a === ((R u>> a) ^ m) - m 16145 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 16146 SmallVector<SDValue, 32> V(NumElts, 16147 DAG.getConstant(128 >> ShiftAmt, MVT::i8)); 16148 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); 16149 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 16150 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 16151 return Res; 16152 } 16153 llvm_unreachable("Unknown shift opcode."); 16154 } 16155 } 16156 } 16157 16158 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 16159 if (!Subtarget->is64Bit() && 16160 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && 16161 Amt.getOpcode() == ISD::BITCAST && 16162 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 16163 Amt = Amt.getOperand(0); 16164 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / 16165 VT.getVectorNumElements(); 16166 unsigned RatioInLog2 = Log2_32_Ceil(Ratio); 16167 uint64_t ShiftAmt = 0; 16168 for (unsigned i = 0; i != Ratio; ++i) { 16169 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); 16170 if (!C) 16171 return SDValue(); 16172 // 6 == Log2(64) 16173 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); 16174 } 16175 // Check remaining shift amounts. 16176 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 16177 uint64_t ShAmt = 0; 16178 for (unsigned j = 0; j != Ratio; ++j) { 16179 ConstantSDNode *C = 16180 dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); 16181 if (!C) 16182 return SDValue(); 16183 // 6 == Log2(64) 16184 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); 16185 } 16186 if (ShAmt != ShiftAmt) 16187 return SDValue(); 16188 } 16189 switch (Op.getOpcode()) { 16190 default: 16191 llvm_unreachable("Unknown shift opcode!"); 16192 case ISD::SHL: 16193 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 16194 DAG); 16195 case ISD::SRL: 16196 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 16197 DAG); 16198 case ISD::SRA: 16199 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 16200 DAG); 16201 } 16202 } 16203 16204 return SDValue(); 16205 } 16206 16207 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, 16208 const X86Subtarget* Subtarget) { 16209 MVT VT = Op.getSimpleValueType(); 16210 SDLoc dl(Op); 16211 SDValue R = Op.getOperand(0); 16212 SDValue Amt = Op.getOperand(1); 16213 16214 if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || 16215 VT == MVT::v4i32 || VT == MVT::v8i16 || 16216 (Subtarget->hasInt256() && 16217 ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || 16218 VT == MVT::v8i32 || VT == MVT::v16i16)) || 16219 (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { 16220 SDValue BaseShAmt; 16221 EVT EltVT = VT.getVectorElementType(); 16222 16223 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { 16224 // Check if this build_vector node is doing a splat. 16225 // If so, then set BaseShAmt equal to the splat value. 16226 BaseShAmt = BV->getSplatValue(); 16227 if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) 16228 BaseShAmt = SDValue(); 16229 } else { 16230 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) 16231 Amt = Amt.getOperand(0); 16232 16233 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt); 16234 if (SVN && SVN->isSplat()) { 16235 unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); 16236 SDValue InVec = Amt.getOperand(0); 16237 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 16238 assert((SplatIdx < InVec.getValueType().getVectorNumElements()) && 16239 "Unexpected shuffle index found!"); 16240 BaseShAmt = InVec.getOperand(SplatIdx); 16241 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 16242 if (ConstantSDNode *C = 16243 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 16244 if (C->getZExtValue() == SplatIdx) 16245 BaseShAmt = InVec.getOperand(1); 16246 } 16247 } 16248 16249 if (!BaseShAmt) 16250 // Avoid introducing an extract element from a shuffle. 16251 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, 16252 DAG.getIntPtrConstant(SplatIdx)); 16253 } 16254 } 16255 16256 if (BaseShAmt.getNode()) { 16257 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); 16258 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) 16259 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); 16260 else if (EltVT.bitsLT(MVT::i32)) 16261 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); 16262 16263 switch (Op.getOpcode()) { 16264 default: 16265 llvm_unreachable("Unknown shift opcode!"); 16266 case ISD::SHL: 16267 switch (VT.SimpleTy) { 16268 default: return SDValue(); 16269 case MVT::v2i64: 16270 case MVT::v4i32: 16271 case MVT::v8i16: 16272 case MVT::v4i64: 16273 case MVT::v8i32: 16274 case MVT::v16i16: 16275 case MVT::v16i32: 16276 case MVT::v8i64: 16277 return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); 16278 } 16279 case ISD::SRA: 16280 switch (VT.SimpleTy) { 16281 default: return SDValue(); 16282 case MVT::v4i32: 16283 case MVT::v8i16: 16284 case MVT::v8i32: 16285 case MVT::v16i16: 16286 case MVT::v16i32: 16287 case MVT::v8i64: 16288 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); 16289 } 16290 case ISD::SRL: 16291 switch (VT.SimpleTy) { 16292 default: return SDValue(); 16293 case MVT::v2i64: 16294 case MVT::v4i32: 16295 case MVT::v8i16: 16296 case MVT::v4i64: 16297 case MVT::v8i32: 16298 case MVT::v16i16: 16299 case MVT::v16i32: 16300 case MVT::v8i64: 16301 return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); 16302 } 16303 } 16304 } 16305 } 16306 16307 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 16308 if (!Subtarget->is64Bit() && 16309 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || 16310 (Subtarget->hasAVX512() && VT == MVT::v8i64)) && 16311 Amt.getOpcode() == ISD::BITCAST && 16312 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 16313 Amt = Amt.getOperand(0); 16314 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / 16315 VT.getVectorNumElements(); 16316 std::vector<SDValue> Vals(Ratio); 16317 for (unsigned i = 0; i != Ratio; ++i) 16318 Vals[i] = Amt.getOperand(i); 16319 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 16320 for (unsigned j = 0; j != Ratio; ++j) 16321 if (Vals[j] != Amt.getOperand(i + j)) 16322 return SDValue(); 16323 } 16324 switch (Op.getOpcode()) { 16325 default: 16326 llvm_unreachable("Unknown shift opcode!"); 16327 case ISD::SHL: 16328 return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); 16329 case ISD::SRL: 16330 return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); 16331 case ISD::SRA: 16332 return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); 16333 } 16334 } 16335 16336 return SDValue(); 16337 } 16338 16339 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, 16340 SelectionDAG &DAG) { 16341 MVT VT = Op.getSimpleValueType(); 16342 SDLoc dl(Op); 16343 SDValue R = Op.getOperand(0); 16344 SDValue Amt = Op.getOperand(1); 16345 16346 assert(VT.isVector() && "Custom lowering only for vector shifts!"); 16347 assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); 16348 16349 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) 16350 return V; 16351 16352 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) 16353 return V; 16354 16355 if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) 16356 return Op; 16357 16358 // AVX2 has VPSLLV/VPSRAV/VPSRLV. 16359 if (Subtarget->hasInt256()) { 16360 if (Op.getOpcode() == ISD::SRL && 16361 (VT == MVT::v2i64 || VT == MVT::v4i32 || 16362 VT == MVT::v4i64 || VT == MVT::v8i32)) 16363 return Op; 16364 if (Op.getOpcode() == ISD::SHL && 16365 (VT == MVT::v2i64 || VT == MVT::v4i32 || 16366 VT == MVT::v4i64 || VT == MVT::v8i32)) 16367 return Op; 16368 if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) 16369 return Op; 16370 } 16371 16372 // 2i64 vector logical shifts can efficiently avoid scalarization - do the 16373 // shifts per-lane and then shuffle the partial results back together. 16374 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { 16375 // Splat the shift amounts so the scalar shifts above will catch it. 16376 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); 16377 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); 16378 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); 16379 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); 16380 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); 16381 } 16382 16383 // If possible, lower this packed shift into a vector multiply instead of 16384 // expanding it into a sequence of scalar shifts. 16385 // Do this only if the vector shift count is a constant build_vector. 16386 if (Op.getOpcode() == ISD::SHL && 16387 (VT == MVT::v8i16 || VT == MVT::v4i32 || 16388 (Subtarget->hasInt256() && VT == MVT::v16i16)) && 16389 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { 16390 SmallVector<SDValue, 8> Elts; 16391 EVT SVT = VT.getScalarType(); 16392 unsigned SVTBits = SVT.getSizeInBits(); 16393 const APInt &One = APInt(SVTBits, 1); 16394 unsigned NumElems = VT.getVectorNumElements(); 16395 16396 for (unsigned i=0; i !=NumElems; ++i) { 16397 SDValue Op = Amt->getOperand(i); 16398 if (Op->getOpcode() == ISD::UNDEF) { 16399 Elts.push_back(Op); 16400 continue; 16401 } 16402 16403 ConstantSDNode *ND = cast<ConstantSDNode>(Op); 16404 const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue()); 16405 uint64_t ShAmt = C.getZExtValue(); 16406 if (ShAmt >= SVTBits) { 16407 Elts.push_back(DAG.getUNDEF(SVT)); 16408 continue; 16409 } 16410 Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT)); 16411 } 16412 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); 16413 return DAG.getNode(ISD::MUL, dl, VT, R, BV); 16414 } 16415 16416 // Lower SHL with variable shift amount. 16417 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 16418 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); 16419 16420 Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); 16421 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 16422 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 16423 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 16424 } 16425 16426 // If possible, lower this shift as a sequence of two shifts by 16427 // constant plus a MOVSS/MOVSD instead of scalarizing it. 16428 // Example: 16429 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) 16430 // 16431 // Could be rewritten as: 16432 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) 16433 // 16434 // The advantage is that the two shifts from the example would be 16435 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing 16436 // the vector shift into four scalar shifts plus four pairs of vector 16437 // insert/extract. 16438 if ((VT == MVT::v8i16 || VT == MVT::v4i32) && 16439 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { 16440 unsigned TargetOpcode = X86ISD::MOVSS; 16441 bool CanBeSimplified; 16442 // The splat value for the first packed shift (the 'X' from the example). 16443 SDValue Amt1 = Amt->getOperand(0); 16444 // The splat value for the second packed shift (the 'Y' from the example). 16445 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : 16446 Amt->getOperand(2); 16447 16448 // See if it is possible to replace this node with a sequence of 16449 // two shifts followed by a MOVSS/MOVSD 16450 if (VT == MVT::v4i32) { 16451 // Check if it is legal to use a MOVSS. 16452 CanBeSimplified = Amt2 == Amt->getOperand(2) && 16453 Amt2 == Amt->getOperand(3); 16454 if (!CanBeSimplified) { 16455 // Otherwise, check if we can still simplify this node using a MOVSD. 16456 CanBeSimplified = Amt1 == Amt->getOperand(1) && 16457 Amt->getOperand(2) == Amt->getOperand(3); 16458 TargetOpcode = X86ISD::MOVSD; 16459 Amt2 = Amt->getOperand(2); 16460 } 16461 } else { 16462 // Do similar checks for the case where the machine value type 16463 // is MVT::v8i16. 16464 CanBeSimplified = Amt1 == Amt->getOperand(1); 16465 for (unsigned i=3; i != 8 && CanBeSimplified; ++i) 16466 CanBeSimplified = Amt2 == Amt->getOperand(i); 16467 16468 if (!CanBeSimplified) { 16469 TargetOpcode = X86ISD::MOVSD; 16470 CanBeSimplified = true; 16471 Amt2 = Amt->getOperand(4); 16472 for (unsigned i=0; i != 4 && CanBeSimplified; ++i) 16473 CanBeSimplified = Amt1 == Amt->getOperand(i); 16474 for (unsigned j=4; j != 8 && CanBeSimplified; ++j) 16475 CanBeSimplified = Amt2 == Amt->getOperand(j); 16476 } 16477 } 16478 16479 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && 16480 isa<ConstantSDNode>(Amt2)) { 16481 // Replace this node with two shifts followed by a MOVSS/MOVSD. 16482 EVT CastVT = MVT::v4i32; 16483 SDValue Splat1 = 16484 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT); 16485 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); 16486 SDValue Splat2 = 16487 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT); 16488 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); 16489 if (TargetOpcode == X86ISD::MOVSD) 16490 CastVT = MVT::v2i64; 16491 SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1); 16492 SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2); 16493 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, 16494 BitCast1, DAG); 16495 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 16496 } 16497 } 16498 16499 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 16500 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 16501 16502 // a = a << 5; 16503 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); 16504 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 16505 16506 // Turn 'a' into a mask suitable for VSELECT 16507 SDValue VSelM = DAG.getConstant(0x80, VT); 16508 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 16509 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 16510 16511 SDValue CM1 = DAG.getConstant(0x0f, VT); 16512 SDValue CM2 = DAG.getConstant(0x3f, VT); 16513 16514 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 16515 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 16516 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); 16517 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 16518 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 16519 16520 // a += a 16521 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 16522 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 16523 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 16524 16525 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 16526 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 16527 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); 16528 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 16529 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 16530 16531 // a += a 16532 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 16533 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 16534 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 16535 16536 // return VSELECT(r, r+r, a); 16537 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 16538 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 16539 return R; 16540 } 16541 16542 // It's worth extending once and using the v8i32 shifts for 16-bit types, but 16543 // the extra overheads to get from v16i8 to v8i32 make the existing SSE 16544 // solution better. 16545 if (Subtarget->hasInt256() && VT == MVT::v8i16) { 16546 MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16; 16547 unsigned ExtOpc = 16548 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 16549 R = DAG.getNode(ExtOpc, dl, NewVT, R); 16550 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); 16551 return DAG.getNode(ISD::TRUNCATE, dl, VT, 16552 DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); 16553 } 16554 16555 // Decompose 256-bit shifts into smaller 128-bit shifts. 16556 if (VT.is256BitVector()) { 16557 unsigned NumElems = VT.getVectorNumElements(); 16558 MVT EltVT = VT.getVectorElementType(); 16559 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 16560 16561 // Extract the two vectors 16562 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 16563 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 16564 16565 // Recreate the shift amount vectors 16566 SDValue Amt1, Amt2; 16567 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 16568 // Constant shift amount 16569 SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems); 16570 ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2); 16571 ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2); 16572 16573 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); 16574 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); 16575 } else { 16576 // Variable shift amount 16577 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 16578 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 16579 } 16580 16581 // Issue new vector shifts for the smaller types 16582 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 16583 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 16584 16585 // Concatenate the result back 16586 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 16587 } 16588 16589 return SDValue(); 16590 } 16591 16592 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 16593 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 16594 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 16595 // looks for this combo and may remove the "setcc" instruction if the "setcc" 16596 // has only one use. 16597 SDNode *N = Op.getNode(); 16598 SDValue LHS = N->getOperand(0); 16599 SDValue RHS = N->getOperand(1); 16600 unsigned BaseOp = 0; 16601 unsigned Cond = 0; 16602 SDLoc DL(Op); 16603 switch (Op.getOpcode()) { 16604 default: llvm_unreachable("Unknown ovf instruction!"); 16605 case ISD::SADDO: 16606 // A subtract of one will be selected as a INC. Note that INC doesn't 16607 // set CF, so we can't do this for UADDO. 16608 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 16609 if (C->isOne()) { 16610 BaseOp = X86ISD::INC; 16611 Cond = X86::COND_O; 16612 break; 16613 } 16614 BaseOp = X86ISD::ADD; 16615 Cond = X86::COND_O; 16616 break; 16617 case ISD::UADDO: 16618 BaseOp = X86ISD::ADD; 16619 Cond = X86::COND_B; 16620 break; 16621 case ISD::SSUBO: 16622 // A subtract of one will be selected as a DEC. Note that DEC doesn't 16623 // set CF, so we can't do this for USUBO. 16624 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 16625 if (C->isOne()) { 16626 BaseOp = X86ISD::DEC; 16627 Cond = X86::COND_O; 16628 break; 16629 } 16630 BaseOp = X86ISD::SUB; 16631 Cond = X86::COND_O; 16632 break; 16633 case ISD::USUBO: 16634 BaseOp = X86ISD::SUB; 16635 Cond = X86::COND_B; 16636 break; 16637 case ISD::SMULO: 16638 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; 16639 Cond = X86::COND_O; 16640 break; 16641 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 16642 if (N->getValueType(0) == MVT::i8) { 16643 BaseOp = X86ISD::UMUL8; 16644 Cond = X86::COND_O; 16645 break; 16646 } 16647 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 16648 MVT::i32); 16649 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 16650 16651 SDValue SetCC = 16652 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 16653 DAG.getConstant(X86::COND_O, MVT::i32), 16654 SDValue(Sum.getNode(), 2)); 16655 16656 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 16657 } 16658 } 16659 16660 // Also sets EFLAGS. 16661 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 16662 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 16663 16664 SDValue SetCC = 16665 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 16666 DAG.getConstant(Cond, MVT::i32), 16667 SDValue(Sum.getNode(), 1)); 16668 16669 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 16670 } 16671 16672 /// Returns true if the operand type is exactly twice the native width, and 16673 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. 16674 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations 16675 /// (otherwise we leave them alone to become __sync_fetch_and_... calls). 16676 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { 16677 unsigned OpWidth = MemType->getPrimitiveSizeInBits(); 16678 16679 if (OpWidth == 64) 16680 return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b 16681 else if (OpWidth == 128) 16682 return Subtarget->hasCmpxchg16b(); 16683 else 16684 return false; 16685 } 16686 16687 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 16688 return needsCmpXchgNb(SI->getValueOperand()->getType()); 16689 } 16690 16691 // Note: this turns large loads into lock cmpxchg8b/16b. 16692 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. 16693 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 16694 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); 16695 return needsCmpXchgNb(PTy->getElementType()); 16696 } 16697 16698 TargetLoweringBase::AtomicRMWExpansionKind 16699 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 16700 unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; 16701 const Type *MemType = AI->getType(); 16702 16703 // If the operand is too big, we must see if cmpxchg8/16b is available 16704 // and default to library calls otherwise. 16705 if (MemType->getPrimitiveSizeInBits() > NativeWidth) { 16706 return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg 16707 : AtomicRMWExpansionKind::None; 16708 } 16709 16710 AtomicRMWInst::BinOp Op = AI->getOperation(); 16711 switch (Op) { 16712 default: 16713 llvm_unreachable("Unknown atomic operation"); 16714 case AtomicRMWInst::Xchg: 16715 case AtomicRMWInst::Add: 16716 case AtomicRMWInst::Sub: 16717 // It's better to use xadd, xsub or xchg for these in all cases. 16718 return AtomicRMWExpansionKind::None; 16719 case AtomicRMWInst::Or: 16720 case AtomicRMWInst::And: 16721 case AtomicRMWInst::Xor: 16722 // If the atomicrmw's result isn't actually used, we can just add a "lock" 16723 // prefix to a normal instruction for these operations. 16724 return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg 16725 : AtomicRMWExpansionKind::None; 16726 case AtomicRMWInst::Nand: 16727 case AtomicRMWInst::Max: 16728 case AtomicRMWInst::Min: 16729 case AtomicRMWInst::UMax: 16730 case AtomicRMWInst::UMin: 16731 // These always require a non-trivial set of data operations on x86. We must 16732 // use a cmpxchg loop. 16733 return AtomicRMWExpansionKind::CmpXChg; 16734 } 16735 } 16736 16737 static bool hasMFENCE(const X86Subtarget& Subtarget) { 16738 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 16739 // no-sse2). There isn't any reason to disable it if the target processor 16740 // supports it. 16741 return Subtarget.hasSSE2() || Subtarget.is64Bit(); 16742 } 16743 16744 LoadInst * 16745 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { 16746 unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; 16747 const Type *MemType = AI->getType(); 16748 // Accesses larger than the native width are turned into cmpxchg/libcalls, so 16749 // there is no benefit in turning such RMWs into loads, and it is actually 16750 // harmful as it introduces a mfence. 16751 if (MemType->getPrimitiveSizeInBits() > NativeWidth) 16752 return nullptr; 16753 16754 auto Builder = IRBuilder<>(AI); 16755 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16756 auto SynchScope = AI->getSynchScope(); 16757 // We must restrict the ordering to avoid generating loads with Release or 16758 // ReleaseAcquire orderings. 16759 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); 16760 auto Ptr = AI->getPointerOperand(); 16761 16762 // Before the load we need a fence. Here is an example lifted from 16763 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence 16764 // is required: 16765 // Thread 0: 16766 // x.store(1, relaxed); 16767 // r1 = y.fetch_add(0, release); 16768 // Thread 1: 16769 // y.fetch_add(42, acquire); 16770 // r2 = x.load(relaxed); 16771 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is 16772 // lowered to just a load without a fence. A mfence flushes the store buffer, 16773 // making the optimization clearly correct. 16774 // FIXME: it is required if isAtLeastRelease(Order) but it is not clear 16775 // otherwise, we might be able to be more agressive on relaxed idempotent 16776 // rmw. In practice, they do not look useful, so we don't try to be 16777 // especially clever. 16778 if (SynchScope == SingleThread) { 16779 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at 16780 // the IR level, so we must wrap it in an intrinsic. 16781 return nullptr; 16782 } else if (hasMFENCE(*Subtarget)) { 16783 Function *MFence = llvm::Intrinsic::getDeclaration(M, 16784 Intrinsic::x86_sse2_mfence); 16785 Builder.CreateCall(MFence); 16786 } else { 16787 // FIXME: it might make sense to use a locked operation here but on a 16788 // different cache-line to prevent cache-line bouncing. In practice it 16789 // is probably a small win, and x86 processors without mfence are rare 16790 // enough that we do not bother. 16791 return nullptr; 16792 } 16793 16794 // Finally we can emit the atomic load. 16795 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, 16796 AI->getType()->getPrimitiveSizeInBits()); 16797 Loaded->setAtomic(Order, SynchScope); 16798 AI->replaceAllUsesWith(Loaded); 16799 AI->eraseFromParent(); 16800 return Loaded; 16801 } 16802 16803 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, 16804 SelectionDAG &DAG) { 16805 SDLoc dl(Op); 16806 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 16807 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 16808 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 16809 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 16810 16811 // The only fence that needs an instruction is a sequentially-consistent 16812 // cross-thread fence. 16813 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 16814 if (hasMFENCE(*Subtarget)) 16815 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 16816 16817 SDValue Chain = Op.getOperand(0); 16818 SDValue Zero = DAG.getConstant(0, MVT::i32); 16819 SDValue Ops[] = { 16820 DAG.getRegister(X86::ESP, MVT::i32), // Base 16821 DAG.getTargetConstant(1, MVT::i8), // Scale 16822 DAG.getRegister(0, MVT::i32), // Index 16823 DAG.getTargetConstant(0, MVT::i32), // Disp 16824 DAG.getRegister(0, MVT::i32), // Segment. 16825 Zero, 16826 Chain 16827 }; 16828 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); 16829 return SDValue(Res, 0); 16830 } 16831 16832 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 16833 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 16834 } 16835 16836 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, 16837 SelectionDAG &DAG) { 16838 MVT T = Op.getSimpleValueType(); 16839 SDLoc DL(Op); 16840 unsigned Reg = 0; 16841 unsigned size = 0; 16842 switch(T.SimpleTy) { 16843 default: llvm_unreachable("Invalid value type!"); 16844 case MVT::i8: Reg = X86::AL; size = 1; break; 16845 case MVT::i16: Reg = X86::AX; size = 2; break; 16846 case MVT::i32: Reg = X86::EAX; size = 4; break; 16847 case MVT::i64: 16848 assert(Subtarget->is64Bit() && "Node not type legal!"); 16849 Reg = X86::RAX; size = 8; 16850 break; 16851 } 16852 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 16853 Op.getOperand(2), SDValue()); 16854 SDValue Ops[] = { cpIn.getValue(0), 16855 Op.getOperand(1), 16856 Op.getOperand(3), 16857 DAG.getTargetConstant(size, MVT::i8), 16858 cpIn.getValue(1) }; 16859 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 16860 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 16861 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 16862 Ops, T, MMO); 16863 16864 SDValue cpOut = 16865 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 16866 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, 16867 MVT::i32, cpOut.getValue(2)); 16868 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), 16869 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); 16870 16871 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); 16872 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); 16873 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); 16874 return SDValue(); 16875 } 16876 16877 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, 16878 SelectionDAG &DAG) { 16879 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 16880 MVT DstVT = Op.getSimpleValueType(); 16881 16882 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { 16883 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 16884 if (DstVT != MVT::f64) 16885 // This conversion needs to be expanded. 16886 return SDValue(); 16887 16888 SDValue InVec = Op->getOperand(0); 16889 SDLoc dl(Op); 16890 unsigned NumElts = SrcVT.getVectorNumElements(); 16891 EVT SVT = SrcVT.getVectorElementType(); 16892 16893 // Widen the vector in input in the case of MVT::v2i32. 16894 // Example: from MVT::v2i32 to MVT::v4i32. 16895 SmallVector<SDValue, 16> Elts; 16896 for (unsigned i = 0, e = NumElts; i != e; ++i) 16897 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, 16898 DAG.getIntPtrConstant(i))); 16899 16900 // Explicitly mark the extra elements as Undef. 16901 Elts.append(NumElts, DAG.getUNDEF(SVT)); 16902 16903 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); 16904 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); 16905 SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV); 16906 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, 16907 DAG.getIntPtrConstant(0)); 16908 } 16909 16910 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 16911 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 16912 assert((DstVT == MVT::i64 || 16913 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 16914 "Unexpected custom BITCAST"); 16915 // i64 <=> MMX conversions are Legal. 16916 if (SrcVT==MVT::i64 && DstVT.isVector()) 16917 return Op; 16918 if (DstVT==MVT::i64 && SrcVT.isVector()) 16919 return Op; 16920 // MMX <=> MMX conversions are Legal. 16921 if (SrcVT.isVector() && DstVT.isVector()) 16922 return Op; 16923 // All other conversions need to be expanded. 16924 return SDValue(); 16925 } 16926 16927 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, 16928 SelectionDAG &DAG) { 16929 SDNode *Node = Op.getNode(); 16930 SDLoc dl(Node); 16931 16932 Op = Op.getOperand(0); 16933 EVT VT = Op.getValueType(); 16934 assert((VT.is128BitVector() || VT.is256BitVector()) && 16935 "CTPOP lowering only implemented for 128/256-bit wide vector types"); 16936 16937 unsigned NumElts = VT.getVectorNumElements(); 16938 EVT EltVT = VT.getVectorElementType(); 16939 unsigned Len = EltVT.getSizeInBits(); 16940 16941 // This is the vectorized version of the "best" algorithm from 16942 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 16943 // with a minor tweak to use a series of adds + shifts instead of vector 16944 // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: 16945 // 16946 // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled 16947 // v8i32 => Always profitable 16948 // 16949 // FIXME: There a couple of possible improvements: 16950 // 16951 // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). 16952 // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html 16953 // 16954 assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && 16955 "CTPOP not implemented for this vector element type."); 16956 16957 // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid 16958 // extra legalization. 16959 bool NeedsBitcast = EltVT == MVT::i32; 16960 MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; 16961 16962 SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT); 16963 SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT); 16964 SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT); 16965 16966 // v = v - ((v >> 1) & 0x55555555...) 16967 SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT)); 16968 SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); 16969 SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); 16970 if (NeedsBitcast) 16971 Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); 16972 16973 SmallVector<SDValue, 8> Mask55(NumElts, Cst55); 16974 SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); 16975 if (NeedsBitcast) 16976 M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); 16977 16978 SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); 16979 if (VT != And.getValueType()) 16980 And = DAG.getNode(ISD::BITCAST, dl, VT, And); 16981 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); 16982 16983 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) 16984 SmallVector<SDValue, 8> Mask33(NumElts, Cst33); 16985 SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); 16986 SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT)); 16987 SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); 16988 16989 Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); 16990 if (NeedsBitcast) { 16991 Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); 16992 M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); 16993 Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); 16994 } 16995 16996 SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); 16997 SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); 16998 if (VT != AndRHS.getValueType()) { 16999 AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); 17000 AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); 17001 } 17002 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); 17003 17004 // v = (v + (v >> 4)) & 0x0F0F0F0F... 17005 SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT)); 17006 SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); 17007 Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); 17008 Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); 17009 17010 SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F); 17011 SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); 17012 if (NeedsBitcast) { 17013 Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); 17014 M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); 17015 } 17016 And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); 17017 if (VT != And.getValueType()) 17018 And = DAG.getNode(ISD::BITCAST, dl, VT, And); 17019 17020 // The algorithm mentioned above uses: 17021 // v = (v * 0x01010101...) >> (Len - 8) 17022 // 17023 // Change it to use vector adds + vector shifts which yield faster results on 17024 // Haswell than using vector integer multiplication. 17025 // 17026 // For i32 elements: 17027 // v = v + (v >> 8) 17028 // v = v + (v >> 16) 17029 // 17030 // For i64 elements: 17031 // v = v + (v >> 8) 17032 // v = v + (v >> 16) 17033 // v = v + (v >> 32) 17034 // 17035 Add = And; 17036 SmallVector<SDValue, 8> Csts; 17037 for (unsigned i = 8; i <= Len/2; i *= 2) { 17038 Csts.assign(NumElts, DAG.getConstant(i, EltVT)); 17039 SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); 17040 Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); 17041 Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); 17042 Csts.clear(); 17043 } 17044 17045 // The result is on the least significant 6-bits on i32 and 7-bits on i64. 17046 SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT); 17047 SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F); 17048 SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); 17049 if (NeedsBitcast) { 17050 Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); 17051 M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); 17052 } 17053 And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); 17054 if (VT != And.getValueType()) 17055 And = DAG.getNode(ISD::BITCAST, dl, VT, And); 17056 17057 return And; 17058 } 17059 17060 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 17061 SDNode *Node = Op.getNode(); 17062 SDLoc dl(Node); 17063 EVT T = Node->getValueType(0); 17064 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 17065 DAG.getConstant(0, T), Node->getOperand(2)); 17066 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 17067 cast<AtomicSDNode>(Node)->getMemoryVT(), 17068 Node->getOperand(0), 17069 Node->getOperand(1), negOp, 17070 cast<AtomicSDNode>(Node)->getMemOperand(), 17071 cast<AtomicSDNode>(Node)->getOrdering(), 17072 cast<AtomicSDNode>(Node)->getSynchScope()); 17073 } 17074 17075 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 17076 SDNode *Node = Op.getNode(); 17077 SDLoc dl(Node); 17078 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 17079 17080 // Convert seq_cst store -> xchg 17081 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 17082 // FIXME: On 32-bit, store -> fist or movq would be more efficient 17083 // (The only way to get a 16-byte store is cmpxchg16b) 17084 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 17085 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 17086 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 17087 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 17088 cast<AtomicSDNode>(Node)->getMemoryVT(), 17089 Node->getOperand(0), 17090 Node->getOperand(1), Node->getOperand(2), 17091 cast<AtomicSDNode>(Node)->getMemOperand(), 17092 cast<AtomicSDNode>(Node)->getOrdering(), 17093 cast<AtomicSDNode>(Node)->getSynchScope()); 17094 return Swap.getValue(1); 17095 } 17096 // Other atomic stores have a simple pattern. 17097 return Op; 17098 } 17099 17100 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 17101 EVT VT = Op.getNode()->getSimpleValueType(0); 17102 17103 // Let legalize expand this if it isn't a legal type yet. 17104 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 17105 return SDValue(); 17106 17107 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 17108 17109 unsigned Opc; 17110 bool ExtraOp = false; 17111 switch (Op.getOpcode()) { 17112 default: llvm_unreachable("Invalid code"); 17113 case ISD::ADDC: Opc = X86ISD::ADD; break; 17114 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 17115 case ISD::SUBC: Opc = X86ISD::SUB; break; 17116 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 17117 } 17118 17119 if (!ExtraOp) 17120 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 17121 Op.getOperand(1)); 17122 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 17123 Op.getOperand(1), Op.getOperand(2)); 17124 } 17125 17126 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, 17127 SelectionDAG &DAG) { 17128 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); 17129 17130 // For MacOSX, we want to call an alternative entry point: __sincos_stret, 17131 // which returns the values as { float, float } (in XMM0) or 17132 // { double, double } (which is returned in XMM0, XMM1). 17133 SDLoc dl(Op); 17134 SDValue Arg = Op.getOperand(0); 17135 EVT ArgVT = Arg.getValueType(); 17136 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 17137 17138 TargetLowering::ArgListTy Args; 17139 TargetLowering::ArgListEntry Entry; 17140 17141 Entry.Node = Arg; 17142 Entry.Ty = ArgTy; 17143 Entry.isSExt = false; 17144 Entry.isZExt = false; 17145 Args.push_back(Entry); 17146 17147 bool isF64 = ArgVT == MVT::f64; 17148 // Only optimize x86_64 for now. i386 is a bit messy. For f32, 17149 // the small struct {f32, f32} is returned in (eax, edx). For f64, 17150 // the results are returned via SRet in memory. 17151 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; 17152 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17153 SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); 17154 17155 Type *RetTy = isF64 17156 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) 17157 : (Type*)VectorType::get(ArgTy, 4); 17158 17159 TargetLowering::CallLoweringInfo CLI(DAG); 17160 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 17161 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); 17162 17163 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); 17164 17165 if (isF64) 17166 // Returned in xmm0 and xmm1. 17167 return CallResult.first; 17168 17169 // Returned in bits 0:31 and 32:64 xmm0. 17170 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 17171 CallResult.first, DAG.getIntPtrConstant(0)); 17172 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 17173 CallResult.first, DAG.getIntPtrConstant(1)); 17174 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 17175 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); 17176 } 17177 17178 /// LowerOperation - Provide custom lowering hooks for some operations. 17179 /// 17180 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 17181 switch (Op.getOpcode()) { 17182 default: llvm_unreachable("Should not custom lower this!"); 17183 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 17184 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 17185 return LowerCMP_SWAP(Op, Subtarget, DAG); 17186 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); 17187 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 17188 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 17189 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 17190 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); 17191 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); 17192 case ISD::VSELECT: return LowerVSELECT(Op, DAG); 17193 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 17194 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 17195 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 17196 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 17197 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 17198 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 17199 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 17200 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 17201 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 17202 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 17203 case ISD::SHL_PARTS: 17204 case ISD::SRA_PARTS: 17205 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 17206 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 17207 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 17208 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 17209 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); 17210 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); 17211 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); 17212 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 17213 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 17214 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 17215 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); 17216 case ISD::FABS: 17217 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); 17218 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 17219 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 17220 case ISD::SETCC: return LowerSETCC(Op, DAG); 17221 case ISD::SELECT: return LowerSELECT(Op, DAG); 17222 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 17223 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 17224 case ISD::VASTART: return LowerVASTART(Op, DAG); 17225 case ISD::VAARG: return LowerVAARG(Op, DAG); 17226 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 17227 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); 17228 case ISD::INTRINSIC_VOID: 17229 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); 17230 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 17231 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 17232 case ISD::FRAME_TO_ARGS_OFFSET: 17233 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 17234 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 17235 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 17236 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 17237 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 17238 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 17239 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 17240 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 17241 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 17242 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 17243 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 17244 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 17245 case ISD::UMUL_LOHI: 17246 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); 17247 case ISD::SRA: 17248 case ISD::SRL: 17249 case ISD::SHL: return LowerShift(Op, Subtarget, DAG); 17250 case ISD::SADDO: 17251 case ISD::UADDO: 17252 case ISD::SSUBO: 17253 case ISD::USUBO: 17254 case ISD::SMULO: 17255 case ISD::UMULO: return LowerXALUO(Op, DAG); 17256 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 17257 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); 17258 case ISD::ADDC: 17259 case ISD::ADDE: 17260 case ISD::SUBC: 17261 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 17262 case ISD::ADD: return LowerADD(Op, DAG); 17263 case ISD::SUB: return LowerSUB(Op, DAG); 17264 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); 17265 } 17266 } 17267 17268 /// ReplaceNodeResults - Replace a node with an illegal result type 17269 /// with a new node built out of custom code. 17270 void X86TargetLowering::ReplaceNodeResults(SDNode *N, 17271 SmallVectorImpl<SDValue>&Results, 17272 SelectionDAG &DAG) const { 17273 SDLoc dl(N); 17274 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17275 switch (N->getOpcode()) { 17276 default: 17277 llvm_unreachable("Do not know how to custom type legalize this operation!"); 17278 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. 17279 case X86ISD::FMINC: 17280 case X86ISD::FMIN: 17281 case X86ISD::FMAXC: 17282 case X86ISD::FMAX: { 17283 EVT VT = N->getValueType(0); 17284 if (VT != MVT::v2f32) 17285 llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX."); 17286 SDValue UNDEF = DAG.getUNDEF(VT); 17287 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, 17288 N->getOperand(0), UNDEF); 17289 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, 17290 N->getOperand(1), UNDEF); 17291 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); 17292 return; 17293 } 17294 case ISD::SIGN_EXTEND_INREG: 17295 case ISD::ADDC: 17296 case ISD::ADDE: 17297 case ISD::SUBC: 17298 case ISD::SUBE: 17299 // We don't want to expand or promote these. 17300 return; 17301 case ISD::SDIV: 17302 case ISD::UDIV: 17303 case ISD::SREM: 17304 case ISD::UREM: 17305 case ISD::SDIVREM: 17306 case ISD::UDIVREM: { 17307 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); 17308 Results.push_back(V); 17309 return; 17310 } 17311 case ISD::FP_TO_SINT: 17312 // FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert 17313 // (FP_TO_SINT (load f16)) to FP_TO_INT*. 17314 if (N->getOperand(0).getValueType() == MVT::f16) 17315 break; 17316 // fallthrough 17317 case ISD::FP_TO_UINT: { 17318 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 17319 17320 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 17321 return; 17322 17323 std::pair<SDValue,SDValue> Vals = 17324 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 17325 SDValue FIST = Vals.first, StackSlot = Vals.second; 17326 if (FIST.getNode()) { 17327 EVT VT = N->getValueType(0); 17328 // Return a load from the stack slot. 17329 if (StackSlot.getNode()) 17330 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 17331 MachinePointerInfo(), 17332 false, false, false, 0)); 17333 else 17334 Results.push_back(FIST); 17335 } 17336 return; 17337 } 17338 case ISD::UINT_TO_FP: { 17339 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 17340 if (N->getOperand(0).getValueType() != MVT::v2i32 || 17341 N->getValueType(0) != MVT::v2f32) 17342 return; 17343 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, 17344 N->getOperand(0)); 17345 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 17346 MVT::f64); 17347 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); 17348 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, 17349 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); 17350 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); 17351 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); 17352 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); 17353 return; 17354 } 17355 case ISD::FP_ROUND: { 17356 if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) 17357 return; 17358 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 17359 Results.push_back(V); 17360 return; 17361 } 17362 case ISD::FP_EXTEND: { 17363 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. 17364 // No other ValueType for FP_EXTEND should reach this point. 17365 assert(N->getValueType(0) == MVT::v2f32 && 17366 "Do not know how to legalize this Node"); 17367 return; 17368 } 17369 case ISD::INTRINSIC_W_CHAIN: { 17370 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 17371 switch (IntNo) { 17372 default : llvm_unreachable("Do not know how to custom type " 17373 "legalize this intrinsic operation!"); 17374 case Intrinsic::x86_rdtsc: 17375 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, 17376 Results); 17377 case Intrinsic::x86_rdtscp: 17378 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, 17379 Results); 17380 case Intrinsic::x86_rdpmc: 17381 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); 17382 } 17383 } 17384 case ISD::READCYCLECOUNTER: { 17385 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, 17386 Results); 17387 } 17388 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { 17389 EVT T = N->getValueType(0); 17390 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 17391 bool Regs64bit = T == MVT::i128; 17392 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 17393 SDValue cpInL, cpInH; 17394 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 17395 DAG.getConstant(0, HalfT)); 17396 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 17397 DAG.getConstant(1, HalfT)); 17398 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 17399 Regs64bit ? X86::RAX : X86::EAX, 17400 cpInL, SDValue()); 17401 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 17402 Regs64bit ? X86::RDX : X86::EDX, 17403 cpInH, cpInL.getValue(1)); 17404 SDValue swapInL, swapInH; 17405 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 17406 DAG.getConstant(0, HalfT)); 17407 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 17408 DAG.getConstant(1, HalfT)); 17409 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 17410 Regs64bit ? X86::RBX : X86::EBX, 17411 swapInL, cpInH.getValue(1)); 17412 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 17413 Regs64bit ? X86::RCX : X86::ECX, 17414 swapInH, swapInL.getValue(1)); 17415 SDValue Ops[] = { swapInH.getValue(0), 17416 N->getOperand(1), 17417 swapInH.getValue(1) }; 17418 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 17419 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 17420 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 17421 X86ISD::LCMPXCHG8_DAG; 17422 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); 17423 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 17424 Regs64bit ? X86::RAX : X86::EAX, 17425 HalfT, Result.getValue(1)); 17426 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 17427 Regs64bit ? X86::RDX : X86::EDX, 17428 HalfT, cpOutL.getValue(2)); 17429 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 17430 17431 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, 17432 MVT::i32, cpOutH.getValue(2)); 17433 SDValue Success = 17434 DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17435 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); 17436 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); 17437 17438 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); 17439 Results.push_back(Success); 17440 Results.push_back(EFLAGS.getValue(1)); 17441 return; 17442 } 17443 case ISD::ATOMIC_SWAP: 17444 case ISD::ATOMIC_LOAD_ADD: 17445 case ISD::ATOMIC_LOAD_SUB: 17446 case ISD::ATOMIC_LOAD_AND: 17447 case ISD::ATOMIC_LOAD_OR: 17448 case ISD::ATOMIC_LOAD_XOR: 17449 case ISD::ATOMIC_LOAD_NAND: 17450 case ISD::ATOMIC_LOAD_MIN: 17451 case ISD::ATOMIC_LOAD_MAX: 17452 case ISD::ATOMIC_LOAD_UMIN: 17453 case ISD::ATOMIC_LOAD_UMAX: 17454 case ISD::ATOMIC_LOAD: { 17455 // Delegate to generic TypeLegalization. Situations we can really handle 17456 // should have already been dealt with by AtomicExpandPass.cpp. 17457 break; 17458 } 17459 case ISD::BITCAST: { 17460 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 17461 EVT DstVT = N->getValueType(0); 17462 EVT SrcVT = N->getOperand(0)->getValueType(0); 17463 17464 if (SrcVT != MVT::f64 || 17465 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) 17466 return; 17467 17468 unsigned NumElts = DstVT.getVectorNumElements(); 17469 EVT SVT = DstVT.getVectorElementType(); 17470 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); 17471 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 17472 MVT::v2f64, N->getOperand(0)); 17473 SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); 17474 17475 if (ExperimentalVectorWideningLegalization) { 17476 // If we are legalizing vectors by widening, we already have the desired 17477 // legal vector type, just return it. 17478 Results.push_back(ToVecInt); 17479 return; 17480 } 17481 17482 SmallVector<SDValue, 8> Elts; 17483 for (unsigned i = 0, e = NumElts; i != e; ++i) 17484 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, 17485 ToVecInt, DAG.getIntPtrConstant(i))); 17486 17487 Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); 17488 } 17489 } 17490 } 17491 17492 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 17493 switch (Opcode) { 17494 default: return nullptr; 17495 case X86ISD::BSF: return "X86ISD::BSF"; 17496 case X86ISD::BSR: return "X86ISD::BSR"; 17497 case X86ISD::SHLD: return "X86ISD::SHLD"; 17498 case X86ISD::SHRD: return "X86ISD::SHRD"; 17499 case X86ISD::FAND: return "X86ISD::FAND"; 17500 case X86ISD::FANDN: return "X86ISD::FANDN"; 17501 case X86ISD::FOR: return "X86ISD::FOR"; 17502 case X86ISD::FXOR: return "X86ISD::FXOR"; 17503 case X86ISD::FSRL: return "X86ISD::FSRL"; 17504 case X86ISD::FILD: return "X86ISD::FILD"; 17505 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 17506 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 17507 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 17508 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 17509 case X86ISD::FLD: return "X86ISD::FLD"; 17510 case X86ISD::FST: return "X86ISD::FST"; 17511 case X86ISD::CALL: return "X86ISD::CALL"; 17512 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 17513 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; 17514 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; 17515 case X86ISD::BT: return "X86ISD::BT"; 17516 case X86ISD::CMP: return "X86ISD::CMP"; 17517 case X86ISD::COMI: return "X86ISD::COMI"; 17518 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 17519 case X86ISD::CMPM: return "X86ISD::CMPM"; 17520 case X86ISD::CMPMU: return "X86ISD::CMPMU"; 17521 case X86ISD::SETCC: return "X86ISD::SETCC"; 17522 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 17523 case X86ISD::FSETCC: return "X86ISD::FSETCC"; 17524 case X86ISD::CMOV: return "X86ISD::CMOV"; 17525 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 17526 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 17527 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 17528 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 17529 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 17530 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 17531 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 17532 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 17533 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 17534 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 17535 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 17536 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 17537 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 17538 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 17539 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 17540 case X86ISD::BLENDI: return "X86ISD::BLENDI"; 17541 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; 17542 case X86ISD::SUBUS: return "X86ISD::SUBUS"; 17543 case X86ISD::HADD: return "X86ISD::HADD"; 17544 case X86ISD::HSUB: return "X86ISD::HSUB"; 17545 case X86ISD::FHADD: return "X86ISD::FHADD"; 17546 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 17547 case X86ISD::UMAX: return "X86ISD::UMAX"; 17548 case X86ISD::UMIN: return "X86ISD::UMIN"; 17549 case X86ISD::SMAX: return "X86ISD::SMAX"; 17550 case X86ISD::SMIN: return "X86ISD::SMIN"; 17551 case X86ISD::FMAX: return "X86ISD::FMAX"; 17552 case X86ISD::FMIN: return "X86ISD::FMIN"; 17553 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 17554 case X86ISD::FMINC: return "X86ISD::FMINC"; 17555 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 17556 case X86ISD::FRCP: return "X86ISD::FRCP"; 17557 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 17558 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 17559 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 17560 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; 17561 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; 17562 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 17563 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 17564 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 17565 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 17566 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 17567 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 17568 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; 17569 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 17570 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 17571 case X86ISD::VZEXT: return "X86ISD::VZEXT"; 17572 case X86ISD::VSEXT: return "X86ISD::VSEXT"; 17573 case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; 17574 case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; 17575 case X86ISD::VINSERT: return "X86ISD::VINSERT"; 17576 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 17577 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 17578 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 17579 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 17580 case X86ISD::VSHL: return "X86ISD::VSHL"; 17581 case X86ISD::VSRL: return "X86ISD::VSRL"; 17582 case X86ISD::VSRA: return "X86ISD::VSRA"; 17583 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 17584 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 17585 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 17586 case X86ISD::CMPP: return "X86ISD::CMPP"; 17587 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 17588 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 17589 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; 17590 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; 17591 case X86ISD::ADD: return "X86ISD::ADD"; 17592 case X86ISD::SUB: return "X86ISD::SUB"; 17593 case X86ISD::ADC: return "X86ISD::ADC"; 17594 case X86ISD::SBB: return "X86ISD::SBB"; 17595 case X86ISD::SMUL: return "X86ISD::SMUL"; 17596 case X86ISD::UMUL: return "X86ISD::UMUL"; 17597 case X86ISD::SMUL8: return "X86ISD::SMUL8"; 17598 case X86ISD::UMUL8: return "X86ISD::UMUL8"; 17599 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; 17600 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; 17601 case X86ISD::INC: return "X86ISD::INC"; 17602 case X86ISD::DEC: return "X86ISD::DEC"; 17603 case X86ISD::OR: return "X86ISD::OR"; 17604 case X86ISD::XOR: return "X86ISD::XOR"; 17605 case X86ISD::AND: return "X86ISD::AND"; 17606 case X86ISD::BEXTR: return "X86ISD::BEXTR"; 17607 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 17608 case X86ISD::PTEST: return "X86ISD::PTEST"; 17609 case X86ISD::TESTP: return "X86ISD::TESTP"; 17610 case X86ISD::TESTM: return "X86ISD::TESTM"; 17611 case X86ISD::TESTNM: return "X86ISD::TESTNM"; 17612 case X86ISD::KORTEST: return "X86ISD::KORTEST"; 17613 case X86ISD::PACKSS: return "X86ISD::PACKSS"; 17614 case X86ISD::PACKUS: return "X86ISD::PACKUS"; 17615 case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; 17616 case X86ISD::VALIGN: return "X86ISD::VALIGN"; 17617 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 17618 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 17619 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 17620 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 17621 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 17622 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 17623 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 17624 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 17625 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 17626 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 17627 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 17628 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 17629 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 17630 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 17631 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 17632 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 17633 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 17634 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; 17635 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; 17636 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; 17637 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 17638 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 17639 case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; 17640 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; 17641 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 17642 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 17643 case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; 17644 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 17645 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 17646 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 17647 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 17648 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 17649 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 17650 case X86ISD::SAHF: return "X86ISD::SAHF"; 17651 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 17652 case X86ISD::RDSEED: return "X86ISD::RDSEED"; 17653 case X86ISD::FMADD: return "X86ISD::FMADD"; 17654 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 17655 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 17656 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 17657 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 17658 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 17659 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; 17660 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; 17661 case X86ISD::XTEST: return "X86ISD::XTEST"; 17662 case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; 17663 case X86ISD::EXPAND: return "X86ISD::EXPAND"; 17664 case X86ISD::SELECT: return "X86ISD::SELECT"; 17665 case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; 17666 case X86ISD::RCP28: return "X86ISD::RCP28"; 17667 case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; 17668 case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; 17669 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; 17670 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; 17671 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; 17672 } 17673 } 17674 17675 // isLegalAddressingMode - Return true if the addressing mode represented 17676 // by AM is legal for this target, for a load/store of the specified type. 17677 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 17678 Type *Ty) const { 17679 // X86 supports extremely general addressing modes. 17680 CodeModel::Model M = getTargetMachine().getCodeModel(); 17681 Reloc::Model R = getTargetMachine().getRelocationModel(); 17682 17683 // X86 allows a sign-extended 32-bit immediate field as a displacement. 17684 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) 17685 return false; 17686 17687 if (AM.BaseGV) { 17688 unsigned GVFlags = 17689 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 17690 17691 // If a reference to this global requires an extra load, we can't fold it. 17692 if (isGlobalStubReference(GVFlags)) 17693 return false; 17694 17695 // If BaseGV requires a register for the PIC base, we cannot also have a 17696 // BaseReg specified. 17697 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 17698 return false; 17699 17700 // If lower 4G is not available, then we must use rip-relative addressing. 17701 if ((M != CodeModel::Small || R != Reloc::Static) && 17702 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 17703 return false; 17704 } 17705 17706 switch (AM.Scale) { 17707 case 0: 17708 case 1: 17709 case 2: 17710 case 4: 17711 case 8: 17712 // These scales always work. 17713 break; 17714 case 3: 17715 case 5: 17716 case 9: 17717 // These scales are formed with basereg+scalereg. Only accept if there is 17718 // no basereg yet. 17719 if (AM.HasBaseReg) 17720 return false; 17721 break; 17722 default: // Other stuff never works. 17723 return false; 17724 } 17725 17726 return true; 17727 } 17728 17729 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { 17730 unsigned Bits = Ty->getScalarSizeInBits(); 17731 17732 // 8-bit shifts are always expensive, but versions with a scalar amount aren't 17733 // particularly cheaper than those without. 17734 if (Bits == 8) 17735 return false; 17736 17737 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make 17738 // variable shifts just as cheap as scalar ones. 17739 if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) 17740 return false; 17741 17742 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a 17743 // fully general vector. 17744 return true; 17745 } 17746 17747 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 17748 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 17749 return false; 17750 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 17751 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 17752 return NumBits1 > NumBits2; 17753 } 17754 17755 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 17756 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 17757 return false; 17758 17759 if (!isTypeLegal(EVT::getEVT(Ty1))) 17760 return false; 17761 17762 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 17763 17764 // Assuming the caller doesn't have a zeroext or signext return parameter, 17765 // truncation all the way down to i1 is valid. 17766 return true; 17767 } 17768 17769 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 17770 return isInt<32>(Imm); 17771 } 17772 17773 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 17774 // Can also use sub to handle negated immediates. 17775 return isInt<32>(Imm); 17776 } 17777 17778 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 17779 if (!VT1.isInteger() || !VT2.isInteger()) 17780 return false; 17781 unsigned NumBits1 = VT1.getSizeInBits(); 17782 unsigned NumBits2 = VT2.getSizeInBits(); 17783 return NumBits1 > NumBits2; 17784 } 17785 17786 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 17787 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 17788 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 17789 } 17790 17791 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 17792 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 17793 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 17794 } 17795 17796 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 17797 EVT VT1 = Val.getValueType(); 17798 if (isZExtFree(VT1, VT2)) 17799 return true; 17800 17801 if (Val.getOpcode() != ISD::LOAD) 17802 return false; 17803 17804 if (!VT1.isSimple() || !VT1.isInteger() || 17805 !VT2.isSimple() || !VT2.isInteger()) 17806 return false; 17807 17808 switch (VT1.getSimpleVT().SimpleTy) { 17809 default: break; 17810 case MVT::i8: 17811 case MVT::i16: 17812 case MVT::i32: 17813 // X86 has 8, 16, and 32-bit zero-extending loads. 17814 return true; 17815 } 17816 17817 return false; 17818 } 17819 17820 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } 17821 17822 bool 17823 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 17824 if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) 17825 return false; 17826 17827 VT = VT.getScalarType(); 17828 17829 if (!VT.isSimple()) 17830 return false; 17831 17832 switch (VT.getSimpleVT().SimpleTy) { 17833 case MVT::f32: 17834 case MVT::f64: 17835 return true; 17836 default: 17837 break; 17838 } 17839 17840 return false; 17841 } 17842 17843 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 17844 // i16 instructions are longer (0x66 prefix) and potentially slower. 17845 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 17846 } 17847 17848 /// isShuffleMaskLegal - Targets can use this to indicate that they only 17849 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 17850 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 17851 /// are assumed to be legal. 17852 bool 17853 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 17854 EVT VT) const { 17855 if (!VT.isSimple()) 17856 return false; 17857 17858 // Very little shuffling can be done for 64-bit vectors right now. 17859 if (VT.getSizeInBits() == 64) 17860 return false; 17861 17862 // We only care that the types being shuffled are legal. The lowering can 17863 // handle any possible shuffle mask that results. 17864 return isTypeLegal(VT.getSimpleVT()); 17865 } 17866 17867 bool 17868 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 17869 EVT VT) const { 17870 // Just delegate to the generic legality, clear masks aren't special. 17871 return isShuffleMaskLegal(Mask, VT); 17872 } 17873 17874 //===----------------------------------------------------------------------===// 17875 // X86 Scheduler Hooks 17876 //===----------------------------------------------------------------------===// 17877 17878 /// Utility function to emit xbegin specifying the start of an RTM region. 17879 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, 17880 const TargetInstrInfo *TII) { 17881 DebugLoc DL = MI->getDebugLoc(); 17882 17883 const BasicBlock *BB = MBB->getBasicBlock(); 17884 MachineFunction::iterator I = MBB; 17885 ++I; 17886 17887 // For the v = xbegin(), we generate 17888 // 17889 // thisMBB: 17890 // xbegin sinkMBB 17891 // 17892 // mainMBB: 17893 // eax = -1 17894 // 17895 // sinkMBB: 17896 // v = eax 17897 17898 MachineBasicBlock *thisMBB = MBB; 17899 MachineFunction *MF = MBB->getParent(); 17900 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 17901 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 17902 MF->insert(I, mainMBB); 17903 MF->insert(I, sinkMBB); 17904 17905 // Transfer the remainder of BB and its successor edges to sinkMBB. 17906 sinkMBB->splice(sinkMBB->begin(), MBB, 17907 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 17908 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 17909 17910 // thisMBB: 17911 // xbegin sinkMBB 17912 // # fallthrough to mainMBB 17913 // # abortion to sinkMBB 17914 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); 17915 thisMBB->addSuccessor(mainMBB); 17916 thisMBB->addSuccessor(sinkMBB); 17917 17918 // mainMBB: 17919 // EAX = -1 17920 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); 17921 mainMBB->addSuccessor(sinkMBB); 17922 17923 // sinkMBB: 17924 // EAX is live into the sinkMBB 17925 sinkMBB->addLiveIn(X86::EAX); 17926 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 17927 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 17928 .addReg(X86::EAX); 17929 17930 MI->eraseFromParent(); 17931 return sinkMBB; 17932 } 17933 17934 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 17935 // or XMM0_V32I8 in AVX all of this code can be replaced with that 17936 // in the .td file. 17937 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, 17938 const TargetInstrInfo *TII) { 17939 unsigned Opc; 17940 switch (MI->getOpcode()) { 17941 default: llvm_unreachable("illegal opcode!"); 17942 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; 17943 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; 17944 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; 17945 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; 17946 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; 17947 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; 17948 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; 17949 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; 17950 } 17951 17952 DebugLoc dl = MI->getDebugLoc(); 17953 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 17954 17955 unsigned NumArgs = MI->getNumOperands(); 17956 for (unsigned i = 1; i < NumArgs; ++i) { 17957 MachineOperand &Op = MI->getOperand(i); 17958 if (!(Op.isReg() && Op.isImplicit())) 17959 MIB.addOperand(Op); 17960 } 17961 if (MI->hasOneMemOperand()) 17962 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 17963 17964 BuildMI(*BB, MI, dl, 17965 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 17966 .addReg(X86::XMM0); 17967 17968 MI->eraseFromParent(); 17969 return BB; 17970 } 17971 17972 // FIXME: Custom handling because TableGen doesn't support multiple implicit 17973 // defs in an instruction pattern 17974 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, 17975 const TargetInstrInfo *TII) { 17976 unsigned Opc; 17977 switch (MI->getOpcode()) { 17978 default: llvm_unreachable("illegal opcode!"); 17979 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; 17980 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; 17981 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; 17982 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; 17983 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; 17984 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; 17985 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; 17986 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; 17987 } 17988 17989 DebugLoc dl = MI->getDebugLoc(); 17990 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 17991 17992 unsigned NumArgs = MI->getNumOperands(); // remove the results 17993 for (unsigned i = 1; i < NumArgs; ++i) { 17994 MachineOperand &Op = MI->getOperand(i); 17995 if (!(Op.isReg() && Op.isImplicit())) 17996 MIB.addOperand(Op); 17997 } 17998 if (MI->hasOneMemOperand()) 17999 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 18000 18001 BuildMI(*BB, MI, dl, 18002 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 18003 .addReg(X86::ECX); 18004 18005 MI->eraseFromParent(); 18006 return BB; 18007 } 18008 18009 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, 18010 const X86Subtarget *Subtarget) { 18011 DebugLoc dl = MI->getDebugLoc(); 18012 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 18013 // Address into RAX/EAX, other two args into ECX, EDX. 18014 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 18015 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 18016 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 18017 for (int i = 0; i < X86::AddrNumOperands; ++i) 18018 MIB.addOperand(MI->getOperand(i)); 18019 18020 unsigned ValOps = X86::AddrNumOperands; 18021 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 18022 .addReg(MI->getOperand(ValOps).getReg()); 18023 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 18024 .addReg(MI->getOperand(ValOps+1).getReg()); 18025 18026 // The instruction doesn't actually take any operands though. 18027 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 18028 18029 MI->eraseFromParent(); // The pseudo is gone now. 18030 return BB; 18031 } 18032 18033 MachineBasicBlock * 18034 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, 18035 MachineBasicBlock *MBB) const { 18036 // Emit va_arg instruction on X86-64. 18037 18038 // Operands to this pseudo-instruction: 18039 // 0 ) Output : destination address (reg) 18040 // 1-5) Input : va_list address (addr, i64mem) 18041 // 6 ) ArgSize : Size (in bytes) of vararg type 18042 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 18043 // 8 ) Align : Alignment of type 18044 // 9 ) EFLAGS (implicit-def) 18045 18046 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 18047 static_assert(X86::AddrNumOperands == 5, 18048 "VAARG_64 assumes 5 address operands"); 18049 18050 unsigned DestReg = MI->getOperand(0).getReg(); 18051 MachineOperand &Base = MI->getOperand(1); 18052 MachineOperand &Scale = MI->getOperand(2); 18053 MachineOperand &Index = MI->getOperand(3); 18054 MachineOperand &Disp = MI->getOperand(4); 18055 MachineOperand &Segment = MI->getOperand(5); 18056 unsigned ArgSize = MI->getOperand(6).getImm(); 18057 unsigned ArgMode = MI->getOperand(7).getImm(); 18058 unsigned Align = MI->getOperand(8).getImm(); 18059 18060 // Memory Reference 18061 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 18062 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 18063 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 18064 18065 // Machine Information 18066 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 18067 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 18068 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 18069 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 18070 DebugLoc DL = MI->getDebugLoc(); 18071 18072 // struct va_list { 18073 // i32 gp_offset 18074 // i32 fp_offset 18075 // i64 overflow_area (address) 18076 // i64 reg_save_area (address) 18077 // } 18078 // sizeof(va_list) = 24 18079 // alignment(va_list) = 8 18080 18081 unsigned TotalNumIntRegs = 6; 18082 unsigned TotalNumXMMRegs = 8; 18083 bool UseGPOffset = (ArgMode == 1); 18084 bool UseFPOffset = (ArgMode == 2); 18085 unsigned MaxOffset = TotalNumIntRegs * 8 + 18086 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 18087 18088 /* Align ArgSize to a multiple of 8 */ 18089 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 18090 bool NeedsAlign = (Align > 8); 18091 18092 MachineBasicBlock *thisMBB = MBB; 18093 MachineBasicBlock *overflowMBB; 18094 MachineBasicBlock *offsetMBB; 18095 MachineBasicBlock *endMBB; 18096 18097 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 18098 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 18099 unsigned OffsetReg = 0; 18100 18101 if (!UseGPOffset && !UseFPOffset) { 18102 // If we only pull from the overflow region, we don't create a branch. 18103 // We don't need to alter control flow. 18104 OffsetDestReg = 0; // unused 18105 OverflowDestReg = DestReg; 18106 18107 offsetMBB = nullptr; 18108 overflowMBB = thisMBB; 18109 endMBB = thisMBB; 18110 } else { 18111 // First emit code to check if gp_offset (or fp_offset) is below the bound. 18112 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 18113 // If not, pull from overflow_area. (branch to overflowMBB) 18114 // 18115 // thisMBB 18116 // | . 18117 // | . 18118 // offsetMBB overflowMBB 18119 // | . 18120 // | . 18121 // endMBB 18122 18123 // Registers for the PHI in endMBB 18124 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 18125 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 18126 18127 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 18128 MachineFunction *MF = MBB->getParent(); 18129 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18130 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18131 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18132 18133 MachineFunction::iterator MBBIter = MBB; 18134 ++MBBIter; 18135 18136 // Insert the new basic blocks 18137 MF->insert(MBBIter, offsetMBB); 18138 MF->insert(MBBIter, overflowMBB); 18139 MF->insert(MBBIter, endMBB); 18140 18141 // Transfer the remainder of MBB and its successor edges to endMBB. 18142 endMBB->splice(endMBB->begin(), thisMBB, 18143 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); 18144 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 18145 18146 // Make offsetMBB and overflowMBB successors of thisMBB 18147 thisMBB->addSuccessor(offsetMBB); 18148 thisMBB->addSuccessor(overflowMBB); 18149 18150 // endMBB is a successor of both offsetMBB and overflowMBB 18151 offsetMBB->addSuccessor(endMBB); 18152 overflowMBB->addSuccessor(endMBB); 18153 18154 // Load the offset value into a register 18155 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 18156 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 18157 .addOperand(Base) 18158 .addOperand(Scale) 18159 .addOperand(Index) 18160 .addDisp(Disp, UseFPOffset ? 4 : 0) 18161 .addOperand(Segment) 18162 .setMemRefs(MMOBegin, MMOEnd); 18163 18164 // Check if there is enough room left to pull this argument. 18165 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 18166 .addReg(OffsetReg) 18167 .addImm(MaxOffset + 8 - ArgSizeA8); 18168 18169 // Branch to "overflowMBB" if offset >= max 18170 // Fall through to "offsetMBB" otherwise 18171 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 18172 .addMBB(overflowMBB); 18173 } 18174 18175 // In offsetMBB, emit code to use the reg_save_area. 18176 if (offsetMBB) { 18177 assert(OffsetReg != 0); 18178 18179 // Read the reg_save_area address. 18180 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 18181 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 18182 .addOperand(Base) 18183 .addOperand(Scale) 18184 .addOperand(Index) 18185 .addDisp(Disp, 16) 18186 .addOperand(Segment) 18187 .setMemRefs(MMOBegin, MMOEnd); 18188 18189 // Zero-extend the offset 18190 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 18191 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 18192 .addImm(0) 18193 .addReg(OffsetReg) 18194 .addImm(X86::sub_32bit); 18195 18196 // Add the offset to the reg_save_area to get the final address. 18197 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 18198 .addReg(OffsetReg64) 18199 .addReg(RegSaveReg); 18200 18201 // Compute the offset for the next argument 18202 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 18203 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 18204 .addReg(OffsetReg) 18205 .addImm(UseFPOffset ? 16 : 8); 18206 18207 // Store it back into the va_list. 18208 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 18209 .addOperand(Base) 18210 .addOperand(Scale) 18211 .addOperand(Index) 18212 .addDisp(Disp, UseFPOffset ? 4 : 0) 18213 .addOperand(Segment) 18214 .addReg(NextOffsetReg) 18215 .setMemRefs(MMOBegin, MMOEnd); 18216 18217 // Jump to endMBB 18218 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) 18219 .addMBB(endMBB); 18220 } 18221 18222 // 18223 // Emit code to use overflow area 18224 // 18225 18226 // Load the overflow_area address into a register. 18227 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 18228 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 18229 .addOperand(Base) 18230 .addOperand(Scale) 18231 .addOperand(Index) 18232 .addDisp(Disp, 8) 18233 .addOperand(Segment) 18234 .setMemRefs(MMOBegin, MMOEnd); 18235 18236 // If we need to align it, do so. Otherwise, just copy the address 18237 // to OverflowDestReg. 18238 if (NeedsAlign) { 18239 // Align the overflow address 18240 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 18241 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 18242 18243 // aligned_addr = (addr + (align-1)) & ~(align-1) 18244 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 18245 .addReg(OverflowAddrReg) 18246 .addImm(Align-1); 18247 18248 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 18249 .addReg(TmpReg) 18250 .addImm(~(uint64_t)(Align-1)); 18251 } else { 18252 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 18253 .addReg(OverflowAddrReg); 18254 } 18255 18256 // Compute the next overflow address after this argument. 18257 // (the overflow address should be kept 8-byte aligned) 18258 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 18259 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 18260 .addReg(OverflowDestReg) 18261 .addImm(ArgSizeA8); 18262 18263 // Store the new overflow address. 18264 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 18265 .addOperand(Base) 18266 .addOperand(Scale) 18267 .addOperand(Index) 18268 .addDisp(Disp, 8) 18269 .addOperand(Segment) 18270 .addReg(NextAddrReg) 18271 .setMemRefs(MMOBegin, MMOEnd); 18272 18273 // If we branched, emit the PHI to the front of endMBB. 18274 if (offsetMBB) { 18275 BuildMI(*endMBB, endMBB->begin(), DL, 18276 TII->get(X86::PHI), DestReg) 18277 .addReg(OffsetDestReg).addMBB(offsetMBB) 18278 .addReg(OverflowDestReg).addMBB(overflowMBB); 18279 } 18280 18281 // Erase the pseudo instruction 18282 MI->eraseFromParent(); 18283 18284 return endMBB; 18285 } 18286 18287 MachineBasicBlock * 18288 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 18289 MachineInstr *MI, 18290 MachineBasicBlock *MBB) const { 18291 // Emit code to save XMM registers to the stack. The ABI says that the 18292 // number of registers to save is given in %al, so it's theoretically 18293 // possible to do an indirect jump trick to avoid saving all of them, 18294 // however this code takes a simpler approach and just executes all 18295 // of the stores if %al is non-zero. It's less code, and it's probably 18296 // easier on the hardware branch predictor, and stores aren't all that 18297 // expensive anyway. 18298 18299 // Create the new basic blocks. One block contains all the XMM stores, 18300 // and one block is the final destination regardless of whether any 18301 // stores were performed. 18302 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 18303 MachineFunction *F = MBB->getParent(); 18304 MachineFunction::iterator MBBIter = MBB; 18305 ++MBBIter; 18306 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 18307 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 18308 F->insert(MBBIter, XMMSaveMBB); 18309 F->insert(MBBIter, EndMBB); 18310 18311 // Transfer the remainder of MBB and its successor edges to EndMBB. 18312 EndMBB->splice(EndMBB->begin(), MBB, 18313 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 18314 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 18315 18316 // The original block will now fall through to the XMM save block. 18317 MBB->addSuccessor(XMMSaveMBB); 18318 // The XMMSaveMBB will fall through to the end block. 18319 XMMSaveMBB->addSuccessor(EndMBB); 18320 18321 // Now add the instructions. 18322 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 18323 DebugLoc DL = MI->getDebugLoc(); 18324 18325 unsigned CountReg = MI->getOperand(0).getReg(); 18326 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 18327 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 18328 18329 if (!Subtarget->isTargetWin64()) { 18330 // If %al is 0, branch around the XMM save block. 18331 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 18332 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); 18333 MBB->addSuccessor(EndMBB); 18334 } 18335 18336 // Make sure the last operand is EFLAGS, which gets clobbered by the branch 18337 // that was just emitted, but clearly shouldn't be "saved". 18338 assert((MI->getNumOperands() <= 3 || 18339 !MI->getOperand(MI->getNumOperands() - 1).isReg() || 18340 MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) 18341 && "Expected last argument to be EFLAGS"); 18342 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; 18343 // In the XMM save block, save all the XMM argument registers. 18344 for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { 18345 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 18346 MachineMemOperand *MMO = 18347 F->getMachineMemOperand( 18348 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 18349 MachineMemOperand::MOStore, 18350 /*Size=*/16, /*Align=*/16); 18351 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 18352 .addFrameIndex(RegSaveFrameIndex) 18353 .addImm(/*Scale=*/1) 18354 .addReg(/*IndexReg=*/0) 18355 .addImm(/*Disp=*/Offset) 18356 .addReg(/*Segment=*/0) 18357 .addReg(MI->getOperand(i).getReg()) 18358 .addMemOperand(MMO); 18359 } 18360 18361 MI->eraseFromParent(); // The pseudo instruction is gone now. 18362 18363 return EndMBB; 18364 } 18365 18366 // The EFLAGS operand of SelectItr might be missing a kill marker 18367 // because there were multiple uses of EFLAGS, and ISel didn't know 18368 // which to mark. Figure out whether SelectItr should have had a 18369 // kill marker, and set it if it should. Returns the correct kill 18370 // marker value. 18371 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 18372 MachineBasicBlock* BB, 18373 const TargetRegisterInfo* TRI) { 18374 // Scan forward through BB for a use/def of EFLAGS. 18375 MachineBasicBlock::iterator miI(std::next(SelectItr)); 18376 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 18377 const MachineInstr& mi = *miI; 18378 if (mi.readsRegister(X86::EFLAGS)) 18379 return false; 18380 if (mi.definesRegister(X86::EFLAGS)) 18381 break; // Should have kill-flag - update below. 18382 } 18383 18384 // If we hit the end of the block, check whether EFLAGS is live into a 18385 // successor. 18386 if (miI == BB->end()) { 18387 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 18388 sEnd = BB->succ_end(); 18389 sItr != sEnd; ++sItr) { 18390 MachineBasicBlock* succ = *sItr; 18391 if (succ->isLiveIn(X86::EFLAGS)) 18392 return false; 18393 } 18394 } 18395 18396 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 18397 // out. SelectMI should have a kill flag on EFLAGS. 18398 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 18399 return true; 18400 } 18401 18402 MachineBasicBlock * 18403 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 18404 MachineBasicBlock *BB) const { 18405 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 18406 DebugLoc DL = MI->getDebugLoc(); 18407 18408 // To "insert" a SELECT_CC instruction, we actually have to insert the 18409 // diamond control-flow pattern. The incoming instruction knows the 18410 // destination vreg to set, the condition code register to branch on, the 18411 // true/false values to select between, and a branch opcode to use. 18412 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 18413 MachineFunction::iterator It = BB; 18414 ++It; 18415 18416 // thisMBB: 18417 // ... 18418 // TrueVal = ... 18419 // cmpTY ccX, r1, r2 18420 // bCC copy1MBB 18421 // fallthrough --> copy0MBB 18422 MachineBasicBlock *thisMBB = BB; 18423 MachineFunction *F = BB->getParent(); 18424 18425 // We also lower double CMOVs: 18426 // (CMOV (CMOV F, T, cc1), T, cc2) 18427 // to two successives branches. For that, we look for another CMOV as the 18428 // following instruction. 18429 // 18430 // Without this, we would add a PHI between the two jumps, which ends up 18431 // creating a few copies all around. For instance, for 18432 // 18433 // (sitofp (zext (fcmp une))) 18434 // 18435 // we would generate: 18436 // 18437 // ucomiss %xmm1, %xmm0 18438 // movss <1.0f>, %xmm0 18439 // movaps %xmm0, %xmm1 18440 // jne .LBB5_2 18441 // xorps %xmm1, %xmm1 18442 // .LBB5_2: 18443 // jp .LBB5_4 18444 // movaps %xmm1, %xmm0 18445 // .LBB5_4: 18446 // retq 18447 // 18448 // because this custom-inserter would have generated: 18449 // 18450 // A 18451 // | \ 18452 // | B 18453 // | / 18454 // C 18455 // | \ 18456 // | D 18457 // | / 18458 // E 18459 // 18460 // A: X = ...; Y = ... 18461 // B: empty 18462 // C: Z = PHI [X, A], [Y, B] 18463 // D: empty 18464 // E: PHI [X, C], [Z, D] 18465 // 18466 // If we lower both CMOVs in a single step, we can instead generate: 18467 // 18468 // A 18469 // | \ 18470 // | C 18471 // | /| 18472 // |/ | 18473 // | | 18474 // | D 18475 // | / 18476 // E 18477 // 18478 // A: X = ...; Y = ... 18479 // D: empty 18480 // E: PHI [X, A], [X, C], [Y, D] 18481 // 18482 // Which, in our sitofp/fcmp example, gives us something like: 18483 // 18484 // ucomiss %xmm1, %xmm0 18485 // movss <1.0f>, %xmm0 18486 // jne .LBB5_4 18487 // jp .LBB5_4 18488 // xorps %xmm0, %xmm0 18489 // .LBB5_4: 18490 // retq 18491 // 18492 MachineInstr *NextCMOV = nullptr; 18493 MachineBasicBlock::iterator NextMIIt = 18494 std::next(MachineBasicBlock::iterator(MI)); 18495 if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && 18496 NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && 18497 NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) 18498 NextCMOV = &*NextMIIt; 18499 18500 MachineBasicBlock *jcc1MBB = nullptr; 18501 18502 // If we have a double CMOV, we lower it to two successive branches to 18503 // the same block. EFLAGS is used by both, so mark it as live in the second. 18504 if (NextCMOV) { 18505 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); 18506 F->insert(It, jcc1MBB); 18507 jcc1MBB->addLiveIn(X86::EFLAGS); 18508 } 18509 18510 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 18511 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 18512 F->insert(It, copy0MBB); 18513 F->insert(It, sinkMBB); 18514 18515 // If the EFLAGS register isn't dead in the terminator, then claim that it's 18516 // live into the sink and copy blocks. 18517 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 18518 18519 MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; 18520 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && 18521 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { 18522 copy0MBB->addLiveIn(X86::EFLAGS); 18523 sinkMBB->addLiveIn(X86::EFLAGS); 18524 } 18525 18526 // Transfer the remainder of BB and its successor edges to sinkMBB. 18527 sinkMBB->splice(sinkMBB->begin(), BB, 18528 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 18529 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 18530 18531 // Add the true and fallthrough blocks as its successors. 18532 if (NextCMOV) { 18533 // The fallthrough block may be jcc1MBB, if we have a double CMOV. 18534 BB->addSuccessor(jcc1MBB); 18535 18536 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and 18537 // jump to the sinkMBB. 18538 jcc1MBB->addSuccessor(copy0MBB); 18539 jcc1MBB->addSuccessor(sinkMBB); 18540 } else { 18541 BB->addSuccessor(copy0MBB); 18542 } 18543 18544 // The true block target of the first (or only) branch is always sinkMBB. 18545 BB->addSuccessor(sinkMBB); 18546 18547 // Create the conditional branch instruction. 18548 unsigned Opc = 18549 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 18550 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 18551 18552 if (NextCMOV) { 18553 unsigned Opc2 = X86::GetCondBranchFromCond( 18554 (X86::CondCode)NextCMOV->getOperand(3).getImm()); 18555 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); 18556 } 18557 18558 // copy0MBB: 18559 // %FalseValue = ... 18560 // # fallthrough to sinkMBB 18561 copy0MBB->addSuccessor(sinkMBB); 18562 18563 // sinkMBB: 18564 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 18565 // ... 18566 MachineInstrBuilder MIB = 18567 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), 18568 MI->getOperand(0).getReg()) 18569 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 18570 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 18571 18572 // If we have a double CMOV, the second Jcc provides the same incoming 18573 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). 18574 if (NextCMOV) { 18575 MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); 18576 // Copy the PHI result to the register defined by the second CMOV. 18577 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), 18578 DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) 18579 .addReg(MI->getOperand(0).getReg()); 18580 NextCMOV->eraseFromParent(); 18581 } 18582 18583 MI->eraseFromParent(); // The pseudo instruction is gone now. 18584 return sinkMBB; 18585 } 18586 18587 MachineBasicBlock * 18588 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, 18589 MachineBasicBlock *BB) const { 18590 MachineFunction *MF = BB->getParent(); 18591 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 18592 DebugLoc DL = MI->getDebugLoc(); 18593 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 18594 18595 assert(MF->shouldSplitStack()); 18596 18597 const bool Is64Bit = Subtarget->is64Bit(); 18598 const bool IsLP64 = Subtarget->isTarget64BitLP64(); 18599 18600 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 18601 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; 18602 18603 // BB: 18604 // ... [Till the alloca] 18605 // If stacklet is not large enough, jump to mallocMBB 18606 // 18607 // bumpMBB: 18608 // Allocate by subtracting from RSP 18609 // Jump to continueMBB 18610 // 18611 // mallocMBB: 18612 // Allocate by call to runtime 18613 // 18614 // continueMBB: 18615 // ... 18616 // [rest of original BB] 18617 // 18618 18619 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18620 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18621 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18622 18623 MachineRegisterInfo &MRI = MF->getRegInfo(); 18624 const TargetRegisterClass *AddrRegClass = 18625 getRegClassFor(getPointerTy()); 18626 18627 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 18628 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 18629 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 18630 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 18631 sizeVReg = MI->getOperand(1).getReg(), 18632 physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; 18633 18634 MachineFunction::iterator MBBIter = BB; 18635 ++MBBIter; 18636 18637 MF->insert(MBBIter, bumpMBB); 18638 MF->insert(MBBIter, mallocMBB); 18639 MF->insert(MBBIter, continueMBB); 18640 18641 continueMBB->splice(continueMBB->begin(), BB, 18642 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 18643 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 18644 18645 // Add code to the main basic block to check if the stack limit has been hit, 18646 // and if so, jump to mallocMBB otherwise to bumpMBB. 18647 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 18648 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 18649 .addReg(tmpSPVReg).addReg(sizeVReg); 18650 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) 18651 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 18652 .addReg(SPLimitVReg); 18653 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); 18654 18655 // bumpMBB simply decreases the stack pointer, since we know the current 18656 // stacklet has enough space. 18657 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 18658 .addReg(SPLimitVReg); 18659 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 18660 .addReg(SPLimitVReg); 18661 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); 18662 18663 // Calls into a routine in libgcc to allocate more space from the heap. 18664 const uint32_t *RegMask = 18665 Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); 18666 if (IsLP64) { 18667 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 18668 .addReg(sizeVReg); 18669 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 18670 .addExternalSymbol("__morestack_allocate_stack_space") 18671 .addRegMask(RegMask) 18672 .addReg(X86::RDI, RegState::Implicit) 18673 .addReg(X86::RAX, RegState::ImplicitDefine); 18674 } else if (Is64Bit) { 18675 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) 18676 .addReg(sizeVReg); 18677 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 18678 .addExternalSymbol("__morestack_allocate_stack_space") 18679 .addRegMask(RegMask) 18680 .addReg(X86::EDI, RegState::Implicit) 18681 .addReg(X86::EAX, RegState::ImplicitDefine); 18682 } else { 18683 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 18684 .addImm(12); 18685 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 18686 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 18687 .addExternalSymbol("__morestack_allocate_stack_space") 18688 .addRegMask(RegMask) 18689 .addReg(X86::EAX, RegState::ImplicitDefine); 18690 } 18691 18692 if (!Is64Bit) 18693 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 18694 .addImm(16); 18695 18696 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 18697 .addReg(IsLP64 ? X86::RAX : X86::EAX); 18698 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); 18699 18700 // Set up the CFG correctly. 18701 BB->addSuccessor(bumpMBB); 18702 BB->addSuccessor(mallocMBB); 18703 mallocMBB->addSuccessor(continueMBB); 18704 bumpMBB->addSuccessor(continueMBB); 18705 18706 // Take care of the PHI nodes. 18707 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 18708 MI->getOperand(0).getReg()) 18709 .addReg(mallocPtrVReg).addMBB(mallocMBB) 18710 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 18711 18712 // Delete the original pseudo instruction. 18713 MI->eraseFromParent(); 18714 18715 // And we're done. 18716 return continueMBB; 18717 } 18718 18719 MachineBasicBlock * 18720 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 18721 MachineBasicBlock *BB) const { 18722 DebugLoc DL = MI->getDebugLoc(); 18723 18724 assert(!Subtarget->isTargetMachO()); 18725 18726 X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL); 18727 18728 MI->eraseFromParent(); // The pseudo instruction is gone now. 18729 return BB; 18730 } 18731 18732 MachineBasicBlock * 18733 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 18734 MachineBasicBlock *BB) const { 18735 // This is pretty easy. We're taking the value that we received from 18736 // our load from the relocation, sticking it in either RDI (x86-64) 18737 // or EAX and doing an indirect call. The return value will then 18738 // be in the normal return register. 18739 MachineFunction *F = BB->getParent(); 18740 const X86InstrInfo *TII = Subtarget->getInstrInfo(); 18741 DebugLoc DL = MI->getDebugLoc(); 18742 18743 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 18744 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 18745 18746 // Get a register mask for the lowered call. 18747 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 18748 // proper register mask. 18749 const uint32_t *RegMask = 18750 Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); 18751 if (Subtarget->is64Bit()) { 18752 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 18753 TII->get(X86::MOV64rm), X86::RDI) 18754 .addReg(X86::RIP) 18755 .addImm(0).addReg(0) 18756 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 18757 MI->getOperand(3).getTargetFlags()) 18758 .addReg(0); 18759 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 18760 addDirectMem(MIB, X86::RDI); 18761 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 18762 } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { 18763 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 18764 TII->get(X86::MOV32rm), X86::EAX) 18765 .addReg(0) 18766 .addImm(0).addReg(0) 18767 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 18768 MI->getOperand(3).getTargetFlags()) 18769 .addReg(0); 18770 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 18771 addDirectMem(MIB, X86::EAX); 18772 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 18773 } else { 18774 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 18775 TII->get(X86::MOV32rm), X86::EAX) 18776 .addReg(TII->getGlobalBaseReg(F)) 18777 .addImm(0).addReg(0) 18778 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 18779 MI->getOperand(3).getTargetFlags()) 18780 .addReg(0); 18781 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 18782 addDirectMem(MIB, X86::EAX); 18783 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 18784 } 18785 18786 MI->eraseFromParent(); // The pseudo instruction is gone now. 18787 return BB; 18788 } 18789 18790 MachineBasicBlock * 18791 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, 18792 MachineBasicBlock *MBB) const { 18793 DebugLoc DL = MI->getDebugLoc(); 18794 MachineFunction *MF = MBB->getParent(); 18795 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 18796 MachineRegisterInfo &MRI = MF->getRegInfo(); 18797 18798 const BasicBlock *BB = MBB->getBasicBlock(); 18799 MachineFunction::iterator I = MBB; 18800 ++I; 18801 18802 // Memory Reference 18803 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 18804 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 18805 18806 unsigned DstReg; 18807 unsigned MemOpndSlot = 0; 18808 18809 unsigned CurOp = 0; 18810 18811 DstReg = MI->getOperand(CurOp++).getReg(); 18812 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 18813 assert(RC->hasType(MVT::i32) && "Invalid destination!"); 18814 unsigned mainDstReg = MRI.createVirtualRegister(RC); 18815 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 18816 18817 MemOpndSlot = CurOp; 18818 18819 MVT PVT = getPointerTy(); 18820 assert((PVT == MVT::i64 || PVT == MVT::i32) && 18821 "Invalid Pointer Size!"); 18822 18823 // For v = setjmp(buf), we generate 18824 // 18825 // thisMBB: 18826 // buf[LabelOffset] = restoreMBB 18827 // SjLjSetup restoreMBB 18828 // 18829 // mainMBB: 18830 // v_main = 0 18831 // 18832 // sinkMBB: 18833 // v = phi(main, restore) 18834 // 18835 // restoreMBB: 18836 // if base pointer being used, load it from frame 18837 // v_restore = 1 18838 18839 MachineBasicBlock *thisMBB = MBB; 18840 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 18841 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 18842 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); 18843 MF->insert(I, mainMBB); 18844 MF->insert(I, sinkMBB); 18845 MF->push_back(restoreMBB); 18846 18847 MachineInstrBuilder MIB; 18848 18849 // Transfer the remainder of BB and its successor edges to sinkMBB. 18850 sinkMBB->splice(sinkMBB->begin(), MBB, 18851 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 18852 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 18853 18854 // thisMBB: 18855 unsigned PtrStoreOpc = 0; 18856 unsigned LabelReg = 0; 18857 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 18858 Reloc::Model RM = MF->getTarget().getRelocationModel(); 18859 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && 18860 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); 18861 18862 // Prepare IP either in reg or imm. 18863 if (!UseImmLabel) { 18864 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 18865 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 18866 LabelReg = MRI.createVirtualRegister(PtrRC); 18867 if (Subtarget->is64Bit()) { 18868 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) 18869 .addReg(X86::RIP) 18870 .addImm(0) 18871 .addReg(0) 18872 .addMBB(restoreMBB) 18873 .addReg(0); 18874 } else { 18875 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); 18876 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) 18877 .addReg(XII->getGlobalBaseReg(MF)) 18878 .addImm(0) 18879 .addReg(0) 18880 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) 18881 .addReg(0); 18882 } 18883 } else 18884 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 18885 // Store IP 18886 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); 18887 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 18888 if (i == X86::AddrDisp) 18889 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); 18890 else 18891 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 18892 } 18893 if (!UseImmLabel) 18894 MIB.addReg(LabelReg); 18895 else 18896 MIB.addMBB(restoreMBB); 18897 MIB.setMemRefs(MMOBegin, MMOEnd); 18898 // Setup 18899 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) 18900 .addMBB(restoreMBB); 18901 18902 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 18903 MIB.addRegMask(RegInfo->getNoPreservedMask()); 18904 thisMBB->addSuccessor(mainMBB); 18905 thisMBB->addSuccessor(restoreMBB); 18906 18907 // mainMBB: 18908 // EAX = 0 18909 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); 18910 mainMBB->addSuccessor(sinkMBB); 18911 18912 // sinkMBB: 18913 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 18914 TII->get(X86::PHI), DstReg) 18915 .addReg(mainDstReg).addMBB(mainMBB) 18916 .addReg(restoreDstReg).addMBB(restoreMBB); 18917 18918 // restoreMBB: 18919 if (RegInfo->hasBasePointer(*MF)) { 18920 const bool Uses64BitFramePtr = 18921 Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64(); 18922 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); 18923 X86FI->setRestoreBasePointer(MF); 18924 unsigned FramePtr = RegInfo->getFrameRegister(*MF); 18925 unsigned BasePtr = RegInfo->getBaseRegister(); 18926 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; 18927 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), 18928 FramePtr, true, X86FI->getRestoreBasePointerOffset()) 18929 .setMIFlag(MachineInstr::FrameSetup); 18930 } 18931 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); 18932 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); 18933 restoreMBB->addSuccessor(sinkMBB); 18934 18935 MI->eraseFromParent(); 18936 return sinkMBB; 18937 } 18938 18939 MachineBasicBlock * 18940 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, 18941 MachineBasicBlock *MBB) const { 18942 DebugLoc DL = MI->getDebugLoc(); 18943 MachineFunction *MF = MBB->getParent(); 18944 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 18945 MachineRegisterInfo &MRI = MF->getRegInfo(); 18946 18947 // Memory Reference 18948 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 18949 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 18950 18951 MVT PVT = getPointerTy(); 18952 assert((PVT == MVT::i64 || PVT == MVT::i32) && 18953 "Invalid Pointer Size!"); 18954 18955 const TargetRegisterClass *RC = 18956 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 18957 unsigned Tmp = MRI.createVirtualRegister(RC); 18958 // Since FP is only updated here but NOT referenced, it's treated as GPR. 18959 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 18960 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; 18961 unsigned SP = RegInfo->getStackRegister(); 18962 18963 MachineInstrBuilder MIB; 18964 18965 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 18966 const int64_t SPOffset = 2 * PVT.getStoreSize(); 18967 18968 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; 18969 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; 18970 18971 // Reload FP 18972 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); 18973 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 18974 MIB.addOperand(MI->getOperand(i)); 18975 MIB.setMemRefs(MMOBegin, MMOEnd); 18976 // Reload IP 18977 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); 18978 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 18979 if (i == X86::AddrDisp) 18980 MIB.addDisp(MI->getOperand(i), LabelOffset); 18981 else 18982 MIB.addOperand(MI->getOperand(i)); 18983 } 18984 MIB.setMemRefs(MMOBegin, MMOEnd); 18985 // Reload SP 18986 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); 18987 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 18988 if (i == X86::AddrDisp) 18989 MIB.addDisp(MI->getOperand(i), SPOffset); 18990 else 18991 MIB.addOperand(MI->getOperand(i)); 18992 } 18993 MIB.setMemRefs(MMOBegin, MMOEnd); 18994 // Jump 18995 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); 18996 18997 MI->eraseFromParent(); 18998 return MBB; 18999 } 19000 19001 // Replace 213-type (isel default) FMA3 instructions with 231-type for 19002 // accumulator loops. Writing back to the accumulator allows the coalescer 19003 // to remove extra copies in the loop. 19004 MachineBasicBlock * 19005 X86TargetLowering::emitFMA3Instr(MachineInstr *MI, 19006 MachineBasicBlock *MBB) const { 19007 MachineOperand &AddendOp = MI->getOperand(3); 19008 19009 // Bail out early if the addend isn't a register - we can't switch these. 19010 if (!AddendOp.isReg()) 19011 return MBB; 19012 19013 MachineFunction &MF = *MBB->getParent(); 19014 MachineRegisterInfo &MRI = MF.getRegInfo(); 19015 19016 // Check whether the addend is defined by a PHI: 19017 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); 19018 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); 19019 if (!AddendDef.isPHI()) 19020 return MBB; 19021 19022 // Look for the following pattern: 19023 // loop: 19024 // %addend = phi [%entry, 0], [%loop, %result] 19025 // ... 19026 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend 19027 19028 // Replace with: 19029 // loop: 19030 // %addend = phi [%entry, 0], [%loop, %result] 19031 // ... 19032 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2 19033 19034 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { 19035 assert(AddendDef.getOperand(i).isReg()); 19036 MachineOperand PHISrcOp = AddendDef.getOperand(i); 19037 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); 19038 if (&PHISrcInst == MI) { 19039 // Found a matching instruction. 19040 unsigned NewFMAOpc = 0; 19041 switch (MI->getOpcode()) { 19042 case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; 19043 case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; 19044 case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; 19045 case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; 19046 case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; 19047 case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; 19048 case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; 19049 case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; 19050 case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; 19051 case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; 19052 case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; 19053 case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; 19054 case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; 19055 case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; 19056 case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; 19057 case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; 19058 case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; 19059 case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; 19060 case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; 19061 case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; 19062 19063 case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; 19064 case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; 19065 case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; 19066 case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; 19067 case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; 19068 case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; 19069 case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; 19070 case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; 19071 case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; 19072 case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; 19073 case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; 19074 case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; 19075 default: llvm_unreachable("Unrecognized FMA variant."); 19076 } 19077 19078 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 19079 MachineInstrBuilder MIB = 19080 BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) 19081 .addOperand(MI->getOperand(0)) 19082 .addOperand(MI->getOperand(3)) 19083 .addOperand(MI->getOperand(2)) 19084 .addOperand(MI->getOperand(1)); 19085 MBB->insert(MachineBasicBlock::iterator(MI), MIB); 19086 MI->eraseFromParent(); 19087 } 19088 } 19089 19090 return MBB; 19091 } 19092 19093 MachineBasicBlock * 19094 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 19095 MachineBasicBlock *BB) const { 19096 switch (MI->getOpcode()) { 19097 default: llvm_unreachable("Unexpected instr type to insert"); 19098 case X86::TAILJMPd64: 19099 case X86::TAILJMPr64: 19100 case X86::TAILJMPm64: 19101 case X86::TAILJMPd64_REX: 19102 case X86::TAILJMPr64_REX: 19103 case X86::TAILJMPm64_REX: 19104 llvm_unreachable("TAILJMP64 would not be touched here."); 19105 case X86::TCRETURNdi64: 19106 case X86::TCRETURNri64: 19107 case X86::TCRETURNmi64: 19108 return BB; 19109 case X86::WIN_ALLOCA: 19110 return EmitLoweredWinAlloca(MI, BB); 19111 case X86::SEG_ALLOCA_32: 19112 case X86::SEG_ALLOCA_64: 19113 return EmitLoweredSegAlloca(MI, BB); 19114 case X86::TLSCall_32: 19115 case X86::TLSCall_64: 19116 return EmitLoweredTLSCall(MI, BB); 19117 case X86::CMOV_GR8: 19118 case X86::CMOV_FR32: 19119 case X86::CMOV_FR64: 19120 case X86::CMOV_V4F32: 19121 case X86::CMOV_V2F64: 19122 case X86::CMOV_V2I64: 19123 case X86::CMOV_V8F32: 19124 case X86::CMOV_V4F64: 19125 case X86::CMOV_V4I64: 19126 case X86::CMOV_V16F32: 19127 case X86::CMOV_V8F64: 19128 case X86::CMOV_V8I64: 19129 case X86::CMOV_GR16: 19130 case X86::CMOV_GR32: 19131 case X86::CMOV_RFP32: 19132 case X86::CMOV_RFP64: 19133 case X86::CMOV_RFP80: 19134 return EmitLoweredSelect(MI, BB); 19135 19136 case X86::FP32_TO_INT16_IN_MEM: 19137 case X86::FP32_TO_INT32_IN_MEM: 19138 case X86::FP32_TO_INT64_IN_MEM: 19139 case X86::FP64_TO_INT16_IN_MEM: 19140 case X86::FP64_TO_INT32_IN_MEM: 19141 case X86::FP64_TO_INT64_IN_MEM: 19142 case X86::FP80_TO_INT16_IN_MEM: 19143 case X86::FP80_TO_INT32_IN_MEM: 19144 case X86::FP80_TO_INT64_IN_MEM: { 19145 MachineFunction *F = BB->getParent(); 19146 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 19147 DebugLoc DL = MI->getDebugLoc(); 19148 19149 // Change the floating point control register to use "round towards zero" 19150 // mode when truncating to an integer value. 19151 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 19152 addFrameReference(BuildMI(*BB, MI, DL, 19153 TII->get(X86::FNSTCW16m)), CWFrameIdx); 19154 19155 // Load the old value of the high byte of the control word... 19156 unsigned OldCW = 19157 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 19158 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 19159 CWFrameIdx); 19160 19161 // Set the high part to be round to zero... 19162 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 19163 .addImm(0xC7F); 19164 19165 // Reload the modified control word now... 19166 addFrameReference(BuildMI(*BB, MI, DL, 19167 TII->get(X86::FLDCW16m)), CWFrameIdx); 19168 19169 // Restore the memory image of control word to original value 19170 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 19171 .addReg(OldCW); 19172 19173 // Get the X86 opcode to use. 19174 unsigned Opc; 19175 switch (MI->getOpcode()) { 19176 default: llvm_unreachable("illegal opcode!"); 19177 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 19178 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 19179 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 19180 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 19181 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 19182 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 19183 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 19184 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 19185 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 19186 } 19187 19188 X86AddressMode AM; 19189 MachineOperand &Op = MI->getOperand(0); 19190 if (Op.isReg()) { 19191 AM.BaseType = X86AddressMode::RegBase; 19192 AM.Base.Reg = Op.getReg(); 19193 } else { 19194 AM.BaseType = X86AddressMode::FrameIndexBase; 19195 AM.Base.FrameIndex = Op.getIndex(); 19196 } 19197 Op = MI->getOperand(1); 19198 if (Op.isImm()) 19199 AM.Scale = Op.getImm(); 19200 Op = MI->getOperand(2); 19201 if (Op.isImm()) 19202 AM.IndexReg = Op.getImm(); 19203 Op = MI->getOperand(3); 19204 if (Op.isGlobal()) { 19205 AM.GV = Op.getGlobal(); 19206 } else { 19207 AM.Disp = Op.getImm(); 19208 } 19209 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 19210 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 19211 19212 // Reload the original control word now. 19213 addFrameReference(BuildMI(*BB, MI, DL, 19214 TII->get(X86::FLDCW16m)), CWFrameIdx); 19215 19216 MI->eraseFromParent(); // The pseudo instruction is gone now. 19217 return BB; 19218 } 19219 // String/text processing lowering. 19220 case X86::PCMPISTRM128REG: 19221 case X86::VPCMPISTRM128REG: 19222 case X86::PCMPISTRM128MEM: 19223 case X86::VPCMPISTRM128MEM: 19224 case X86::PCMPESTRM128REG: 19225 case X86::VPCMPESTRM128REG: 19226 case X86::PCMPESTRM128MEM: 19227 case X86::VPCMPESTRM128MEM: 19228 assert(Subtarget->hasSSE42() && 19229 "Target must have SSE4.2 or AVX features enabled"); 19230 return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo()); 19231 19232 // String/text processing lowering. 19233 case X86::PCMPISTRIREG: 19234 case X86::VPCMPISTRIREG: 19235 case X86::PCMPISTRIMEM: 19236 case X86::VPCMPISTRIMEM: 19237 case X86::PCMPESTRIREG: 19238 case X86::VPCMPESTRIREG: 19239 case X86::PCMPESTRIMEM: 19240 case X86::VPCMPESTRIMEM: 19241 assert(Subtarget->hasSSE42() && 19242 "Target must have SSE4.2 or AVX features enabled"); 19243 return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo()); 19244 19245 // Thread synchronization. 19246 case X86::MONITOR: 19247 return EmitMonitor(MI, BB, Subtarget); 19248 19249 // xbegin 19250 case X86::XBEGIN: 19251 return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); 19252 19253 case X86::VASTART_SAVE_XMM_REGS: 19254 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 19255 19256 case X86::VAARG_64: 19257 return EmitVAARG64WithCustomInserter(MI, BB); 19258 19259 case X86::EH_SjLj_SetJmp32: 19260 case X86::EH_SjLj_SetJmp64: 19261 return emitEHSjLjSetJmp(MI, BB); 19262 19263 case X86::EH_SjLj_LongJmp32: 19264 case X86::EH_SjLj_LongJmp64: 19265 return emitEHSjLjLongJmp(MI, BB); 19266 19267 case TargetOpcode::STATEPOINT: 19268 // As an implementation detail, STATEPOINT shares the STACKMAP format at 19269 // this point in the process. We diverge later. 19270 return emitPatchPoint(MI, BB); 19271 19272 case TargetOpcode::STACKMAP: 19273 case TargetOpcode::PATCHPOINT: 19274 return emitPatchPoint(MI, BB); 19275 19276 case X86::VFMADDPDr213r: 19277 case X86::VFMADDPSr213r: 19278 case X86::VFMADDSDr213r: 19279 case X86::VFMADDSSr213r: 19280 case X86::VFMSUBPDr213r: 19281 case X86::VFMSUBPSr213r: 19282 case X86::VFMSUBSDr213r: 19283 case X86::VFMSUBSSr213r: 19284 case X86::VFNMADDPDr213r: 19285 case X86::VFNMADDPSr213r: 19286 case X86::VFNMADDSDr213r: 19287 case X86::VFNMADDSSr213r: 19288 case X86::VFNMSUBPDr213r: 19289 case X86::VFNMSUBPSr213r: 19290 case X86::VFNMSUBSDr213r: 19291 case X86::VFNMSUBSSr213r: 19292 case X86::VFMADDSUBPDr213r: 19293 case X86::VFMADDSUBPSr213r: 19294 case X86::VFMSUBADDPDr213r: 19295 case X86::VFMSUBADDPSr213r: 19296 case X86::VFMADDPDr213rY: 19297 case X86::VFMADDPSr213rY: 19298 case X86::VFMSUBPDr213rY: 19299 case X86::VFMSUBPSr213rY: 19300 case X86::VFNMADDPDr213rY: 19301 case X86::VFNMADDPSr213rY: 19302 case X86::VFNMSUBPDr213rY: 19303 case X86::VFNMSUBPSr213rY: 19304 case X86::VFMADDSUBPDr213rY: 19305 case X86::VFMADDSUBPSr213rY: 19306 case X86::VFMSUBADDPDr213rY: 19307 case X86::VFMSUBADDPSr213rY: 19308 return emitFMA3Instr(MI, BB); 19309 } 19310 } 19311 19312 //===----------------------------------------------------------------------===// 19313 // X86 Optimization Hooks 19314 //===----------------------------------------------------------------------===// 19315 19316 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 19317 APInt &KnownZero, 19318 APInt &KnownOne, 19319 const SelectionDAG &DAG, 19320 unsigned Depth) const { 19321 unsigned BitWidth = KnownZero.getBitWidth(); 19322 unsigned Opc = Op.getOpcode(); 19323 assert((Opc >= ISD::BUILTIN_OP_END || 19324 Opc == ISD::INTRINSIC_WO_CHAIN || 19325 Opc == ISD::INTRINSIC_W_CHAIN || 19326 Opc == ISD::INTRINSIC_VOID) && 19327 "Should use MaskedValueIsZero if you don't know whether Op" 19328 " is a target node!"); 19329 19330 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 19331 switch (Opc) { 19332 default: break; 19333 case X86ISD::ADD: 19334 case X86ISD::SUB: 19335 case X86ISD::ADC: 19336 case X86ISD::SBB: 19337 case X86ISD::SMUL: 19338 case X86ISD::UMUL: 19339 case X86ISD::INC: 19340 case X86ISD::DEC: 19341 case X86ISD::OR: 19342 case X86ISD::XOR: 19343 case X86ISD::AND: 19344 // These nodes' second result is a boolean. 19345 if (Op.getResNo() == 0) 19346 break; 19347 // Fallthrough 19348 case X86ISD::SETCC: 19349 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 19350 break; 19351 case ISD::INTRINSIC_WO_CHAIN: { 19352 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 19353 unsigned NumLoBits = 0; 19354 switch (IntId) { 19355 default: break; 19356 case Intrinsic::x86_sse_movmsk_ps: 19357 case Intrinsic::x86_avx_movmsk_ps_256: 19358 case Intrinsic::x86_sse2_movmsk_pd: 19359 case Intrinsic::x86_avx_movmsk_pd_256: 19360 case Intrinsic::x86_mmx_pmovmskb: 19361 case Intrinsic::x86_sse2_pmovmskb_128: 19362 case Intrinsic::x86_avx2_pmovmskb: { 19363 // High bits of movmskp{s|d}, pmovmskb are known zero. 19364 switch (IntId) { 19365 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 19366 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 19367 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 19368 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 19369 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 19370 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 19371 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 19372 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 19373 } 19374 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 19375 break; 19376 } 19377 } 19378 break; 19379 } 19380 } 19381 } 19382 19383 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( 19384 SDValue Op, 19385 const SelectionDAG &, 19386 unsigned Depth) const { 19387 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 19388 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 19389 return Op.getValueType().getScalarType().getSizeInBits(); 19390 19391 // Fallback case. 19392 return 1; 19393 } 19394 19395 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 19396 /// node is a GlobalAddress + offset. 19397 bool X86TargetLowering::isGAPlusOffset(SDNode *N, 19398 const GlobalValue* &GA, 19399 int64_t &Offset) const { 19400 if (N->getOpcode() == X86ISD::Wrapper) { 19401 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 19402 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 19403 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 19404 return true; 19405 } 19406 } 19407 return TargetLowering::isGAPlusOffset(N, GA, Offset); 19408 } 19409 19410 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 19411 /// same as extracting the high 128-bit part of 256-bit vector and then 19412 /// inserting the result into the low part of a new 256-bit vector 19413 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 19414 EVT VT = SVOp->getValueType(0); 19415 unsigned NumElems = VT.getVectorNumElements(); 19416 19417 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 19418 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 19419 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 19420 SVOp->getMaskElt(j) >= 0) 19421 return false; 19422 19423 return true; 19424 } 19425 19426 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 19427 /// same as extracting the low 128-bit part of 256-bit vector and then 19428 /// inserting the result into the high part of a new 256-bit vector 19429 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 19430 EVT VT = SVOp->getValueType(0); 19431 unsigned NumElems = VT.getVectorNumElements(); 19432 19433 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 19434 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 19435 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 19436 SVOp->getMaskElt(j) >= 0) 19437 return false; 19438 19439 return true; 19440 } 19441 19442 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 19443 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 19444 TargetLowering::DAGCombinerInfo &DCI, 19445 const X86Subtarget* Subtarget) { 19446 SDLoc dl(N); 19447 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 19448 SDValue V1 = SVOp->getOperand(0); 19449 SDValue V2 = SVOp->getOperand(1); 19450 EVT VT = SVOp->getValueType(0); 19451 unsigned NumElems = VT.getVectorNumElements(); 19452 19453 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 19454 V2.getOpcode() == ISD::CONCAT_VECTORS) { 19455 // 19456 // 0,0,0,... 19457 // | 19458 // V UNDEF BUILD_VECTOR UNDEF 19459 // \ / \ / 19460 // CONCAT_VECTOR CONCAT_VECTOR 19461 // \ / 19462 // \ / 19463 // RESULT: V + zero extended 19464 // 19465 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 19466 V2.getOperand(1).getOpcode() != ISD::UNDEF || 19467 V1.getOperand(1).getOpcode() != ISD::UNDEF) 19468 return SDValue(); 19469 19470 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 19471 return SDValue(); 19472 19473 // To match the shuffle mask, the first half of the mask should 19474 // be exactly the first vector, and all the rest a splat with the 19475 // first element of the second one. 19476 for (unsigned i = 0; i != NumElems/2; ++i) 19477 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 19478 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 19479 return SDValue(); 19480 19481 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 19482 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 19483 if (Ld->hasNUsesOfValue(1, 0)) { 19484 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 19485 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 19486 SDValue ResNode = 19487 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 19488 Ld->getMemoryVT(), 19489 Ld->getPointerInfo(), 19490 Ld->getAlignment(), 19491 false/*isVolatile*/, true/*ReadMem*/, 19492 false/*WriteMem*/); 19493 19494 // Make sure the newly-created LOAD is in the same position as Ld in 19495 // terms of dependency. We create a TokenFactor for Ld and ResNode, 19496 // and update uses of Ld's output chain to use the TokenFactor. 19497 if (Ld->hasAnyUseOfValue(1)) { 19498 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 19499 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); 19500 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 19501 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), 19502 SDValue(ResNode.getNode(), 1)); 19503 } 19504 19505 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 19506 } 19507 } 19508 19509 // Emit a zeroed vector and insert the desired subvector on its 19510 // first half. 19511 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 19512 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 19513 return DCI.CombineTo(N, InsV); 19514 } 19515 19516 //===--------------------------------------------------------------------===// 19517 // Combine some shuffles into subvector extracts and inserts: 19518 // 19519 19520 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 19521 if (isShuffleHigh128VectorInsertLow(SVOp)) { 19522 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 19523 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 19524 return DCI.CombineTo(N, InsV); 19525 } 19526 19527 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 19528 if (isShuffleLow128VectorInsertHigh(SVOp)) { 19529 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 19530 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 19531 return DCI.CombineTo(N, InsV); 19532 } 19533 19534 return SDValue(); 19535 } 19536 19537 /// \brief Combine an arbitrary chain of shuffles into a single instruction if 19538 /// possible. 19539 /// 19540 /// This is the leaf of the recursive combinine below. When we have found some 19541 /// chain of single-use x86 shuffle instructions and accumulated the combined 19542 /// shuffle mask represented by them, this will try to pattern match that mask 19543 /// into either a single instruction if there is a special purpose instruction 19544 /// for this operation, or into a PSHUFB instruction which is a fully general 19545 /// instruction but should only be used to replace chains over a certain depth. 19546 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, 19547 int Depth, bool HasPSHUFB, SelectionDAG &DAG, 19548 TargetLowering::DAGCombinerInfo &DCI, 19549 const X86Subtarget *Subtarget) { 19550 assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); 19551 19552 // Find the operand that enters the chain. Note that multiple uses are OK 19553 // here, we're not going to remove the operand we find. 19554 SDValue Input = Op.getOperand(0); 19555 while (Input.getOpcode() == ISD::BITCAST) 19556 Input = Input.getOperand(0); 19557 19558 MVT VT = Input.getSimpleValueType(); 19559 MVT RootVT = Root.getSimpleValueType(); 19560 SDLoc DL(Root); 19561 19562 // Just remove no-op shuffle masks. 19563 if (Mask.size() == 1) { 19564 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input), 19565 /*AddTo*/ true); 19566 return true; 19567 } 19568 19569 // Use the float domain if the operand type is a floating point type. 19570 bool FloatDomain = VT.isFloatingPoint(); 19571 19572 // For floating point shuffles, we don't have free copies in the shuffle 19573 // instructions or the ability to load as part of the instruction, so 19574 // canonicalize their shuffles to UNPCK or MOV variants. 19575 // 19576 // Note that even with AVX we prefer the PSHUFD form of shuffle for integer 19577 // vectors because it can have a load folded into it that UNPCK cannot. This 19578 // doesn't preclude something switching to the shorter encoding post-RA. 19579 // 19580 // FIXME: Should teach these routines about AVX vector widths. 19581 if (FloatDomain && VT.getSizeInBits() == 128) { 19582 if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { 19583 bool Lo = Mask.equals({0, 0}); 19584 unsigned Shuffle; 19585 MVT ShuffleVT; 19586 // Check if we have SSE3 which will let us use MOVDDUP. That instruction 19587 // is no slower than UNPCKLPD but has the option to fold the input operand 19588 // into even an unaligned memory load. 19589 if (Lo && Subtarget->hasSSE3()) { 19590 Shuffle = X86ISD::MOVDDUP; 19591 ShuffleVT = MVT::v2f64; 19592 } else { 19593 // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller 19594 // than the UNPCK variants. 19595 Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; 19596 ShuffleVT = MVT::v4f32; 19597 } 19598 if (Depth == 1 && Root->getOpcode() == Shuffle) 19599 return false; // Nothing to do! 19600 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); 19601 DCI.AddToWorklist(Op.getNode()); 19602 if (Shuffle == X86ISD::MOVDDUP) 19603 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); 19604 else 19605 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 19606 DCI.AddToWorklist(Op.getNode()); 19607 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19608 /*AddTo*/ true); 19609 return true; 19610 } 19611 if (Subtarget->hasSSE3() && 19612 (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) { 19613 bool Lo = Mask.equals({0, 0, 2, 2}); 19614 unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; 19615 MVT ShuffleVT = MVT::v4f32; 19616 if (Depth == 1 && Root->getOpcode() == Shuffle) 19617 return false; // Nothing to do! 19618 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); 19619 DCI.AddToWorklist(Op.getNode()); 19620 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); 19621 DCI.AddToWorklist(Op.getNode()); 19622 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19623 /*AddTo*/ true); 19624 return true; 19625 } 19626 if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) { 19627 bool Lo = Mask.equals({0, 0, 1, 1}); 19628 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; 19629 MVT ShuffleVT = MVT::v4f32; 19630 if (Depth == 1 && Root->getOpcode() == Shuffle) 19631 return false; // Nothing to do! 19632 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); 19633 DCI.AddToWorklist(Op.getNode()); 19634 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 19635 DCI.AddToWorklist(Op.getNode()); 19636 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19637 /*AddTo*/ true); 19638 return true; 19639 } 19640 } 19641 19642 // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK 19643 // variants as none of these have single-instruction variants that are 19644 // superior to the UNPCK formulation. 19645 if (!FloatDomain && VT.getSizeInBits() == 128 && 19646 (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || 19647 Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || 19648 Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || 19649 Mask.equals( 19650 {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) { 19651 bool Lo = Mask[0] == 0; 19652 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; 19653 if (Depth == 1 && Root->getOpcode() == Shuffle) 19654 return false; // Nothing to do! 19655 MVT ShuffleVT; 19656 switch (Mask.size()) { 19657 case 8: 19658 ShuffleVT = MVT::v8i16; 19659 break; 19660 case 16: 19661 ShuffleVT = MVT::v16i8; 19662 break; 19663 default: 19664 llvm_unreachable("Impossible mask size!"); 19665 }; 19666 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); 19667 DCI.AddToWorklist(Op.getNode()); 19668 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 19669 DCI.AddToWorklist(Op.getNode()); 19670 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19671 /*AddTo*/ true); 19672 return true; 19673 } 19674 19675 // Don't try to re-form single instruction chains under any circumstances now 19676 // that we've done encoding canonicalization for them. 19677 if (Depth < 2) 19678 return false; 19679 19680 // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we 19681 // can replace them with a single PSHUFB instruction profitably. Intel's 19682 // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but 19683 // in practice PSHUFB tends to be *very* fast so we're more aggressive. 19684 if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { 19685 SmallVector<SDValue, 16> PSHUFBMask; 19686 int NumBytes = VT.getSizeInBits() / 8; 19687 int Ratio = NumBytes / Mask.size(); 19688 for (int i = 0; i < NumBytes; ++i) { 19689 if (Mask[i / Ratio] == SM_SentinelUndef) { 19690 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); 19691 continue; 19692 } 19693 int M = Mask[i / Ratio] != SM_SentinelZero 19694 ? Ratio * Mask[i / Ratio] + i % Ratio 19695 : 255; 19696 PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); 19697 } 19698 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); 19699 Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input); 19700 DCI.AddToWorklist(Op.getNode()); 19701 SDValue PSHUFBMaskOp = 19702 DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); 19703 DCI.AddToWorklist(PSHUFBMaskOp.getNode()); 19704 Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); 19705 DCI.AddToWorklist(Op.getNode()); 19706 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19707 /*AddTo*/ true); 19708 return true; 19709 } 19710 19711 // Failed to find any combines. 19712 return false; 19713 } 19714 19715 /// \brief Fully generic combining of x86 shuffle instructions. 19716 /// 19717 /// This should be the last combine run over the x86 shuffle instructions. Once 19718 /// they have been fully optimized, this will recursively consider all chains 19719 /// of single-use shuffle instructions, build a generic model of the cumulative 19720 /// shuffle operation, and check for simpler instructions which implement this 19721 /// operation. We use this primarily for two purposes: 19722 /// 19723 /// 1) Collapse generic shuffles to specialized single instructions when 19724 /// equivalent. In most cases, this is just an encoding size win, but 19725 /// sometimes we will collapse multiple generic shuffles into a single 19726 /// special-purpose shuffle. 19727 /// 2) Look for sequences of shuffle instructions with 3 or more total 19728 /// instructions, and replace them with the slightly more expensive SSSE3 19729 /// PSHUFB instruction if available. We do this as the last combining step 19730 /// to ensure we avoid using PSHUFB if we can implement the shuffle with 19731 /// a suitable short sequence of other instructions. The PHUFB will either 19732 /// use a register or have to read from memory and so is slightly (but only 19733 /// slightly) more expensive than the other shuffle instructions. 19734 /// 19735 /// Because this is inherently a quadratic operation (for each shuffle in 19736 /// a chain, we recurse up the chain), the depth is limited to 8 instructions. 19737 /// This should never be an issue in practice as the shuffle lowering doesn't 19738 /// produce sequences of more than 8 instructions. 19739 /// 19740 /// FIXME: We will currently miss some cases where the redundant shuffling 19741 /// would simplify under the threshold for PSHUFB formation because of 19742 /// combine-ordering. To fix this, we should do the redundant instruction 19743 /// combining in this recursive walk. 19744 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, 19745 ArrayRef<int> RootMask, 19746 int Depth, bool HasPSHUFB, 19747 SelectionDAG &DAG, 19748 TargetLowering::DAGCombinerInfo &DCI, 19749 const X86Subtarget *Subtarget) { 19750 // Bound the depth of our recursive combine because this is ultimately 19751 // quadratic in nature. 19752 if (Depth > 8) 19753 return false; 19754 19755 // Directly rip through bitcasts to find the underlying operand. 19756 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) 19757 Op = Op.getOperand(0); 19758 19759 MVT VT = Op.getSimpleValueType(); 19760 if (!VT.isVector()) 19761 return false; // Bail if we hit a non-vector. 19762 19763 assert(Root.getSimpleValueType().isVector() && 19764 "Shuffles operate on vector types!"); 19765 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && 19766 "Can only combine shuffles of the same vector register size."); 19767 19768 if (!isTargetShuffle(Op.getOpcode())) 19769 return false; 19770 SmallVector<int, 16> OpMask; 19771 bool IsUnary; 19772 bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); 19773 // We only can combine unary shuffles which we can decode the mask for. 19774 if (!HaveMask || !IsUnary) 19775 return false; 19776 19777 assert(VT.getVectorNumElements() == OpMask.size() && 19778 "Different mask size from vector size!"); 19779 assert(((RootMask.size() > OpMask.size() && 19780 RootMask.size() % OpMask.size() == 0) || 19781 (OpMask.size() > RootMask.size() && 19782 OpMask.size() % RootMask.size() == 0) || 19783 OpMask.size() == RootMask.size()) && 19784 "The smaller number of elements must divide the larger."); 19785 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); 19786 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); 19787 assert(((RootRatio == 1 && OpRatio == 1) || 19788 (RootRatio == 1) != (OpRatio == 1)) && 19789 "Must not have a ratio for both incoming and op masks!"); 19790 19791 SmallVector<int, 16> Mask; 19792 Mask.reserve(std::max(OpMask.size(), RootMask.size())); 19793 19794 // Merge this shuffle operation's mask into our accumulated mask. Note that 19795 // this shuffle's mask will be the first applied to the input, followed by the 19796 // root mask to get us all the way to the root value arrangement. The reason 19797 // for this order is that we are recursing up the operation chain. 19798 for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { 19799 int RootIdx = i / RootRatio; 19800 if (RootMask[RootIdx] < 0) { 19801 // This is a zero or undef lane, we're done. 19802 Mask.push_back(RootMask[RootIdx]); 19803 continue; 19804 } 19805 19806 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; 19807 int OpIdx = RootMaskedIdx / OpRatio; 19808 if (OpMask[OpIdx] < 0) { 19809 // The incoming lanes are zero or undef, it doesn't matter which ones we 19810 // are using. 19811 Mask.push_back(OpMask[OpIdx]); 19812 continue; 19813 } 19814 19815 // Ok, we have non-zero lanes, map them through. 19816 Mask.push_back(OpMask[OpIdx] * OpRatio + 19817 RootMaskedIdx % OpRatio); 19818 } 19819 19820 // See if we can recurse into the operand to combine more things. 19821 switch (Op.getOpcode()) { 19822 case X86ISD::PSHUFB: 19823 HasPSHUFB = true; 19824 case X86ISD::PSHUFD: 19825 case X86ISD::PSHUFHW: 19826 case X86ISD::PSHUFLW: 19827 if (Op.getOperand(0).hasOneUse() && 19828 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, 19829 HasPSHUFB, DAG, DCI, Subtarget)) 19830 return true; 19831 break; 19832 19833 case X86ISD::UNPCKL: 19834 case X86ISD::UNPCKH: 19835 assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); 19836 // We can't check for single use, we have to check that this shuffle is the only user. 19837 if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && 19838 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, 19839 HasPSHUFB, DAG, DCI, Subtarget)) 19840 return true; 19841 break; 19842 } 19843 19844 // Minor canonicalization of the accumulated shuffle mask to make it easier 19845 // to match below. All this does is detect masks with squential pairs of 19846 // elements, and shrink them to the half-width mask. It does this in a loop 19847 // so it will reduce the size of the mask to the minimal width mask which 19848 // performs an equivalent shuffle. 19849 SmallVector<int, 16> WidenedMask; 19850 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { 19851 Mask = std::move(WidenedMask); 19852 WidenedMask.clear(); 19853 } 19854 19855 return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, 19856 Subtarget); 19857 } 19858 19859 /// \brief Get the PSHUF-style mask from PSHUF node. 19860 /// 19861 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 19862 /// PSHUF-style masks that can be reused with such instructions. 19863 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { 19864 MVT VT = N.getSimpleValueType(); 19865 SmallVector<int, 4> Mask; 19866 bool IsUnary; 19867 bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); 19868 (void)HaveMask; 19869 assert(HaveMask); 19870 19871 // If we have more than 128-bits, only the low 128-bits of shuffle mask 19872 // matter. Check that the upper masks are repeats and remove them. 19873 if (VT.getSizeInBits() > 128) { 19874 int LaneElts = 128 / VT.getScalarSizeInBits(); 19875 #ifndef NDEBUG 19876 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) 19877 for (int j = 0; j < LaneElts; ++j) 19878 assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts && 19879 "Mask doesn't repeat in high 128-bit lanes!"); 19880 #endif 19881 Mask.resize(LaneElts); 19882 } 19883 19884 switch (N.getOpcode()) { 19885 case X86ISD::PSHUFD: 19886 return Mask; 19887 case X86ISD::PSHUFLW: 19888 Mask.resize(4); 19889 return Mask; 19890 case X86ISD::PSHUFHW: 19891 Mask.erase(Mask.begin(), Mask.begin() + 4); 19892 for (int &M : Mask) 19893 M -= 4; 19894 return Mask; 19895 default: 19896 llvm_unreachable("No valid shuffle instruction found!"); 19897 } 19898 } 19899 19900 /// \brief Search for a combinable shuffle across a chain ending in pshufd. 19901 /// 19902 /// We walk up the chain and look for a combinable shuffle, skipping over 19903 /// shuffles that we could hoist this shuffle's transformation past without 19904 /// altering anything. 19905 static SDValue 19906 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, 19907 SelectionDAG &DAG, 19908 TargetLowering::DAGCombinerInfo &DCI) { 19909 assert(N.getOpcode() == X86ISD::PSHUFD && 19910 "Called with something other than an x86 128-bit half shuffle!"); 19911 SDLoc DL(N); 19912 19913 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack 19914 // of the shuffles in the chain so that we can form a fresh chain to replace 19915 // this one. 19916 SmallVector<SDValue, 8> Chain; 19917 SDValue V = N.getOperand(0); 19918 for (; V.hasOneUse(); V = V.getOperand(0)) { 19919 switch (V.getOpcode()) { 19920 default: 19921 return SDValue(); // Nothing combined! 19922 19923 case ISD::BITCAST: 19924 // Skip bitcasts as we always know the type for the target specific 19925 // instructions. 19926 continue; 19927 19928 case X86ISD::PSHUFD: 19929 // Found another dword shuffle. 19930 break; 19931 19932 case X86ISD::PSHUFLW: 19933 // Check that the low words (being shuffled) are the identity in the 19934 // dword shuffle, and the high words are self-contained. 19935 if (Mask[0] != 0 || Mask[1] != 1 || 19936 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) 19937 return SDValue(); 19938 19939 Chain.push_back(V); 19940 continue; 19941 19942 case X86ISD::PSHUFHW: 19943 // Check that the high words (being shuffled) are the identity in the 19944 // dword shuffle, and the low words are self-contained. 19945 if (Mask[2] != 2 || Mask[3] != 3 || 19946 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) 19947 return SDValue(); 19948 19949 Chain.push_back(V); 19950 continue; 19951 19952 case X86ISD::UNPCKL: 19953 case X86ISD::UNPCKH: 19954 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword 19955 // shuffle into a preceding word shuffle. 19956 if (V.getSimpleValueType().getScalarType() != MVT::i8 && 19957 V.getSimpleValueType().getScalarType() != MVT::i16) 19958 return SDValue(); 19959 19960 // Search for a half-shuffle which we can combine with. 19961 unsigned CombineOp = 19962 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; 19963 if (V.getOperand(0) != V.getOperand(1) || 19964 !V->isOnlyUserOf(V.getOperand(0).getNode())) 19965 return SDValue(); 19966 Chain.push_back(V); 19967 V = V.getOperand(0); 19968 do { 19969 switch (V.getOpcode()) { 19970 default: 19971 return SDValue(); // Nothing to combine. 19972 19973 case X86ISD::PSHUFLW: 19974 case X86ISD::PSHUFHW: 19975 if (V.getOpcode() == CombineOp) 19976 break; 19977 19978 Chain.push_back(V); 19979 19980 // Fallthrough! 19981 case ISD::BITCAST: 19982 V = V.getOperand(0); 19983 continue; 19984 } 19985 break; 19986 } while (V.hasOneUse()); 19987 break; 19988 } 19989 // Break out of the loop if we break out of the switch. 19990 break; 19991 } 19992 19993 if (!V.hasOneUse()) 19994 // We fell out of the loop without finding a viable combining instruction. 19995 return SDValue(); 19996 19997 // Merge this node's mask and our incoming mask. 19998 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 19999 for (int &M : Mask) 20000 M = VMask[M]; 20001 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), 20002 getV4X86ShuffleImm8ForMask(Mask, DAG)); 20003 20004 // Rebuild the chain around this new shuffle. 20005 while (!Chain.empty()) { 20006 SDValue W = Chain.pop_back_val(); 20007 20008 if (V.getValueType() != W.getOperand(0).getValueType()) 20009 V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V); 20010 20011 switch (W.getOpcode()) { 20012 default: 20013 llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); 20014 20015 case X86ISD::UNPCKL: 20016 case X86ISD::UNPCKH: 20017 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); 20018 break; 20019 20020 case X86ISD::PSHUFD: 20021 case X86ISD::PSHUFLW: 20022 case X86ISD::PSHUFHW: 20023 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); 20024 break; 20025 } 20026 } 20027 if (V.getValueType() != N.getValueType()) 20028 V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V); 20029 20030 // Return the new chain to replace N. 20031 return V; 20032 } 20033 20034 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. 20035 /// 20036 /// We walk up the chain, skipping shuffles of the other half and looking 20037 /// through shuffles which switch halves trying to find a shuffle of the same 20038 /// pair of dwords. 20039 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, 20040 SelectionDAG &DAG, 20041 TargetLowering::DAGCombinerInfo &DCI) { 20042 assert( 20043 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && 20044 "Called with something other than an x86 128-bit half shuffle!"); 20045 SDLoc DL(N); 20046 unsigned CombineOpcode = N.getOpcode(); 20047 20048 // Walk up a single-use chain looking for a combinable shuffle. 20049 SDValue V = N.getOperand(0); 20050 for (; V.hasOneUse(); V = V.getOperand(0)) { 20051 switch (V.getOpcode()) { 20052 default: 20053 return false; // Nothing combined! 20054 20055 case ISD::BITCAST: 20056 // Skip bitcasts as we always know the type for the target specific 20057 // instructions. 20058 continue; 20059 20060 case X86ISD::PSHUFLW: 20061 case X86ISD::PSHUFHW: 20062 if (V.getOpcode() == CombineOpcode) 20063 break; 20064 20065 // Other-half shuffles are no-ops. 20066 continue; 20067 } 20068 // Break out of the loop if we break out of the switch. 20069 break; 20070 } 20071 20072 if (!V.hasOneUse()) 20073 // We fell out of the loop without finding a viable combining instruction. 20074 return false; 20075 20076 // Combine away the bottom node as its shuffle will be accumulated into 20077 // a preceding shuffle. 20078 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); 20079 20080 // Record the old value. 20081 SDValue Old = V; 20082 20083 // Merge this node's mask and our incoming mask (adjusted to account for all 20084 // the pshufd instructions encountered). 20085 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 20086 for (int &M : Mask) 20087 M = VMask[M]; 20088 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), 20089 getV4X86ShuffleImm8ForMask(Mask, DAG)); 20090 20091 // Check that the shuffles didn't cancel each other out. If not, we need to 20092 // combine to the new one. 20093 if (Old != V) 20094 // Replace the combinable shuffle with the combined one, updating all users 20095 // so that we re-evaluate the chain here. 20096 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); 20097 20098 return true; 20099 } 20100 20101 /// \brief Try to combine x86 target specific shuffles. 20102 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, 20103 TargetLowering::DAGCombinerInfo &DCI, 20104 const X86Subtarget *Subtarget) { 20105 SDLoc DL(N); 20106 MVT VT = N.getSimpleValueType(); 20107 SmallVector<int, 4> Mask; 20108 20109 switch (N.getOpcode()) { 20110 case X86ISD::PSHUFD: 20111 case X86ISD::PSHUFLW: 20112 case X86ISD::PSHUFHW: 20113 Mask = getPSHUFShuffleMask(N); 20114 assert(Mask.size() == 4); 20115 break; 20116 default: 20117 return SDValue(); 20118 } 20119 20120 // Nuke no-op shuffles that show up after combining. 20121 if (isNoopShuffleMask(Mask)) 20122 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); 20123 20124 // Look for simplifications involving one or two shuffle instructions. 20125 SDValue V = N.getOperand(0); 20126 switch (N.getOpcode()) { 20127 default: 20128 break; 20129 case X86ISD::PSHUFLW: 20130 case X86ISD::PSHUFHW: 20131 assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!"); 20132 20133 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) 20134 return SDValue(); // We combined away this shuffle, so we're done. 20135 20136 // See if this reduces to a PSHUFD which is no more expensive and can 20137 // combine with more operations. Note that it has to at least flip the 20138 // dwords as otherwise it would have been removed as a no-op. 20139 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { 20140 int DMask[] = {0, 1, 2, 3}; 20141 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; 20142 DMask[DOffset + 0] = DOffset + 1; 20143 DMask[DOffset + 1] = DOffset + 0; 20144 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); 20145 V = DAG.getNode(ISD::BITCAST, DL, DVT, V); 20146 DCI.AddToWorklist(V.getNode()); 20147 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, 20148 getV4X86ShuffleImm8ForMask(DMask, DAG)); 20149 DCI.AddToWorklist(V.getNode()); 20150 return DAG.getNode(ISD::BITCAST, DL, VT, V); 20151 } 20152 20153 // Look for shuffle patterns which can be implemented as a single unpack. 20154 // FIXME: This doesn't handle the location of the PSHUFD generically, and 20155 // only works when we have a PSHUFD followed by two half-shuffles. 20156 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && 20157 (V.getOpcode() == X86ISD::PSHUFLW || 20158 V.getOpcode() == X86ISD::PSHUFHW) && 20159 V.getOpcode() != N.getOpcode() && 20160 V.hasOneUse()) { 20161 SDValue D = V.getOperand(0); 20162 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) 20163 D = D.getOperand(0); 20164 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { 20165 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 20166 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); 20167 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; 20168 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; 20169 int WordMask[8]; 20170 for (int i = 0; i < 4; ++i) { 20171 WordMask[i + NOffset] = Mask[i] + NOffset; 20172 WordMask[i + VOffset] = VMask[i] + VOffset; 20173 } 20174 // Map the word mask through the DWord mask. 20175 int MappedMask[8]; 20176 for (int i = 0; i < 8; ++i) 20177 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; 20178 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || 20179 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { 20180 // We can replace all three shuffles with an unpack. 20181 V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0)); 20182 DCI.AddToWorklist(V.getNode()); 20183 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL 20184 : X86ISD::UNPCKH, 20185 DL, VT, V, V); 20186 } 20187 } 20188 } 20189 20190 break; 20191 20192 case X86ISD::PSHUFD: 20193 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) 20194 return NewN; 20195 20196 break; 20197 } 20198 20199 return SDValue(); 20200 } 20201 20202 /// \brief Try to combine a shuffle into a target-specific add-sub node. 20203 /// 20204 /// We combine this directly on the abstract vector shuffle nodes so it is 20205 /// easier to generically match. We also insert dummy vector shuffle nodes for 20206 /// the operands which explicitly discard the lanes which are unused by this 20207 /// operation to try to flow through the rest of the combiner the fact that 20208 /// they're unused. 20209 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { 20210 SDLoc DL(N); 20211 EVT VT = N->getValueType(0); 20212 20213 // We only handle target-independent shuffles. 20214 // FIXME: It would be easy and harmless to use the target shuffle mask 20215 // extraction tool to support more. 20216 if (N->getOpcode() != ISD::VECTOR_SHUFFLE) 20217 return SDValue(); 20218 20219 auto *SVN = cast<ShuffleVectorSDNode>(N); 20220 ArrayRef<int> Mask = SVN->getMask(); 20221 SDValue V1 = N->getOperand(0); 20222 SDValue V2 = N->getOperand(1); 20223 20224 // We require the first shuffle operand to be the SUB node, and the second to 20225 // be the ADD node. 20226 // FIXME: We should support the commuted patterns. 20227 if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) 20228 return SDValue(); 20229 20230 // If there are other uses of these operations we can't fold them. 20231 if (!V1->hasOneUse() || !V2->hasOneUse()) 20232 return SDValue(); 20233 20234 // Ensure that both operations have the same operands. Note that we can 20235 // commute the FADD operands. 20236 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); 20237 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && 20238 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) 20239 return SDValue(); 20240 20241 // We're looking for blends between FADD and FSUB nodes. We insist on these 20242 // nodes being lined up in a specific expected pattern. 20243 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || 20244 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || 20245 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) 20246 return SDValue(); 20247 20248 // Only specific types are legal at this point, assert so we notice if and 20249 // when these change. 20250 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || 20251 VT == MVT::v4f64) && 20252 "Unknown vector type encountered!"); 20253 20254 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); 20255 } 20256 20257 /// PerformShuffleCombine - Performs several different shuffle combines. 20258 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 20259 TargetLowering::DAGCombinerInfo &DCI, 20260 const X86Subtarget *Subtarget) { 20261 SDLoc dl(N); 20262 SDValue N0 = N->getOperand(0); 20263 SDValue N1 = N->getOperand(1); 20264 EVT VT = N->getValueType(0); 20265 20266 // Don't create instructions with illegal types after legalize types has run. 20267 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20268 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 20269 return SDValue(); 20270 20271 // If we have legalized the vector types, look for blends of FADD and FSUB 20272 // nodes that we can fuse into an ADDSUB node. 20273 if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) 20274 if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) 20275 return AddSub; 20276 20277 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 20278 if (Subtarget->hasFp256() && VT.is256BitVector() && 20279 N->getOpcode() == ISD::VECTOR_SHUFFLE) 20280 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 20281 20282 // During Type Legalization, when promoting illegal vector types, 20283 // the backend might introduce new shuffle dag nodes and bitcasts. 20284 // 20285 // This code performs the following transformation: 20286 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> 20287 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) 20288 // 20289 // We do this only if both the bitcast and the BINOP dag nodes have 20290 // one use. Also, perform this transformation only if the new binary 20291 // operation is legal. This is to avoid introducing dag nodes that 20292 // potentially need to be further expanded (or custom lowered) into a 20293 // less optimal sequence of dag nodes. 20294 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && 20295 N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && 20296 N0.getOpcode() == ISD::BITCAST) { 20297 SDValue BC0 = N0.getOperand(0); 20298 EVT SVT = BC0.getValueType(); 20299 unsigned Opcode = BC0.getOpcode(); 20300 unsigned NumElts = VT.getVectorNumElements(); 20301 20302 if (BC0.hasOneUse() && SVT.isVector() && 20303 SVT.getVectorNumElements() * 2 == NumElts && 20304 TLI.isOperationLegal(Opcode, VT)) { 20305 bool CanFold = false; 20306 switch (Opcode) { 20307 default : break; 20308 case ISD::ADD : 20309 case ISD::FADD : 20310 case ISD::SUB : 20311 case ISD::FSUB : 20312 case ISD::MUL : 20313 case ISD::FMUL : 20314 CanFold = true; 20315 } 20316 20317 unsigned SVTNumElts = SVT.getVectorNumElements(); 20318 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 20319 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) 20320 CanFold = SVOp->getMaskElt(i) == (int)(i * 2); 20321 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) 20322 CanFold = SVOp->getMaskElt(i) < 0; 20323 20324 if (CanFold) { 20325 SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0)); 20326 SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1)); 20327 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); 20328 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); 20329 } 20330 } 20331 } 20332 20333 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 20334 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 20335 // consecutive, non-overlapping, and in the right order. 20336 SmallVector<SDValue, 16> Elts; 20337 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 20338 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 20339 20340 SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); 20341 if (LD.getNode()) 20342 return LD; 20343 20344 if (isTargetShuffle(N->getOpcode())) { 20345 SDValue Shuffle = 20346 PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); 20347 if (Shuffle.getNode()) 20348 return Shuffle; 20349 20350 // Try recursively combining arbitrary sequences of x86 shuffle 20351 // instructions into higher-order shuffles. We do this after combining 20352 // specific PSHUF instruction sequences into their minimal form so that we 20353 // can evaluate how many specialized shuffle instructions are involved in 20354 // a particular chain. 20355 SmallVector<int, 1> NonceMask; // Just a placeholder. 20356 NonceMask.push_back(0); 20357 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, 20358 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, 20359 DCI, Subtarget)) 20360 return SDValue(); // This routine will use CombineTo to replace N. 20361 } 20362 20363 return SDValue(); 20364 } 20365 20366 /// PerformTruncateCombine - Converts truncate operation to 20367 /// a sequence of vector shuffle operations. 20368 /// It is possible when we truncate 256-bit vector to 128-bit vector 20369 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 20370 TargetLowering::DAGCombinerInfo &DCI, 20371 const X86Subtarget *Subtarget) { 20372 return SDValue(); 20373 } 20374 20375 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 20376 /// specific shuffle of a load can be folded into a single element load. 20377 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 20378 /// shuffles have been custom lowered so we need to handle those here. 20379 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 20380 TargetLowering::DAGCombinerInfo &DCI) { 20381 if (DCI.isBeforeLegalizeOps()) 20382 return SDValue(); 20383 20384 SDValue InVec = N->getOperand(0); 20385 SDValue EltNo = N->getOperand(1); 20386 20387 if (!isa<ConstantSDNode>(EltNo)) 20388 return SDValue(); 20389 20390 EVT OriginalVT = InVec.getValueType(); 20391 20392 if (InVec.getOpcode() == ISD::BITCAST) { 20393 // Don't duplicate a load with other uses. 20394 if (!InVec.hasOneUse()) 20395 return SDValue(); 20396 EVT BCVT = InVec.getOperand(0).getValueType(); 20397 if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) 20398 return SDValue(); 20399 InVec = InVec.getOperand(0); 20400 } 20401 20402 EVT CurrentVT = InVec.getValueType(); 20403 20404 if (!isTargetShuffle(InVec.getOpcode())) 20405 return SDValue(); 20406 20407 // Don't duplicate a load with other uses. 20408 if (!InVec.hasOneUse()) 20409 return SDValue(); 20410 20411 SmallVector<int, 16> ShuffleMask; 20412 bool UnaryShuffle; 20413 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), 20414 ShuffleMask, UnaryShuffle)) 20415 return SDValue(); 20416 20417 // Select the input vector, guarding against out of range extract vector. 20418 unsigned NumElems = CurrentVT.getVectorNumElements(); 20419 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 20420 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 20421 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 20422 : InVec.getOperand(1); 20423 20424 // If inputs to shuffle are the same for both ops, then allow 2 uses 20425 unsigned AllowedUses = InVec.getNumOperands() > 1 && 20426 InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 20427 20428 if (LdNode.getOpcode() == ISD::BITCAST) { 20429 // Don't duplicate a load with other uses. 20430 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 20431 return SDValue(); 20432 20433 AllowedUses = 1; // only allow 1 load use if we have a bitcast 20434 LdNode = LdNode.getOperand(0); 20435 } 20436 20437 if (!ISD::isNormalLoad(LdNode.getNode())) 20438 return SDValue(); 20439 20440 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 20441 20442 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 20443 return SDValue(); 20444 20445 EVT EltVT = N->getValueType(0); 20446 // If there's a bitcast before the shuffle, check if the load type and 20447 // alignment is valid. 20448 unsigned Align = LN0->getAlignment(); 20449 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20450 unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( 20451 EltVT.getTypeForEVT(*DAG.getContext())); 20452 20453 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) 20454 return SDValue(); 20455 20456 // All checks match so transform back to vector_shuffle so that DAG combiner 20457 // can finish the job 20458 SDLoc dl(N); 20459 20460 // Create shuffle node taking into account the case that its a unary shuffle 20461 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) 20462 : InVec.getOperand(1); 20463 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, 20464 InVec.getOperand(0), Shuffle, 20465 &ShuffleMask[0]); 20466 Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle); 20467 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 20468 EltNo); 20469 } 20470 20471 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are 20472 /// special and don't usually play with other vector types, it's better to 20473 /// handle them early to be sure we emit efficient code by avoiding 20474 /// store-load conversions. 20475 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { 20476 if (N->getValueType(0) != MVT::x86mmx || 20477 N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR || 20478 N->getOperand(0)->getValueType(0) != MVT::v2i32) 20479 return SDValue(); 20480 20481 SDValue V = N->getOperand(0); 20482 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1)); 20483 if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32) 20484 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)), 20485 N->getValueType(0), V.getOperand(0)); 20486 20487 return SDValue(); 20488 } 20489 20490 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 20491 /// generation and convert it from being a bunch of shuffles and extracts 20492 /// into a somewhat faster sequence. For i686, the best sequence is apparently 20493 /// storing the value and loading scalars back, while for x64 we should 20494 /// use 64-bit extracts and shifts. 20495 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 20496 TargetLowering::DAGCombinerInfo &DCI) { 20497 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); 20498 if (NewOp.getNode()) 20499 return NewOp; 20500 20501 SDValue InputVector = N->getOperand(0); 20502 20503 // Detect mmx to i32 conversion through a v2i32 elt extract. 20504 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && 20505 N->getValueType(0) == MVT::i32 && 20506 InputVector.getValueType() == MVT::v2i32) { 20507 20508 // The bitcast source is a direct mmx result. 20509 SDValue MMXSrc = InputVector.getNode()->getOperand(0); 20510 if (MMXSrc.getValueType() == MVT::x86mmx) 20511 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), 20512 N->getValueType(0), 20513 InputVector.getNode()->getOperand(0)); 20514 20515 // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). 20516 SDValue MMXSrcOp = MMXSrc.getOperand(0); 20517 if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && 20518 MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() && 20519 MMXSrcOp.getOpcode() == ISD::BITCAST && 20520 MMXSrcOp.getValueType() == MVT::v1i64 && 20521 MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) 20522 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), 20523 N->getValueType(0), 20524 MMXSrcOp.getOperand(0)); 20525 } 20526 20527 // Only operate on vectors of 4 elements, where the alternative shuffling 20528 // gets to be more expensive. 20529 if (InputVector.getValueType() != MVT::v4i32) 20530 return SDValue(); 20531 20532 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 20533 // single use which is a sign-extend or zero-extend, and all elements are 20534 // used. 20535 SmallVector<SDNode *, 4> Uses; 20536 unsigned ExtractedElements = 0; 20537 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 20538 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 20539 if (UI.getUse().getResNo() != InputVector.getResNo()) 20540 return SDValue(); 20541 20542 SDNode *Extract = *UI; 20543 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 20544 return SDValue(); 20545 20546 if (Extract->getValueType(0) != MVT::i32) 20547 return SDValue(); 20548 if (!Extract->hasOneUse()) 20549 return SDValue(); 20550 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 20551 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 20552 return SDValue(); 20553 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 20554 return SDValue(); 20555 20556 // Record which element was extracted. 20557 ExtractedElements |= 20558 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 20559 20560 Uses.push_back(Extract); 20561 } 20562 20563 // If not all the elements were used, this may not be worthwhile. 20564 if (ExtractedElements != 15) 20565 return SDValue(); 20566 20567 // Ok, we've now decided to do the transformation. 20568 // If 64-bit shifts are legal, use the extract-shift sequence, 20569 // otherwise bounce the vector off the cache. 20570 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20571 SDValue Vals[4]; 20572 SDLoc dl(InputVector); 20573 20574 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { 20575 SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); 20576 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); 20577 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, 20578 DAG.getConstant(0, VecIdxTy)); 20579 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, 20580 DAG.getConstant(1, VecIdxTy)); 20581 20582 SDValue ShAmt = DAG.getConstant(32, 20583 DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); 20584 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); 20585 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, 20586 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); 20587 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); 20588 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, 20589 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); 20590 } else { 20591 // Store the value to a temporary stack slot. 20592 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 20593 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 20594 MachinePointerInfo(), false, false, 0); 20595 20596 EVT ElementType = InputVector.getValueType().getVectorElementType(); 20597 unsigned EltSize = ElementType.getSizeInBits() / 8; 20598 20599 // Replace each use (extract) with a load of the appropriate element. 20600 for (unsigned i = 0; i < 4; ++i) { 20601 uint64_t Offset = EltSize * i; 20602 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 20603 20604 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 20605 StackPtr, OffsetVal); 20606 20607 // Load the scalar. 20608 Vals[i] = DAG.getLoad(ElementType, dl, Ch, 20609 ScalarAddr, MachinePointerInfo(), 20610 false, false, false, 0); 20611 20612 } 20613 } 20614 20615 // Replace the extracts 20616 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 20617 UE = Uses.end(); UI != UE; ++UI) { 20618 SDNode *Extract = *UI; 20619 20620 SDValue Idx = Extract->getOperand(1); 20621 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 20622 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); 20623 } 20624 20625 // The replacement was made in place; don't return anything. 20626 return SDValue(); 20627 } 20628 20629 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. 20630 static std::pair<unsigned, bool> 20631 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, 20632 SelectionDAG &DAG, const X86Subtarget *Subtarget) { 20633 if (!VT.isVector()) 20634 return std::make_pair(0, false); 20635 20636 bool NeedSplit = false; 20637 switch (VT.getSimpleVT().SimpleTy) { 20638 default: return std::make_pair(0, false); 20639 case MVT::v4i64: 20640 case MVT::v2i64: 20641 if (!Subtarget->hasVLX()) 20642 return std::make_pair(0, false); 20643 break; 20644 case MVT::v64i8: 20645 case MVT::v32i16: 20646 if (!Subtarget->hasBWI()) 20647 return std::make_pair(0, false); 20648 break; 20649 case MVT::v16i32: 20650 case MVT::v8i64: 20651 if (!Subtarget->hasAVX512()) 20652 return std::make_pair(0, false); 20653 break; 20654 case MVT::v32i8: 20655 case MVT::v16i16: 20656 case MVT::v8i32: 20657 if (!Subtarget->hasAVX2()) 20658 NeedSplit = true; 20659 if (!Subtarget->hasAVX()) 20660 return std::make_pair(0, false); 20661 break; 20662 case MVT::v16i8: 20663 case MVT::v8i16: 20664 case MVT::v4i32: 20665 if (!Subtarget->hasSSE2()) 20666 return std::make_pair(0, false); 20667 } 20668 20669 // SSE2 has only a small subset of the operations. 20670 bool hasUnsigned = Subtarget->hasSSE41() || 20671 (Subtarget->hasSSE2() && VT == MVT::v16i8); 20672 bool hasSigned = Subtarget->hasSSE41() || 20673 (Subtarget->hasSSE2() && VT == MVT::v8i16); 20674 20675 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 20676 20677 unsigned Opc = 0; 20678 // Check for x CC y ? x : y. 20679 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 20680 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 20681 switch (CC) { 20682 default: break; 20683 case ISD::SETULT: 20684 case ISD::SETULE: 20685 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 20686 case ISD::SETUGT: 20687 case ISD::SETUGE: 20688 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 20689 case ISD::SETLT: 20690 case ISD::SETLE: 20691 Opc = hasSigned ? X86ISD::SMIN : 0; break; 20692 case ISD::SETGT: 20693 case ISD::SETGE: 20694 Opc = hasSigned ? X86ISD::SMAX : 0; break; 20695 } 20696 // Check for x CC y ? y : x -- a min/max with reversed arms. 20697 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 20698 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 20699 switch (CC) { 20700 default: break; 20701 case ISD::SETULT: 20702 case ISD::SETULE: 20703 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 20704 case ISD::SETUGT: 20705 case ISD::SETUGE: 20706 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 20707 case ISD::SETLT: 20708 case ISD::SETLE: 20709 Opc = hasSigned ? X86ISD::SMAX : 0; break; 20710 case ISD::SETGT: 20711 case ISD::SETGE: 20712 Opc = hasSigned ? X86ISD::SMIN : 0; break; 20713 } 20714 } 20715 20716 return std::make_pair(Opc, NeedSplit); 20717 } 20718 20719 static SDValue 20720 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, 20721 const X86Subtarget *Subtarget) { 20722 SDLoc dl(N); 20723 SDValue Cond = N->getOperand(0); 20724 SDValue LHS = N->getOperand(1); 20725 SDValue RHS = N->getOperand(2); 20726 20727 if (Cond.getOpcode() == ISD::SIGN_EXTEND) { 20728 SDValue CondSrc = Cond->getOperand(0); 20729 if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) 20730 Cond = CondSrc->getOperand(0); 20731 } 20732 20733 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) 20734 return SDValue(); 20735 20736 // A vselect where all conditions and data are constants can be optimized into 20737 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). 20738 if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && 20739 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) 20740 return SDValue(); 20741 20742 unsigned MaskValue = 0; 20743 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) 20744 return SDValue(); 20745 20746 MVT VT = N->getSimpleValueType(0); 20747 unsigned NumElems = VT.getVectorNumElements(); 20748 SmallVector<int, 8> ShuffleMask(NumElems, -1); 20749 for (unsigned i = 0; i < NumElems; ++i) { 20750 // Be sure we emit undef where we can. 20751 if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) 20752 ShuffleMask[i] = -1; 20753 else 20754 ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); 20755 } 20756 20757 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20758 if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) 20759 return SDValue(); 20760 return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); 20761 } 20762 20763 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 20764 /// nodes. 20765 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 20766 TargetLowering::DAGCombinerInfo &DCI, 20767 const X86Subtarget *Subtarget) { 20768 SDLoc DL(N); 20769 SDValue Cond = N->getOperand(0); 20770 // Get the LHS/RHS of the select. 20771 SDValue LHS = N->getOperand(1); 20772 SDValue RHS = N->getOperand(2); 20773 EVT VT = LHS.getValueType(); 20774 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20775 20776 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 20777 // instructions match the semantics of the common C idiom x<y?x:y but not 20778 // x<=y?x:y, because of how they handle negative zero (which can be 20779 // ignored in unsafe-math mode). 20780 // We also try to create v2f32 min/max nodes, which we later widen to v4f32. 20781 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 20782 VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && 20783 (Subtarget->hasSSE2() || 20784 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 20785 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 20786 20787 unsigned Opcode = 0; 20788 // Check for x CC y ? x : y. 20789 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 20790 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 20791 switch (CC) { 20792 default: break; 20793 case ISD::SETULT: 20794 // Converting this to a min would handle NaNs incorrectly, and swapping 20795 // the operands would cause it to handle comparisons between positive 20796 // and negative zero incorrectly. 20797 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 20798 if (!DAG.getTarget().Options.UnsafeFPMath && 20799 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 20800 break; 20801 std::swap(LHS, RHS); 20802 } 20803 Opcode = X86ISD::FMIN; 20804 break; 20805 case ISD::SETOLE: 20806 // Converting this to a min would handle comparisons between positive 20807 // and negative zero incorrectly. 20808 if (!DAG.getTarget().Options.UnsafeFPMath && 20809 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 20810 break; 20811 Opcode = X86ISD::FMIN; 20812 break; 20813 case ISD::SETULE: 20814 // Converting this to a min would handle both negative zeros and NaNs 20815 // incorrectly, but we can swap the operands to fix both. 20816 std::swap(LHS, RHS); 20817 case ISD::SETOLT: 20818 case ISD::SETLT: 20819 case ISD::SETLE: 20820 Opcode = X86ISD::FMIN; 20821 break; 20822 20823 case ISD::SETOGE: 20824 // Converting this to a max would handle comparisons between positive 20825 // and negative zero incorrectly. 20826 if (!DAG.getTarget().Options.UnsafeFPMath && 20827 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 20828 break; 20829 Opcode = X86ISD::FMAX; 20830 break; 20831 case ISD::SETUGT: 20832 // Converting this to a max would handle NaNs incorrectly, and swapping 20833 // the operands would cause it to handle comparisons between positive 20834 // and negative zero incorrectly. 20835 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 20836 if (!DAG.getTarget().Options.UnsafeFPMath && 20837 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 20838 break; 20839 std::swap(LHS, RHS); 20840 } 20841 Opcode = X86ISD::FMAX; 20842 break; 20843 case ISD::SETUGE: 20844 // Converting this to a max would handle both negative zeros and NaNs 20845 // incorrectly, but we can swap the operands to fix both. 20846 std::swap(LHS, RHS); 20847 case ISD::SETOGT: 20848 case ISD::SETGT: 20849 case ISD::SETGE: 20850 Opcode = X86ISD::FMAX; 20851 break; 20852 } 20853 // Check for x CC y ? y : x -- a min/max with reversed arms. 20854 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 20855 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 20856 switch (CC) { 20857 default: break; 20858 case ISD::SETOGE: 20859 // Converting this to a min would handle comparisons between positive 20860 // and negative zero incorrectly, and swapping the operands would 20861 // cause it to handle NaNs incorrectly. 20862 if (!DAG.getTarget().Options.UnsafeFPMath && 20863 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 20864 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 20865 break; 20866 std::swap(LHS, RHS); 20867 } 20868 Opcode = X86ISD::FMIN; 20869 break; 20870 case ISD::SETUGT: 20871 // Converting this to a min would handle NaNs incorrectly. 20872 if (!DAG.getTarget().Options.UnsafeFPMath && 20873 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 20874 break; 20875 Opcode = X86ISD::FMIN; 20876 break; 20877 case ISD::SETUGE: 20878 // Converting this to a min would handle both negative zeros and NaNs 20879 // incorrectly, but we can swap the operands to fix both. 20880 std::swap(LHS, RHS); 20881 case ISD::SETOGT: 20882 case ISD::SETGT: 20883 case ISD::SETGE: 20884 Opcode = X86ISD::FMIN; 20885 break; 20886 20887 case ISD::SETULT: 20888 // Converting this to a max would handle NaNs incorrectly. 20889 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 20890 break; 20891 Opcode = X86ISD::FMAX; 20892 break; 20893 case ISD::SETOLE: 20894 // Converting this to a max would handle comparisons between positive 20895 // and negative zero incorrectly, and swapping the operands would 20896 // cause it to handle NaNs incorrectly. 20897 if (!DAG.getTarget().Options.UnsafeFPMath && 20898 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 20899 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 20900 break; 20901 std::swap(LHS, RHS); 20902 } 20903 Opcode = X86ISD::FMAX; 20904 break; 20905 case ISD::SETULE: 20906 // Converting this to a max would handle both negative zeros and NaNs 20907 // incorrectly, but we can swap the operands to fix both. 20908 std::swap(LHS, RHS); 20909 case ISD::SETOLT: 20910 case ISD::SETLT: 20911 case ISD::SETLE: 20912 Opcode = X86ISD::FMAX; 20913 break; 20914 } 20915 } 20916 20917 if (Opcode) 20918 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 20919 } 20920 20921 EVT CondVT = Cond.getValueType(); 20922 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && 20923 CondVT.getVectorElementType() == MVT::i1) { 20924 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper 20925 // lowering on KNL. In this case we convert it to 20926 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. 20927 // The same situation for all 128 and 256-bit vectors of i8 and i16. 20928 // Since SKX these selects have a proper lowering. 20929 EVT OpVT = LHS.getValueType(); 20930 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && 20931 (OpVT.getVectorElementType() == MVT::i8 || 20932 OpVT.getVectorElementType() == MVT::i16) && 20933 !(Subtarget->hasBWI() && Subtarget->hasVLX())) { 20934 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); 20935 DCI.AddToWorklist(Cond.getNode()); 20936 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); 20937 } 20938 } 20939 // If this is a select between two integer constants, try to do some 20940 // optimizations. 20941 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 20942 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 20943 // Don't do this for crazy integer types. 20944 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 20945 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 20946 // so that TrueC (the true value) is larger than FalseC. 20947 bool NeedsCondInvert = false; 20948 20949 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 20950 // Efficiently invertible. 20951 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 20952 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 20953 isa<ConstantSDNode>(Cond.getOperand(1))))) { 20954 NeedsCondInvert = true; 20955 std::swap(TrueC, FalseC); 20956 } 20957 20958 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 20959 if (FalseC->getAPIntValue() == 0 && 20960 TrueC->getAPIntValue().isPowerOf2()) { 20961 if (NeedsCondInvert) // Invert the condition if needed. 20962 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 20963 DAG.getConstant(1, Cond.getValueType())); 20964 20965 // Zero extend the condition if needed. 20966 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 20967 20968 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 20969 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 20970 DAG.getConstant(ShAmt, MVT::i8)); 20971 } 20972 20973 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 20974 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 20975 if (NeedsCondInvert) // Invert the condition if needed. 20976 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 20977 DAG.getConstant(1, Cond.getValueType())); 20978 20979 // Zero extend the condition if needed. 20980 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 20981 FalseC->getValueType(0), Cond); 20982 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 20983 SDValue(FalseC, 0)); 20984 } 20985 20986 // Optimize cases that will turn into an LEA instruction. This requires 20987 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 20988 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 20989 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 20990 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 20991 20992 bool isFastMultiplier = false; 20993 if (Diff < 10) { 20994 switch ((unsigned char)Diff) { 20995 default: break; 20996 case 1: // result = add base, cond 20997 case 2: // result = lea base( , cond*2) 20998 case 3: // result = lea base(cond, cond*2) 20999 case 4: // result = lea base( , cond*4) 21000 case 5: // result = lea base(cond, cond*4) 21001 case 8: // result = lea base( , cond*8) 21002 case 9: // result = lea base(cond, cond*8) 21003 isFastMultiplier = true; 21004 break; 21005 } 21006 } 21007 21008 if (isFastMultiplier) { 21009 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 21010 if (NeedsCondInvert) // Invert the condition if needed. 21011 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 21012 DAG.getConstant(1, Cond.getValueType())); 21013 21014 // Zero extend the condition if needed. 21015 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 21016 Cond); 21017 // Scale the condition by the difference. 21018 if (Diff != 1) 21019 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 21020 DAG.getConstant(Diff, Cond.getValueType())); 21021 21022 // Add the base if non-zero. 21023 if (FalseC->getAPIntValue() != 0) 21024 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 21025 SDValue(FalseC, 0)); 21026 return Cond; 21027 } 21028 } 21029 } 21030 } 21031 21032 // Canonicalize max and min: 21033 // (x > y) ? x : y -> (x >= y) ? x : y 21034 // (x < y) ? x : y -> (x <= y) ? x : y 21035 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 21036 // the need for an extra compare 21037 // against zero. e.g. 21038 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 21039 // subl %esi, %edi 21040 // testl %edi, %edi 21041 // movl $0, %eax 21042 // cmovgl %edi, %eax 21043 // => 21044 // xorl %eax, %eax 21045 // subl %esi, $edi 21046 // cmovsl %eax, %edi 21047 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 21048 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 21049 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 21050 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 21051 switch (CC) { 21052 default: break; 21053 case ISD::SETLT: 21054 case ISD::SETGT: { 21055 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 21056 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), 21057 Cond.getOperand(0), Cond.getOperand(1), NewCC); 21058 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 21059 } 21060 } 21061 } 21062 21063 // Early exit check 21064 if (!TLI.isTypeLegal(VT)) 21065 return SDValue(); 21066 21067 // Match VSELECTs into subs with unsigned saturation. 21068 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 21069 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. 21070 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || 21071 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { 21072 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 21073 21074 // Check if one of the arms of the VSELECT is a zero vector. If it's on the 21075 // left side invert the predicate to simplify logic below. 21076 SDValue Other; 21077 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 21078 Other = RHS; 21079 CC = ISD::getSetCCInverse(CC, true); 21080 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { 21081 Other = LHS; 21082 } 21083 21084 if (Other.getNode() && Other->getNumOperands() == 2 && 21085 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { 21086 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); 21087 SDValue CondRHS = Cond->getOperand(1); 21088 21089 // Look for a general sub with unsigned saturation first. 21090 // x >= y ? x-y : 0 --> subus x, y 21091 // x > y ? x-y : 0 --> subus x, y 21092 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && 21093 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) 21094 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 21095 21096 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) 21097 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { 21098 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS)) 21099 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) 21100 // If the RHS is a constant we have to reverse the const 21101 // canonicalization. 21102 // x > C-1 ? x+-C : 0 --> subus x, C 21103 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && 21104 CondRHSConst->getAPIntValue() == 21105 (-OpRHSConst->getAPIntValue() - 1)) 21106 return DAG.getNode( 21107 X86ISD::SUBUS, DL, VT, OpLHS, 21108 DAG.getConstant(-OpRHSConst->getAPIntValue(), VT)); 21109 21110 // Another special case: If C was a sign bit, the sub has been 21111 // canonicalized into a xor. 21112 // FIXME: Would it be better to use computeKnownBits to determine 21113 // whether it's safe to decanonicalize the xor? 21114 // x s< 0 ? x^C : 0 --> subus x, C 21115 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && 21116 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 21117 OpRHSConst->getAPIntValue().isSignBit()) 21118 // Note that we have to rebuild the RHS constant here to ensure we 21119 // don't rely on particular values of undef lanes. 21120 return DAG.getNode( 21121 X86ISD::SUBUS, DL, VT, OpLHS, 21122 DAG.getConstant(OpRHSConst->getAPIntValue(), VT)); 21123 } 21124 } 21125 } 21126 21127 // Try to match a min/max vector operation. 21128 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { 21129 std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); 21130 unsigned Opc = ret.first; 21131 bool NeedSplit = ret.second; 21132 21133 if (Opc && NeedSplit) { 21134 unsigned NumElems = VT.getVectorNumElements(); 21135 // Extract the LHS vectors 21136 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); 21137 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); 21138 21139 // Extract the RHS vectors 21140 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); 21141 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); 21142 21143 // Create min/max for each subvector 21144 LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); 21145 RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); 21146 21147 // Merge the result 21148 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); 21149 } else if (Opc) 21150 return DAG.getNode(Opc, DL, VT, LHS, RHS); 21151 } 21152 21153 // Simplify vector selection if condition value type matches vselect 21154 // operand type 21155 if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { 21156 assert(Cond.getValueType().isVector() && 21157 "vector select expects a vector selector!"); 21158 21159 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); 21160 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 21161 21162 // Try invert the condition if true value is not all 1s and false value 21163 // is not all 0s. 21164 if (!TValIsAllOnes && !FValIsAllZeros && 21165 // Check if the selector will be produced by CMPP*/PCMP* 21166 Cond.getOpcode() == ISD::SETCC && 21167 // Check if SETCC has already been promoted 21168 TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) { 21169 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); 21170 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); 21171 21172 if (TValIsAllZeros || FValIsAllOnes) { 21173 SDValue CC = Cond.getOperand(2); 21174 ISD::CondCode NewCC = 21175 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 21176 Cond.getOperand(0).getValueType().isInteger()); 21177 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); 21178 std::swap(LHS, RHS); 21179 TValIsAllOnes = FValIsAllOnes; 21180 FValIsAllZeros = TValIsAllZeros; 21181 } 21182 } 21183 21184 if (TValIsAllOnes || FValIsAllZeros) { 21185 SDValue Ret; 21186 21187 if (TValIsAllOnes && FValIsAllZeros) 21188 Ret = Cond; 21189 else if (TValIsAllOnes) 21190 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, 21191 DAG.getNode(ISD::BITCAST, DL, CondVT, RHS)); 21192 else if (FValIsAllZeros) 21193 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, 21194 DAG.getNode(ISD::BITCAST, DL, CondVT, LHS)); 21195 21196 return DAG.getNode(ISD::BITCAST, DL, VT, Ret); 21197 } 21198 } 21199 21200 // We should generate an X86ISD::BLENDI from a vselect if its argument 21201 // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of 21202 // constants. This specific pattern gets generated when we split a 21203 // selector for a 512 bit vector in a machine without AVX512 (but with 21204 // 256-bit vectors), during legalization: 21205 // 21206 // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) 21207 // 21208 // Iff we find this pattern and the build_vectors are built from 21209 // constants, we translate the vselect into a shuffle_vector that we 21210 // know will be matched by LowerVECTOR_SHUFFLEtoBlend. 21211 if ((N->getOpcode() == ISD::VSELECT || 21212 N->getOpcode() == X86ISD::SHRUNKBLEND) && 21213 !DCI.isBeforeLegalize()) { 21214 SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); 21215 if (Shuffle.getNode()) 21216 return Shuffle; 21217 } 21218 21219 // If this is a *dynamic* select (non-constant condition) and we can match 21220 // this node with one of the variable blend instructions, restructure the 21221 // condition so that the blends can use the high bit of each element and use 21222 // SimplifyDemandedBits to simplify the condition operand. 21223 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 21224 !DCI.isBeforeLegalize() && 21225 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { 21226 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 21227 21228 // Don't optimize vector selects that map to mask-registers. 21229 if (BitWidth == 1) 21230 return SDValue(); 21231 21232 // We can only handle the cases where VSELECT is directly legal on the 21233 // subtarget. We custom lower VSELECT nodes with constant conditions and 21234 // this makes it hard to see whether a dynamic VSELECT will correctly 21235 // lower, so we both check the operation's status and explicitly handle the 21236 // cases where a *dynamic* blend will fail even though a constant-condition 21237 // blend could be custom lowered. 21238 // FIXME: We should find a better way to handle this class of problems. 21239 // Potentially, we should combine constant-condition vselect nodes 21240 // pre-legalization into shuffles and not mark as many types as custom 21241 // lowered. 21242 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) 21243 return SDValue(); 21244 // FIXME: We don't support i16-element blends currently. We could and 21245 // should support them by making *all* the bits in the condition be set 21246 // rather than just the high bit and using an i8-element blend. 21247 if (VT.getScalarType() == MVT::i16) 21248 return SDValue(); 21249 // Dynamic blending was only available from SSE4.1 onward. 21250 if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41()) 21251 return SDValue(); 21252 // Byte blends are only available in AVX2 21253 if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 && 21254 !Subtarget->hasAVX2()) 21255 return SDValue(); 21256 21257 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 21258 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 21259 21260 APInt KnownZero, KnownOne; 21261 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 21262 DCI.isBeforeLegalizeOps()); 21263 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 21264 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, 21265 TLO)) { 21266 // If we changed the computation somewhere in the DAG, this change 21267 // will affect all users of Cond. 21268 // Make sure it is fine and update all the nodes so that we do not 21269 // use the generic VSELECT anymore. Otherwise, we may perform 21270 // wrong optimizations as we messed up with the actual expectation 21271 // for the vector boolean values. 21272 if (Cond != TLO.Old) { 21273 // Check all uses of that condition operand to check whether it will be 21274 // consumed by non-BLEND instructions, which may depend on all bits are 21275 // set properly. 21276 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); 21277 I != E; ++I) 21278 if (I->getOpcode() != ISD::VSELECT) 21279 // TODO: Add other opcodes eventually lowered into BLEND. 21280 return SDValue(); 21281 21282 // Update all the users of the condition, before committing the change, 21283 // so that the VSELECT optimizations that expect the correct vector 21284 // boolean value will not be triggered. 21285 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); 21286 I != E; ++I) 21287 DAG.ReplaceAllUsesOfValueWith( 21288 SDValue(*I, 0), 21289 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), 21290 Cond, I->getOperand(1), I->getOperand(2))); 21291 DCI.CommitTargetLoweringOpt(TLO); 21292 return SDValue(); 21293 } 21294 // At this point, only Cond is changed. Change the condition 21295 // just for N to keep the opportunity to optimize all other 21296 // users their own way. 21297 DAG.ReplaceAllUsesOfValueWith( 21298 SDValue(N, 0), 21299 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), 21300 TLO.New, N->getOperand(1), N->getOperand(2))); 21301 return SDValue(); 21302 } 21303 } 21304 21305 return SDValue(); 21306 } 21307 21308 // Check whether a boolean test is testing a boolean value generated by 21309 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 21310 // code. 21311 // 21312 // Simplify the following patterns: 21313 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 21314 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 21315 // to (Op EFLAGS Cond) 21316 // 21317 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 21318 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 21319 // to (Op EFLAGS !Cond) 21320 // 21321 // where Op could be BRCOND or CMOV. 21322 // 21323 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 21324 // Quit if not CMP and SUB with its value result used. 21325 if (Cmp.getOpcode() != X86ISD::CMP && 21326 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 21327 return SDValue(); 21328 21329 // Quit if not used as a boolean value. 21330 if (CC != X86::COND_E && CC != X86::COND_NE) 21331 return SDValue(); 21332 21333 // Check CMP operands. One of them should be 0 or 1 and the other should be 21334 // an SetCC or extended from it. 21335 SDValue Op1 = Cmp.getOperand(0); 21336 SDValue Op2 = Cmp.getOperand(1); 21337 21338 SDValue SetCC; 21339 const ConstantSDNode* C = nullptr; 21340 bool needOppositeCond = (CC == X86::COND_E); 21341 bool checkAgainstTrue = false; // Is it a comparison against 1? 21342 21343 if ((C = dyn_cast<ConstantSDNode>(Op1))) 21344 SetCC = Op2; 21345 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 21346 SetCC = Op1; 21347 else // Quit if all operands are not constants. 21348 return SDValue(); 21349 21350 if (C->getZExtValue() == 1) { 21351 needOppositeCond = !needOppositeCond; 21352 checkAgainstTrue = true; 21353 } else if (C->getZExtValue() != 0) 21354 // Quit if the constant is neither 0 or 1. 21355 return SDValue(); 21356 21357 bool truncatedToBoolWithAnd = false; 21358 // Skip (zext $x), (trunc $x), or (and $x, 1) node. 21359 while (SetCC.getOpcode() == ISD::ZERO_EXTEND || 21360 SetCC.getOpcode() == ISD::TRUNCATE || 21361 SetCC.getOpcode() == ISD::AND) { 21362 if (SetCC.getOpcode() == ISD::AND) { 21363 int OpIdx = -1; 21364 ConstantSDNode *CS; 21365 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && 21366 CS->getZExtValue() == 1) 21367 OpIdx = 1; 21368 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && 21369 CS->getZExtValue() == 1) 21370 OpIdx = 0; 21371 if (OpIdx == -1) 21372 break; 21373 SetCC = SetCC.getOperand(OpIdx); 21374 truncatedToBoolWithAnd = true; 21375 } else 21376 SetCC = SetCC.getOperand(0); 21377 } 21378 21379 switch (SetCC.getOpcode()) { 21380 case X86ISD::SETCC_CARRY: 21381 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to 21382 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, 21383 // i.e. it's a comparison against true but the result of SETCC_CARRY is not 21384 // truncated to i1 using 'and'. 21385 if (checkAgainstTrue && !truncatedToBoolWithAnd) 21386 break; 21387 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && 21388 "Invalid use of SETCC_CARRY!"); 21389 // FALL THROUGH 21390 case X86ISD::SETCC: 21391 // Set the condition code or opposite one if necessary. 21392 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 21393 if (needOppositeCond) 21394 CC = X86::GetOppositeBranchCondition(CC); 21395 return SetCC.getOperand(1); 21396 case X86ISD::CMOV: { 21397 // Check whether false/true value has canonical one, i.e. 0 or 1. 21398 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 21399 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 21400 // Quit if true value is not a constant. 21401 if (!TVal) 21402 return SDValue(); 21403 // Quit if false value is not a constant. 21404 if (!FVal) { 21405 SDValue Op = SetCC.getOperand(0); 21406 // Skip 'zext' or 'trunc' node. 21407 if (Op.getOpcode() == ISD::ZERO_EXTEND || 21408 Op.getOpcode() == ISD::TRUNCATE) 21409 Op = Op.getOperand(0); 21410 // A special case for rdrand/rdseed, where 0 is set if false cond is 21411 // found. 21412 if ((Op.getOpcode() != X86ISD::RDRAND && 21413 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) 21414 return SDValue(); 21415 } 21416 // Quit if false value is not the constant 0 or 1. 21417 bool FValIsFalse = true; 21418 if (FVal && FVal->getZExtValue() != 0) { 21419 if (FVal->getZExtValue() != 1) 21420 return SDValue(); 21421 // If FVal is 1, opposite cond is needed. 21422 needOppositeCond = !needOppositeCond; 21423 FValIsFalse = false; 21424 } 21425 // Quit if TVal is not the constant opposite of FVal. 21426 if (FValIsFalse && TVal->getZExtValue() != 1) 21427 return SDValue(); 21428 if (!FValIsFalse && TVal->getZExtValue() != 0) 21429 return SDValue(); 21430 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 21431 if (needOppositeCond) 21432 CC = X86::GetOppositeBranchCondition(CC); 21433 return SetCC.getOperand(3); 21434 } 21435 } 21436 21437 return SDValue(); 21438 } 21439 21440 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. 21441 /// Match: 21442 /// (X86or (X86setcc) (X86setcc)) 21443 /// (X86cmp (and (X86setcc) (X86setcc)), 0) 21444 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, 21445 X86::CondCode &CC1, SDValue &Flags, 21446 bool &isAnd) { 21447 if (Cond->getOpcode() == X86ISD::CMP) { 21448 ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); 21449 if (!CondOp1C || !CondOp1C->isNullValue()) 21450 return false; 21451 21452 Cond = Cond->getOperand(0); 21453 } 21454 21455 isAnd = false; 21456 21457 SDValue SetCC0, SetCC1; 21458 switch (Cond->getOpcode()) { 21459 default: return false; 21460 case ISD::AND: 21461 case X86ISD::AND: 21462 isAnd = true; 21463 // fallthru 21464 case ISD::OR: 21465 case X86ISD::OR: 21466 SetCC0 = Cond->getOperand(0); 21467 SetCC1 = Cond->getOperand(1); 21468 break; 21469 }; 21470 21471 // Make sure we have SETCC nodes, using the same flags value. 21472 if (SetCC0.getOpcode() != X86ISD::SETCC || 21473 SetCC1.getOpcode() != X86ISD::SETCC || 21474 SetCC0->getOperand(1) != SetCC1->getOperand(1)) 21475 return false; 21476 21477 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); 21478 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); 21479 Flags = SetCC0->getOperand(1); 21480 return true; 21481 } 21482 21483 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 21484 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 21485 TargetLowering::DAGCombinerInfo &DCI, 21486 const X86Subtarget *Subtarget) { 21487 SDLoc DL(N); 21488 21489 // If the flag operand isn't dead, don't touch this CMOV. 21490 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 21491 return SDValue(); 21492 21493 SDValue FalseOp = N->getOperand(0); 21494 SDValue TrueOp = N->getOperand(1); 21495 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 21496 SDValue Cond = N->getOperand(3); 21497 21498 if (CC == X86::COND_E || CC == X86::COND_NE) { 21499 switch (Cond.getOpcode()) { 21500 default: break; 21501 case X86ISD::BSR: 21502 case X86ISD::BSF: 21503 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 21504 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 21505 return (CC == X86::COND_E) ? FalseOp : TrueOp; 21506 } 21507 } 21508 21509 SDValue Flags; 21510 21511 Flags = checkBoolTestSetCCCombine(Cond, CC); 21512 if (Flags.getNode() && 21513 // Extra check as FCMOV only supports a subset of X86 cond. 21514 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { 21515 SDValue Ops[] = { FalseOp, TrueOp, 21516 DAG.getConstant(CC, MVT::i8), Flags }; 21517 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); 21518 } 21519 21520 // If this is a select between two integer constants, try to do some 21521 // optimizations. Note that the operands are ordered the opposite of SELECT 21522 // operands. 21523 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 21524 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 21525 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 21526 // larger than FalseC (the false value). 21527 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 21528 CC = X86::GetOppositeBranchCondition(CC); 21529 std::swap(TrueC, FalseC); 21530 std::swap(TrueOp, FalseOp); 21531 } 21532 21533 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 21534 // This is efficient for any integer data type (including i8/i16) and 21535 // shift amount. 21536 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 21537 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 21538 DAG.getConstant(CC, MVT::i8), Cond); 21539 21540 // Zero extend the condition if needed. 21541 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 21542 21543 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 21544 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 21545 DAG.getConstant(ShAmt, MVT::i8)); 21546 if (N->getNumValues() == 2) // Dead flag value? 21547 return DCI.CombineTo(N, Cond, SDValue()); 21548 return Cond; 21549 } 21550 21551 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 21552 // for any integer data type, including i8/i16. 21553 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 21554 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 21555 DAG.getConstant(CC, MVT::i8), Cond); 21556 21557 // Zero extend the condition if needed. 21558 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 21559 FalseC->getValueType(0), Cond); 21560 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 21561 SDValue(FalseC, 0)); 21562 21563 if (N->getNumValues() == 2) // Dead flag value? 21564 return DCI.CombineTo(N, Cond, SDValue()); 21565 return Cond; 21566 } 21567 21568 // Optimize cases that will turn into an LEA instruction. This requires 21569 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 21570 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 21571 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 21572 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 21573 21574 bool isFastMultiplier = false; 21575 if (Diff < 10) { 21576 switch ((unsigned char)Diff) { 21577 default: break; 21578 case 1: // result = add base, cond 21579 case 2: // result = lea base( , cond*2) 21580 case 3: // result = lea base(cond, cond*2) 21581 case 4: // result = lea base( , cond*4) 21582 case 5: // result = lea base(cond, cond*4) 21583 case 8: // result = lea base( , cond*8) 21584 case 9: // result = lea base(cond, cond*8) 21585 isFastMultiplier = true; 21586 break; 21587 } 21588 } 21589 21590 if (isFastMultiplier) { 21591 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 21592 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 21593 DAG.getConstant(CC, MVT::i8), Cond); 21594 // Zero extend the condition if needed. 21595 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 21596 Cond); 21597 // Scale the condition by the difference. 21598 if (Diff != 1) 21599 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 21600 DAG.getConstant(Diff, Cond.getValueType())); 21601 21602 // Add the base if non-zero. 21603 if (FalseC->getAPIntValue() != 0) 21604 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 21605 SDValue(FalseC, 0)); 21606 if (N->getNumValues() == 2) // Dead flag value? 21607 return DCI.CombineTo(N, Cond, SDValue()); 21608 return Cond; 21609 } 21610 } 21611 } 21612 } 21613 21614 // Handle these cases: 21615 // (select (x != c), e, c) -> select (x != c), e, x), 21616 // (select (x == c), c, e) -> select (x == c), x, e) 21617 // where the c is an integer constant, and the "select" is the combination 21618 // of CMOV and CMP. 21619 // 21620 // The rationale for this change is that the conditional-move from a constant 21621 // needs two instructions, however, conditional-move from a register needs 21622 // only one instruction. 21623 // 21624 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 21625 // some instruction-combining opportunities. This opt needs to be 21626 // postponed as late as possible. 21627 // 21628 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 21629 // the DCI.xxxx conditions are provided to postpone the optimization as 21630 // late as possible. 21631 21632 ConstantSDNode *CmpAgainst = nullptr; 21633 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 21634 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 21635 !isa<ConstantSDNode>(Cond.getOperand(0))) { 21636 21637 if (CC == X86::COND_NE && 21638 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 21639 CC = X86::GetOppositeBranchCondition(CC); 21640 std::swap(TrueOp, FalseOp); 21641 } 21642 21643 if (CC == X86::COND_E && 21644 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 21645 SDValue Ops[] = { FalseOp, Cond.getOperand(0), 21646 DAG.getConstant(CC, MVT::i8), Cond }; 21647 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); 21648 } 21649 } 21650 } 21651 21652 // Fold and/or of setcc's to double CMOV: 21653 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) 21654 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) 21655 // 21656 // This combine lets us generate: 21657 // cmovcc1 (jcc1 if we don't have CMOV) 21658 // cmovcc2 (same) 21659 // instead of: 21660 // setcc1 21661 // setcc2 21662 // and/or 21663 // cmovne (jne if we don't have CMOV) 21664 // When we can't use the CMOV instruction, it might increase branch 21665 // mispredicts. 21666 // When we can use CMOV, or when there is no mispredict, this improves 21667 // throughput and reduces register pressure. 21668 // 21669 if (CC == X86::COND_NE) { 21670 SDValue Flags; 21671 X86::CondCode CC0, CC1; 21672 bool isAndSetCC; 21673 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { 21674 if (isAndSetCC) { 21675 std::swap(FalseOp, TrueOp); 21676 CC0 = X86::GetOppositeBranchCondition(CC0); 21677 CC1 = X86::GetOppositeBranchCondition(CC1); 21678 } 21679 21680 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8), 21681 Flags}; 21682 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); 21683 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags}; 21684 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); 21685 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); 21686 return CMOV; 21687 } 21688 } 21689 21690 return SDValue(); 21691 } 21692 21693 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, 21694 const X86Subtarget *Subtarget) { 21695 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 21696 switch (IntNo) { 21697 default: return SDValue(); 21698 // SSE/AVX/AVX2 blend intrinsics. 21699 case Intrinsic::x86_avx2_pblendvb: 21700 // Don't try to simplify this intrinsic if we don't have AVX2. 21701 if (!Subtarget->hasAVX2()) 21702 return SDValue(); 21703 // FALL-THROUGH 21704 case Intrinsic::x86_avx_blendv_pd_256: 21705 case Intrinsic::x86_avx_blendv_ps_256: 21706 // Don't try to simplify this intrinsic if we don't have AVX. 21707 if (!Subtarget->hasAVX()) 21708 return SDValue(); 21709 // FALL-THROUGH 21710 case Intrinsic::x86_sse41_blendvps: 21711 case Intrinsic::x86_sse41_blendvpd: 21712 case Intrinsic::x86_sse41_pblendvb: { 21713 SDValue Op0 = N->getOperand(1); 21714 SDValue Op1 = N->getOperand(2); 21715 SDValue Mask = N->getOperand(3); 21716 21717 // Don't try to simplify this intrinsic if we don't have SSE4.1. 21718 if (!Subtarget->hasSSE41()) 21719 return SDValue(); 21720 21721 // fold (blend A, A, Mask) -> A 21722 if (Op0 == Op1) 21723 return Op0; 21724 // fold (blend A, B, allZeros) -> A 21725 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 21726 return Op0; 21727 // fold (blend A, B, allOnes) -> B 21728 if (ISD::isBuildVectorAllOnes(Mask.getNode())) 21729 return Op1; 21730 21731 // Simplify the case where the mask is a constant i32 value. 21732 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { 21733 if (C->isNullValue()) 21734 return Op0; 21735 if (C->isAllOnesValue()) 21736 return Op1; 21737 } 21738 21739 return SDValue(); 21740 } 21741 21742 // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. 21743 case Intrinsic::x86_sse2_psrai_w: 21744 case Intrinsic::x86_sse2_psrai_d: 21745 case Intrinsic::x86_avx2_psrai_w: 21746 case Intrinsic::x86_avx2_psrai_d: 21747 case Intrinsic::x86_sse2_psra_w: 21748 case Intrinsic::x86_sse2_psra_d: 21749 case Intrinsic::x86_avx2_psra_w: 21750 case Intrinsic::x86_avx2_psra_d: { 21751 SDValue Op0 = N->getOperand(1); 21752 SDValue Op1 = N->getOperand(2); 21753 EVT VT = Op0.getValueType(); 21754 assert(VT.isVector() && "Expected a vector type!"); 21755 21756 if (isa<BuildVectorSDNode>(Op1)) 21757 Op1 = Op1.getOperand(0); 21758 21759 if (!isa<ConstantSDNode>(Op1)) 21760 return SDValue(); 21761 21762 EVT SVT = VT.getVectorElementType(); 21763 unsigned SVTBits = SVT.getSizeInBits(); 21764 21765 ConstantSDNode *CND = cast<ConstantSDNode>(Op1); 21766 const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); 21767 uint64_t ShAmt = C.getZExtValue(); 21768 21769 // Don't try to convert this shift into a ISD::SRA if the shift 21770 // count is bigger than or equal to the element size. 21771 if (ShAmt >= SVTBits) 21772 return SDValue(); 21773 21774 // Trivial case: if the shift count is zero, then fold this 21775 // into the first operand. 21776 if (ShAmt == 0) 21777 return Op0; 21778 21779 // Replace this packed shift intrinsic with a target independent 21780 // shift dag node. 21781 SDValue Splat = DAG.getConstant(C, VT); 21782 return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat); 21783 } 21784 } 21785 } 21786 21787 /// PerformMulCombine - Optimize a single multiply with constant into two 21788 /// in order to implement it with two cheaper instructions, e.g. 21789 /// LEA + SHL, LEA + LEA. 21790 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 21791 TargetLowering::DAGCombinerInfo &DCI) { 21792 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 21793 return SDValue(); 21794 21795 EVT VT = N->getValueType(0); 21796 if (VT != MVT::i64 && VT != MVT::i32) 21797 return SDValue(); 21798 21799 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 21800 if (!C) 21801 return SDValue(); 21802 uint64_t MulAmt = C->getZExtValue(); 21803 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 21804 return SDValue(); 21805 21806 uint64_t MulAmt1 = 0; 21807 uint64_t MulAmt2 = 0; 21808 if ((MulAmt % 9) == 0) { 21809 MulAmt1 = 9; 21810 MulAmt2 = MulAmt / 9; 21811 } else if ((MulAmt % 5) == 0) { 21812 MulAmt1 = 5; 21813 MulAmt2 = MulAmt / 5; 21814 } else if ((MulAmt % 3) == 0) { 21815 MulAmt1 = 3; 21816 MulAmt2 = MulAmt / 3; 21817 } 21818 if (MulAmt2 && 21819 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 21820 SDLoc DL(N); 21821 21822 if (isPowerOf2_64(MulAmt2) && 21823 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 21824 // If second multiplifer is pow2, issue it first. We want the multiply by 21825 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 21826 // is an add. 21827 std::swap(MulAmt1, MulAmt2); 21828 21829 SDValue NewMul; 21830 if (isPowerOf2_64(MulAmt1)) 21831 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 21832 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 21833 else 21834 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 21835 DAG.getConstant(MulAmt1, VT)); 21836 21837 if (isPowerOf2_64(MulAmt2)) 21838 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 21839 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 21840 else 21841 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 21842 DAG.getConstant(MulAmt2, VT)); 21843 21844 // Do not add new nodes to DAG combiner worklist. 21845 DCI.CombineTo(N, NewMul, false); 21846 } 21847 return SDValue(); 21848 } 21849 21850 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 21851 SDValue N0 = N->getOperand(0); 21852 SDValue N1 = N->getOperand(1); 21853 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 21854 EVT VT = N0.getValueType(); 21855 21856 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 21857 // since the result of setcc_c is all zero's or all ones. 21858 if (VT.isInteger() && !VT.isVector() && 21859 N1C && N0.getOpcode() == ISD::AND && 21860 N0.getOperand(1).getOpcode() == ISD::Constant) { 21861 SDValue N00 = N0.getOperand(0); 21862 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 21863 ((N00.getOpcode() == ISD::ANY_EXTEND || 21864 N00.getOpcode() == ISD::ZERO_EXTEND) && 21865 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 21866 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 21867 APInt ShAmt = N1C->getAPIntValue(); 21868 Mask = Mask.shl(ShAmt); 21869 if (Mask != 0) 21870 return DAG.getNode(ISD::AND, SDLoc(N), VT, 21871 N00, DAG.getConstant(Mask, VT)); 21872 } 21873 } 21874 21875 // Hardware support for vector shifts is sparse which makes us scalarize the 21876 // vector operations in many cases. Also, on sandybridge ADD is faster than 21877 // shl. 21878 // (shl V, 1) -> add V,V 21879 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1)) 21880 if (auto *N1SplatC = N1BV->getConstantSplatNode()) { 21881 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 21882 // We shift all of the values by one. In many cases we do not have 21883 // hardware support for this operation. This is better expressed as an ADD 21884 // of two values. 21885 if (N1SplatC->getZExtValue() == 1) 21886 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); 21887 } 21888 21889 return SDValue(); 21890 } 21891 21892 /// \brief Returns a vector of 0s if the node in input is a vector logical 21893 /// shift by a constant amount which is known to be bigger than or equal 21894 /// to the vector element size in bits. 21895 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, 21896 const X86Subtarget *Subtarget) { 21897 EVT VT = N->getValueType(0); 21898 21899 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 21900 (!Subtarget->hasInt256() || 21901 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 21902 return SDValue(); 21903 21904 SDValue Amt = N->getOperand(1); 21905 SDLoc DL(N); 21906 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) 21907 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { 21908 APInt ShiftAmt = AmtSplat->getAPIntValue(); 21909 unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); 21910 21911 // SSE2/AVX2 logical shifts always return a vector of 0s 21912 // if the shift amount is bigger than or equal to 21913 // the element size. The constant shift amount will be 21914 // encoded as a 8-bit immediate. 21915 if (ShiftAmt.trunc(8).uge(MaxAmount)) 21916 return getZeroVector(VT, Subtarget, DAG, DL); 21917 } 21918 21919 return SDValue(); 21920 } 21921 21922 /// PerformShiftCombine - Combine shifts. 21923 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 21924 TargetLowering::DAGCombinerInfo &DCI, 21925 const X86Subtarget *Subtarget) { 21926 if (N->getOpcode() == ISD::SHL) { 21927 SDValue V = PerformSHLCombine(N, DAG); 21928 if (V.getNode()) return V; 21929 } 21930 21931 if (N->getOpcode() != ISD::SRA) { 21932 // Try to fold this logical shift into a zero vector. 21933 SDValue V = performShiftToAllZeros(N, DAG, Subtarget); 21934 if (V.getNode()) return V; 21935 } 21936 21937 return SDValue(); 21938 } 21939 21940 // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 21941 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS 21942 // and friends. Likewise for OR -> CMPNEQSS. 21943 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 21944 TargetLowering::DAGCombinerInfo &DCI, 21945 const X86Subtarget *Subtarget) { 21946 unsigned opcode; 21947 21948 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 21949 // we're requiring SSE2 for both. 21950 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 21951 SDValue N0 = N->getOperand(0); 21952 SDValue N1 = N->getOperand(1); 21953 SDValue CMP0 = N0->getOperand(1); 21954 SDValue CMP1 = N1->getOperand(1); 21955 SDLoc DL(N); 21956 21957 // The SETCCs should both refer to the same CMP. 21958 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 21959 return SDValue(); 21960 21961 SDValue CMP00 = CMP0->getOperand(0); 21962 SDValue CMP01 = CMP0->getOperand(1); 21963 EVT VT = CMP00.getValueType(); 21964 21965 if (VT == MVT::f32 || VT == MVT::f64) { 21966 bool ExpectingFlags = false; 21967 // Check for any users that want flags: 21968 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 21969 !ExpectingFlags && UI != UE; ++UI) 21970 switch (UI->getOpcode()) { 21971 default: 21972 case ISD::BR_CC: 21973 case ISD::BRCOND: 21974 case ISD::SELECT: 21975 ExpectingFlags = true; 21976 break; 21977 case ISD::CopyToReg: 21978 case ISD::SIGN_EXTEND: 21979 case ISD::ZERO_EXTEND: 21980 case ISD::ANY_EXTEND: 21981 break; 21982 } 21983 21984 if (!ExpectingFlags) { 21985 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 21986 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 21987 21988 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 21989 X86::CondCode tmp = cc0; 21990 cc0 = cc1; 21991 cc1 = tmp; 21992 } 21993 21994 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 21995 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 21996 // FIXME: need symbolic constants for these magic numbers. 21997 // See X86ATTInstPrinter.cpp:printSSECC(). 21998 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 21999 if (Subtarget->hasAVX512()) { 22000 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, 22001 CMP01, DAG.getConstant(x86cc, MVT::i8)); 22002 if (N->getValueType(0) != MVT::i1) 22003 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), 22004 FSetCC); 22005 return FSetCC; 22006 } 22007 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, 22008 CMP00.getValueType(), CMP00, CMP01, 22009 DAG.getConstant(x86cc, MVT::i8)); 22010 22011 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 22012 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; 22013 22014 if (is64BitFP && !Subtarget->is64Bit()) { 22015 // On a 32-bit target, we cannot bitcast the 64-bit float to a 22016 // 64-bit integer, since that's not a legal type. Since 22017 // OnesOrZeroesF is all ones of all zeroes, we don't need all the 22018 // bits, but can do this little dance to extract the lowest 32 bits 22019 // and work with those going forward. 22020 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, 22021 OnesOrZeroesF); 22022 SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, 22023 Vector64); 22024 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, 22025 Vector32, DAG.getIntPtrConstant(0)); 22026 IntVT = MVT::i32; 22027 } 22028 22029 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF); 22030 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, 22031 DAG.getConstant(1, IntVT)); 22032 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 22033 return OneBitOfTruth; 22034 } 22035 } 22036 } 22037 } 22038 return SDValue(); 22039 } 22040 22041 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 22042 /// so it can be folded inside ANDNP. 22043 static bool CanFoldXORWithAllOnes(const SDNode *N) { 22044 EVT VT = N->getValueType(0); 22045 22046 // Match direct AllOnes for 128 and 256-bit vectors 22047 if (ISD::isBuildVectorAllOnes(N)) 22048 return true; 22049 22050 // Look through a bit convert. 22051 if (N->getOpcode() == ISD::BITCAST) 22052 N = N->getOperand(0).getNode(); 22053 22054 // Sometimes the operand may come from a insert_subvector building a 256-bit 22055 // allones vector 22056 if (VT.is256BitVector() && 22057 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 22058 SDValue V1 = N->getOperand(0); 22059 SDValue V2 = N->getOperand(1); 22060 22061 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 22062 V1.getOperand(0).getOpcode() == ISD::UNDEF && 22063 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 22064 ISD::isBuildVectorAllOnes(V2.getNode())) 22065 return true; 22066 } 22067 22068 return false; 22069 } 22070 22071 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized 22072 // register. In most cases we actually compare or select YMM-sized registers 22073 // and mixing the two types creates horrible code. This method optimizes 22074 // some of the transition sequences. 22075 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, 22076 TargetLowering::DAGCombinerInfo &DCI, 22077 const X86Subtarget *Subtarget) { 22078 EVT VT = N->getValueType(0); 22079 if (!VT.is256BitVector()) 22080 return SDValue(); 22081 22082 assert((N->getOpcode() == ISD::ANY_EXTEND || 22083 N->getOpcode() == ISD::ZERO_EXTEND || 22084 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); 22085 22086 SDValue Narrow = N->getOperand(0); 22087 EVT NarrowVT = Narrow->getValueType(0); 22088 if (!NarrowVT.is128BitVector()) 22089 return SDValue(); 22090 22091 if (Narrow->getOpcode() != ISD::XOR && 22092 Narrow->getOpcode() != ISD::AND && 22093 Narrow->getOpcode() != ISD::OR) 22094 return SDValue(); 22095 22096 SDValue N0 = Narrow->getOperand(0); 22097 SDValue N1 = Narrow->getOperand(1); 22098 SDLoc DL(Narrow); 22099 22100 // The Left side has to be a trunc. 22101 if (N0.getOpcode() != ISD::TRUNCATE) 22102 return SDValue(); 22103 22104 // The type of the truncated inputs. 22105 EVT WideVT = N0->getOperand(0)->getValueType(0); 22106 if (WideVT != VT) 22107 return SDValue(); 22108 22109 // The right side has to be a 'trunc' or a constant vector. 22110 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; 22111 ConstantSDNode *RHSConstSplat = nullptr; 22112 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1)) 22113 RHSConstSplat = RHSBV->getConstantSplatNode(); 22114 if (!RHSTrunc && !RHSConstSplat) 22115 return SDValue(); 22116 22117 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 22118 22119 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) 22120 return SDValue(); 22121 22122 // Set N0 and N1 to hold the inputs to the new wide operation. 22123 N0 = N0->getOperand(0); 22124 if (RHSConstSplat) { 22125 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), 22126 SDValue(RHSConstSplat, 0)); 22127 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); 22128 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); 22129 } else if (RHSTrunc) { 22130 N1 = N1->getOperand(0); 22131 } 22132 22133 // Generate the wide operation. 22134 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); 22135 unsigned Opcode = N->getOpcode(); 22136 switch (Opcode) { 22137 case ISD::ANY_EXTEND: 22138 return Op; 22139 case ISD::ZERO_EXTEND: { 22140 unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); 22141 APInt Mask = APInt::getAllOnesValue(InBits); 22142 Mask = Mask.zext(VT.getScalarType().getSizeInBits()); 22143 return DAG.getNode(ISD::AND, DL, VT, 22144 Op, DAG.getConstant(Mask, VT)); 22145 } 22146 case ISD::SIGN_EXTEND: 22147 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, 22148 Op, DAG.getValueType(NarrowVT)); 22149 default: 22150 llvm_unreachable("Unexpected opcode"); 22151 } 22152 } 22153 22154 static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, 22155 TargetLowering::DAGCombinerInfo &DCI, 22156 const X86Subtarget *Subtarget) { 22157 SDValue N0 = N->getOperand(0); 22158 SDValue N1 = N->getOperand(1); 22159 SDLoc DL(N); 22160 22161 // A vector zext_in_reg may be represented as a shuffle, 22162 // feeding into a bitcast (this represents anyext) feeding into 22163 // an and with a mask. 22164 // We'd like to try to combine that into a shuffle with zero 22165 // plus a bitcast, removing the and. 22166 if (N0.getOpcode() != ISD::BITCAST || 22167 N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) 22168 return SDValue(); 22169 22170 // The other side of the AND should be a splat of 2^C, where C 22171 // is the number of bits in the source type. 22172 if (N1.getOpcode() == ISD::BITCAST) 22173 N1 = N1.getOperand(0); 22174 if (N1.getOpcode() != ISD::BUILD_VECTOR) 22175 return SDValue(); 22176 BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1); 22177 22178 ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0)); 22179 EVT SrcType = Shuffle->getValueType(0); 22180 22181 // We expect a single-source shuffle 22182 if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF) 22183 return SDValue(); 22184 22185 unsigned SrcSize = SrcType.getScalarSizeInBits(); 22186 22187 APInt SplatValue, SplatUndef; 22188 unsigned SplatBitSize; 22189 bool HasAnyUndefs; 22190 if (!Vector->isConstantSplat(SplatValue, SplatUndef, 22191 SplatBitSize, HasAnyUndefs)) 22192 return SDValue(); 22193 22194 unsigned ResSize = N1.getValueType().getScalarSizeInBits(); 22195 // Make sure the splat matches the mask we expect 22196 if (SplatBitSize > ResSize || 22197 (SplatValue + 1).exactLogBase2() != (int)SrcSize) 22198 return SDValue(); 22199 22200 // Make sure the input and output size make sense 22201 if (SrcSize >= ResSize || ResSize % SrcSize) 22202 return SDValue(); 22203 22204 // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> 22205 // The number of u's between each two values depends on the ratio between 22206 // the source and dest type. 22207 unsigned ZextRatio = ResSize / SrcSize; 22208 bool IsZext = true; 22209 for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) { 22210 if (i % ZextRatio) { 22211 if (Shuffle->getMaskElt(i) > 0) { 22212 // Expected undef 22213 IsZext = false; 22214 break; 22215 } 22216 } else { 22217 if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) { 22218 // Expected element number 22219 IsZext = false; 22220 break; 22221 } 22222 } 22223 } 22224 22225 if (!IsZext) 22226 return SDValue(); 22227 22228 // Ok, perform the transformation - replace the shuffle with 22229 // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero 22230 // (instead of undef) where the k elements come from the zero vector. 22231 SmallVector<int, 8> Mask; 22232 unsigned NumElems = SrcType.getVectorNumElements(); 22233 for (unsigned i = 0; i < NumElems; ++i) 22234 if (i % ZextRatio) 22235 Mask.push_back(NumElems); 22236 else 22237 Mask.push_back(i / ZextRatio); 22238 22239 SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, 22240 Shuffle->getOperand(0), DAG.getConstant(0, SrcType), Mask); 22241 return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle); 22242 } 22243 22244 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 22245 TargetLowering::DAGCombinerInfo &DCI, 22246 const X86Subtarget *Subtarget) { 22247 if (DCI.isBeforeLegalizeOps()) 22248 return SDValue(); 22249 22250 if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget)) 22251 return Zext; 22252 22253 if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) 22254 return R; 22255 22256 EVT VT = N->getValueType(0); 22257 SDValue N0 = N->getOperand(0); 22258 SDValue N1 = N->getOperand(1); 22259 SDLoc DL(N); 22260 22261 // Create BEXTR instructions 22262 // BEXTR is ((X >> imm) & (2**size-1)) 22263 if (VT == MVT::i32 || VT == MVT::i64) { 22264 // Check for BEXTR. 22265 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && 22266 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { 22267 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); 22268 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 22269 if (MaskNode && ShiftNode) { 22270 uint64_t Mask = MaskNode->getZExtValue(); 22271 uint64_t Shift = ShiftNode->getZExtValue(); 22272 if (isMask_64(Mask)) { 22273 uint64_t MaskSize = countPopulation(Mask); 22274 if (Shift + MaskSize <= VT.getSizeInBits()) 22275 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), 22276 DAG.getConstant(Shift | (MaskSize << 8), VT)); 22277 } 22278 } 22279 } // BEXTR 22280 22281 return SDValue(); 22282 } 22283 22284 // Want to form ANDNP nodes: 22285 // 1) In the hopes of then easily combining them with OR and AND nodes 22286 // to form PBLEND/PSIGN. 22287 // 2) To match ANDN packed intrinsics 22288 if (VT != MVT::v2i64 && VT != MVT::v4i64) 22289 return SDValue(); 22290 22291 // Check LHS for vnot 22292 if (N0.getOpcode() == ISD::XOR && 22293 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 22294 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 22295 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 22296 22297 // Check RHS for vnot 22298 if (N1.getOpcode() == ISD::XOR && 22299 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 22300 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 22301 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 22302 22303 return SDValue(); 22304 } 22305 22306 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 22307 TargetLowering::DAGCombinerInfo &DCI, 22308 const X86Subtarget *Subtarget) { 22309 if (DCI.isBeforeLegalizeOps()) 22310 return SDValue(); 22311 22312 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 22313 if (R.getNode()) 22314 return R; 22315 22316 SDValue N0 = N->getOperand(0); 22317 SDValue N1 = N->getOperand(1); 22318 EVT VT = N->getValueType(0); 22319 22320 // look for psign/blend 22321 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 22322 if (!Subtarget->hasSSSE3() || 22323 (VT == MVT::v4i64 && !Subtarget->hasInt256())) 22324 return SDValue(); 22325 22326 // Canonicalize pandn to RHS 22327 if (N0.getOpcode() == X86ISD::ANDNP) 22328 std::swap(N0, N1); 22329 // or (and (m, y), (pandn m, x)) 22330 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 22331 SDValue Mask = N1.getOperand(0); 22332 SDValue X = N1.getOperand(1); 22333 SDValue Y; 22334 if (N0.getOperand(0) == Mask) 22335 Y = N0.getOperand(1); 22336 if (N0.getOperand(1) == Mask) 22337 Y = N0.getOperand(0); 22338 22339 // Check to see if the mask appeared in both the AND and ANDNP and 22340 if (!Y.getNode()) 22341 return SDValue(); 22342 22343 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 22344 // Look through mask bitcast. 22345 if (Mask.getOpcode() == ISD::BITCAST) 22346 Mask = Mask.getOperand(0); 22347 if (X.getOpcode() == ISD::BITCAST) 22348 X = X.getOperand(0); 22349 if (Y.getOpcode() == ISD::BITCAST) 22350 Y = Y.getOperand(0); 22351 22352 EVT MaskVT = Mask.getValueType(); 22353 22354 // Validate that the Mask operand is a vector sra node. 22355 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 22356 // there is no psrai.b 22357 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 22358 unsigned SraAmt = ~0; 22359 if (Mask.getOpcode() == ISD::SRA) { 22360 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) 22361 if (auto *AmtConst = AmtBV->getConstantSplatNode()) 22362 SraAmt = AmtConst->getZExtValue(); 22363 } else if (Mask.getOpcode() == X86ISD::VSRAI) { 22364 SDValue SraC = Mask.getOperand(1); 22365 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 22366 } 22367 if ((SraAmt + 1) != EltBits) 22368 return SDValue(); 22369 22370 SDLoc DL(N); 22371 22372 // Now we know we at least have a plendvb with the mask val. See if 22373 // we can form a psignb/w/d. 22374 // psign = x.type == y.type == mask.type && y = sub(0, x); 22375 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 22376 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 22377 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 22378 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 22379 "Unsupported VT for PSIGN"); 22380 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 22381 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 22382 } 22383 // PBLENDVB only available on SSE 4.1 22384 if (!Subtarget->hasSSE41()) 22385 return SDValue(); 22386 22387 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 22388 22389 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 22390 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 22391 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 22392 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 22393 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 22394 } 22395 } 22396 22397 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 22398 return SDValue(); 22399 22400 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 22401 MachineFunction &MF = DAG.getMachineFunction(); 22402 bool OptForSize = 22403 MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); 22404 22405 // SHLD/SHRD instructions have lower register pressure, but on some 22406 // platforms they have higher latency than the equivalent 22407 // series of shifts/or that would otherwise be generated. 22408 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions 22409 // have higher latencies and we are not optimizing for size. 22410 if (!OptForSize && Subtarget->isSHLDSlow()) 22411 return SDValue(); 22412 22413 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 22414 std::swap(N0, N1); 22415 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 22416 return SDValue(); 22417 if (!N0.hasOneUse() || !N1.hasOneUse()) 22418 return SDValue(); 22419 22420 SDValue ShAmt0 = N0.getOperand(1); 22421 if (ShAmt0.getValueType() != MVT::i8) 22422 return SDValue(); 22423 SDValue ShAmt1 = N1.getOperand(1); 22424 if (ShAmt1.getValueType() != MVT::i8) 22425 return SDValue(); 22426 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 22427 ShAmt0 = ShAmt0.getOperand(0); 22428 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 22429 ShAmt1 = ShAmt1.getOperand(0); 22430 22431 SDLoc DL(N); 22432 unsigned Opc = X86ISD::SHLD; 22433 SDValue Op0 = N0.getOperand(0); 22434 SDValue Op1 = N1.getOperand(0); 22435 if (ShAmt0.getOpcode() == ISD::SUB) { 22436 Opc = X86ISD::SHRD; 22437 std::swap(Op0, Op1); 22438 std::swap(ShAmt0, ShAmt1); 22439 } 22440 22441 unsigned Bits = VT.getSizeInBits(); 22442 if (ShAmt1.getOpcode() == ISD::SUB) { 22443 SDValue Sum = ShAmt1.getOperand(0); 22444 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 22445 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 22446 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 22447 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 22448 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 22449 return DAG.getNode(Opc, DL, VT, 22450 Op0, Op1, 22451 DAG.getNode(ISD::TRUNCATE, DL, 22452 MVT::i8, ShAmt0)); 22453 } 22454 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 22455 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 22456 if (ShAmt0C && 22457 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 22458 return DAG.getNode(Opc, DL, VT, 22459 N0.getOperand(0), N1.getOperand(0), 22460 DAG.getNode(ISD::TRUNCATE, DL, 22461 MVT::i8, ShAmt0)); 22462 } 22463 22464 return SDValue(); 22465 } 22466 22467 // Generate NEG and CMOV for integer abs. 22468 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 22469 EVT VT = N->getValueType(0); 22470 22471 // Since X86 does not have CMOV for 8-bit integer, we don't convert 22472 // 8-bit integer abs to NEG and CMOV. 22473 if (VT.isInteger() && VT.getSizeInBits() == 8) 22474 return SDValue(); 22475 22476 SDValue N0 = N->getOperand(0); 22477 SDValue N1 = N->getOperand(1); 22478 SDLoc DL(N); 22479 22480 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 22481 // and change it to SUB and CMOV. 22482 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 22483 N0.getOpcode() == ISD::ADD && 22484 N0.getOperand(1) == N1 && 22485 N1.getOpcode() == ISD::SRA && 22486 N1.getOperand(0) == N0.getOperand(0)) 22487 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 22488 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 22489 // Generate SUB & CMOV. 22490 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 22491 DAG.getConstant(0, VT), N0.getOperand(0)); 22492 22493 SDValue Ops[] = { N0.getOperand(0), Neg, 22494 DAG.getConstant(X86::COND_GE, MVT::i8), 22495 SDValue(Neg.getNode(), 1) }; 22496 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); 22497 } 22498 return SDValue(); 22499 } 22500 22501 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 22502 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 22503 TargetLowering::DAGCombinerInfo &DCI, 22504 const X86Subtarget *Subtarget) { 22505 if (DCI.isBeforeLegalizeOps()) 22506 return SDValue(); 22507 22508 if (Subtarget->hasCMov()) { 22509 SDValue RV = performIntegerAbsCombine(N, DAG); 22510 if (RV.getNode()) 22511 return RV; 22512 } 22513 22514 return SDValue(); 22515 } 22516 22517 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 22518 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 22519 TargetLowering::DAGCombinerInfo &DCI, 22520 const X86Subtarget *Subtarget) { 22521 LoadSDNode *Ld = cast<LoadSDNode>(N); 22522 EVT RegVT = Ld->getValueType(0); 22523 EVT MemVT = Ld->getMemoryVT(); 22524 SDLoc dl(Ld); 22525 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 22526 22527 // For chips with slow 32-byte unaligned loads, break the 32-byte operation 22528 // into two 16-byte operations. 22529 ISD::LoadExtType Ext = Ld->getExtensionType(); 22530 unsigned Alignment = Ld->getAlignment(); 22531 bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; 22532 if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && 22533 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { 22534 unsigned NumElems = RegVT.getVectorNumElements(); 22535 if (NumElems < 2) 22536 return SDValue(); 22537 22538 SDValue Ptr = Ld->getBasePtr(); 22539 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); 22540 22541 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 22542 NumElems/2); 22543 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 22544 Ld->getPointerInfo(), Ld->isVolatile(), 22545 Ld->isNonTemporal(), Ld->isInvariant(), 22546 Alignment); 22547 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 22548 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 22549 Ld->getPointerInfo(), Ld->isVolatile(), 22550 Ld->isNonTemporal(), Ld->isInvariant(), 22551 std::min(16U, Alignment)); 22552 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 22553 Load1.getValue(1), 22554 Load2.getValue(1)); 22555 22556 SDValue NewVec = DAG.getUNDEF(RegVT); 22557 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); 22558 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); 22559 return DCI.CombineTo(N, NewVec, TF, true); 22560 } 22561 22562 return SDValue(); 22563 } 22564 22565 /// PerformMLOADCombine - Resolve extending loads 22566 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, 22567 TargetLowering::DAGCombinerInfo &DCI, 22568 const X86Subtarget *Subtarget) { 22569 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); 22570 if (Mld->getExtensionType() != ISD::SEXTLOAD) 22571 return SDValue(); 22572 22573 EVT VT = Mld->getValueType(0); 22574 unsigned NumElems = VT.getVectorNumElements(); 22575 EVT LdVT = Mld->getMemoryVT(); 22576 SDLoc dl(Mld); 22577 22578 assert(LdVT != VT && "Cannot extend to the same type"); 22579 unsigned ToSz = VT.getVectorElementType().getSizeInBits(); 22580 unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); 22581 // From, To sizes and ElemCount must be pow of two 22582 assert (isPowerOf2_32(NumElems * FromSz * ToSz) && 22583 "Unexpected size for extending masked load"); 22584 22585 unsigned SizeRatio = ToSz / FromSz; 22586 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); 22587 22588 // Create a type on which we perform the shuffle 22589 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 22590 LdVT.getScalarType(), NumElems*SizeRatio); 22591 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 22592 22593 // Convert Src0 value 22594 SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0()); 22595 if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { 22596 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 22597 for (unsigned i = 0; i != NumElems; ++i) 22598 ShuffleVec[i] = i * SizeRatio; 22599 22600 // Can't shuffle using an illegal type. 22601 assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) 22602 && "WideVecVT should be legal"); 22603 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, 22604 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); 22605 } 22606 // Prepare the new mask 22607 SDValue NewMask; 22608 SDValue Mask = Mld->getMask(); 22609 if (Mask.getValueType() == VT) { 22610 // Mask and original value have the same type 22611 NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); 22612 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 22613 for (unsigned i = 0; i != NumElems; ++i) 22614 ShuffleVec[i] = i * SizeRatio; 22615 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) 22616 ShuffleVec[i] = NumElems*SizeRatio; 22617 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, 22618 DAG.getConstant(0, WideVecVT), 22619 &ShuffleVec[0]); 22620 } 22621 else { 22622 assert(Mask.getValueType().getVectorElementType() == MVT::i1); 22623 unsigned WidenNumElts = NumElems*SizeRatio; 22624 unsigned MaskNumElts = VT.getVectorNumElements(); 22625 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 22626 WidenNumElts); 22627 22628 unsigned NumConcat = WidenNumElts / MaskNumElts; 22629 SmallVector<SDValue, 16> Ops(NumConcat); 22630 SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); 22631 Ops[0] = Mask; 22632 for (unsigned i = 1; i != NumConcat; ++i) 22633 Ops[i] = ZeroVal; 22634 22635 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); 22636 } 22637 22638 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), 22639 Mld->getBasePtr(), NewMask, WideSrc0, 22640 Mld->getMemoryVT(), Mld->getMemOperand(), 22641 ISD::NON_EXTLOAD); 22642 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); 22643 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); 22644 22645 } 22646 /// PerformMSTORECombine - Resolve truncating stores 22647 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, 22648 const X86Subtarget *Subtarget) { 22649 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); 22650 if (!Mst->isTruncatingStore()) 22651 return SDValue(); 22652 22653 EVT VT = Mst->getValue().getValueType(); 22654 unsigned NumElems = VT.getVectorNumElements(); 22655 EVT StVT = Mst->getMemoryVT(); 22656 SDLoc dl(Mst); 22657 22658 assert(StVT != VT && "Cannot truncate to the same type"); 22659 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 22660 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 22661 22662 // From, To sizes and ElemCount must be pow of two 22663 assert (isPowerOf2_32(NumElems * FromSz * ToSz) && 22664 "Unexpected size for truncating masked store"); 22665 // We are going to use the original vector elt for storing. 22666 // Accumulated smaller vector elements must be a multiple of the store size. 22667 assert (((NumElems * FromSz) % ToSz) == 0 && 22668 "Unexpected ratio for truncating masked store"); 22669 22670 unsigned SizeRatio = FromSz / ToSz; 22671 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 22672 22673 // Create a type on which we perform the shuffle 22674 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 22675 StVT.getScalarType(), NumElems*SizeRatio); 22676 22677 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 22678 22679 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue()); 22680 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 22681 for (unsigned i = 0; i != NumElems; ++i) 22682 ShuffleVec[i] = i * SizeRatio; 22683 22684 // Can't shuffle using an illegal type. 22685 assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) 22686 && "WideVecVT should be legal"); 22687 22688 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 22689 DAG.getUNDEF(WideVecVT), 22690 &ShuffleVec[0]); 22691 22692 SDValue NewMask; 22693 SDValue Mask = Mst->getMask(); 22694 if (Mask.getValueType() == VT) { 22695 // Mask and original value have the same type 22696 NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); 22697 for (unsigned i = 0; i != NumElems; ++i) 22698 ShuffleVec[i] = i * SizeRatio; 22699 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) 22700 ShuffleVec[i] = NumElems*SizeRatio; 22701 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, 22702 DAG.getConstant(0, WideVecVT), 22703 &ShuffleVec[0]); 22704 } 22705 else { 22706 assert(Mask.getValueType().getVectorElementType() == MVT::i1); 22707 unsigned WidenNumElts = NumElems*SizeRatio; 22708 unsigned MaskNumElts = VT.getVectorNumElements(); 22709 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 22710 WidenNumElts); 22711 22712 unsigned NumConcat = WidenNumElts / MaskNumElts; 22713 SmallVector<SDValue, 16> Ops(NumConcat); 22714 SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); 22715 Ops[0] = Mask; 22716 for (unsigned i = 1; i != NumConcat; ++i) 22717 Ops[i] = ZeroVal; 22718 22719 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); 22720 } 22721 22722 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), 22723 NewMask, StVT, Mst->getMemOperand(), false); 22724 } 22725 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 22726 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 22727 const X86Subtarget *Subtarget) { 22728 StoreSDNode *St = cast<StoreSDNode>(N); 22729 EVT VT = St->getValue().getValueType(); 22730 EVT StVT = St->getMemoryVT(); 22731 SDLoc dl(St); 22732 SDValue StoredVal = St->getOperand(1); 22733 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 22734 22735 // If we are saving a concatenation of two XMM registers and 32-byte stores 22736 // are slow, such as on Sandy Bridge, perform two 16-byte stores. 22737 unsigned Alignment = St->getAlignment(); 22738 bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; 22739 if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && 22740 StVT == VT && !IsAligned) { 22741 unsigned NumElems = VT.getVectorNumElements(); 22742 if (NumElems < 2) 22743 return SDValue(); 22744 22745 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); 22746 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); 22747 22748 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 22749 SDValue Ptr0 = St->getBasePtr(); 22750 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 22751 22752 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 22753 St->getPointerInfo(), St->isVolatile(), 22754 St->isNonTemporal(), Alignment); 22755 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 22756 St->getPointerInfo(), St->isVolatile(), 22757 St->isNonTemporal(), 22758 std::min(16U, Alignment)); 22759 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 22760 } 22761 22762 // Optimize trunc store (of multiple scalars) to shuffle and store. 22763 // First, pack all of the elements in one place. Next, store to memory 22764 // in fewer chunks. 22765 if (St->isTruncatingStore() && VT.isVector()) { 22766 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 22767 unsigned NumElems = VT.getVectorNumElements(); 22768 assert(StVT != VT && "Cannot truncate to the same type"); 22769 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 22770 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 22771 22772 // From, To sizes and ElemCount must be pow of two 22773 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 22774 // We are going to use the original vector elt for storing. 22775 // Accumulated smaller vector elements must be a multiple of the store size. 22776 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 22777 22778 unsigned SizeRatio = FromSz / ToSz; 22779 22780 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 22781 22782 // Create a type on which we perform the shuffle 22783 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 22784 StVT.getScalarType(), NumElems*SizeRatio); 22785 22786 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 22787 22788 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 22789 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 22790 for (unsigned i = 0; i != NumElems; ++i) 22791 ShuffleVec[i] = i * SizeRatio; 22792 22793 // Can't shuffle using an illegal type. 22794 if (!TLI.isTypeLegal(WideVecVT)) 22795 return SDValue(); 22796 22797 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 22798 DAG.getUNDEF(WideVecVT), 22799 &ShuffleVec[0]); 22800 // At this point all of the data is stored at the bottom of the 22801 // register. We now need to save it to mem. 22802 22803 // Find the largest store unit 22804 MVT StoreType = MVT::i8; 22805 for (MVT Tp : MVT::integer_valuetypes()) { 22806 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 22807 StoreType = Tp; 22808 } 22809 22810 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 22811 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 22812 (64 <= NumElems * ToSz)) 22813 StoreType = MVT::f64; 22814 22815 // Bitcast the original vector into a vector of store-size units 22816 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 22817 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 22818 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 22819 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 22820 SmallVector<SDValue, 8> Chains; 22821 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 22822 TLI.getPointerTy()); 22823 SDValue Ptr = St->getBasePtr(); 22824 22825 // Perform one or more big stores into memory. 22826 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 22827 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 22828 StoreType, ShuffWide, 22829 DAG.getIntPtrConstant(i)); 22830 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 22831 St->getPointerInfo(), St->isVolatile(), 22832 St->isNonTemporal(), St->getAlignment()); 22833 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 22834 Chains.push_back(Ch); 22835 } 22836 22837 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 22838 } 22839 22840 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 22841 // the FP state in cases where an emms may be missing. 22842 // A preferable solution to the general problem is to figure out the right 22843 // places to insert EMMS. This qualifies as a quick hack. 22844 22845 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 22846 if (VT.getSizeInBits() != 64) 22847 return SDValue(); 22848 22849 const Function *F = DAG.getMachineFunction().getFunction(); 22850 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); 22851 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 22852 && Subtarget->hasSSE2(); 22853 if ((VT.isVector() || 22854 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 22855 isa<LoadSDNode>(St->getValue()) && 22856 !cast<LoadSDNode>(St->getValue())->isVolatile() && 22857 St->getChain().hasOneUse() && !St->isVolatile()) { 22858 SDNode* LdVal = St->getValue().getNode(); 22859 LoadSDNode *Ld = nullptr; 22860 int TokenFactorIndex = -1; 22861 SmallVector<SDValue, 8> Ops; 22862 SDNode* ChainVal = St->getChain().getNode(); 22863 // Must be a store of a load. We currently handle two cases: the load 22864 // is a direct child, and it's under an intervening TokenFactor. It is 22865 // possible to dig deeper under nested TokenFactors. 22866 if (ChainVal == LdVal) 22867 Ld = cast<LoadSDNode>(St->getChain()); 22868 else if (St->getValue().hasOneUse() && 22869 ChainVal->getOpcode() == ISD::TokenFactor) { 22870 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 22871 if (ChainVal->getOperand(i).getNode() == LdVal) { 22872 TokenFactorIndex = i; 22873 Ld = cast<LoadSDNode>(St->getValue()); 22874 } else 22875 Ops.push_back(ChainVal->getOperand(i)); 22876 } 22877 } 22878 22879 if (!Ld || !ISD::isNormalLoad(Ld)) 22880 return SDValue(); 22881 22882 // If this is not the MMX case, i.e. we are just turning i64 load/store 22883 // into f64 load/store, avoid the transformation if there are multiple 22884 // uses of the loaded value. 22885 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 22886 return SDValue(); 22887 22888 SDLoc LdDL(Ld); 22889 SDLoc StDL(N); 22890 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 22891 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 22892 // pair instead. 22893 if (Subtarget->is64Bit() || F64IsLegal) { 22894 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 22895 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 22896 Ld->getPointerInfo(), Ld->isVolatile(), 22897 Ld->isNonTemporal(), Ld->isInvariant(), 22898 Ld->getAlignment()); 22899 SDValue NewChain = NewLd.getValue(1); 22900 if (TokenFactorIndex != -1) { 22901 Ops.push_back(NewChain); 22902 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); 22903 } 22904 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 22905 St->getPointerInfo(), 22906 St->isVolatile(), St->isNonTemporal(), 22907 St->getAlignment()); 22908 } 22909 22910 // Otherwise, lower to two pairs of 32-bit loads / stores. 22911 SDValue LoAddr = Ld->getBasePtr(); 22912 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 22913 DAG.getConstant(4, MVT::i32)); 22914 22915 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 22916 Ld->getPointerInfo(), 22917 Ld->isVolatile(), Ld->isNonTemporal(), 22918 Ld->isInvariant(), Ld->getAlignment()); 22919 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 22920 Ld->getPointerInfo().getWithOffset(4), 22921 Ld->isVolatile(), Ld->isNonTemporal(), 22922 Ld->isInvariant(), 22923 MinAlign(Ld->getAlignment(), 4)); 22924 22925 SDValue NewChain = LoLd.getValue(1); 22926 if (TokenFactorIndex != -1) { 22927 Ops.push_back(LoLd); 22928 Ops.push_back(HiLd); 22929 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); 22930 } 22931 22932 LoAddr = St->getBasePtr(); 22933 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 22934 DAG.getConstant(4, MVT::i32)); 22935 22936 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 22937 St->getPointerInfo(), 22938 St->isVolatile(), St->isNonTemporal(), 22939 St->getAlignment()); 22940 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 22941 St->getPointerInfo().getWithOffset(4), 22942 St->isVolatile(), 22943 St->isNonTemporal(), 22944 MinAlign(St->getAlignment(), 4)); 22945 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 22946 } 22947 return SDValue(); 22948 } 22949 22950 /// Return 'true' if this vector operation is "horizontal" 22951 /// and return the operands for the horizontal operation in LHS and RHS. A 22952 /// horizontal operation performs the binary operation on successive elements 22953 /// of its first operand, then on successive elements of its second operand, 22954 /// returning the resulting values in a vector. For example, if 22955 /// A = < float a0, float a1, float a2, float a3 > 22956 /// and 22957 /// B = < float b0, float b1, float b2, float b3 > 22958 /// then the result of doing a horizontal operation on A and B is 22959 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 22960 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 22961 /// A horizontal-op B, for some already available A and B, and if so then LHS is 22962 /// set to A, RHS to B, and the routine returns 'true'. 22963 /// Note that the binary operation should have the property that if one of the 22964 /// operands is UNDEF then the result is UNDEF. 22965 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 22966 // Look for the following pattern: if 22967 // A = < float a0, float a1, float a2, float a3 > 22968 // B = < float b0, float b1, float b2, float b3 > 22969 // and 22970 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 22971 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 22972 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 22973 // which is A horizontal-op B. 22974 22975 // At least one of the operands should be a vector shuffle. 22976 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 22977 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 22978 return false; 22979 22980 MVT VT = LHS.getSimpleValueType(); 22981 22982 assert((VT.is128BitVector() || VT.is256BitVector()) && 22983 "Unsupported vector type for horizontal add/sub"); 22984 22985 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 22986 // operate independently on 128-bit lanes. 22987 unsigned NumElts = VT.getVectorNumElements(); 22988 unsigned NumLanes = VT.getSizeInBits()/128; 22989 unsigned NumLaneElts = NumElts / NumLanes; 22990 assert((NumLaneElts % 2 == 0) && 22991 "Vector type should have an even number of elements in each lane"); 22992 unsigned HalfLaneElts = NumLaneElts/2; 22993 22994 // View LHS in the form 22995 // LHS = VECTOR_SHUFFLE A, B, LMask 22996 // If LHS is not a shuffle then pretend it is the shuffle 22997 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 22998 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 22999 // type VT. 23000 SDValue A, B; 23001 SmallVector<int, 16> LMask(NumElts); 23002 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 23003 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 23004 A = LHS.getOperand(0); 23005 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 23006 B = LHS.getOperand(1); 23007 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 23008 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 23009 } else { 23010 if (LHS.getOpcode() != ISD::UNDEF) 23011 A = LHS; 23012 for (unsigned i = 0; i != NumElts; ++i) 23013 LMask[i] = i; 23014 } 23015 23016 // Likewise, view RHS in the form 23017 // RHS = VECTOR_SHUFFLE C, D, RMask 23018 SDValue C, D; 23019 SmallVector<int, 16> RMask(NumElts); 23020 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 23021 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 23022 C = RHS.getOperand(0); 23023 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 23024 D = RHS.getOperand(1); 23025 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 23026 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 23027 } else { 23028 if (RHS.getOpcode() != ISD::UNDEF) 23029 C = RHS; 23030 for (unsigned i = 0; i != NumElts; ++i) 23031 RMask[i] = i; 23032 } 23033 23034 // Check that the shuffles are both shuffling the same vectors. 23035 if (!(A == C && B == D) && !(A == D && B == C)) 23036 return false; 23037 23038 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 23039 if (!A.getNode() && !B.getNode()) 23040 return false; 23041 23042 // If A and B occur in reverse order in RHS, then "swap" them (which means 23043 // rewriting the mask). 23044 if (A != C) 23045 ShuffleVectorSDNode::commuteMask(RMask); 23046 23047 // At this point LHS and RHS are equivalent to 23048 // LHS = VECTOR_SHUFFLE A, B, LMask 23049 // RHS = VECTOR_SHUFFLE A, B, RMask 23050 // Check that the masks correspond to performing a horizontal operation. 23051 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 23052 for (unsigned i = 0; i != NumLaneElts; ++i) { 23053 int LIdx = LMask[i+l], RIdx = RMask[i+l]; 23054 23055 // Ignore any UNDEF components. 23056 if (LIdx < 0 || RIdx < 0 || 23057 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 23058 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 23059 continue; 23060 23061 // Check that successive elements are being operated on. If not, this is 23062 // not a horizontal operation. 23063 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs 23064 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; 23065 if (!(LIdx == Index && RIdx == Index + 1) && 23066 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 23067 return false; 23068 } 23069 } 23070 23071 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 23072 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 23073 return true; 23074 } 23075 23076 /// Do target-specific dag combines on floating point adds. 23077 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 23078 const X86Subtarget *Subtarget) { 23079 EVT VT = N->getValueType(0); 23080 SDValue LHS = N->getOperand(0); 23081 SDValue RHS = N->getOperand(1); 23082 23083 // Try to synthesize horizontal adds from adds of shuffles. 23084 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 23085 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 23086 isHorizontalBinOp(LHS, RHS, true)) 23087 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); 23088 return SDValue(); 23089 } 23090 23091 /// Do target-specific dag combines on floating point subs. 23092 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 23093 const X86Subtarget *Subtarget) { 23094 EVT VT = N->getValueType(0); 23095 SDValue LHS = N->getOperand(0); 23096 SDValue RHS = N->getOperand(1); 23097 23098 // Try to synthesize horizontal subs from subs of shuffles. 23099 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 23100 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 23101 isHorizontalBinOp(LHS, RHS, false)) 23102 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); 23103 return SDValue(); 23104 } 23105 23106 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. 23107 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 23108 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 23109 23110 // F[X]OR(0.0, x) -> x 23111 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 23112 if (C->getValueAPF().isPosZero()) 23113 return N->getOperand(1); 23114 23115 // F[X]OR(x, 0.0) -> x 23116 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 23117 if (C->getValueAPF().isPosZero()) 23118 return N->getOperand(0); 23119 return SDValue(); 23120 } 23121 23122 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. 23123 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 23124 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 23125 23126 // Only perform optimizations if UnsafeMath is used. 23127 if (!DAG.getTarget().Options.UnsafeFPMath) 23128 return SDValue(); 23129 23130 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 23131 // into FMINC and FMAXC, which are Commutative operations. 23132 unsigned NewOp = 0; 23133 switch (N->getOpcode()) { 23134 default: llvm_unreachable("unknown opcode"); 23135 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 23136 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 23137 } 23138 23139 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), 23140 N->getOperand(0), N->getOperand(1)); 23141 } 23142 23143 /// Do target-specific dag combines on X86ISD::FAND nodes. 23144 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 23145 // FAND(0.0, x) -> 0.0 23146 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 23147 if (C->getValueAPF().isPosZero()) 23148 return N->getOperand(0); 23149 23150 // FAND(x, 0.0) -> 0.0 23151 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 23152 if (C->getValueAPF().isPosZero()) 23153 return N->getOperand(1); 23154 23155 return SDValue(); 23156 } 23157 23158 /// Do target-specific dag combines on X86ISD::FANDN nodes 23159 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { 23160 // FANDN(0.0, x) -> x 23161 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 23162 if (C->getValueAPF().isPosZero()) 23163 return N->getOperand(1); 23164 23165 // FANDN(x, 0.0) -> 0.0 23166 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 23167 if (C->getValueAPF().isPosZero()) 23168 return N->getOperand(1); 23169 23170 return SDValue(); 23171 } 23172 23173 static SDValue PerformBTCombine(SDNode *N, 23174 SelectionDAG &DAG, 23175 TargetLowering::DAGCombinerInfo &DCI) { 23176 // BT ignores high bits in the bit index operand. 23177 SDValue Op1 = N->getOperand(1); 23178 if (Op1.hasOneUse()) { 23179 unsigned BitWidth = Op1.getValueSizeInBits(); 23180 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 23181 APInt KnownZero, KnownOne; 23182 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 23183 !DCI.isBeforeLegalizeOps()); 23184 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 23185 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 23186 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 23187 DCI.CommitTargetLoweringOpt(TLO); 23188 } 23189 return SDValue(); 23190 } 23191 23192 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 23193 SDValue Op = N->getOperand(0); 23194 if (Op.getOpcode() == ISD::BITCAST) 23195 Op = Op.getOperand(0); 23196 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 23197 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 23198 VT.getVectorElementType().getSizeInBits() == 23199 OpVT.getVectorElementType().getSizeInBits()) { 23200 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 23201 } 23202 return SDValue(); 23203 } 23204 23205 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 23206 const X86Subtarget *Subtarget) { 23207 EVT VT = N->getValueType(0); 23208 if (!VT.isVector()) 23209 return SDValue(); 23210 23211 SDValue N0 = N->getOperand(0); 23212 SDValue N1 = N->getOperand(1); 23213 EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); 23214 SDLoc dl(N); 23215 23216 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the 23217 // both SSE and AVX2 since there is no sign-extended shift right 23218 // operation on a vector with 64-bit elements. 23219 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> 23220 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) 23221 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || 23222 N0.getOpcode() == ISD::SIGN_EXTEND)) { 23223 SDValue N00 = N0.getOperand(0); 23224 23225 // EXTLOAD has a better solution on AVX2, 23226 // it may be replaced with X86ISD::VSEXT node. 23227 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) 23228 if (!ISD::isNormalLoad(N00.getNode())) 23229 return SDValue(); 23230 23231 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { 23232 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 23233 N00, N1); 23234 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); 23235 } 23236 } 23237 return SDValue(); 23238 } 23239 23240 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 23241 TargetLowering::DAGCombinerInfo &DCI, 23242 const X86Subtarget *Subtarget) { 23243 SDValue N0 = N->getOperand(0); 23244 EVT VT = N->getValueType(0); 23245 23246 // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> 23247 // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) 23248 // This exposes the sext to the sdivrem lowering, so that it directly extends 23249 // from AH (which we otherwise need to do contortions to access). 23250 if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && 23251 N0.getValueType() == MVT::i8 && VT == MVT::i32) { 23252 SDLoc dl(N); 23253 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); 23254 SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, 23255 N0.getOperand(0), N0.getOperand(1)); 23256 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); 23257 return R.getValue(1); 23258 } 23259 23260 if (!DCI.isBeforeLegalizeOps()) 23261 return SDValue(); 23262 23263 if (!Subtarget->hasFp256()) 23264 return SDValue(); 23265 23266 if (VT.isVector() && VT.getSizeInBits() == 256) { 23267 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 23268 if (R.getNode()) 23269 return R; 23270 } 23271 23272 return SDValue(); 23273 } 23274 23275 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 23276 const X86Subtarget* Subtarget) { 23277 SDLoc dl(N); 23278 EVT VT = N->getValueType(0); 23279 23280 // Let legalize expand this if it isn't a legal type yet. 23281 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 23282 return SDValue(); 23283 23284 EVT ScalarVT = VT.getScalarType(); 23285 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || 23286 (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) 23287 return SDValue(); 23288 23289 SDValue A = N->getOperand(0); 23290 SDValue B = N->getOperand(1); 23291 SDValue C = N->getOperand(2); 23292 23293 bool NegA = (A.getOpcode() == ISD::FNEG); 23294 bool NegB = (B.getOpcode() == ISD::FNEG); 23295 bool NegC = (C.getOpcode() == ISD::FNEG); 23296 23297 // Negative multiplication when NegA xor NegB 23298 bool NegMul = (NegA != NegB); 23299 if (NegA) 23300 A = A.getOperand(0); 23301 if (NegB) 23302 B = B.getOperand(0); 23303 if (NegC) 23304 C = C.getOperand(0); 23305 23306 unsigned Opcode; 23307 if (!NegMul) 23308 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 23309 else 23310 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 23311 23312 return DAG.getNode(Opcode, dl, VT, A, B, C); 23313 } 23314 23315 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 23316 TargetLowering::DAGCombinerInfo &DCI, 23317 const X86Subtarget *Subtarget) { 23318 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 23319 // (and (i32 x86isd::setcc_carry), 1) 23320 // This eliminates the zext. This transformation is necessary because 23321 // ISD::SETCC is always legalized to i8. 23322 SDLoc dl(N); 23323 SDValue N0 = N->getOperand(0); 23324 EVT VT = N->getValueType(0); 23325 23326 if (N0.getOpcode() == ISD::AND && 23327 N0.hasOneUse() && 23328 N0.getOperand(0).hasOneUse()) { 23329 SDValue N00 = N0.getOperand(0); 23330 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 23331 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 23332 if (!C || C->getZExtValue() != 1) 23333 return SDValue(); 23334 return DAG.getNode(ISD::AND, dl, VT, 23335 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 23336 N00.getOperand(0), N00.getOperand(1)), 23337 DAG.getConstant(1, VT)); 23338 } 23339 } 23340 23341 if (N0.getOpcode() == ISD::TRUNCATE && 23342 N0.hasOneUse() && 23343 N0.getOperand(0).hasOneUse()) { 23344 SDValue N00 = N0.getOperand(0); 23345 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 23346 return DAG.getNode(ISD::AND, dl, VT, 23347 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 23348 N00.getOperand(0), N00.getOperand(1)), 23349 DAG.getConstant(1, VT)); 23350 } 23351 } 23352 if (VT.is256BitVector()) { 23353 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 23354 if (R.getNode()) 23355 return R; 23356 } 23357 23358 // (i8,i32 zext (udivrem (i8 x, i8 y)) -> 23359 // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) 23360 // This exposes the zext to the udivrem lowering, so that it directly extends 23361 // from AH (which we otherwise need to do contortions to access). 23362 if (N0.getOpcode() == ISD::UDIVREM && 23363 N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && 23364 (VT == MVT::i32 || VT == MVT::i64)) { 23365 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); 23366 SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, 23367 N0.getOperand(0), N0.getOperand(1)); 23368 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); 23369 return R.getValue(1); 23370 } 23371 23372 return SDValue(); 23373 } 23374 23375 // Optimize x == -y --> x+y == 0 23376 // x != -y --> x+y != 0 23377 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, 23378 const X86Subtarget* Subtarget) { 23379 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 23380 SDValue LHS = N->getOperand(0); 23381 SDValue RHS = N->getOperand(1); 23382 EVT VT = N->getValueType(0); 23383 SDLoc DL(N); 23384 23385 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 23386 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) 23387 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { 23388 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS, 23389 LHS.getOperand(1)); 23390 return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, 23391 DAG.getConstant(0, addV.getValueType()), CC); 23392 } 23393 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 23394 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) 23395 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { 23396 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS, 23397 RHS.getOperand(1)); 23398 return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, 23399 DAG.getConstant(0, addV.getValueType()), CC); 23400 } 23401 23402 if (VT.getScalarType() == MVT::i1 && 23403 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { 23404 bool IsSEXT0 = 23405 (LHS.getOpcode() == ISD::SIGN_EXTEND) && 23406 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); 23407 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); 23408 23409 if (!IsSEXT0 || !IsVZero1) { 23410 // Swap the operands and update the condition code. 23411 std::swap(LHS, RHS); 23412 CC = ISD::getSetCCSwappedOperands(CC); 23413 23414 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && 23415 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); 23416 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); 23417 } 23418 23419 if (IsSEXT0 && IsVZero1) { 23420 assert(VT == LHS.getOperand(0).getValueType() && 23421 "Uexpected operand type"); 23422 if (CC == ISD::SETGT) 23423 return DAG.getConstant(0, VT); 23424 if (CC == ISD::SETLE) 23425 return DAG.getConstant(1, VT); 23426 if (CC == ISD::SETEQ || CC == ISD::SETGE) 23427 return DAG.getNOT(DL, LHS.getOperand(0), VT); 23428 23429 assert((CC == ISD::SETNE || CC == ISD::SETLT) && 23430 "Unexpected condition code!"); 23431 return LHS.getOperand(0); 23432 } 23433 } 23434 23435 return SDValue(); 23436 } 23437 23438 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, 23439 SelectionDAG &DAG) { 23440 SDLoc dl(Load); 23441 MVT VT = Load->getSimpleValueType(0); 23442 MVT EVT = VT.getVectorElementType(); 23443 SDValue Addr = Load->getOperand(1); 23444 SDValue NewAddr = DAG.getNode( 23445 ISD::ADD, dl, Addr.getSimpleValueType(), Addr, 23446 DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); 23447 23448 SDValue NewLoad = 23449 DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, 23450 DAG.getMachineFunction().getMachineMemOperand( 23451 Load->getMemOperand(), 0, EVT.getStoreSize())); 23452 return NewLoad; 23453 } 23454 23455 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, 23456 const X86Subtarget *Subtarget) { 23457 SDLoc dl(N); 23458 MVT VT = N->getOperand(1)->getSimpleValueType(0); 23459 assert((VT == MVT::v4f32 || VT == MVT::v4i32) && 23460 "X86insertps is only defined for v4x32"); 23461 23462 SDValue Ld = N->getOperand(1); 23463 if (MayFoldLoad(Ld)) { 23464 // Extract the countS bits from the immediate so we can get the proper 23465 // address when narrowing the vector load to a specific element. 23466 // When the second source op is a memory address, insertps doesn't use 23467 // countS and just gets an f32 from that address. 23468 unsigned DestIndex = 23469 cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; 23470 23471 Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); 23472 23473 // Create this as a scalar to vector to match the instruction pattern. 23474 SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); 23475 // countS bits are ignored when loading from memory on insertps, which 23476 // means we don't need to explicitly set them to 0. 23477 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), 23478 LoadScalarToVector, N->getOperand(2)); 23479 } 23480 return SDValue(); 23481 } 23482 23483 static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { 23484 SDValue V0 = N->getOperand(0); 23485 SDValue V1 = N->getOperand(1); 23486 SDLoc DL(N); 23487 EVT VT = N->getValueType(0); 23488 23489 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector 23490 // operands and changing the mask to 1. This saves us a bunch of 23491 // pattern-matching possibilities related to scalar math ops in SSE/AVX. 23492 // x86InstrInfo knows how to commute this back after instruction selection 23493 // if it would help register allocation. 23494 23495 // TODO: If optimizing for size or a processor that doesn't suffer from 23496 // partial register update stalls, this should be transformed into a MOVSD 23497 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. 23498 23499 if (VT == MVT::v2f64) 23500 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) 23501 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { 23502 SDValue NewMask = DAG.getConstant(1, MVT::i8); 23503 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); 23504 } 23505 23506 return SDValue(); 23507 } 23508 23509 // Helper function of PerformSETCCCombine. It is to materialize "setb reg" 23510 // as "sbb reg,reg", since it can be extended without zext and produces 23511 // an all-ones bit which is more useful than 0/1 in some cases. 23512 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, 23513 MVT VT) { 23514 if (VT == MVT::i8) 23515 return DAG.getNode(ISD::AND, DL, VT, 23516 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 23517 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), 23518 DAG.getConstant(1, VT)); 23519 assert (VT == MVT::i1 && "Unexpected type for SECCC node"); 23520 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, 23521 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 23522 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS)); 23523 } 23524 23525 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 23526 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 23527 TargetLowering::DAGCombinerInfo &DCI, 23528 const X86Subtarget *Subtarget) { 23529 SDLoc DL(N); 23530 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 23531 SDValue EFLAGS = N->getOperand(1); 23532 23533 if (CC == X86::COND_A) { 23534 // Try to convert COND_A into COND_B in an attempt to facilitate 23535 // materializing "setb reg". 23536 // 23537 // Do not flip "e > c", where "c" is a constant, because Cmp instruction 23538 // cannot take an immediate as its first operand. 23539 // 23540 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 23541 EFLAGS.getValueType().isInteger() && 23542 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { 23543 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), 23544 EFLAGS.getNode()->getVTList(), 23545 EFLAGS.getOperand(1), EFLAGS.getOperand(0)); 23546 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); 23547 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); 23548 } 23549 } 23550 23551 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 23552 // a zext and produces an all-ones bit which is more useful than 0/1 in some 23553 // cases. 23554 if (CC == X86::COND_B) 23555 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); 23556 23557 SDValue Flags; 23558 23559 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 23560 if (Flags.getNode()) { 23561 SDValue Cond = DAG.getConstant(CC, MVT::i8); 23562 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 23563 } 23564 23565 return SDValue(); 23566 } 23567 23568 // Optimize branch condition evaluation. 23569 // 23570 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 23571 TargetLowering::DAGCombinerInfo &DCI, 23572 const X86Subtarget *Subtarget) { 23573 SDLoc DL(N); 23574 SDValue Chain = N->getOperand(0); 23575 SDValue Dest = N->getOperand(1); 23576 SDValue EFLAGS = N->getOperand(3); 23577 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 23578 23579 SDValue Flags; 23580 23581 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 23582 if (Flags.getNode()) { 23583 SDValue Cond = DAG.getConstant(CC, MVT::i8); 23584 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 23585 Flags); 23586 } 23587 23588 return SDValue(); 23589 } 23590 23591 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 23592 SelectionDAG &DAG) { 23593 // Take advantage of vector comparisons producing 0 or -1 in each lane to 23594 // optimize away operation when it's from a constant. 23595 // 23596 // The general transformation is: 23597 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 23598 // AND(VECTOR_CMP(x,y), constant2) 23599 // constant2 = UNARYOP(constant) 23600 23601 // Early exit if this isn't a vector operation, the operand of the 23602 // unary operation isn't a bitwise AND, or if the sizes of the operations 23603 // aren't the same. 23604 EVT VT = N->getValueType(0); 23605 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 23606 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 23607 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 23608 return SDValue(); 23609 23610 // Now check that the other operand of the AND is a constant. We could 23611 // make the transformation for non-constant splats as well, but it's unclear 23612 // that would be a benefit as it would not eliminate any operations, just 23613 // perform one more step in scalar code before moving to the vector unit. 23614 if (BuildVectorSDNode *BV = 23615 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 23616 // Bail out if the vector isn't a constant. 23617 if (!BV->isConstant()) 23618 return SDValue(); 23619 23620 // Everything checks out. Build up the new and improved node. 23621 SDLoc DL(N); 23622 EVT IntVT = BV->getValueType(0); 23623 // Create a new constant of the appropriate type for the transformed 23624 // DAG. 23625 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 23626 // The AND node needs bitcasts to/from an integer vector type around it. 23627 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 23628 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 23629 N->getOperand(0)->getOperand(0), MaskConst); 23630 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 23631 return Res; 23632 } 23633 23634 return SDValue(); 23635 } 23636 23637 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 23638 const X86Subtarget *Subtarget) { 23639 // First try to optimize away the conversion entirely when it's 23640 // conditionally from a constant. Vectors only. 23641 SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); 23642 if (Res != SDValue()) 23643 return Res; 23644 23645 // Now move on to more general possibilities. 23646 SDValue Op0 = N->getOperand(0); 23647 EVT InVT = Op0->getValueType(0); 23648 23649 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) 23650 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 23651 SDLoc dl(N); 23652 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 23653 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 23654 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 23655 } 23656 23657 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 23658 // a 32-bit target where SSE doesn't support i64->FP operations. 23659 if (Op0.getOpcode() == ISD::LOAD) { 23660 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 23661 EVT VT = Ld->getValueType(0); 23662 23663 // This transformation is not supported if the result type is f16 23664 if (N->getValueType(0) == MVT::f16) 23665 return SDValue(); 23666 23667 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 23668 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 23669 !Subtarget->is64Bit() && VT == MVT::i64) { 23670 SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( 23671 SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); 23672 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 23673 return FILDChain; 23674 } 23675 } 23676 return SDValue(); 23677 } 23678 23679 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 23680 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 23681 X86TargetLowering::DAGCombinerInfo &DCI) { 23682 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 23683 // the result is either zero or one (depending on the input carry bit). 23684 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 23685 if (X86::isZeroNode(N->getOperand(0)) && 23686 X86::isZeroNode(N->getOperand(1)) && 23687 // We don't have a good way to replace an EFLAGS use, so only do this when 23688 // dead right now. 23689 SDValue(N, 1).use_empty()) { 23690 SDLoc DL(N); 23691 EVT VT = N->getValueType(0); 23692 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 23693 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 23694 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 23695 DAG.getConstant(X86::COND_B,MVT::i8), 23696 N->getOperand(2)), 23697 DAG.getConstant(1, VT)); 23698 return DCI.CombineTo(N, Res1, CarryOut); 23699 } 23700 23701 return SDValue(); 23702 } 23703 23704 // fold (add Y, (sete X, 0)) -> adc 0, Y 23705 // (add Y, (setne X, 0)) -> sbb -1, Y 23706 // (sub (sete X, 0), Y) -> sbb 0, Y 23707 // (sub (setne X, 0), Y) -> adc -1, Y 23708 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 23709 SDLoc DL(N); 23710 23711 // Look through ZExts. 23712 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 23713 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 23714 return SDValue(); 23715 23716 SDValue SetCC = Ext.getOperand(0); 23717 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 23718 return SDValue(); 23719 23720 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 23721 if (CC != X86::COND_E && CC != X86::COND_NE) 23722 return SDValue(); 23723 23724 SDValue Cmp = SetCC.getOperand(1); 23725 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 23726 !X86::isZeroNode(Cmp.getOperand(1)) || 23727 !Cmp.getOperand(0).getValueType().isInteger()) 23728 return SDValue(); 23729 23730 SDValue CmpOp0 = Cmp.getOperand(0); 23731 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 23732 DAG.getConstant(1, CmpOp0.getValueType())); 23733 23734 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 23735 if (CC == X86::COND_NE) 23736 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 23737 DL, OtherVal.getValueType(), OtherVal, 23738 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 23739 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 23740 DL, OtherVal.getValueType(), OtherVal, 23741 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 23742 } 23743 23744 /// PerformADDCombine - Do target-specific dag combines on integer adds. 23745 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 23746 const X86Subtarget *Subtarget) { 23747 EVT VT = N->getValueType(0); 23748 SDValue Op0 = N->getOperand(0); 23749 SDValue Op1 = N->getOperand(1); 23750 23751 // Try to synthesize horizontal adds from adds of shuffles. 23752 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 23753 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 23754 isHorizontalBinOp(Op0, Op1, true)) 23755 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); 23756 23757 return OptimizeConditionalInDecrement(N, DAG); 23758 } 23759 23760 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 23761 const X86Subtarget *Subtarget) { 23762 SDValue Op0 = N->getOperand(0); 23763 SDValue Op1 = N->getOperand(1); 23764 23765 // X86 can't encode an immediate LHS of a sub. See if we can push the 23766 // negation into a preceding instruction. 23767 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 23768 // If the RHS of the sub is a XOR with one use and a constant, invert the 23769 // immediate. Then add one to the LHS of the sub so we can turn 23770 // X-Y -> X+~Y+1, saving one register. 23771 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 23772 isa<ConstantSDNode>(Op1.getOperand(1))) { 23773 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 23774 EVT VT = Op0.getValueType(); 23775 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, 23776 Op1.getOperand(0), 23777 DAG.getConstant(~XorC, VT)); 23778 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, 23779 DAG.getConstant(C->getAPIntValue()+1, VT)); 23780 } 23781 } 23782 23783 // Try to synthesize horizontal adds from adds of shuffles. 23784 EVT VT = N->getValueType(0); 23785 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 23786 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 23787 isHorizontalBinOp(Op0, Op1, true)) 23788 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); 23789 23790 return OptimizeConditionalInDecrement(N, DAG); 23791 } 23792 23793 /// performVZEXTCombine - Performs build vector combines 23794 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, 23795 TargetLowering::DAGCombinerInfo &DCI, 23796 const X86Subtarget *Subtarget) { 23797 SDLoc DL(N); 23798 MVT VT = N->getSimpleValueType(0); 23799 SDValue Op = N->getOperand(0); 23800 MVT OpVT = Op.getSimpleValueType(); 23801 MVT OpEltVT = OpVT.getVectorElementType(); 23802 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); 23803 23804 // (vzext (bitcast (vzext (x)) -> (vzext x) 23805 SDValue V = Op; 23806 while (V.getOpcode() == ISD::BITCAST) 23807 V = V.getOperand(0); 23808 23809 if (V != Op && V.getOpcode() == X86ISD::VZEXT) { 23810 MVT InnerVT = V.getSimpleValueType(); 23811 MVT InnerEltVT = InnerVT.getVectorElementType(); 23812 23813 // If the element sizes match exactly, we can just do one larger vzext. This 23814 // is always an exact type match as vzext operates on integer types. 23815 if (OpEltVT == InnerEltVT) { 23816 assert(OpVT == InnerVT && "Types must match for vzext!"); 23817 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); 23818 } 23819 23820 // The only other way we can combine them is if only a single element of the 23821 // inner vzext is used in the input to the outer vzext. 23822 if (InnerEltVT.getSizeInBits() < InputBits) 23823 return SDValue(); 23824 23825 // In this case, the inner vzext is completely dead because we're going to 23826 // only look at bits inside of the low element. Just do the outer vzext on 23827 // a bitcast of the input to the inner. 23828 return DAG.getNode(X86ISD::VZEXT, DL, VT, 23829 DAG.getNode(ISD::BITCAST, DL, OpVT, V)); 23830 } 23831 23832 // Check if we can bypass extracting and re-inserting an element of an input 23833 // vector. Essentialy: 23834 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 23835 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && 23836 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 23837 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { 23838 SDValue ExtractedV = V.getOperand(0); 23839 SDValue OrigV = ExtractedV.getOperand(0); 23840 if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1))) 23841 if (ExtractIdx->getZExtValue() == 0) { 23842 MVT OrigVT = OrigV.getSimpleValueType(); 23843 // Extract a subvector if necessary... 23844 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { 23845 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); 23846 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), 23847 OrigVT.getVectorNumElements() / Ratio); 23848 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, 23849 DAG.getIntPtrConstant(0)); 23850 } 23851 Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV); 23852 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); 23853 } 23854 } 23855 23856 return SDValue(); 23857 } 23858 23859 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 23860 DAGCombinerInfo &DCI) const { 23861 SelectionDAG &DAG = DCI.DAG; 23862 switch (N->getOpcode()) { 23863 default: break; 23864 case ISD::EXTRACT_VECTOR_ELT: 23865 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 23866 case ISD::VSELECT: 23867 case ISD::SELECT: 23868 case X86ISD::SHRUNKBLEND: 23869 return PerformSELECTCombine(N, DAG, DCI, Subtarget); 23870 case ISD::BITCAST: return PerformBITCASTCombine(N, DAG); 23871 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 23872 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 23873 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 23874 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 23875 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 23876 case ISD::SHL: 23877 case ISD::SRA: 23878 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 23879 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 23880 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 23881 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 23882 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 23883 case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); 23884 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 23885 case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); 23886 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); 23887 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 23888 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 23889 case X86ISD::FXOR: 23890 case X86ISD::FOR: return PerformFORCombine(N, DAG); 23891 case X86ISD::FMIN: 23892 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 23893 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 23894 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); 23895 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 23896 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 23897 case ISD::ANY_EXTEND: 23898 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 23899 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 23900 case ISD::SIGN_EXTEND_INREG: 23901 return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); 23902 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); 23903 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); 23904 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 23905 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 23906 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); 23907 case X86ISD::SHUFP: // Handle all target specific shuffles 23908 case X86ISD::PALIGNR: 23909 case X86ISD::UNPCKH: 23910 case X86ISD::UNPCKL: 23911 case X86ISD::MOVHLPS: 23912 case X86ISD::MOVLHPS: 23913 case X86ISD::PSHUFB: 23914 case X86ISD::PSHUFD: 23915 case X86ISD::PSHUFHW: 23916 case X86ISD::PSHUFLW: 23917 case X86ISD::MOVSS: 23918 case X86ISD::MOVSD: 23919 case X86ISD::VPERMILPI: 23920 case X86ISD::VPERM2X128: 23921 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 23922 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 23923 case ISD::INTRINSIC_WO_CHAIN: 23924 return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); 23925 case X86ISD::INSERTPS: { 23926 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) 23927 return PerformINSERTPSCombine(N, DAG, Subtarget); 23928 break; 23929 } 23930 case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); 23931 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); 23932 } 23933 23934 return SDValue(); 23935 } 23936 23937 /// isTypeDesirableForOp - Return true if the target has native support for 23938 /// the specified value type and it is 'desirable' to use the type for the 23939 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 23940 /// instruction encodings are longer and some i16 instructions are slow. 23941 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 23942 if (!isTypeLegal(VT)) 23943 return false; 23944 if (VT != MVT::i16) 23945 return true; 23946 23947 switch (Opc) { 23948 default: 23949 return true; 23950 case ISD::LOAD: 23951 case ISD::SIGN_EXTEND: 23952 case ISD::ZERO_EXTEND: 23953 case ISD::ANY_EXTEND: 23954 case ISD::SHL: 23955 case ISD::SRL: 23956 case ISD::SUB: 23957 case ISD::ADD: 23958 case ISD::MUL: 23959 case ISD::AND: 23960 case ISD::OR: 23961 case ISD::XOR: 23962 return false; 23963 } 23964 } 23965 23966 /// IsDesirableToPromoteOp - This method query the target whether it is 23967 /// beneficial for dag combiner to promote the specified node. If true, it 23968 /// should return the desired promotion type by reference. 23969 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 23970 EVT VT = Op.getValueType(); 23971 if (VT != MVT::i16) 23972 return false; 23973 23974 bool Promote = false; 23975 bool Commute = false; 23976 switch (Op.getOpcode()) { 23977 default: break; 23978 case ISD::LOAD: { 23979 LoadSDNode *LD = cast<LoadSDNode>(Op); 23980 // If the non-extending load has a single use and it's not live out, then it 23981 // might be folded. 23982 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 23983 Op.hasOneUse()*/) { 23984 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 23985 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 23986 // The only case where we'd want to promote LOAD (rather then it being 23987 // promoted as an operand is when it's only use is liveout. 23988 if (UI->getOpcode() != ISD::CopyToReg) 23989 return false; 23990 } 23991 } 23992 Promote = true; 23993 break; 23994 } 23995 case ISD::SIGN_EXTEND: 23996 case ISD::ZERO_EXTEND: 23997 case ISD::ANY_EXTEND: 23998 Promote = true; 23999 break; 24000 case ISD::SHL: 24001 case ISD::SRL: { 24002 SDValue N0 = Op.getOperand(0); 24003 // Look out for (store (shl (load), x)). 24004 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 24005 return false; 24006 Promote = true; 24007 break; 24008 } 24009 case ISD::ADD: 24010 case ISD::MUL: 24011 case ISD::AND: 24012 case ISD::OR: 24013 case ISD::XOR: 24014 Commute = true; 24015 // fallthrough 24016 case ISD::SUB: { 24017 SDValue N0 = Op.getOperand(0); 24018 SDValue N1 = Op.getOperand(1); 24019 if (!Commute && MayFoldLoad(N1)) 24020 return false; 24021 // Avoid disabling potential load folding opportunities. 24022 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 24023 return false; 24024 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 24025 return false; 24026 Promote = true; 24027 } 24028 } 24029 24030 PVT = MVT::i32; 24031 return Promote; 24032 } 24033 24034 //===----------------------------------------------------------------------===// 24035 // X86 Inline Assembly Support 24036 //===----------------------------------------------------------------------===// 24037 24038 // Helper to match a string separated by whitespace. 24039 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { 24040 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. 24041 24042 for (StringRef Piece : Pieces) { 24043 if (!S.startswith(Piece)) // Check if the piece matches. 24044 return false; 24045 24046 S = S.substr(Piece.size()); 24047 StringRef::size_type Pos = S.find_first_not_of(" \t"); 24048 if (Pos == 0) // We matched a prefix. 24049 return false; 24050 24051 S = S.substr(Pos); 24052 } 24053 24054 return S.empty(); 24055 } 24056 24057 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { 24058 24059 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { 24060 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && 24061 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && 24062 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { 24063 24064 if (AsmPieces.size() == 3) 24065 return true; 24066 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) 24067 return true; 24068 } 24069 } 24070 return false; 24071 } 24072 24073 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 24074 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 24075 24076 std::string AsmStr = IA->getAsmString(); 24077 24078 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 24079 if (!Ty || Ty->getBitWidth() % 16 != 0) 24080 return false; 24081 24082 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 24083 SmallVector<StringRef, 4> AsmPieces; 24084 SplitString(AsmStr, AsmPieces, ";\n"); 24085 24086 switch (AsmPieces.size()) { 24087 default: return false; 24088 case 1: 24089 // FIXME: this should verify that we are targeting a 486 or better. If not, 24090 // we will turn this bswap into something that will be lowered to logical 24091 // ops instead of emitting the bswap asm. For now, we don't support 486 or 24092 // lower so don't worry about this. 24093 // bswap $0 24094 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || 24095 matchAsm(AsmPieces[0], {"bswapl", "$0"}) || 24096 matchAsm(AsmPieces[0], {"bswapq", "$0"}) || 24097 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || 24098 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || 24099 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { 24100 // No need to check constraints, nothing other than the equivalent of 24101 // "=r,0" would be valid here. 24102 return IntrinsicLowering::LowerToByteSwap(CI); 24103 } 24104 24105 // rorw $$8, ${0:w} --> llvm.bswap.i16 24106 if (CI->getType()->isIntegerTy(16) && 24107 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 24108 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || 24109 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { 24110 AsmPieces.clear(); 24111 const std::string &ConstraintsStr = IA->getConstraintString(); 24112 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 24113 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 24114 if (clobbersFlagRegisters(AsmPieces)) 24115 return IntrinsicLowering::LowerToByteSwap(CI); 24116 } 24117 break; 24118 case 3: 24119 if (CI->getType()->isIntegerTy(32) && 24120 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 24121 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && 24122 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && 24123 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { 24124 AsmPieces.clear(); 24125 const std::string &ConstraintsStr = IA->getConstraintString(); 24126 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 24127 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 24128 if (clobbersFlagRegisters(AsmPieces)) 24129 return IntrinsicLowering::LowerToByteSwap(CI); 24130 } 24131 24132 if (CI->getType()->isIntegerTy(64)) { 24133 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 24134 if (Constraints.size() >= 2 && 24135 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 24136 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 24137 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 24138 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && 24139 matchAsm(AsmPieces[1], {"bswap", "%edx"}) && 24140 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) 24141 return IntrinsicLowering::LowerToByteSwap(CI); 24142 } 24143 } 24144 break; 24145 } 24146 return false; 24147 } 24148 24149 /// getConstraintType - Given a constraint letter, return the type of 24150 /// constraint it is for this target. 24151 X86TargetLowering::ConstraintType 24152 X86TargetLowering::getConstraintType(const std::string &Constraint) const { 24153 if (Constraint.size() == 1) { 24154 switch (Constraint[0]) { 24155 case 'R': 24156 case 'q': 24157 case 'Q': 24158 case 'f': 24159 case 't': 24160 case 'u': 24161 case 'y': 24162 case 'x': 24163 case 'Y': 24164 case 'l': 24165 return C_RegisterClass; 24166 case 'a': 24167 case 'b': 24168 case 'c': 24169 case 'd': 24170 case 'S': 24171 case 'D': 24172 case 'A': 24173 return C_Register; 24174 case 'I': 24175 case 'J': 24176 case 'K': 24177 case 'L': 24178 case 'M': 24179 case 'N': 24180 case 'G': 24181 case 'C': 24182 case 'e': 24183 case 'Z': 24184 return C_Other; 24185 default: 24186 break; 24187 } 24188 } 24189 return TargetLowering::getConstraintType(Constraint); 24190 } 24191 24192 /// Examine constraint type and operand type and determine a weight value. 24193 /// This object must already have been set up with the operand type 24194 /// and the current alternative constraint selected. 24195 TargetLowering::ConstraintWeight 24196 X86TargetLowering::getSingleConstraintMatchWeight( 24197 AsmOperandInfo &info, const char *constraint) const { 24198 ConstraintWeight weight = CW_Invalid; 24199 Value *CallOperandVal = info.CallOperandVal; 24200 // If we don't have a value, we can't do a match, 24201 // but allow it at the lowest weight. 24202 if (!CallOperandVal) 24203 return CW_Default; 24204 Type *type = CallOperandVal->getType(); 24205 // Look at the constraint type. 24206 switch (*constraint) { 24207 default: 24208 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 24209 case 'R': 24210 case 'q': 24211 case 'Q': 24212 case 'a': 24213 case 'b': 24214 case 'c': 24215 case 'd': 24216 case 'S': 24217 case 'D': 24218 case 'A': 24219 if (CallOperandVal->getType()->isIntegerTy()) 24220 weight = CW_SpecificReg; 24221 break; 24222 case 'f': 24223 case 't': 24224 case 'u': 24225 if (type->isFloatingPointTy()) 24226 weight = CW_SpecificReg; 24227 break; 24228 case 'y': 24229 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 24230 weight = CW_SpecificReg; 24231 break; 24232 case 'x': 24233 case 'Y': 24234 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 24235 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) 24236 weight = CW_Register; 24237 break; 24238 case 'I': 24239 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 24240 if (C->getZExtValue() <= 31) 24241 weight = CW_Constant; 24242 } 24243 break; 24244 case 'J': 24245 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 24246 if (C->getZExtValue() <= 63) 24247 weight = CW_Constant; 24248 } 24249 break; 24250 case 'K': 24251 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 24252 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 24253 weight = CW_Constant; 24254 } 24255 break; 24256 case 'L': 24257 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 24258 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 24259 weight = CW_Constant; 24260 } 24261 break; 24262 case 'M': 24263 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 24264 if (C->getZExtValue() <= 3) 24265 weight = CW_Constant; 24266 } 24267 break; 24268 case 'N': 24269 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 24270 if (C->getZExtValue() <= 0xff) 24271 weight = CW_Constant; 24272 } 24273 break; 24274 case 'G': 24275 case 'C': 24276 if (isa<ConstantFP>(CallOperandVal)) { 24277 weight = CW_Constant; 24278 } 24279 break; 24280 case 'e': 24281 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 24282 if ((C->getSExtValue() >= -0x80000000LL) && 24283 (C->getSExtValue() <= 0x7fffffffLL)) 24284 weight = CW_Constant; 24285 } 24286 break; 24287 case 'Z': 24288 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 24289 if (C->getZExtValue() <= 0xffffffff) 24290 weight = CW_Constant; 24291 } 24292 break; 24293 } 24294 return weight; 24295 } 24296 24297 /// LowerXConstraint - try to replace an X constraint, which matches anything, 24298 /// with another that has more specific requirements based on the type of the 24299 /// corresponding operand. 24300 const char *X86TargetLowering:: 24301 LowerXConstraint(EVT ConstraintVT) const { 24302 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 24303 // 'f' like normal targets. 24304 if (ConstraintVT.isFloatingPoint()) { 24305 if (Subtarget->hasSSE2()) 24306 return "Y"; 24307 if (Subtarget->hasSSE1()) 24308 return "x"; 24309 } 24310 24311 return TargetLowering::LowerXConstraint(ConstraintVT); 24312 } 24313 24314 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 24315 /// vector. If it is invalid, don't add anything to Ops. 24316 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 24317 std::string &Constraint, 24318 std::vector<SDValue>&Ops, 24319 SelectionDAG &DAG) const { 24320 SDValue Result; 24321 24322 // Only support length 1 constraints for now. 24323 if (Constraint.length() > 1) return; 24324 24325 char ConstraintLetter = Constraint[0]; 24326 switch (ConstraintLetter) { 24327 default: break; 24328 case 'I': 24329 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24330 if (C->getZExtValue() <= 31) { 24331 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 24332 break; 24333 } 24334 } 24335 return; 24336 case 'J': 24337 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24338 if (C->getZExtValue() <= 63) { 24339 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 24340 break; 24341 } 24342 } 24343 return; 24344 case 'K': 24345 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24346 if (isInt<8>(C->getSExtValue())) { 24347 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 24348 break; 24349 } 24350 } 24351 return; 24352 case 'L': 24353 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24354 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || 24355 (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { 24356 Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType()); 24357 break; 24358 } 24359 } 24360 return; 24361 case 'M': 24362 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24363 if (C->getZExtValue() <= 3) { 24364 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 24365 break; 24366 } 24367 } 24368 return; 24369 case 'N': 24370 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24371 if (C->getZExtValue() <= 255) { 24372 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 24373 break; 24374 } 24375 } 24376 return; 24377 case 'O': 24378 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24379 if (C->getZExtValue() <= 127) { 24380 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 24381 break; 24382 } 24383 } 24384 return; 24385 case 'e': { 24386 // 32-bit signed value 24387 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24388 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 24389 C->getSExtValue())) { 24390 // Widen to 64 bits here to get it sign extended. 24391 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 24392 break; 24393 } 24394 // FIXME gcc accepts some relocatable values here too, but only in certain 24395 // memory models; it's complicated. 24396 } 24397 return; 24398 } 24399 case 'Z': { 24400 // 32-bit unsigned value 24401 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 24402 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 24403 C->getZExtValue())) { 24404 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 24405 break; 24406 } 24407 } 24408 // FIXME gcc accepts some relocatable values here too, but only in certain 24409 // memory models; it's complicated. 24410 return; 24411 } 24412 case 'i': { 24413 // Literal immediates are always ok. 24414 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 24415 // Widen to 64 bits here to get it sign extended. 24416 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 24417 break; 24418 } 24419 24420 // In any sort of PIC mode addresses need to be computed at runtime by 24421 // adding in a register or some sort of table lookup. These can't 24422 // be used as immediates. 24423 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 24424 return; 24425 24426 // If we are in non-pic codegen mode, we allow the address of a global (with 24427 // an optional displacement) to be used with 'i'. 24428 GlobalAddressSDNode *GA = nullptr; 24429 int64_t Offset = 0; 24430 24431 // Match either (GA), (GA+C), (GA+C1+C2), etc. 24432 while (1) { 24433 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 24434 Offset += GA->getOffset(); 24435 break; 24436 } else if (Op.getOpcode() == ISD::ADD) { 24437 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 24438 Offset += C->getZExtValue(); 24439 Op = Op.getOperand(0); 24440 continue; 24441 } 24442 } else if (Op.getOpcode() == ISD::SUB) { 24443 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 24444 Offset += -C->getZExtValue(); 24445 Op = Op.getOperand(0); 24446 continue; 24447 } 24448 } 24449 24450 // Otherwise, this isn't something we can handle, reject it. 24451 return; 24452 } 24453 24454 const GlobalValue *GV = GA->getGlobal(); 24455 // If we require an extra load to get this address, as in PIC mode, we 24456 // can't accept it. 24457 if (isGlobalStubReference( 24458 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) 24459 return; 24460 24461 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), 24462 GA->getValueType(0), Offset); 24463 break; 24464 } 24465 } 24466 24467 if (Result.getNode()) { 24468 Ops.push_back(Result); 24469 return; 24470 } 24471 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 24472 } 24473 24474 std::pair<unsigned, const TargetRegisterClass *> 24475 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 24476 const std::string &Constraint, 24477 MVT VT) const { 24478 // First, see if this is a constraint that directly corresponds to an LLVM 24479 // register class. 24480 if (Constraint.size() == 1) { 24481 // GCC Constraint Letters 24482 switch (Constraint[0]) { 24483 default: break; 24484 // TODO: Slight differences here in allocation order and leaving 24485 // RIP in the class. Do they matter any more here than they do 24486 // in the normal allocation? 24487 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 24488 if (Subtarget->is64Bit()) { 24489 if (VT == MVT::i32 || VT == MVT::f32) 24490 return std::make_pair(0U, &X86::GR32RegClass); 24491 if (VT == MVT::i16) 24492 return std::make_pair(0U, &X86::GR16RegClass); 24493 if (VT == MVT::i8 || VT == MVT::i1) 24494 return std::make_pair(0U, &X86::GR8RegClass); 24495 if (VT == MVT::i64 || VT == MVT::f64) 24496 return std::make_pair(0U, &X86::GR64RegClass); 24497 break; 24498 } 24499 // 32-bit fallthrough 24500 case 'Q': // Q_REGS 24501 if (VT == MVT::i32 || VT == MVT::f32) 24502 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 24503 if (VT == MVT::i16) 24504 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 24505 if (VT == MVT::i8 || VT == MVT::i1) 24506 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 24507 if (VT == MVT::i64) 24508 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 24509 break; 24510 case 'r': // GENERAL_REGS 24511 case 'l': // INDEX_REGS 24512 if (VT == MVT::i8 || VT == MVT::i1) 24513 return std::make_pair(0U, &X86::GR8RegClass); 24514 if (VT == MVT::i16) 24515 return std::make_pair(0U, &X86::GR16RegClass); 24516 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 24517 return std::make_pair(0U, &X86::GR32RegClass); 24518 return std::make_pair(0U, &X86::GR64RegClass); 24519 case 'R': // LEGACY_REGS 24520 if (VT == MVT::i8 || VT == MVT::i1) 24521 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 24522 if (VT == MVT::i16) 24523 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 24524 if (VT == MVT::i32 || !Subtarget->is64Bit()) 24525 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 24526 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 24527 case 'f': // FP Stack registers. 24528 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 24529 // value to the correct fpstack register class. 24530 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 24531 return std::make_pair(0U, &X86::RFP32RegClass); 24532 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 24533 return std::make_pair(0U, &X86::RFP64RegClass); 24534 return std::make_pair(0U, &X86::RFP80RegClass); 24535 case 'y': // MMX_REGS if MMX allowed. 24536 if (!Subtarget->hasMMX()) break; 24537 return std::make_pair(0U, &X86::VR64RegClass); 24538 case 'Y': // SSE_REGS if SSE2 allowed 24539 if (!Subtarget->hasSSE2()) break; 24540 // FALL THROUGH. 24541 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 24542 if (!Subtarget->hasSSE1()) break; 24543 24544 switch (VT.SimpleTy) { 24545 default: break; 24546 // Scalar SSE types. 24547 case MVT::f32: 24548 case MVT::i32: 24549 return std::make_pair(0U, &X86::FR32RegClass); 24550 case MVT::f64: 24551 case MVT::i64: 24552 return std::make_pair(0U, &X86::FR64RegClass); 24553 // Vector types. 24554 case MVT::v16i8: 24555 case MVT::v8i16: 24556 case MVT::v4i32: 24557 case MVT::v2i64: 24558 case MVT::v4f32: 24559 case MVT::v2f64: 24560 return std::make_pair(0U, &X86::VR128RegClass); 24561 // AVX types. 24562 case MVT::v32i8: 24563 case MVT::v16i16: 24564 case MVT::v8i32: 24565 case MVT::v4i64: 24566 case MVT::v8f32: 24567 case MVT::v4f64: 24568 return std::make_pair(0U, &X86::VR256RegClass); 24569 case MVT::v8f64: 24570 case MVT::v16f32: 24571 case MVT::v16i32: 24572 case MVT::v8i64: 24573 return std::make_pair(0U, &X86::VR512RegClass); 24574 } 24575 break; 24576 } 24577 } 24578 24579 // Use the default implementation in TargetLowering to convert the register 24580 // constraint into a member of a register class. 24581 std::pair<unsigned, const TargetRegisterClass*> Res; 24582 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 24583 24584 // Not found as a standard register? 24585 if (!Res.second) { 24586 // Map st(0) -> st(7) -> ST0 24587 if (Constraint.size() == 7 && Constraint[0] == '{' && 24588 tolower(Constraint[1]) == 's' && 24589 tolower(Constraint[2]) == 't' && 24590 Constraint[3] == '(' && 24591 (Constraint[4] >= '0' && Constraint[4] <= '7') && 24592 Constraint[5] == ')' && 24593 Constraint[6] == '}') { 24594 24595 Res.first = X86::FP0+Constraint[4]-'0'; 24596 Res.second = &X86::RFP80RegClass; 24597 return Res; 24598 } 24599 24600 // GCC allows "st(0)" to be called just plain "st". 24601 if (StringRef("{st}").equals_lower(Constraint)) { 24602 Res.first = X86::FP0; 24603 Res.second = &X86::RFP80RegClass; 24604 return Res; 24605 } 24606 24607 // flags -> EFLAGS 24608 if (StringRef("{flags}").equals_lower(Constraint)) { 24609 Res.first = X86::EFLAGS; 24610 Res.second = &X86::CCRRegClass; 24611 return Res; 24612 } 24613 24614 // 'A' means EAX + EDX. 24615 if (Constraint == "A") { 24616 Res.first = X86::EAX; 24617 Res.second = &X86::GR32_ADRegClass; 24618 return Res; 24619 } 24620 return Res; 24621 } 24622 24623 // Otherwise, check to see if this is a register class of the wrong value 24624 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 24625 // turn into {ax},{dx}. 24626 if (Res.second->hasType(VT)) 24627 return Res; // Correct type already, nothing to do. 24628 24629 // All of the single-register GCC register classes map their values onto 24630 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 24631 // really want an 8-bit or 32-bit register, map to the appropriate register 24632 // class and return the appropriate register. 24633 if (Res.second == &X86::GR16RegClass) { 24634 if (VT == MVT::i8 || VT == MVT::i1) { 24635 unsigned DestReg = 0; 24636 switch (Res.first) { 24637 default: break; 24638 case X86::AX: DestReg = X86::AL; break; 24639 case X86::DX: DestReg = X86::DL; break; 24640 case X86::CX: DestReg = X86::CL; break; 24641 case X86::BX: DestReg = X86::BL; break; 24642 } 24643 if (DestReg) { 24644 Res.first = DestReg; 24645 Res.second = &X86::GR8RegClass; 24646 } 24647 } else if (VT == MVT::i32 || VT == MVT::f32) { 24648 unsigned DestReg = 0; 24649 switch (Res.first) { 24650 default: break; 24651 case X86::AX: DestReg = X86::EAX; break; 24652 case X86::DX: DestReg = X86::EDX; break; 24653 case X86::CX: DestReg = X86::ECX; break; 24654 case X86::BX: DestReg = X86::EBX; break; 24655 case X86::SI: DestReg = X86::ESI; break; 24656 case X86::DI: DestReg = X86::EDI; break; 24657 case X86::BP: DestReg = X86::EBP; break; 24658 case X86::SP: DestReg = X86::ESP; break; 24659 } 24660 if (DestReg) { 24661 Res.first = DestReg; 24662 Res.second = &X86::GR32RegClass; 24663 } 24664 } else if (VT == MVT::i64 || VT == MVT::f64) { 24665 unsigned DestReg = 0; 24666 switch (Res.first) { 24667 default: break; 24668 case X86::AX: DestReg = X86::RAX; break; 24669 case X86::DX: DestReg = X86::RDX; break; 24670 case X86::CX: DestReg = X86::RCX; break; 24671 case X86::BX: DestReg = X86::RBX; break; 24672 case X86::SI: DestReg = X86::RSI; break; 24673 case X86::DI: DestReg = X86::RDI; break; 24674 case X86::BP: DestReg = X86::RBP; break; 24675 case X86::SP: DestReg = X86::RSP; break; 24676 } 24677 if (DestReg) { 24678 Res.first = DestReg; 24679 Res.second = &X86::GR64RegClass; 24680 } 24681 } 24682 } else if (Res.second == &X86::FR32RegClass || 24683 Res.second == &X86::FR64RegClass || 24684 Res.second == &X86::VR128RegClass || 24685 Res.second == &X86::VR256RegClass || 24686 Res.second == &X86::FR32XRegClass || 24687 Res.second == &X86::FR64XRegClass || 24688 Res.second == &X86::VR128XRegClass || 24689 Res.second == &X86::VR256XRegClass || 24690 Res.second == &X86::VR512RegClass) { 24691 // Handle references to XMM physical registers that got mapped into the 24692 // wrong class. This can happen with constraints like {xmm0} where the 24693 // target independent register mapper will just pick the first match it can 24694 // find, ignoring the required type. 24695 24696 if (VT == MVT::f32 || VT == MVT::i32) 24697 Res.second = &X86::FR32RegClass; 24698 else if (VT == MVT::f64 || VT == MVT::i64) 24699 Res.second = &X86::FR64RegClass; 24700 else if (X86::VR128RegClass.hasType(VT)) 24701 Res.second = &X86::VR128RegClass; 24702 else if (X86::VR256RegClass.hasType(VT)) 24703 Res.second = &X86::VR256RegClass; 24704 else if (X86::VR512RegClass.hasType(VT)) 24705 Res.second = &X86::VR512RegClass; 24706 } 24707 24708 return Res; 24709 } 24710 24711 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, 24712 Type *Ty) const { 24713 // Scaling factors are not free at all. 24714 // An indexed folded instruction, i.e., inst (reg1, reg2, scale), 24715 // will take 2 allocations in the out of order engine instead of 1 24716 // for plain addressing mode, i.e. inst (reg1). 24717 // E.g., 24718 // vaddps (%rsi,%drx), %ymm0, %ymm1 24719 // Requires two allocations (one for the load, one for the computation) 24720 // whereas: 24721 // vaddps (%rsi), %ymm0, %ymm1 24722 // Requires just 1 allocation, i.e., freeing allocations for other operations 24723 // and having less micro operations to execute. 24724 // 24725 // For some X86 architectures, this is even worse because for instance for 24726 // stores, the complex addressing mode forces the instruction to use the 24727 // "load" ports instead of the dedicated "store" port. 24728 // E.g., on Haswell: 24729 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. 24730 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. 24731 if (isLegalAddressingMode(AM, Ty)) 24732 // Scale represents reg2 * scale, thus account for 1 24733 // as soon as we use a second register. 24734 return AM.Scale != 0; 24735 return -1; 24736 } 24737 24738 bool X86TargetLowering::isTargetFTOL() const { 24739 return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); 24740 } 24741