1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that X86 uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "X86ISelLowering.h" 16 #include "Utils/X86ShuffleDecode.h" 17 #include "X86CallingConv.h" 18 #include "X86FrameLowering.h" 19 #include "X86InstrBuilder.h" 20 #include "X86MachineFunctionInfo.h" 21 #include "X86TargetMachine.h" 22 #include "X86TargetObjectFile.h" 23 #include "llvm/ADT/SmallBitVector.h" 24 #include "llvm/ADT/SmallSet.h" 25 #include "llvm/ADT/Statistic.h" 26 #include "llvm/ADT/StringExtras.h" 27 #include "llvm/ADT/StringSwitch.h" 28 #include "llvm/Analysis/EHPersonalities.h" 29 #include "llvm/CodeGen/IntrinsicLowering.h" 30 #include "llvm/CodeGen/MachineFrameInfo.h" 31 #include "llvm/CodeGen/MachineFunction.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineJumpTableInfo.h" 34 #include "llvm/CodeGen/MachineModuleInfo.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/WinEHFuncInfo.h" 37 #include "llvm/IR/CallSite.h" 38 #include "llvm/IR/CallingConv.h" 39 #include "llvm/IR/Constants.h" 40 #include "llvm/IR/DerivedTypes.h" 41 #include "llvm/IR/Function.h" 42 #include "llvm/IR/GlobalAlias.h" 43 #include "llvm/IR/GlobalVariable.h" 44 #include "llvm/IR/Instructions.h" 45 #include "llvm/IR/Intrinsics.h" 46 #include "llvm/MC/MCAsmInfo.h" 47 #include "llvm/MC/MCContext.h" 48 #include "llvm/MC/MCExpr.h" 49 #include "llvm/MC/MCSymbol.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/Debug.h" 52 #include "llvm/Support/ErrorHandling.h" 53 #include "llvm/Support/MathExtras.h" 54 #include "llvm/Target/TargetOptions.h" 55 #include "X86IntrinsicsInfo.h" 56 #include <bitset> 57 #include <numeric> 58 #include <cctype> 59 using namespace llvm; 60 61 #define DEBUG_TYPE "x86-isel" 62 63 STATISTIC(NumTailCalls, "Number of tail calls"); 64 65 static cl::opt<bool> ExperimentalVectorWideningLegalization( 66 "x86-experimental-vector-widening-legalization", cl::init(false), 67 cl::desc("Enable an experimental vector type legalization through widening " 68 "rather than promotion."), 69 cl::Hidden); 70 71 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, 72 const X86Subtarget &STI) 73 : TargetLowering(TM), Subtarget(&STI) { 74 X86ScalarSSEf64 = Subtarget->hasSSE2(); 75 X86ScalarSSEf32 = Subtarget->hasSSE1(); 76 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); 77 78 // Set up the TargetLowering object. 79 80 // X86 is weird. It always uses i8 for shift amounts and setcc results. 81 setBooleanContents(ZeroOrOneBooleanContent); 82 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 83 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 84 85 // For 64-bit, since we have so many registers, use the ILP scheduler. 86 // For 32-bit, use the register pressure specific scheduling. 87 // For Atom, always use ILP scheduling. 88 if (Subtarget->isAtom()) 89 setSchedulingPreference(Sched::ILP); 90 else if (Subtarget->is64Bit()) 91 setSchedulingPreference(Sched::ILP); 92 else 93 setSchedulingPreference(Sched::RegPressure); 94 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 95 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 96 97 // Bypass expensive divides on Atom when compiling with O2. 98 if (TM.getOptLevel() >= CodeGenOpt::Default) { 99 if (Subtarget->hasSlowDivide32()) 100 addBypassSlowDiv(32, 8); 101 if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) 102 addBypassSlowDiv(64, 16); 103 } 104 105 if (Subtarget->isTargetKnownWindowsMSVC()) { 106 // Setup Windows compiler runtime calls. 107 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 108 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 109 setLibcallName(RTLIB::SREM_I64, "_allrem"); 110 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 111 setLibcallName(RTLIB::MUL_I64, "_allmul"); 112 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 113 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 114 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 115 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 116 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 117 } 118 119 if (Subtarget->isTargetDarwin()) { 120 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 121 setUseUnderscoreSetJmp(false); 122 setUseUnderscoreLongJmp(false); 123 } else if (Subtarget->isTargetWindowsGNU()) { 124 // MS runtime is weird: it exports _setjmp, but longjmp! 125 setUseUnderscoreSetJmp(true); 126 setUseUnderscoreLongJmp(false); 127 } else { 128 setUseUnderscoreSetJmp(true); 129 setUseUnderscoreLongJmp(true); 130 } 131 132 // Set up the register classes. 133 addRegisterClass(MVT::i8, &X86::GR8RegClass); 134 addRegisterClass(MVT::i16, &X86::GR16RegClass); 135 addRegisterClass(MVT::i32, &X86::GR32RegClass); 136 if (Subtarget->is64Bit()) 137 addRegisterClass(MVT::i64, &X86::GR64RegClass); 138 139 for (MVT VT : MVT::integer_valuetypes()) 140 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 141 142 // We don't accept any truncstore of integer registers. 143 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 144 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 145 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 146 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 147 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 148 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 149 150 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 151 152 // SETOEQ and SETUNE require checking two conditions. 153 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 154 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 155 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 156 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 157 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 158 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 159 160 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 161 // operation. 162 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 163 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 164 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 165 166 if (Subtarget->is64Bit()) { 167 if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) 168 // f32/f64 are legal, f80 is custom. 169 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 170 else 171 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 172 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 173 } else if (!Subtarget->useSoftFloat()) { 174 // We have an algorithm for SSE2->double, and we turn this into a 175 // 64-bit FILD followed by conditional FADD for other targets. 176 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 177 // We have an algorithm for SSE2, and we turn this into a 64-bit 178 // FILD or VCVTUSI2SS/SD for other targets. 179 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 180 } 181 182 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 183 // this operation. 184 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 185 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 186 187 if (!Subtarget->useSoftFloat()) { 188 // SSE has no i16 to fp conversion, only i32 189 if (X86ScalarSSEf32) { 190 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 191 // f32 and f64 cases are Legal, f80 case is not 192 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 193 } else { 194 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 195 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 196 } 197 } else { 198 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 199 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 200 } 201 202 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 203 // this operation. 204 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 205 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 206 207 if (!Subtarget->useSoftFloat()) { 208 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 209 // are Legal, f80 is custom lowered. 210 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 211 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 212 213 if (X86ScalarSSEf32) { 214 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 215 // f32 and f64 cases are Legal, f80 case is not 216 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 217 } else { 218 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 219 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 220 } 221 } else { 222 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 223 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); 224 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); 225 } 226 227 // Handle FP_TO_UINT by promoting the destination to a larger signed 228 // conversion. 229 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 230 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 231 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 232 233 if (Subtarget->is64Bit()) { 234 if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { 235 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. 236 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 237 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 238 } else { 239 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 240 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 241 } 242 } else if (!Subtarget->useSoftFloat()) { 243 // Since AVX is a superset of SSE3, only check for SSE here. 244 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 245 // Expand FP_TO_UINT into a select. 246 // FIXME: We would like to use a Custom expander here eventually to do 247 // the optimal thing for SSE vs. the default expansion in the legalizer. 248 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 249 else 250 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. 251 // With SSE3 we can use fisttpll to convert to a signed i64; without 252 // SSE, we're stuck with a fistpll. 253 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 254 255 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 256 } 257 258 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 259 if (!X86ScalarSSEf64) { 260 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 261 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 262 if (Subtarget->is64Bit()) { 263 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 264 // Without SSE, i64->f64 goes through memory. 265 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 266 } 267 } 268 269 // Scalar integer divide and remainder are lowered to use operations that 270 // produce two results, to match the available instructions. This exposes 271 // the two-result form to trivial CSE, which is able to combine x/y and x%y 272 // into a single instruction. 273 // 274 // Scalar integer multiply-high is also lowered to use two-result 275 // operations, to match the available instructions. However, plain multiply 276 // (low) operations are left as Legal, as there are single-result 277 // instructions for this in x86. Using the two-result multiply instructions 278 // when both high and low results are needed must be arranged by dagcombine. 279 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { 280 setOperationAction(ISD::MULHS, VT, Expand); 281 setOperationAction(ISD::MULHU, VT, Expand); 282 setOperationAction(ISD::SDIV, VT, Expand); 283 setOperationAction(ISD::UDIV, VT, Expand); 284 setOperationAction(ISD::SREM, VT, Expand); 285 setOperationAction(ISD::UREM, VT, Expand); 286 287 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 288 setOperationAction(ISD::ADDC, VT, Custom); 289 setOperationAction(ISD::ADDE, VT, Custom); 290 setOperationAction(ISD::SUBC, VT, Custom); 291 setOperationAction(ISD::SUBE, VT, Custom); 292 } 293 294 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 295 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 296 setOperationAction(ISD::BR_CC , MVT::f32, Expand); 297 setOperationAction(ISD::BR_CC , MVT::f64, Expand); 298 setOperationAction(ISD::BR_CC , MVT::f80, Expand); 299 setOperationAction(ISD::BR_CC , MVT::f128, Expand); 300 setOperationAction(ISD::BR_CC , MVT::i8, Expand); 301 setOperationAction(ISD::BR_CC , MVT::i16, Expand); 302 setOperationAction(ISD::BR_CC , MVT::i32, Expand); 303 setOperationAction(ISD::BR_CC , MVT::i64, Expand); 304 setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); 305 setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); 306 setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); 307 setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); 308 setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); 309 setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); 310 setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); 311 setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); 312 if (Subtarget->is64Bit()) 313 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 314 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 315 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 316 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 317 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 318 319 if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { 320 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` 321 // is. We should promote the value to 64-bits to solve this. 322 // This is what the CRT headers do - `fmodf` is an inline header 323 // function casting to f64 and calling `fmod`. 324 setOperationAction(ISD::FREM , MVT::f32 , Promote); 325 } else { 326 setOperationAction(ISD::FREM , MVT::f32 , Expand); 327 } 328 329 setOperationAction(ISD::FREM , MVT::f64 , Expand); 330 setOperationAction(ISD::FREM , MVT::f80 , Expand); 331 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 332 333 // Promote the i8 variants and force them on up to i32 which has a shorter 334 // encoding. 335 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 336 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 337 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 338 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 339 if (Subtarget->hasBMI()) { 340 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 341 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 342 if (Subtarget->is64Bit()) 343 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 344 } else { 345 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 346 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 347 if (Subtarget->is64Bit()) 348 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 349 } 350 351 if (Subtarget->hasLZCNT()) { 352 // When promoting the i8 variants, force them to i32 for a shorter 353 // encoding. 354 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 355 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 356 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 357 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 358 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 359 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 360 if (Subtarget->is64Bit()) 361 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 362 } else { 363 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 364 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 365 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 366 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 367 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 368 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 369 if (Subtarget->is64Bit()) { 370 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 372 } 373 } 374 375 // Special handling for half-precision floating point conversions. 376 // If we don't have F16C support, then lower half float conversions 377 // into library calls. 378 if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) { 379 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 380 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 381 } 382 383 // There's never any support for operations beyond MVT::f32. 384 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 385 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); 386 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 387 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); 388 389 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 390 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 391 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); 392 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 393 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 394 setTruncStoreAction(MVT::f80, MVT::f16, Expand); 395 396 if (Subtarget->hasPOPCNT()) { 397 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 398 } else { 399 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 400 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 401 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 402 if (Subtarget->is64Bit()) 403 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 404 } 405 406 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 407 408 if (!Subtarget->hasMOVBE()) 409 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 410 411 // These should be promoted to a larger select which is supported. 412 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 413 // X86 wants to expand cmov itself. 414 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 415 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 416 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 417 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 418 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 419 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 420 setOperationAction(ISD::SELECT , MVT::f128 , Custom); 421 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 422 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 423 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 424 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 425 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 426 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 427 setOperationAction(ISD::SETCC , MVT::f128 , Custom); 428 setOperationAction(ISD::SETCCE , MVT::i8 , Custom); 429 setOperationAction(ISD::SETCCE , MVT::i16 , Custom); 430 setOperationAction(ISD::SETCCE , MVT::i32 , Custom); 431 if (Subtarget->is64Bit()) { 432 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 433 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 434 setOperationAction(ISD::SETCCE , MVT::i64 , Custom); 435 } 436 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 437 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 438 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 439 // support continuation, user-level threading, and etc.. As a result, no 440 // other SjLj exception interfaces are implemented and please don't build 441 // your own exception handling based on them. 442 // LLVM/Clang supports zero-cost DWARF exception handling. 443 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 444 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 445 446 // Darwin ABI issue. 447 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 448 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 449 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 450 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 451 if (Subtarget->is64Bit()) 452 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 453 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 454 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 455 if (Subtarget->is64Bit()) { 456 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 457 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 458 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 459 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 460 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 461 } 462 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 463 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 464 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 465 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 466 if (Subtarget->is64Bit()) { 467 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 468 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 469 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 470 } 471 472 if (Subtarget->hasSSE1()) 473 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 474 475 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 476 477 // Expand certain atomics 478 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { 479 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); 480 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 481 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 482 } 483 484 if (Subtarget->hasCmpxchg16b()) { 485 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); 486 } 487 488 // FIXME - use subtarget debug flags 489 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && 490 !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { 491 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 492 } 493 494 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 495 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 496 497 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 498 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 499 500 setOperationAction(ISD::TRAP, MVT::Other, Legal); 501 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 502 503 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 504 setOperationAction(ISD::VASTART , MVT::Other, Custom); 505 setOperationAction(ISD::VAEND , MVT::Other, Expand); 506 if (Subtarget->is64Bit()) { 507 setOperationAction(ISD::VAARG , MVT::Other, Custom); 508 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 509 } else { 510 // TargetInfo::CharPtrBuiltinVaList 511 setOperationAction(ISD::VAARG , MVT::Other, Expand); 512 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 513 } 514 515 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 516 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 517 518 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); 519 520 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. 521 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); 522 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); 523 524 if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) { 525 // f32 and f64 use SSE. 526 // Set up the FP register classes. 527 addRegisterClass(MVT::f32, &X86::FR32RegClass); 528 addRegisterClass(MVT::f64, &X86::FR64RegClass); 529 530 // Use ANDPD to simulate FABS. 531 setOperationAction(ISD::FABS , MVT::f64, Custom); 532 setOperationAction(ISD::FABS , MVT::f32, Custom); 533 534 // Use XORP to simulate FNEG. 535 setOperationAction(ISD::FNEG , MVT::f64, Custom); 536 setOperationAction(ISD::FNEG , MVT::f32, Custom); 537 538 // Use ANDPD and ORPD to simulate FCOPYSIGN. 539 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 540 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 541 542 // Lower this to FGETSIGNx86 plus an AND. 543 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 544 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 545 546 // We don't support sin/cos/fmod 547 setOperationAction(ISD::FSIN , MVT::f64, Expand); 548 setOperationAction(ISD::FCOS , MVT::f64, Expand); 549 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 550 setOperationAction(ISD::FSIN , MVT::f32, Expand); 551 setOperationAction(ISD::FCOS , MVT::f32, Expand); 552 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 553 554 // Expand FP immediates into loads from the stack, except for the special 555 // cases we handle. 556 addLegalFPImmediate(APFloat(+0.0)); // xorpd 557 addLegalFPImmediate(APFloat(+0.0f)); // xorps 558 } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) { 559 // Use SSE for f32, x87 for f64. 560 // Set up the FP register classes. 561 addRegisterClass(MVT::f32, &X86::FR32RegClass); 562 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 563 564 // Use ANDPS to simulate FABS. 565 setOperationAction(ISD::FABS , MVT::f32, Custom); 566 567 // Use XORP to simulate FNEG. 568 setOperationAction(ISD::FNEG , MVT::f32, Custom); 569 570 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 571 572 // Use ANDPS and ORPS to simulate FCOPYSIGN. 573 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 574 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 575 576 // We don't support sin/cos/fmod 577 setOperationAction(ISD::FSIN , MVT::f32, Expand); 578 setOperationAction(ISD::FCOS , MVT::f32, Expand); 579 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 580 581 // Special cases we handle for FP constants. 582 addLegalFPImmediate(APFloat(+0.0f)); // xorps 583 addLegalFPImmediate(APFloat(+0.0)); // FLD0 584 addLegalFPImmediate(APFloat(+1.0)); // FLD1 585 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 586 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 587 588 if (!TM.Options.UnsafeFPMath) { 589 setOperationAction(ISD::FSIN , MVT::f64, Expand); 590 setOperationAction(ISD::FCOS , MVT::f64, Expand); 591 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 592 } 593 } else if (!Subtarget->useSoftFloat()) { 594 // f32 and f64 in x87. 595 // Set up the FP register classes. 596 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 597 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 598 599 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 600 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 601 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 602 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 603 604 if (!TM.Options.UnsafeFPMath) { 605 setOperationAction(ISD::FSIN , MVT::f64, Expand); 606 setOperationAction(ISD::FSIN , MVT::f32, Expand); 607 setOperationAction(ISD::FCOS , MVT::f64, Expand); 608 setOperationAction(ISD::FCOS , MVT::f32, Expand); 609 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 610 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 611 } 612 addLegalFPImmediate(APFloat(+0.0)); // FLD0 613 addLegalFPImmediate(APFloat(+1.0)); // FLD1 614 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 615 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 616 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 617 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 618 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 619 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 620 } 621 622 // We don't support FMA. 623 setOperationAction(ISD::FMA, MVT::f64, Expand); 624 setOperationAction(ISD::FMA, MVT::f32, Expand); 625 626 // Long double always uses X87, except f128 in MMX. 627 if (!Subtarget->useSoftFloat()) { 628 if (Subtarget->is64Bit() && Subtarget->hasMMX()) { 629 addRegisterClass(MVT::f128, &X86::FR128RegClass); 630 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); 631 setOperationAction(ISD::FABS , MVT::f128, Custom); 632 setOperationAction(ISD::FNEG , MVT::f128, Custom); 633 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); 634 } 635 636 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 637 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 638 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 639 { 640 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 641 addLegalFPImmediate(TmpFlt); // FLD0 642 TmpFlt.changeSign(); 643 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 644 645 bool ignored; 646 APFloat TmpFlt2(+1.0); 647 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 648 &ignored); 649 addLegalFPImmediate(TmpFlt2); // FLD1 650 TmpFlt2.changeSign(); 651 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 652 } 653 654 if (!TM.Options.UnsafeFPMath) { 655 setOperationAction(ISD::FSIN , MVT::f80, Expand); 656 setOperationAction(ISD::FCOS , MVT::f80, Expand); 657 setOperationAction(ISD::FSINCOS, MVT::f80, Expand); 658 } 659 660 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 661 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 662 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 663 setOperationAction(ISD::FRINT, MVT::f80, Expand); 664 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 665 setOperationAction(ISD::FMA, MVT::f80, Expand); 666 } 667 668 // Always use a library call for pow. 669 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 670 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 671 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 672 673 setOperationAction(ISD::FLOG, MVT::f80, Expand); 674 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 675 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 676 setOperationAction(ISD::FEXP, MVT::f80, Expand); 677 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 678 setOperationAction(ISD::FMINNUM, MVT::f80, Expand); 679 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); 680 681 // First set operation action for all vector types to either promote 682 // (for widening) or expand (for scalarization). Then we will selectively 683 // turn on ones that can be effectively codegen'd. 684 for (MVT VT : MVT::vector_valuetypes()) { 685 setOperationAction(ISD::ADD , VT, Expand); 686 setOperationAction(ISD::SUB , VT, Expand); 687 setOperationAction(ISD::FADD, VT, Expand); 688 setOperationAction(ISD::FNEG, VT, Expand); 689 setOperationAction(ISD::FSUB, VT, Expand); 690 setOperationAction(ISD::MUL , VT, Expand); 691 setOperationAction(ISD::FMUL, VT, Expand); 692 setOperationAction(ISD::SDIV, VT, Expand); 693 setOperationAction(ISD::UDIV, VT, Expand); 694 setOperationAction(ISD::FDIV, VT, Expand); 695 setOperationAction(ISD::SREM, VT, Expand); 696 setOperationAction(ISD::UREM, VT, Expand); 697 setOperationAction(ISD::LOAD, VT, Expand); 698 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 699 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 700 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 701 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 702 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 703 setOperationAction(ISD::FABS, VT, Expand); 704 setOperationAction(ISD::FSIN, VT, Expand); 705 setOperationAction(ISD::FSINCOS, VT, Expand); 706 setOperationAction(ISD::FCOS, VT, Expand); 707 setOperationAction(ISD::FSINCOS, VT, Expand); 708 setOperationAction(ISD::FREM, VT, Expand); 709 setOperationAction(ISD::FMA, VT, Expand); 710 setOperationAction(ISD::FPOWI, VT, Expand); 711 setOperationAction(ISD::FSQRT, VT, Expand); 712 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 713 setOperationAction(ISD::FFLOOR, VT, Expand); 714 setOperationAction(ISD::FCEIL, VT, Expand); 715 setOperationAction(ISD::FTRUNC, VT, Expand); 716 setOperationAction(ISD::FRINT, VT, Expand); 717 setOperationAction(ISD::FNEARBYINT, VT, Expand); 718 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 719 setOperationAction(ISD::MULHS, VT, Expand); 720 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 721 setOperationAction(ISD::MULHU, VT, Expand); 722 setOperationAction(ISD::SDIVREM, VT, Expand); 723 setOperationAction(ISD::UDIVREM, VT, Expand); 724 setOperationAction(ISD::FPOW, VT, Expand); 725 setOperationAction(ISD::CTPOP, VT, Expand); 726 setOperationAction(ISD::CTTZ, VT, Expand); 727 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 728 setOperationAction(ISD::CTLZ, VT, Expand); 729 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 730 setOperationAction(ISD::SHL, VT, Expand); 731 setOperationAction(ISD::SRA, VT, Expand); 732 setOperationAction(ISD::SRL, VT, Expand); 733 setOperationAction(ISD::ROTL, VT, Expand); 734 setOperationAction(ISD::ROTR, VT, Expand); 735 setOperationAction(ISD::BSWAP, VT, Expand); 736 setOperationAction(ISD::SETCC, VT, Expand); 737 setOperationAction(ISD::FLOG, VT, Expand); 738 setOperationAction(ISD::FLOG2, VT, Expand); 739 setOperationAction(ISD::FLOG10, VT, Expand); 740 setOperationAction(ISD::FEXP, VT, Expand); 741 setOperationAction(ISD::FEXP2, VT, Expand); 742 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 743 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 744 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 745 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 746 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 747 setOperationAction(ISD::TRUNCATE, VT, Expand); 748 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 749 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 750 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 751 setOperationAction(ISD::VSELECT, VT, Expand); 752 setOperationAction(ISD::SELECT_CC, VT, Expand); 753 for (MVT InnerVT : MVT::vector_valuetypes()) { 754 setTruncStoreAction(InnerVT, VT, Expand); 755 756 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); 757 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); 758 759 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like 760 // types, we have to deal with them whether we ask for Expansion or not. 761 // Setting Expand causes its own optimisation problems though, so leave 762 // them legal. 763 if (VT.getVectorElementType() == MVT::i1) 764 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 765 766 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are 767 // split/scalarized right now. 768 if (VT.getVectorElementType() == MVT::f16) 769 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 770 } 771 } 772 773 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 774 // with -msoft-float, disable use of MMX as well. 775 if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) { 776 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 777 // No operations on x86mmx supported, everything uses intrinsics. 778 } 779 780 // MMX-sized vectors (other than x86mmx) are expected to be expanded 781 // into smaller operations. 782 for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { 783 setOperationAction(ISD::MULHS, MMXTy, Expand); 784 setOperationAction(ISD::AND, MMXTy, Expand); 785 setOperationAction(ISD::OR, MMXTy, Expand); 786 setOperationAction(ISD::XOR, MMXTy, Expand); 787 setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); 788 setOperationAction(ISD::SELECT, MMXTy, Expand); 789 setOperationAction(ISD::BITCAST, MMXTy, Expand); 790 } 791 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 792 793 if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) { 794 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 795 796 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 797 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 798 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 799 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 800 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 801 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 802 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 803 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 804 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 805 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 806 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); 807 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 808 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 809 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 810 } 811 812 if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) { 813 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 814 815 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM 816 // registers cannot be used even for integer operations. 817 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 818 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 819 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 820 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 821 822 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 823 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 824 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 825 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 826 setOperationAction(ISD::MUL, MVT::v16i8, Custom); 827 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 828 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 829 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); 830 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); 831 setOperationAction(ISD::MULHU, MVT::v8i16, Legal); 832 setOperationAction(ISD::MULHS, MVT::v8i16, Legal); 833 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 834 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 835 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 836 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 837 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 838 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 839 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 840 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 841 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 842 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 843 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 844 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 845 846 setOperationAction(ISD::SMAX, MVT::v8i16, Legal); 847 setOperationAction(ISD::UMAX, MVT::v16i8, Legal); 848 setOperationAction(ISD::SMIN, MVT::v8i16, Legal); 849 setOperationAction(ISD::UMIN, MVT::v16i8, Legal); 850 851 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 852 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 853 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 854 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 855 856 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 857 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 858 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 859 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 860 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 861 862 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); 863 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 864 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 865 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 866 867 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 868 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 869 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 870 // ISD::CTTZ v2i64 - scalarization is faster. 871 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 872 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 873 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 874 // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster. 875 876 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 877 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { 878 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 879 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 880 setOperationAction(ISD::VSELECT, VT, Custom); 881 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 882 } 883 884 // We support custom legalizing of sext and anyext loads for specific 885 // memory vector types which we can load as a scalar (or sequence of 886 // scalars) and extend in-register to a legal 128-bit vector type. For sext 887 // loads these must work with a single scalar load. 888 for (MVT VT : MVT::integer_vector_valuetypes()) { 889 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); 890 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); 891 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); 892 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); 893 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); 894 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); 895 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); 896 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); 897 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); 898 } 899 900 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 901 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 902 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 903 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 904 setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); 905 setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); 906 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 907 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 908 909 if (Subtarget->is64Bit()) { 910 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 911 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 912 } 913 914 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 915 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { 916 setOperationAction(ISD::AND, VT, Promote); 917 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 918 setOperationAction(ISD::OR, VT, Promote); 919 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 920 setOperationAction(ISD::XOR, VT, Promote); 921 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 922 setOperationAction(ISD::LOAD, VT, Promote); 923 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 924 setOperationAction(ISD::SELECT, VT, Promote); 925 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 926 } 927 928 // Custom lower v2i64 and v2f64 selects. 929 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 930 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 931 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 932 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 933 934 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 935 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 936 937 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 938 939 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 940 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 941 // As there is no 64-bit GPR available, we need build a special custom 942 // sequence to convert from v2i32 to v2f32. 943 if (!Subtarget->is64Bit()) 944 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 945 946 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 947 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 948 949 for (MVT VT : MVT::fp_vector_valuetypes()) 950 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); 951 952 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); 953 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); 954 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); 955 } 956 957 if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) { 958 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { 959 setOperationAction(ISD::FFLOOR, RoundedTy, Legal); 960 setOperationAction(ISD::FCEIL, RoundedTy, Legal); 961 setOperationAction(ISD::FTRUNC, RoundedTy, Legal); 962 setOperationAction(ISD::FRINT, RoundedTy, Legal); 963 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); 964 } 965 966 setOperationAction(ISD::SMAX, MVT::v16i8, Legal); 967 setOperationAction(ISD::SMAX, MVT::v4i32, Legal); 968 setOperationAction(ISD::UMAX, MVT::v8i16, Legal); 969 setOperationAction(ISD::UMAX, MVT::v4i32, Legal); 970 setOperationAction(ISD::SMIN, MVT::v16i8, Legal); 971 setOperationAction(ISD::SMIN, MVT::v4i32, Legal); 972 setOperationAction(ISD::UMIN, MVT::v8i16, Legal); 973 setOperationAction(ISD::UMIN, MVT::v4i32, Legal); 974 975 // FIXME: Do we need to handle scalar-to-vector here? 976 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 977 978 // We directly match byte blends in the backend as they match the VSELECT 979 // condition form. 980 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 981 982 // SSE41 brings specific instructions for doing vector sign extend even in 983 // cases where we don't have SRA. 984 for (MVT VT : MVT::integer_vector_valuetypes()) { 985 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); 986 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); 987 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); 988 } 989 990 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X 991 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); 992 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); 993 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); 994 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); 995 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); 996 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); 997 998 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); 999 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); 1000 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); 1001 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); 1002 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); 1003 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); 1004 1005 // i8 and i16 vectors are custom because the source register and source 1006 // source memory operand types are not the same width. f32 vectors are 1007 // custom since the immediate controlling the insert encodes additional 1008 // information. 1009 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 1010 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 1011 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 1012 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 1013 1014 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 1015 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 1016 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 1017 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 1018 1019 // FIXME: these should be Legal, but that's only for the case where 1020 // the index is constant. For now custom expand to deal with that. 1021 if (Subtarget->is64Bit()) { 1022 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 1023 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 1024 } 1025 } 1026 1027 if (Subtarget->hasSSE2()) { 1028 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); 1029 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); 1030 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); 1031 1032 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 1033 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 1034 1035 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 1036 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 1037 1038 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 1039 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 1040 1041 // In the customized shift lowering, the legal cases in AVX2 will be 1042 // recognized. 1043 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1044 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1045 1046 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1047 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1048 1049 setOperationAction(ISD::SRA, MVT::v2i64, Custom); 1050 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1051 } 1052 1053 if (Subtarget->hasXOP()) { 1054 setOperationAction(ISD::ROTL, MVT::v16i8, Custom); 1055 setOperationAction(ISD::ROTL, MVT::v8i16, Custom); 1056 setOperationAction(ISD::ROTL, MVT::v4i32, Custom); 1057 setOperationAction(ISD::ROTL, MVT::v2i64, Custom); 1058 setOperationAction(ISD::ROTL, MVT::v32i8, Custom); 1059 setOperationAction(ISD::ROTL, MVT::v16i16, Custom); 1060 setOperationAction(ISD::ROTL, MVT::v8i32, Custom); 1061 setOperationAction(ISD::ROTL, MVT::v4i64, Custom); 1062 } 1063 1064 if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { 1065 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1066 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1067 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1068 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1069 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1070 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1071 1072 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1073 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1074 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1075 1076 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1077 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1078 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1079 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1080 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1081 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 1082 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); 1083 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); 1084 setOperationAction(ISD::FRINT, MVT::v8f32, Legal); 1085 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); 1086 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1087 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 1088 1089 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1090 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1091 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1092 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1093 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1094 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1095 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 1096 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 1097 setOperationAction(ISD::FRINT, MVT::v4f64, Legal); 1098 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); 1099 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1100 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 1101 1102 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted 1103 // even though v8i16 is a legal type. 1104 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); 1105 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); 1106 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1107 1108 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 1109 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1110 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1111 1112 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1113 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1114 1115 for (MVT VT : MVT::fp_vector_valuetypes()) 1116 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); 1117 1118 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1119 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1120 1121 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1122 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1123 1124 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1125 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1126 1127 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1128 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1129 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1130 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1131 1132 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1133 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1134 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1135 1136 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); 1137 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 1138 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1139 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); 1140 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1141 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 1142 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); 1143 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); 1144 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); 1145 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1146 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 1147 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 1148 1149 setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); 1150 setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); 1151 setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); 1152 setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); 1153 1154 setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); 1155 setOperationAction(ISD::CTTZ, MVT::v16i16, Custom); 1156 setOperationAction(ISD::CTTZ, MVT::v8i32, Custom); 1157 setOperationAction(ISD::CTTZ, MVT::v4i64, Custom); 1158 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom); 1159 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom); 1160 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); 1161 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); 1162 1163 if (Subtarget->hasAnyFMA()) { 1164 setOperationAction(ISD::FMA, MVT::v8f32, Legal); 1165 setOperationAction(ISD::FMA, MVT::v4f64, Legal); 1166 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 1167 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 1168 setOperationAction(ISD::FMA, MVT::f32, Legal); 1169 setOperationAction(ISD::FMA, MVT::f64, Legal); 1170 } 1171 1172 if (Subtarget->hasInt256()) { 1173 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1174 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1175 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1176 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1177 1178 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1179 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1180 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1181 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1182 1183 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1184 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1185 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1186 setOperationAction(ISD::MUL, MVT::v32i8, Custom); 1187 1188 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); 1189 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); 1190 setOperationAction(ISD::MULHU, MVT::v16i16, Legal); 1191 setOperationAction(ISD::MULHS, MVT::v16i16, Legal); 1192 1193 setOperationAction(ISD::SMAX, MVT::v32i8, Legal); 1194 setOperationAction(ISD::SMAX, MVT::v16i16, Legal); 1195 setOperationAction(ISD::SMAX, MVT::v8i32, Legal); 1196 setOperationAction(ISD::UMAX, MVT::v32i8, Legal); 1197 setOperationAction(ISD::UMAX, MVT::v16i16, Legal); 1198 setOperationAction(ISD::UMAX, MVT::v8i32, Legal); 1199 setOperationAction(ISD::SMIN, MVT::v32i8, Legal); 1200 setOperationAction(ISD::SMIN, MVT::v16i16, Legal); 1201 setOperationAction(ISD::SMIN, MVT::v8i32, Legal); 1202 setOperationAction(ISD::UMIN, MVT::v32i8, Legal); 1203 setOperationAction(ISD::UMIN, MVT::v16i16, Legal); 1204 setOperationAction(ISD::UMIN, MVT::v8i32, Legal); 1205 1206 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting 1207 // when we have a 256bit-wide blend with immediate. 1208 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); 1209 1210 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X 1211 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); 1212 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); 1213 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); 1214 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); 1215 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); 1216 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); 1217 1218 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); 1219 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); 1220 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); 1221 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); 1222 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); 1223 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); 1224 } else { 1225 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1226 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1227 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1228 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1229 1230 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1231 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1232 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1233 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1234 1235 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1236 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1237 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1238 setOperationAction(ISD::MUL, MVT::v32i8, Custom); 1239 1240 setOperationAction(ISD::SMAX, MVT::v32i8, Custom); 1241 setOperationAction(ISD::SMAX, MVT::v16i16, Custom); 1242 setOperationAction(ISD::SMAX, MVT::v8i32, Custom); 1243 setOperationAction(ISD::UMAX, MVT::v32i8, Custom); 1244 setOperationAction(ISD::UMAX, MVT::v16i16, Custom); 1245 setOperationAction(ISD::UMAX, MVT::v8i32, Custom); 1246 setOperationAction(ISD::SMIN, MVT::v32i8, Custom); 1247 setOperationAction(ISD::SMIN, MVT::v16i16, Custom); 1248 setOperationAction(ISD::SMIN, MVT::v8i32, Custom); 1249 setOperationAction(ISD::UMIN, MVT::v32i8, Custom); 1250 setOperationAction(ISD::UMIN, MVT::v16i16, Custom); 1251 setOperationAction(ISD::UMIN, MVT::v8i32, Custom); 1252 } 1253 1254 // In the customized shift lowering, the legal cases in AVX2 will be 1255 // recognized. 1256 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1257 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1258 1259 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1260 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1261 1262 setOperationAction(ISD::SRA, MVT::v4i64, Custom); 1263 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1264 1265 // Custom lower several nodes for 256-bit types. 1266 for (MVT VT : MVT::vector_valuetypes()) { 1267 if (VT.getScalarSizeInBits() >= 32) { 1268 setOperationAction(ISD::MLOAD, VT, Legal); 1269 setOperationAction(ISD::MSTORE, VT, Legal); 1270 } 1271 // Extract subvector is special because the value type 1272 // (result) is 128-bit but the source is 256-bit wide. 1273 if (VT.is128BitVector()) { 1274 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1275 } 1276 // Do not attempt to custom lower other non-256-bit vectors 1277 if (!VT.is256BitVector()) 1278 continue; 1279 1280 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1281 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1282 setOperationAction(ISD::VSELECT, VT, Custom); 1283 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1284 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1285 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1286 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1287 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1288 } 1289 1290 if (Subtarget->hasInt256()) 1291 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1292 1293 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1294 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { 1295 setOperationAction(ISD::AND, VT, Promote); 1296 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1297 setOperationAction(ISD::OR, VT, Promote); 1298 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1299 setOperationAction(ISD::XOR, VT, Promote); 1300 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1301 setOperationAction(ISD::LOAD, VT, Promote); 1302 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1303 setOperationAction(ISD::SELECT, VT, Promote); 1304 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1305 } 1306 } 1307 1308 if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { 1309 addRegisterClass(MVT::v16i32, &X86::VR512RegClass); 1310 addRegisterClass(MVT::v16f32, &X86::VR512RegClass); 1311 addRegisterClass(MVT::v8i64, &X86::VR512RegClass); 1312 addRegisterClass(MVT::v8f64, &X86::VR512RegClass); 1313 1314 addRegisterClass(MVT::i1, &X86::VK1RegClass); 1315 addRegisterClass(MVT::v8i1, &X86::VK8RegClass); 1316 addRegisterClass(MVT::v16i1, &X86::VK16RegClass); 1317 1318 for (MVT VT : MVT::fp_vector_valuetypes()) 1319 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); 1320 1321 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); 1322 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); 1323 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); 1324 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); 1325 setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); 1326 setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); 1327 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); 1328 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); 1329 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); 1330 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); 1331 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); 1332 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); 1333 1334 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 1335 setOperationAction(ISD::SETCC, MVT::i1, Custom); 1336 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 1337 setOperationAction(ISD::XOR, MVT::i1, Legal); 1338 setOperationAction(ISD::OR, MVT::i1, Legal); 1339 setOperationAction(ISD::AND, MVT::i1, Legal); 1340 setOperationAction(ISD::SUB, MVT::i1, Custom); 1341 setOperationAction(ISD::ADD, MVT::i1, Custom); 1342 setOperationAction(ISD::MUL, MVT::i1, Custom); 1343 setOperationAction(ISD::LOAD, MVT::v16f32, Legal); 1344 setOperationAction(ISD::LOAD, MVT::v8f64, Legal); 1345 setOperationAction(ISD::LOAD, MVT::v8i64, Legal); 1346 setOperationAction(ISD::LOAD, MVT::v16i32, Legal); 1347 setOperationAction(ISD::LOAD, MVT::v16i1, Legal); 1348 1349 setOperationAction(ISD::FADD, MVT::v16f32, Legal); 1350 setOperationAction(ISD::FSUB, MVT::v16f32, Legal); 1351 setOperationAction(ISD::FMUL, MVT::v16f32, Legal); 1352 setOperationAction(ISD::FDIV, MVT::v16f32, Legal); 1353 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); 1354 setOperationAction(ISD::FNEG, MVT::v16f32, Custom); 1355 setOperationAction(ISD::FABS, MVT::v16f32, Custom); 1356 1357 setOperationAction(ISD::FADD, MVT::v8f64, Legal); 1358 setOperationAction(ISD::FSUB, MVT::v8f64, Legal); 1359 setOperationAction(ISD::FMUL, MVT::v8f64, Legal); 1360 setOperationAction(ISD::FDIV, MVT::v8f64, Legal); 1361 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); 1362 setOperationAction(ISD::FNEG, MVT::v8f64, Custom); 1363 setOperationAction(ISD::FABS, MVT::v8f64, Custom); 1364 setOperationAction(ISD::FMA, MVT::v8f64, Legal); 1365 setOperationAction(ISD::FMA, MVT::v16f32, Legal); 1366 1367 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); 1368 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); 1369 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 1370 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 1371 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); 1372 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); 1373 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); 1374 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); 1375 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); 1376 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); 1377 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 1378 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 1379 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); 1380 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); 1381 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); 1382 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); 1383 1384 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); 1385 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); 1386 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); 1387 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); 1388 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); 1389 if (Subtarget->hasVLX()){ 1390 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); 1391 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); 1392 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); 1393 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); 1394 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); 1395 1396 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); 1397 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); 1398 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); 1399 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 1400 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 1401 } else { 1402 setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); 1403 setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); 1404 setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); 1405 setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); 1406 } 1407 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); 1408 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1409 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 1410 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); 1411 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); 1412 if (Subtarget->hasDQI()) { 1413 setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); 1414 setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); 1415 1416 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); 1417 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); 1418 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); 1419 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); 1420 if (Subtarget->hasVLX()) { 1421 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); 1422 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); 1423 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); 1424 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); 1425 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); 1426 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); 1427 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); 1428 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); 1429 } 1430 } 1431 if (Subtarget->hasVLX()) { 1432 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1433 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 1434 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1435 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 1436 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 1437 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 1438 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 1439 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 1440 } 1441 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); 1442 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); 1443 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); 1444 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 1445 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); 1446 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); 1447 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); 1448 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 1449 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); 1450 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); 1451 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); 1452 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1453 if (Subtarget->hasDQI()) { 1454 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); 1455 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); 1456 } 1457 setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); 1458 setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); 1459 setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); 1460 setOperationAction(ISD::FCEIL, MVT::v8f64, Legal); 1461 setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal); 1462 setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal); 1463 setOperationAction(ISD::FRINT, MVT::v16f32, Legal); 1464 setOperationAction(ISD::FRINT, MVT::v8f64, Legal); 1465 setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal); 1466 setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal); 1467 1468 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); 1469 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); 1470 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); 1471 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); 1472 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); 1473 1474 setOperationAction(ISD::SETCC, MVT::v16i1, Custom); 1475 setOperationAction(ISD::SETCC, MVT::v8i1, Custom); 1476 1477 setOperationAction(ISD::MUL, MVT::v8i64, Custom); 1478 1479 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); 1480 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); 1481 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); 1482 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); 1483 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); 1484 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); 1485 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); 1486 setOperationAction(ISD::SELECT, MVT::v8f64, Custom); 1487 setOperationAction(ISD::SELECT, MVT::v8i64, Custom); 1488 setOperationAction(ISD::SELECT, MVT::v16f32, Custom); 1489 setOperationAction(ISD::SELECT, MVT::v16i1, Custom); 1490 setOperationAction(ISD::SELECT, MVT::v8i1, Custom); 1491 1492 setOperationAction(ISD::SMAX, MVT::v16i32, Legal); 1493 setOperationAction(ISD::SMAX, MVT::v8i64, Legal); 1494 setOperationAction(ISD::UMAX, MVT::v16i32, Legal); 1495 setOperationAction(ISD::UMAX, MVT::v8i64, Legal); 1496 setOperationAction(ISD::SMIN, MVT::v16i32, Legal); 1497 setOperationAction(ISD::SMIN, MVT::v8i64, Legal); 1498 setOperationAction(ISD::UMIN, MVT::v16i32, Legal); 1499 setOperationAction(ISD::UMIN, MVT::v8i64, Legal); 1500 1501 setOperationAction(ISD::ADD, MVT::v8i64, Legal); 1502 setOperationAction(ISD::ADD, MVT::v16i32, Legal); 1503 1504 setOperationAction(ISD::SUB, MVT::v8i64, Legal); 1505 setOperationAction(ISD::SUB, MVT::v16i32, Legal); 1506 1507 setOperationAction(ISD::MUL, MVT::v16i32, Legal); 1508 1509 setOperationAction(ISD::SRL, MVT::v8i64, Custom); 1510 setOperationAction(ISD::SRL, MVT::v16i32, Custom); 1511 1512 setOperationAction(ISD::SHL, MVT::v8i64, Custom); 1513 setOperationAction(ISD::SHL, MVT::v16i32, Custom); 1514 1515 setOperationAction(ISD::SRA, MVT::v8i64, Custom); 1516 setOperationAction(ISD::SRA, MVT::v16i32, Custom); 1517 1518 setOperationAction(ISD::AND, MVT::v8i64, Legal); 1519 setOperationAction(ISD::OR, MVT::v8i64, Legal); 1520 setOperationAction(ISD::XOR, MVT::v8i64, Legal); 1521 setOperationAction(ISD::AND, MVT::v16i32, Legal); 1522 setOperationAction(ISD::OR, MVT::v16i32, Legal); 1523 setOperationAction(ISD::XOR, MVT::v16i32, Legal); 1524 1525 if (Subtarget->hasCDI()) { 1526 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); 1527 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); 1528 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal); 1529 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Legal); 1530 1531 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); 1532 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); 1533 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); 1534 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); 1535 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Custom); 1536 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Custom); 1537 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Custom); 1538 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Custom); 1539 1540 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); 1541 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); 1542 1543 if (Subtarget->hasVLX()) { 1544 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); 1545 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); 1546 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); 1547 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); 1548 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal); 1549 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal); 1550 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal); 1551 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal); 1552 1553 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); 1554 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); 1555 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 1556 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 1557 } else { 1558 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); 1559 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); 1560 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); 1561 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); 1562 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Custom); 1563 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Custom); 1564 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Custom); 1565 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom); 1566 } 1567 } // Subtarget->hasCDI() 1568 1569 if (Subtarget->hasDQI()) { 1570 setOperationAction(ISD::MUL, MVT::v2i64, Legal); 1571 setOperationAction(ISD::MUL, MVT::v4i64, Legal); 1572 setOperationAction(ISD::MUL, MVT::v8i64, Legal); 1573 } 1574 // Custom lower several nodes. 1575 for (MVT VT : MVT::vector_valuetypes()) { 1576 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 1577 if (EltSize == 1) { 1578 setOperationAction(ISD::AND, VT, Legal); 1579 setOperationAction(ISD::OR, VT, Legal); 1580 setOperationAction(ISD::XOR, VT, Legal); 1581 } 1582 if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { 1583 setOperationAction(ISD::MGATHER, VT, Custom); 1584 setOperationAction(ISD::MSCATTER, VT, Custom); 1585 } 1586 // Extract subvector is special because the value type 1587 // (result) is 256/128-bit but the source is 512-bit wide. 1588 if (VT.is128BitVector() || VT.is256BitVector()) { 1589 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1590 } 1591 if (VT.getVectorElementType() == MVT::i1) 1592 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 1593 1594 // Do not attempt to custom lower other non-512-bit vectors 1595 if (!VT.is512BitVector()) 1596 continue; 1597 1598 if (EltSize >= 32) { 1599 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1600 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1601 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1602 setOperationAction(ISD::VSELECT, VT, Legal); 1603 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1604 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1605 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1606 setOperationAction(ISD::MLOAD, VT, Legal); 1607 setOperationAction(ISD::MSTORE, VT, Legal); 1608 setOperationAction(ISD::MGATHER, VT, Legal); 1609 setOperationAction(ISD::MSCATTER, VT, Custom); 1610 } 1611 } 1612 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { 1613 setOperationAction(ISD::SELECT, VT, Promote); 1614 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); 1615 } 1616 }// has AVX-512 1617 1618 if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) { 1619 addRegisterClass(MVT::v32i16, &X86::VR512RegClass); 1620 addRegisterClass(MVT::v64i8, &X86::VR512RegClass); 1621 1622 addRegisterClass(MVT::v32i1, &X86::VK32RegClass); 1623 addRegisterClass(MVT::v64i1, &X86::VK64RegClass); 1624 1625 setOperationAction(ISD::LOAD, MVT::v32i16, Legal); 1626 setOperationAction(ISD::LOAD, MVT::v64i8, Legal); 1627 setOperationAction(ISD::SETCC, MVT::v32i1, Custom); 1628 setOperationAction(ISD::SETCC, MVT::v64i1, Custom); 1629 setOperationAction(ISD::ADD, MVT::v32i16, Legal); 1630 setOperationAction(ISD::ADD, MVT::v64i8, Legal); 1631 setOperationAction(ISD::SUB, MVT::v32i16, Legal); 1632 setOperationAction(ISD::SUB, MVT::v64i8, Legal); 1633 setOperationAction(ISD::MUL, MVT::v32i16, Legal); 1634 setOperationAction(ISD::MULHS, MVT::v32i16, Legal); 1635 setOperationAction(ISD::MULHU, MVT::v32i16, Legal); 1636 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); 1637 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); 1638 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); 1639 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); 1640 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); 1641 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); 1642 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); 1643 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); 1644 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); 1645 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); 1646 setOperationAction(ISD::SELECT, MVT::v32i1, Custom); 1647 setOperationAction(ISD::SELECT, MVT::v64i1, Custom); 1648 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); 1649 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); 1650 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); 1651 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); 1652 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); 1653 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); 1654 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); 1655 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); 1656 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); 1657 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); 1658 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); 1659 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); 1660 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); 1661 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); 1662 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); 1663 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); 1664 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); 1665 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); 1666 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); 1667 1668 setOperationAction(ISD::SMAX, MVT::v64i8, Legal); 1669 setOperationAction(ISD::SMAX, MVT::v32i16, Legal); 1670 setOperationAction(ISD::UMAX, MVT::v64i8, Legal); 1671 setOperationAction(ISD::UMAX, MVT::v32i16, Legal); 1672 setOperationAction(ISD::SMIN, MVT::v64i8, Legal); 1673 setOperationAction(ISD::SMIN, MVT::v32i16, Legal); 1674 setOperationAction(ISD::UMIN, MVT::v64i8, Legal); 1675 setOperationAction(ISD::UMIN, MVT::v32i16, Legal); 1676 1677 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); 1678 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); 1679 if (Subtarget->hasVLX()) 1680 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 1681 1682 if (Subtarget->hasCDI()) { 1683 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); 1684 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); 1685 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Custom); 1686 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Custom); 1687 } 1688 1689 for (auto VT : { MVT::v64i8, MVT::v32i16 }) { 1690 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1691 setOperationAction(ISD::VSELECT, VT, Legal); 1692 1693 setOperationAction(ISD::AND, VT, Promote); 1694 AddPromotedToType (ISD::AND, VT, MVT::v8i64); 1695 setOperationAction(ISD::OR, VT, Promote); 1696 AddPromotedToType (ISD::OR, VT, MVT::v8i64); 1697 setOperationAction(ISD::XOR, VT, Promote); 1698 AddPromotedToType (ISD::XOR, VT, MVT::v8i64); 1699 } 1700 } 1701 1702 if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) { 1703 addRegisterClass(MVT::v4i1, &X86::VK4RegClass); 1704 addRegisterClass(MVT::v2i1, &X86::VK2RegClass); 1705 1706 setOperationAction(ISD::SETCC, MVT::v4i1, Custom); 1707 setOperationAction(ISD::SETCC, MVT::v2i1, Custom); 1708 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); 1709 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); 1710 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); 1711 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); 1712 setOperationAction(ISD::SELECT, MVT::v4i1, Custom); 1713 setOperationAction(ISD::SELECT, MVT::v2i1, Custom); 1714 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); 1715 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); 1716 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); 1717 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); 1718 1719 setOperationAction(ISD::AND, MVT::v8i32, Legal); 1720 setOperationAction(ISD::OR, MVT::v8i32, Legal); 1721 setOperationAction(ISD::XOR, MVT::v8i32, Legal); 1722 setOperationAction(ISD::AND, MVT::v4i32, Legal); 1723 setOperationAction(ISD::OR, MVT::v4i32, Legal); 1724 setOperationAction(ISD::XOR, MVT::v4i32, Legal); 1725 setOperationAction(ISD::SRA, MVT::v2i64, Custom); 1726 setOperationAction(ISD::SRA, MVT::v4i64, Custom); 1727 1728 setOperationAction(ISD::SMAX, MVT::v2i64, Legal); 1729 setOperationAction(ISD::SMAX, MVT::v4i64, Legal); 1730 setOperationAction(ISD::UMAX, MVT::v2i64, Legal); 1731 setOperationAction(ISD::UMAX, MVT::v4i64, Legal); 1732 setOperationAction(ISD::SMIN, MVT::v2i64, Legal); 1733 setOperationAction(ISD::SMIN, MVT::v4i64, Legal); 1734 setOperationAction(ISD::UMIN, MVT::v2i64, Legal); 1735 setOperationAction(ISD::UMIN, MVT::v4i64, Legal); 1736 } 1737 1738 // We want to custom lower some of our intrinsics. 1739 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1740 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1741 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1742 if (!Subtarget->is64Bit()) { 1743 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); 1744 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1745 } 1746 1747 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1748 // handle type legalization for these operations here. 1749 // 1750 // FIXME: We really should do custom legalization for addition and 1751 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1752 // than generic legalization for 64-bit multiplication-with-overflow, though. 1753 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { 1754 if (VT == MVT::i64 && !Subtarget->is64Bit()) 1755 continue; 1756 // Add/Sub/Mul with overflow operations are custom lowered. 1757 setOperationAction(ISD::SADDO, VT, Custom); 1758 setOperationAction(ISD::UADDO, VT, Custom); 1759 setOperationAction(ISD::SSUBO, VT, Custom); 1760 setOperationAction(ISD::USUBO, VT, Custom); 1761 setOperationAction(ISD::SMULO, VT, Custom); 1762 setOperationAction(ISD::UMULO, VT, Custom); 1763 } 1764 1765 if (!Subtarget->is64Bit()) { 1766 // These libcalls are not available in 32-bit. 1767 setLibcallName(RTLIB::SHL_I128, nullptr); 1768 setLibcallName(RTLIB::SRL_I128, nullptr); 1769 setLibcallName(RTLIB::SRA_I128, nullptr); 1770 } 1771 1772 // Combine sin / cos into one node or libcall if possible. 1773 if (Subtarget->hasSinCos()) { 1774 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 1775 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 1776 if (Subtarget->isTargetDarwin()) { 1777 // For MacOSX, we don't want the normal expansion of a libcall to sincos. 1778 // We want to issue a libcall to __sincos_stret to avoid memory traffic. 1779 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1780 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1781 } 1782 } 1783 1784 if (Subtarget->isTargetWin64()) { 1785 setOperationAction(ISD::SDIV, MVT::i128, Custom); 1786 setOperationAction(ISD::UDIV, MVT::i128, Custom); 1787 setOperationAction(ISD::SREM, MVT::i128, Custom); 1788 setOperationAction(ISD::UREM, MVT::i128, Custom); 1789 setOperationAction(ISD::SDIVREM, MVT::i128, Custom); 1790 setOperationAction(ISD::UDIVREM, MVT::i128, Custom); 1791 } 1792 1793 // We have target-specific dag combine patterns for the following nodes: 1794 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1795 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1796 setTargetDAGCombine(ISD::BITCAST); 1797 setTargetDAGCombine(ISD::VSELECT); 1798 setTargetDAGCombine(ISD::SELECT); 1799 setTargetDAGCombine(ISD::SHL); 1800 setTargetDAGCombine(ISD::SRA); 1801 setTargetDAGCombine(ISD::SRL); 1802 setTargetDAGCombine(ISD::OR); 1803 setTargetDAGCombine(ISD::AND); 1804 setTargetDAGCombine(ISD::ADD); 1805 setTargetDAGCombine(ISD::FADD); 1806 setTargetDAGCombine(ISD::FSUB); 1807 setTargetDAGCombine(ISD::FNEG); 1808 setTargetDAGCombine(ISD::FMA); 1809 setTargetDAGCombine(ISD::FMAXNUM); 1810 setTargetDAGCombine(ISD::SUB); 1811 setTargetDAGCombine(ISD::LOAD); 1812 setTargetDAGCombine(ISD::MLOAD); 1813 setTargetDAGCombine(ISD::STORE); 1814 setTargetDAGCombine(ISD::MSTORE); 1815 setTargetDAGCombine(ISD::TRUNCATE); 1816 setTargetDAGCombine(ISD::ZERO_EXTEND); 1817 setTargetDAGCombine(ISD::ANY_EXTEND); 1818 setTargetDAGCombine(ISD::SIGN_EXTEND); 1819 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 1820 setTargetDAGCombine(ISD::SINT_TO_FP); 1821 setTargetDAGCombine(ISD::UINT_TO_FP); 1822 setTargetDAGCombine(ISD::SETCC); 1823 setTargetDAGCombine(ISD::BUILD_VECTOR); 1824 setTargetDAGCombine(ISD::MUL); 1825 setTargetDAGCombine(ISD::XOR); 1826 setTargetDAGCombine(ISD::MSCATTER); 1827 setTargetDAGCombine(ISD::MGATHER); 1828 1829 computeRegisterProperties(Subtarget->getRegisterInfo()); 1830 1831 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1832 MaxStoresPerMemsetOptSize = 8; 1833 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1834 MaxStoresPerMemcpyOptSize = 4; 1835 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1836 MaxStoresPerMemmoveOptSize = 4; 1837 setPrefLoopAlignment(4); // 2^4 bytes. 1838 1839 // A predictable cmov does not hurt on an in-order CPU. 1840 // FIXME: Use a CPU attribute to trigger this, not a CPU model. 1841 PredictableSelectIsExpensive = !Subtarget->isAtom(); 1842 EnableExtLdPromotion = true; 1843 setPrefFunctionAlignment(4); // 2^4 bytes. 1844 1845 verifyIntrinsicTables(); 1846 } 1847 1848 // This has so far only been implemented for 64-bit MachO. 1849 bool X86TargetLowering::useLoadStackGuardNode() const { 1850 return Subtarget->isTargetMachO() && Subtarget->is64Bit(); 1851 } 1852 1853 TargetLoweringBase::LegalizeTypeAction 1854 X86TargetLowering::getPreferredVectorAction(EVT VT) const { 1855 if (ExperimentalVectorWideningLegalization && 1856 VT.getVectorNumElements() != 1 && 1857 VT.getVectorElementType().getSimpleVT() != MVT::i1) 1858 return TypeWidenVector; 1859 1860 return TargetLoweringBase::getPreferredVectorAction(VT); 1861 } 1862 1863 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1864 EVT VT) const { 1865 if (!VT.isVector()) 1866 return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; 1867 1868 if (VT.isSimple()) { 1869 MVT VVT = VT.getSimpleVT(); 1870 const unsigned NumElts = VVT.getVectorNumElements(); 1871 const MVT EltVT = VVT.getVectorElementType(); 1872 if (VVT.is512BitVector()) { 1873 if (Subtarget->hasAVX512()) 1874 if (EltVT == MVT::i32 || EltVT == MVT::i64 || 1875 EltVT == MVT::f32 || EltVT == MVT::f64) 1876 switch(NumElts) { 1877 case 8: return MVT::v8i1; 1878 case 16: return MVT::v16i1; 1879 } 1880 if (Subtarget->hasBWI()) 1881 if (EltVT == MVT::i8 || EltVT == MVT::i16) 1882 switch(NumElts) { 1883 case 32: return MVT::v32i1; 1884 case 64: return MVT::v64i1; 1885 } 1886 } 1887 1888 if (VVT.is256BitVector() || VVT.is128BitVector()) { 1889 if (Subtarget->hasVLX()) 1890 if (EltVT == MVT::i32 || EltVT == MVT::i64 || 1891 EltVT == MVT::f32 || EltVT == MVT::f64) 1892 switch(NumElts) { 1893 case 2: return MVT::v2i1; 1894 case 4: return MVT::v4i1; 1895 case 8: return MVT::v8i1; 1896 } 1897 if (Subtarget->hasBWI() && Subtarget->hasVLX()) 1898 if (EltVT == MVT::i8 || EltVT == MVT::i16) 1899 switch(NumElts) { 1900 case 8: return MVT::v8i1; 1901 case 16: return MVT::v16i1; 1902 case 32: return MVT::v32i1; 1903 } 1904 } 1905 } 1906 1907 return VT.changeVectorElementTypeToInteger(); 1908 } 1909 1910 /// Helper for getByValTypeAlignment to determine 1911 /// the desired ByVal argument alignment. 1912 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1913 if (MaxAlign == 16) 1914 return; 1915 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1916 if (VTy->getBitWidth() == 128) 1917 MaxAlign = 16; 1918 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1919 unsigned EltAlign = 0; 1920 getMaxByValAlign(ATy->getElementType(), EltAlign); 1921 if (EltAlign > MaxAlign) 1922 MaxAlign = EltAlign; 1923 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1924 for (auto *EltTy : STy->elements()) { 1925 unsigned EltAlign = 0; 1926 getMaxByValAlign(EltTy, EltAlign); 1927 if (EltAlign > MaxAlign) 1928 MaxAlign = EltAlign; 1929 if (MaxAlign == 16) 1930 break; 1931 } 1932 } 1933 } 1934 1935 /// Return the desired alignment for ByVal aggregate 1936 /// function arguments in the caller parameter area. For X86, aggregates 1937 /// that contain SSE vectors are placed at 16-byte boundaries while the rest 1938 /// are at 4-byte boundaries. 1939 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, 1940 const DataLayout &DL) const { 1941 if (Subtarget->is64Bit()) { 1942 // Max of 8 and alignment of type. 1943 unsigned TyAlign = DL.getABITypeAlignment(Ty); 1944 if (TyAlign > 8) 1945 return TyAlign; 1946 return 8; 1947 } 1948 1949 unsigned Align = 4; 1950 if (Subtarget->hasSSE1()) 1951 getMaxByValAlign(Ty, Align); 1952 return Align; 1953 } 1954 1955 /// Returns the target specific optimal type for load 1956 /// and store operations as a result of memset, memcpy, and memmove 1957 /// lowering. If DstAlign is zero that means it's safe to destination 1958 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1959 /// means there isn't a need to check it against alignment requirement, 1960 /// probably because the source does not need to be loaded. If 'IsMemset' is 1961 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 1962 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 1963 /// source is constant so it does not need to be loaded. 1964 /// It returns EVT::Other if the type should be determined using generic 1965 /// target-independent logic. 1966 EVT 1967 X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1968 unsigned DstAlign, unsigned SrcAlign, 1969 bool IsMemset, bool ZeroMemset, 1970 bool MemcpyStrSrc, 1971 MachineFunction &MF) const { 1972 const Function *F = MF.getFunction(); 1973 if ((!IsMemset || ZeroMemset) && 1974 !F->hasFnAttribute(Attribute::NoImplicitFloat)) { 1975 if (Size >= 16 && 1976 (!Subtarget->isUnalignedMem16Slow() || 1977 ((DstAlign == 0 || DstAlign >= 16) && 1978 (SrcAlign == 0 || SrcAlign >= 16)))) { 1979 if (Size >= 32) { 1980 // FIXME: Check if unaligned 32-byte accesses are slow. 1981 if (Subtarget->hasInt256()) 1982 return MVT::v8i32; 1983 if (Subtarget->hasFp256()) 1984 return MVT::v8f32; 1985 } 1986 if (Subtarget->hasSSE2()) 1987 return MVT::v4i32; 1988 if (Subtarget->hasSSE1()) 1989 return MVT::v4f32; 1990 } else if (!MemcpyStrSrc && Size >= 8 && 1991 !Subtarget->is64Bit() && 1992 Subtarget->hasSSE2()) { 1993 // Do not use f64 to lower memcpy if source is string constant. It's 1994 // better to use i32 to avoid the loads. 1995 return MVT::f64; 1996 } 1997 } 1998 // This is a compromise. If we reach here, unaligned accesses may be slow on 1999 // this target. However, creating smaller, aligned accesses could be even 2000 // slower and would certainly be a lot more code. 2001 if (Subtarget->is64Bit() && Size >= 8) 2002 return MVT::i64; 2003 return MVT::i32; 2004 } 2005 2006 bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 2007 if (VT == MVT::f32) 2008 return X86ScalarSSEf32; 2009 else if (VT == MVT::f64) 2010 return X86ScalarSSEf64; 2011 return true; 2012 } 2013 2014 bool 2015 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 2016 unsigned, 2017 unsigned, 2018 bool *Fast) const { 2019 if (Fast) { 2020 switch (VT.getSizeInBits()) { 2021 default: 2022 // 8-byte and under are always assumed to be fast. 2023 *Fast = true; 2024 break; 2025 case 128: 2026 *Fast = !Subtarget->isUnalignedMem16Slow(); 2027 break; 2028 case 256: 2029 *Fast = !Subtarget->isUnalignedMem32Slow(); 2030 break; 2031 // TODO: What about AVX-512 (512-bit) accesses? 2032 } 2033 } 2034 // Misaligned accesses of any size are always allowed. 2035 return true; 2036 } 2037 2038 /// Return the entry encoding for a jump table in the 2039 /// current function. The returned value is a member of the 2040 /// MachineJumpTableInfo::JTEntryKind enum. 2041 unsigned X86TargetLowering::getJumpTableEncoding() const { 2042 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 2043 // symbol. 2044 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2045 Subtarget->isPICStyleGOT()) 2046 return MachineJumpTableInfo::EK_Custom32; 2047 2048 // Otherwise, use the normal jump table encoding heuristics. 2049 return TargetLowering::getJumpTableEncoding(); 2050 } 2051 2052 bool X86TargetLowering::useSoftFloat() const { 2053 return Subtarget->useSoftFloat(); 2054 } 2055 2056 const MCExpr * 2057 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 2058 const MachineBasicBlock *MBB, 2059 unsigned uid,MCContext &Ctx) const{ 2060 assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && 2061 Subtarget->isPICStyleGOT()); 2062 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 2063 // entries. 2064 return MCSymbolRefExpr::create(MBB->getSymbol(), 2065 MCSymbolRefExpr::VK_GOTOFF, Ctx); 2066 } 2067 2068 /// Returns relocation base for the given PIC jumptable. 2069 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 2070 SelectionDAG &DAG) const { 2071 if (!Subtarget->is64Bit()) 2072 // This doesn't have SDLoc associated with it, but is not really the 2073 // same as a Register. 2074 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 2075 getPointerTy(DAG.getDataLayout())); 2076 return Table; 2077 } 2078 2079 /// This returns the relocation base for the given PIC jumptable, 2080 /// the same as getPICJumpTableRelocBase, but as an MCExpr. 2081 const MCExpr *X86TargetLowering:: 2082 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 2083 MCContext &Ctx) const { 2084 // X86-64 uses RIP relative addressing based on the jump table label. 2085 if (Subtarget->isPICStyleRIPRel()) 2086 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2087 2088 // Otherwise, the reference is relative to the PIC base. 2089 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 2090 } 2091 2092 std::pair<const TargetRegisterClass *, uint8_t> 2093 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 2094 MVT VT) const { 2095 const TargetRegisterClass *RRC = nullptr; 2096 uint8_t Cost = 1; 2097 switch (VT.SimpleTy) { 2098 default: 2099 return TargetLowering::findRepresentativeClass(TRI, VT); 2100 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 2101 RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; 2102 break; 2103 case MVT::x86mmx: 2104 RRC = &X86::VR64RegClass; 2105 break; 2106 case MVT::f32: case MVT::f64: 2107 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 2108 case MVT::v4f32: case MVT::v2f64: 2109 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 2110 case MVT::v4f64: 2111 RRC = &X86::VR128RegClass; 2112 break; 2113 } 2114 return std::make_pair(RRC, Cost); 2115 } 2116 2117 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 2118 unsigned &Offset) const { 2119 if (!Subtarget->isTargetLinux()) 2120 return false; 2121 2122 if (Subtarget->is64Bit()) { 2123 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 2124 Offset = 0x28; 2125 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 2126 AddressSpace = 256; 2127 else 2128 AddressSpace = 257; 2129 } else { 2130 // %gs:0x14 on i386 2131 Offset = 0x14; 2132 AddressSpace = 256; 2133 } 2134 return true; 2135 } 2136 2137 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { 2138 if (!Subtarget->isTargetAndroid()) 2139 return TargetLowering::getSafeStackPointerLocation(IRB); 2140 2141 // Android provides a fixed TLS slot for the SafeStack pointer. See the 2142 // definition of TLS_SLOT_SAFESTACK in 2143 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 2144 unsigned AddressSpace, Offset; 2145 if (Subtarget->is64Bit()) { 2146 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: 2147 Offset = 0x48; 2148 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 2149 AddressSpace = 256; 2150 else 2151 AddressSpace = 257; 2152 } else { 2153 // %gs:0x24 on i386 2154 Offset = 0x24; 2155 AddressSpace = 256; 2156 } 2157 2158 return ConstantExpr::getIntToPtr( 2159 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), 2160 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); 2161 } 2162 2163 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 2164 unsigned DestAS) const { 2165 assert(SrcAS != DestAS && "Expected different address spaces!"); 2166 2167 return SrcAS < 256 && DestAS < 256; 2168 } 2169 2170 //===----------------------------------------------------------------------===// 2171 // Return Value Calling Convention Implementation 2172 //===----------------------------------------------------------------------===// 2173 2174 #include "X86GenCallingConv.inc" 2175 2176 bool X86TargetLowering::CanLowerReturn( 2177 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 2178 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 2179 SmallVector<CCValAssign, 16> RVLocs; 2180 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2181 return CCInfo.CheckReturn(Outs, RetCC_X86); 2182 } 2183 2184 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 2185 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; 2186 return ScratchRegs; 2187 } 2188 2189 SDValue 2190 X86TargetLowering::LowerReturn(SDValue Chain, 2191 CallingConv::ID CallConv, bool isVarArg, 2192 const SmallVectorImpl<ISD::OutputArg> &Outs, 2193 const SmallVectorImpl<SDValue> &OutVals, 2194 SDLoc dl, SelectionDAG &DAG) const { 2195 MachineFunction &MF = DAG.getMachineFunction(); 2196 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2197 2198 if (CallConv == CallingConv::X86_INTR && !Outs.empty()) 2199 report_fatal_error("X86 interrupts may not return any value"); 2200 2201 SmallVector<CCValAssign, 16> RVLocs; 2202 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 2203 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 2204 2205 SDValue Flag; 2206 SmallVector<SDValue, 6> RetOps; 2207 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2208 // Operand #1 = Bytes To Pop 2209 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, 2210 MVT::i16)); 2211 2212 // Copy the result values into the output registers. 2213 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2214 CCValAssign &VA = RVLocs[i]; 2215 assert(VA.isRegLoc() && "Can only return in registers!"); 2216 SDValue ValToCopy = OutVals[i]; 2217 EVT ValVT = ValToCopy.getValueType(); 2218 2219 // Promote values to the appropriate types. 2220 if (VA.getLocInfo() == CCValAssign::SExt) 2221 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 2222 else if (VA.getLocInfo() == CCValAssign::ZExt) 2223 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 2224 else if (VA.getLocInfo() == CCValAssign::AExt) { 2225 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) 2226 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 2227 else 2228 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 2229 } 2230 else if (VA.getLocInfo() == CCValAssign::BCvt) 2231 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); 2232 2233 assert(VA.getLocInfo() != CCValAssign::FPExt && 2234 "Unexpected FP-extend for return value."); 2235 2236 // If this is x86-64, and we disabled SSE, we can't return FP values, 2237 // or SSE or MMX vectors. 2238 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 2239 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 2240 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 2241 report_fatal_error("SSE register return with SSE disabled"); 2242 } 2243 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 2244 // llvm-gcc has never done it right and no one has noticed, so this 2245 // should be OK for now. 2246 if (ValVT == MVT::f64 && 2247 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 2248 report_fatal_error("SSE2 register return with SSE2 disabled"); 2249 2250 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 2251 // the RET instruction and handled by the FP Stackifier. 2252 if (VA.getLocReg() == X86::FP0 || 2253 VA.getLocReg() == X86::FP1) { 2254 // If this is a copy from an xmm register to ST(0), use an FPExtend to 2255 // change the value to the FP stack register class. 2256 if (isScalarFPTypeInSSEReg(VA.getValVT())) 2257 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 2258 RetOps.push_back(ValToCopy); 2259 // Don't emit a copytoreg. 2260 continue; 2261 } 2262 2263 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 2264 // which is returned in RAX / RDX. 2265 if (Subtarget->is64Bit()) { 2266 if (ValVT == MVT::x86mmx) { 2267 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 2268 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); 2269 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 2270 ValToCopy); 2271 // If we don't have SSE2 available, convert to v4f32 so the generated 2272 // register is legal. 2273 if (!Subtarget->hasSSE2()) 2274 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); 2275 } 2276 } 2277 } 2278 2279 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 2280 Flag = Chain.getValue(1); 2281 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2282 } 2283 2284 // All x86 ABIs require that for returning structs by value we copy 2285 // the sret argument into %rax/%eax (depending on ABI) for the return. 2286 // We saved the argument into a virtual register in the entry block, 2287 // so now we copy the value out and into %rax/%eax. 2288 // 2289 // Checking Function.hasStructRetAttr() here is insufficient because the IR 2290 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is 2291 // false, then an sret argument may be implicitly inserted in the SelDAG. In 2292 // either case FuncInfo->setSRetReturnReg() will have been called. 2293 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { 2294 SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, 2295 getPointerTy(MF.getDataLayout())); 2296 2297 unsigned RetValReg 2298 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? 2299 X86::RAX : X86::EAX; 2300 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); 2301 Flag = Chain.getValue(1); 2302 2303 // RAX/EAX now acts like a return value. 2304 RetOps.push_back( 2305 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 2306 } 2307 2308 RetOps[0] = Chain; // Update chain. 2309 2310 // Add the flag if we have it. 2311 if (Flag.getNode()) 2312 RetOps.push_back(Flag); 2313 2314 X86ISD::NodeType opcode = X86ISD::RET_FLAG; 2315 if (CallConv == CallingConv::X86_INTR) 2316 opcode = X86ISD::IRET; 2317 return DAG.getNode(opcode, dl, MVT::Other, RetOps); 2318 } 2319 2320 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2321 if (N->getNumValues() != 1) 2322 return false; 2323 if (!N->hasNUsesOfValue(1, 0)) 2324 return false; 2325 2326 SDValue TCChain = Chain; 2327 SDNode *Copy = *N->use_begin(); 2328 if (Copy->getOpcode() == ISD::CopyToReg) { 2329 // If the copy has a glue operand, we conservatively assume it isn't safe to 2330 // perform a tail call. 2331 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2332 return false; 2333 TCChain = Copy->getOperand(0); 2334 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 2335 return false; 2336 2337 bool HasRet = false; 2338 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2339 UI != UE; ++UI) { 2340 if (UI->getOpcode() != X86ISD::RET_FLAG) 2341 return false; 2342 // If we are returning more than one value, we can definitely 2343 // not make a tail call see PR19530 2344 if (UI->getNumOperands() > 4) 2345 return false; 2346 if (UI->getNumOperands() == 4 && 2347 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) 2348 return false; 2349 HasRet = true; 2350 } 2351 2352 if (!HasRet) 2353 return false; 2354 2355 Chain = TCChain; 2356 return true; 2357 } 2358 2359 EVT 2360 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 2361 ISD::NodeType ExtendKind) const { 2362 MVT ReturnMVT; 2363 // TODO: Is this also valid on 32-bit? 2364 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 2365 ReturnMVT = MVT::i8; 2366 else 2367 ReturnMVT = MVT::i32; 2368 2369 EVT MinVT = getRegisterType(Context, ReturnMVT); 2370 return VT.bitsLT(MinVT) ? MinVT : VT; 2371 } 2372 2373 /// Lower the result values of a call into the 2374 /// appropriate copies out of appropriate physical registers. 2375 /// 2376 SDValue 2377 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 2378 CallingConv::ID CallConv, bool isVarArg, 2379 const SmallVectorImpl<ISD::InputArg> &Ins, 2380 SDLoc dl, SelectionDAG &DAG, 2381 SmallVectorImpl<SDValue> &InVals) const { 2382 2383 // Assign locations to each value returned by this call. 2384 SmallVector<CCValAssign, 16> RVLocs; 2385 bool Is64Bit = Subtarget->is64Bit(); 2386 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2387 *DAG.getContext()); 2388 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2389 2390 // Copy all of the result registers out of their specified physreg. 2391 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2392 CCValAssign &VA = RVLocs[i]; 2393 EVT CopyVT = VA.getLocVT(); 2394 2395 // If this is x86-64, and we disabled SSE, we can't return FP values 2396 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && 2397 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 2398 report_fatal_error("SSE register return with SSE disabled"); 2399 } 2400 2401 // If we prefer to use the value in xmm registers, copy it out as f80 and 2402 // use a truncate to move it from fp stack reg to xmm reg. 2403 bool RoundAfterCopy = false; 2404 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && 2405 isScalarFPTypeInSSEReg(VA.getValVT())) { 2406 CopyVT = MVT::f80; 2407 RoundAfterCopy = (CopyVT != VA.getLocVT()); 2408 } 2409 2410 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 2411 CopyVT, InFlag).getValue(1); 2412 SDValue Val = Chain.getValue(0); 2413 2414 if (RoundAfterCopy) 2415 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 2416 // This truncation won't change the value. 2417 DAG.getIntPtrConstant(1, dl)); 2418 2419 if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1) 2420 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 2421 2422 InFlag = Chain.getValue(2); 2423 InVals.push_back(Val); 2424 } 2425 2426 return Chain; 2427 } 2428 2429 //===----------------------------------------------------------------------===// 2430 // C & StdCall & Fast Calling Convention implementation 2431 //===----------------------------------------------------------------------===// 2432 // StdCall calling convention seems to be standard for many Windows' API 2433 // routines and around. It differs from C calling convention just a little: 2434 // callee should clean up the stack, not caller. Symbols should be also 2435 // decorated in some fancy way :) It doesn't support any vector arguments. 2436 // For info on fast calling convention see Fast Calling Convention (tail call) 2437 // implementation LowerX86_32FastCCCallTo. 2438 2439 /// CallIsStructReturn - Determines whether a call uses struct return 2440 /// semantics. 2441 enum StructReturnType { 2442 NotStructReturn, 2443 RegStructReturn, 2444 StackStructReturn 2445 }; 2446 static StructReturnType 2447 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 2448 if (Outs.empty()) 2449 return NotStructReturn; 2450 2451 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 2452 if (!Flags.isSRet()) 2453 return NotStructReturn; 2454 if (Flags.isInReg()) 2455 return RegStructReturn; 2456 return StackStructReturn; 2457 } 2458 2459 /// Determines whether a function uses struct return semantics. 2460 static StructReturnType 2461 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 2462 if (Ins.empty()) 2463 return NotStructReturn; 2464 2465 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 2466 if (!Flags.isSRet()) 2467 return NotStructReturn; 2468 if (Flags.isInReg()) 2469 return RegStructReturn; 2470 return StackStructReturn; 2471 } 2472 2473 /// Make a copy of an aggregate at address specified by "Src" to address 2474 /// "Dst" with size and alignment information specified by the specific 2475 /// parameter attribute. The copy will be passed as a byval function parameter. 2476 static SDValue 2477 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 2478 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 2479 SDLoc dl) { 2480 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); 2481 2482 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 2483 /*isVolatile*/false, /*AlwaysInline=*/true, 2484 /*isTailCall*/false, 2485 MachinePointerInfo(), MachinePointerInfo()); 2486 } 2487 2488 /// Return true if the calling convention is one that we can guarantee TCO for. 2489 static bool canGuaranteeTCO(CallingConv::ID CC) { 2490 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 2491 CC == CallingConv::HiPE || CC == CallingConv::HHVM); 2492 } 2493 2494 /// Return true if we might ever do TCO for calls with this calling convention. 2495 static bool mayTailCallThisCC(CallingConv::ID CC) { 2496 switch (CC) { 2497 // C calling conventions: 2498 case CallingConv::C: 2499 case CallingConv::X86_64_Win64: 2500 case CallingConv::X86_64_SysV: 2501 // Callee pop conventions: 2502 case CallingConv::X86_ThisCall: 2503 case CallingConv::X86_StdCall: 2504 case CallingConv::X86_VectorCall: 2505 case CallingConv::X86_FastCall: 2506 return true; 2507 default: 2508 return canGuaranteeTCO(CC); 2509 } 2510 } 2511 2512 /// Return true if the function is being made into a tailcall target by 2513 /// changing its ABI. 2514 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { 2515 return GuaranteedTailCallOpt && canGuaranteeTCO(CC); 2516 } 2517 2518 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2519 auto Attr = 2520 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2521 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2522 return false; 2523 2524 CallSite CS(CI); 2525 CallingConv::ID CalleeCC = CS.getCallingConv(); 2526 if (!mayTailCallThisCC(CalleeCC)) 2527 return false; 2528 2529 return true; 2530 } 2531 2532 SDValue 2533 X86TargetLowering::LowerMemArgument(SDValue Chain, 2534 CallingConv::ID CallConv, 2535 const SmallVectorImpl<ISD::InputArg> &Ins, 2536 SDLoc dl, SelectionDAG &DAG, 2537 const CCValAssign &VA, 2538 MachineFrameInfo *MFI, 2539 unsigned i) const { 2540 // Create the nodes corresponding to a load from this parameter slot. 2541 ISD::ArgFlagsTy Flags = Ins[i].Flags; 2542 bool AlwaysUseMutable = shouldGuaranteeTCO( 2543 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); 2544 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 2545 EVT ValVT; 2546 2547 // If value is passed by pointer we have address passed instead of the value 2548 // itself. 2549 bool ExtendedInMem = VA.isExtInLoc() && 2550 VA.getValVT().getScalarType() == MVT::i1; 2551 2552 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) 2553 ValVT = VA.getLocVT(); 2554 else 2555 ValVT = VA.getValVT(); 2556 2557 // Calculate SP offset of interrupt parameter, re-arrange the slot normally 2558 // taken by a return address. 2559 int Offset = 0; 2560 if (CallConv == CallingConv::X86_INTR) { 2561 const X86Subtarget& Subtarget = 2562 static_cast<const X86Subtarget&>(DAG.getSubtarget()); 2563 // X86 interrupts may take one or two arguments. 2564 // On the stack there will be no return address as in regular call. 2565 // Offset of last argument need to be set to -4/-8 bytes. 2566 // Where offset of the first argument out of two, should be set to 0 bytes. 2567 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); 2568 } 2569 2570 // FIXME: For now, all byval parameter objects are marked mutable. This can be 2571 // changed with more analysis. 2572 // In case of tail call optimization mark all arguments mutable. Since they 2573 // could be overwritten by lowering of arguments in case of a tail call. 2574 if (Flags.isByVal()) { 2575 unsigned Bytes = Flags.getByValSize(); 2576 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 2577 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 2578 // Adjust SP offset of interrupt parameter. 2579 if (CallConv == CallingConv::X86_INTR) { 2580 MFI->setObjectOffset(FI, Offset); 2581 } 2582 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2583 } else { 2584 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 2585 VA.getLocMemOffset(), isImmutable); 2586 // Adjust SP offset of interrupt parameter. 2587 if (CallConv == CallingConv::X86_INTR) { 2588 MFI->setObjectOffset(FI, Offset); 2589 } 2590 2591 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2592 SDValue Val = DAG.getLoad( 2593 ValVT, dl, Chain, FIN, 2594 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, 2595 false, false, 0); 2596 return ExtendedInMem ? 2597 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; 2598 } 2599 } 2600 2601 // FIXME: Get this from tablegen. 2602 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, 2603 const X86Subtarget *Subtarget) { 2604 assert(Subtarget->is64Bit()); 2605 2606 if (Subtarget->isCallingConvWin64(CallConv)) { 2607 static const MCPhysReg GPR64ArgRegsWin64[] = { 2608 X86::RCX, X86::RDX, X86::R8, X86::R9 2609 }; 2610 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); 2611 } 2612 2613 static const MCPhysReg GPR64ArgRegs64Bit[] = { 2614 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 2615 }; 2616 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); 2617 } 2618 2619 // FIXME: Get this from tablegen. 2620 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, 2621 CallingConv::ID CallConv, 2622 const X86Subtarget *Subtarget) { 2623 assert(Subtarget->is64Bit()); 2624 if (Subtarget->isCallingConvWin64(CallConv)) { 2625 // The XMM registers which might contain var arg parameters are shadowed 2626 // in their paired GPR. So we only need to save the GPR to their home 2627 // slots. 2628 // TODO: __vectorcall will change this. 2629 return None; 2630 } 2631 2632 const Function *Fn = MF.getFunction(); 2633 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); 2634 bool isSoftFloat = Subtarget->useSoftFloat(); 2635 assert(!(isSoftFloat && NoImplicitFloatOps) && 2636 "SSE register cannot be used when SSE is disabled!"); 2637 if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 2638 // Kernel mode asks for SSE to be disabled, so there are no XMM argument 2639 // registers. 2640 return None; 2641 2642 static const MCPhysReg XMMArgRegs64Bit[] = { 2643 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2644 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2645 }; 2646 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); 2647 } 2648 2649 SDValue X86TargetLowering::LowerFormalArguments( 2650 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2651 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, 2652 SmallVectorImpl<SDValue> &InVals) const { 2653 MachineFunction &MF = DAG.getMachineFunction(); 2654 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2655 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 2656 2657 const Function* Fn = MF.getFunction(); 2658 if (Fn->hasExternalLinkage() && 2659 Subtarget->isTargetCygMing() && 2660 Fn->getName() == "main") 2661 FuncInfo->setForceFramePointer(true); 2662 2663 MachineFrameInfo *MFI = MF.getFrameInfo(); 2664 bool Is64Bit = Subtarget->is64Bit(); 2665 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2666 2667 assert(!(isVarArg && canGuaranteeTCO(CallConv)) && 2668 "Var args not supported with calling convention fastcc, ghc or hipe"); 2669 2670 if (CallConv == CallingConv::X86_INTR) { 2671 bool isLegal = Ins.size() == 1 || 2672 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || 2673 (!Is64Bit && Ins[1].VT == MVT::i32))); 2674 if (!isLegal) 2675 report_fatal_error("X86 interrupts may take one or two arguments"); 2676 } 2677 2678 // Assign locations to all of the incoming arguments. 2679 SmallVector<CCValAssign, 16> ArgLocs; 2680 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 2681 2682 // Allocate shadow area for Win64 2683 if (IsWin64) 2684 CCInfo.AllocateStack(32, 8); 2685 2686 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 2687 2688 unsigned LastVal = ~0U; 2689 SDValue ArgValue; 2690 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2691 CCValAssign &VA = ArgLocs[i]; 2692 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 2693 // places. 2694 assert(VA.getValNo() != LastVal && 2695 "Don't support value assigned to multiple locs yet"); 2696 (void)LastVal; 2697 LastVal = VA.getValNo(); 2698 2699 if (VA.isRegLoc()) { 2700 EVT RegVT = VA.getLocVT(); 2701 const TargetRegisterClass *RC; 2702 if (RegVT == MVT::i32) 2703 RC = &X86::GR32RegClass; 2704 else if (Is64Bit && RegVT == MVT::i64) 2705 RC = &X86::GR64RegClass; 2706 else if (RegVT == MVT::f32) 2707 RC = &X86::FR32RegClass; 2708 else if (RegVT == MVT::f64) 2709 RC = &X86::FR64RegClass; 2710 else if (RegVT == MVT::f128) 2711 RC = &X86::FR128RegClass; 2712 else if (RegVT.is512BitVector()) 2713 RC = &X86::VR512RegClass; 2714 else if (RegVT.is256BitVector()) 2715 RC = &X86::VR256RegClass; 2716 else if (RegVT.is128BitVector()) 2717 RC = &X86::VR128RegClass; 2718 else if (RegVT == MVT::x86mmx) 2719 RC = &X86::VR64RegClass; 2720 else if (RegVT == MVT::i1) 2721 RC = &X86::VK1RegClass; 2722 else if (RegVT == MVT::v8i1) 2723 RC = &X86::VK8RegClass; 2724 else if (RegVT == MVT::v16i1) 2725 RC = &X86::VK16RegClass; 2726 else if (RegVT == MVT::v32i1) 2727 RC = &X86::VK32RegClass; 2728 else if (RegVT == MVT::v64i1) 2729 RC = &X86::VK64RegClass; 2730 else 2731 llvm_unreachable("Unknown argument type!"); 2732 2733 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2734 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 2735 2736 // If this is an 8 or 16-bit value, it is really passed promoted to 32 2737 // bits. Insert an assert[sz]ext to capture this, then truncate to the 2738 // right size. 2739 if (VA.getLocInfo() == CCValAssign::SExt) 2740 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 2741 DAG.getValueType(VA.getValVT())); 2742 else if (VA.getLocInfo() == CCValAssign::ZExt) 2743 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 2744 DAG.getValueType(VA.getValVT())); 2745 else if (VA.getLocInfo() == CCValAssign::BCvt) 2746 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); 2747 2748 if (VA.isExtInLoc()) { 2749 // Handle MMX values passed in XMM regs. 2750 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) 2751 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 2752 else 2753 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2754 } 2755 } else { 2756 assert(VA.isMemLoc()); 2757 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 2758 } 2759 2760 // If value is passed via pointer - do a load. 2761 if (VA.getLocInfo() == CCValAssign::Indirect) 2762 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 2763 MachinePointerInfo(), false, false, false, 0); 2764 2765 InVals.push_back(ArgValue); 2766 } 2767 2768 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2769 // All x86 ABIs require that for returning structs by value we copy the 2770 // sret argument into %rax/%eax (depending on ABI) for the return. Save 2771 // the argument into a virtual register so that we can access it from the 2772 // return points. 2773 if (Ins[i].Flags.isSRet()) { 2774 unsigned Reg = FuncInfo->getSRetReturnReg(); 2775 if (!Reg) { 2776 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 2777 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 2778 FuncInfo->setSRetReturnReg(Reg); 2779 } 2780 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); 2781 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 2782 break; 2783 } 2784 } 2785 2786 unsigned StackSize = CCInfo.getNextStackOffset(); 2787 // Align stack specially for tail calls. 2788 if (shouldGuaranteeTCO(CallConv, 2789 MF.getTarget().Options.GuaranteedTailCallOpt)) 2790 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 2791 2792 // If the function takes variable number of arguments, make a frame index for 2793 // the start of the first vararg value... for expansion of llvm.va_start. We 2794 // can skip this if there are no va_start calls. 2795 if (MFI->hasVAStart() && 2796 (Is64Bit || (CallConv != CallingConv::X86_FastCall && 2797 CallConv != CallingConv::X86_ThisCall))) { 2798 FuncInfo->setVarArgsFrameIndex( 2799 MFI->CreateFixedObject(1, StackSize, true)); 2800 } 2801 2802 // Figure out if XMM registers are in use. 2803 assert(!(Subtarget->useSoftFloat() && 2804 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && 2805 "SSE register cannot be used when SSE is disabled!"); 2806 2807 // 64-bit calling conventions support varargs and register parameters, so we 2808 // have to do extra work to spill them in the prologue. 2809 if (Is64Bit && isVarArg && MFI->hasVAStart()) { 2810 // Find the first unallocated argument registers. 2811 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); 2812 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); 2813 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); 2814 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); 2815 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 2816 "SSE register cannot be used when SSE is disabled!"); 2817 2818 // Gather all the live in physical registers. 2819 SmallVector<SDValue, 6> LiveGPRs; 2820 SmallVector<SDValue, 8> LiveXMMRegs; 2821 SDValue ALVal; 2822 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { 2823 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); 2824 LiveGPRs.push_back( 2825 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); 2826 } 2827 if (!ArgXMMs.empty()) { 2828 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2829 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); 2830 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { 2831 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); 2832 LiveXMMRegs.push_back( 2833 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); 2834 } 2835 } 2836 2837 if (IsWin64) { 2838 // Get to the caller-allocated home save location. Add 8 to account 2839 // for the return address. 2840 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2841 FuncInfo->setRegSaveFrameIndex( 2842 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2843 // Fixup to set vararg frame on shadow area (4 x i64). 2844 if (NumIntRegs < 4) 2845 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2846 } else { 2847 // For X86-64, if there are vararg parameters that are passed via 2848 // registers, then we must store them to their spots on the stack so 2849 // they may be loaded by deferencing the result of va_next. 2850 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2851 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); 2852 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( 2853 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); 2854 } 2855 2856 // Store the integer parameter registers. 2857 SmallVector<SDValue, 8> MemOps; 2858 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2859 getPointerTy(DAG.getDataLayout())); 2860 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2861 for (SDValue Val : LiveGPRs) { 2862 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2863 RSFIN, DAG.getIntPtrConstant(Offset, dl)); 2864 SDValue Store = 2865 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2866 MachinePointerInfo::getFixedStack( 2867 DAG.getMachineFunction(), 2868 FuncInfo->getRegSaveFrameIndex(), Offset), 2869 false, false, 0); 2870 MemOps.push_back(Store); 2871 Offset += 8; 2872 } 2873 2874 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { 2875 // Now store the XMM (fp + vector) parameter registers. 2876 SmallVector<SDValue, 12> SaveXMMOps; 2877 SaveXMMOps.push_back(Chain); 2878 SaveXMMOps.push_back(ALVal); 2879 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2880 FuncInfo->getRegSaveFrameIndex(), dl)); 2881 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2882 FuncInfo->getVarArgsFPOffset(), dl)); 2883 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), 2884 LiveXMMRegs.end()); 2885 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2886 MVT::Other, SaveXMMOps)); 2887 } 2888 2889 if (!MemOps.empty()) 2890 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 2891 } 2892 2893 if (isVarArg && MFI->hasMustTailInVarArgFunc()) { 2894 // Find the largest legal vector type. 2895 MVT VecVT = MVT::Other; 2896 // FIXME: Only some x86_32 calling conventions support AVX512. 2897 if (Subtarget->hasAVX512() && 2898 (Is64Bit || (CallConv == CallingConv::X86_VectorCall || 2899 CallConv == CallingConv::Intel_OCL_BI))) 2900 VecVT = MVT::v16f32; 2901 else if (Subtarget->hasAVX()) 2902 VecVT = MVT::v8f32; 2903 else if (Subtarget->hasSSE2()) 2904 VecVT = MVT::v4f32; 2905 2906 // We forward some GPRs and some vector types. 2907 SmallVector<MVT, 2> RegParmTypes; 2908 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; 2909 RegParmTypes.push_back(IntVT); 2910 if (VecVT != MVT::Other) 2911 RegParmTypes.push_back(VecVT); 2912 2913 // Compute the set of forwarded registers. The rest are scratch. 2914 SmallVectorImpl<ForwardedRegister> &Forwards = 2915 FuncInfo->getForwardedMustTailRegParms(); 2916 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); 2917 2918 // Conservatively forward AL on x86_64, since it might be used for varargs. 2919 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { 2920 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2921 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); 2922 } 2923 2924 // Copy all forwards from physical to virtual registers. 2925 for (ForwardedRegister &F : Forwards) { 2926 // FIXME: Can we use a less constrained schedule? 2927 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 2928 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); 2929 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); 2930 } 2931 } 2932 2933 // Some CCs need callee pop. 2934 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2935 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2936 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2937 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { 2938 // X86 interrupts must pop the error code if present 2939 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); 2940 } else { 2941 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2942 // If this is an sret function, the return should pop the hidden pointer. 2943 if (!Is64Bit && !canGuaranteeTCO(CallConv) && 2944 !Subtarget->getTargetTriple().isOSMSVCRT() && 2945 argsAreStructReturn(Ins) == StackStructReturn) 2946 FuncInfo->setBytesToPopOnReturn(4); 2947 } 2948 2949 if (!Is64Bit) { 2950 // RegSaveFrameIndex is X86-64 only. 2951 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2952 if (CallConv == CallingConv::X86_FastCall || 2953 CallConv == CallingConv::X86_ThisCall) 2954 // fastcc functions can't have varargs. 2955 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2956 } 2957 2958 FuncInfo->setArgumentStackSize(StackSize); 2959 2960 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { 2961 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); 2962 if (Personality == EHPersonality::CoreCLR) { 2963 assert(Is64Bit); 2964 // TODO: Add a mechanism to frame lowering that will allow us to indicate 2965 // that we'd prefer this slot be allocated towards the bottom of the frame 2966 // (i.e. near the stack pointer after allocating the frame). Every 2967 // funclet needs a copy of this slot in its (mostly empty) frame, and the 2968 // offset from the bottom of this and each funclet's frame must be the 2969 // same, so the size of funclets' (mostly empty) frames is dictated by 2970 // how far this slot is from the bottom (since they allocate just enough 2971 // space to accomodate holding this slot at the correct offset). 2972 int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); 2973 EHInfo->PSPSymFrameIdx = PSPSymFI; 2974 } 2975 } 2976 2977 return Chain; 2978 } 2979 2980 SDValue 2981 X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2982 SDValue StackPtr, SDValue Arg, 2983 SDLoc dl, SelectionDAG &DAG, 2984 const CCValAssign &VA, 2985 ISD::ArgFlagsTy Flags) const { 2986 unsigned LocMemOffset = VA.getLocMemOffset(); 2987 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2988 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2989 StackPtr, PtrOff); 2990 if (Flags.isByVal()) 2991 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2992 2993 return DAG.getStore( 2994 Chain, dl, Arg, PtrOff, 2995 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), 2996 false, false, 0); 2997 } 2998 2999 /// Emit a load of return address if tail call 3000 /// optimization is performed and it is required. 3001 SDValue 3002 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 3003 SDValue &OutRetAddr, SDValue Chain, 3004 bool IsTailCall, bool Is64Bit, 3005 int FPDiff, SDLoc dl) const { 3006 // Adjust the Return address stack slot. 3007 EVT VT = getPointerTy(DAG.getDataLayout()); 3008 OutRetAddr = getReturnAddressFrameIndex(DAG); 3009 3010 // Load the "old" Return address. 3011 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 3012 false, false, false, 0); 3013 return SDValue(OutRetAddr.getNode(), 1); 3014 } 3015 3016 /// Emit a store of the return address if tail call 3017 /// optimization is performed and it is required (FPDiff!=0). 3018 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, 3019 SDValue Chain, SDValue RetAddrFrIdx, 3020 EVT PtrVT, unsigned SlotSize, 3021 int FPDiff, SDLoc dl) { 3022 // Store the return address to the appropriate stack slot. 3023 if (!FPDiff) return Chain; 3024 // Calculate the new stack slot for the return address. 3025 int NewReturnAddrFI = 3026 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 3027 false); 3028 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 3029 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 3030 MachinePointerInfo::getFixedStack( 3031 DAG.getMachineFunction(), NewReturnAddrFI), 3032 false, false, 0); 3033 return Chain; 3034 } 3035 3036 /// Returns a vector_shuffle mask for an movs{s|d}, movd 3037 /// operation of specified width. 3038 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 3039 SDValue V2) { 3040 unsigned NumElems = VT.getVectorNumElements(); 3041 SmallVector<int, 8> Mask; 3042 Mask.push_back(NumElems); 3043 for (unsigned i = 1; i != NumElems; ++i) 3044 Mask.push_back(i); 3045 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3046 } 3047 3048 SDValue 3049 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 3050 SmallVectorImpl<SDValue> &InVals) const { 3051 SelectionDAG &DAG = CLI.DAG; 3052 SDLoc &dl = CLI.DL; 3053 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 3054 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 3055 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 3056 SDValue Chain = CLI.Chain; 3057 SDValue Callee = CLI.Callee; 3058 CallingConv::ID CallConv = CLI.CallConv; 3059 bool &isTailCall = CLI.IsTailCall; 3060 bool isVarArg = CLI.IsVarArg; 3061 3062 MachineFunction &MF = DAG.getMachineFunction(); 3063 bool Is64Bit = Subtarget->is64Bit(); 3064 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 3065 StructReturnType SR = callIsStructReturn(Outs); 3066 bool IsSibcall = false; 3067 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 3068 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); 3069 3070 if (CallConv == CallingConv::X86_INTR) 3071 report_fatal_error("X86 interrupts may not be called directly"); 3072 3073 if (Attr.getValueAsString() == "true") 3074 isTailCall = false; 3075 3076 if (Subtarget->isPICStyleGOT() && 3077 !MF.getTarget().Options.GuaranteedTailCallOpt) { 3078 // If we are using a GOT, disable tail calls to external symbols with 3079 // default visibility. Tail calling such a symbol requires using a GOT 3080 // relocation, which forces early binding of the symbol. This breaks code 3081 // that require lazy function symbol resolution. Using musttail or 3082 // GuaranteedTailCallOpt will override this. 3083 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 3084 if (!G || (!G->getGlobal()->hasLocalLinkage() && 3085 G->getGlobal()->hasDefaultVisibility())) 3086 isTailCall = false; 3087 } 3088 3089 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); 3090 if (IsMustTail) { 3091 // Force this to be a tail call. The verifier rules are enough to ensure 3092 // that we can lower this successfully without moving the return address 3093 // around. 3094 isTailCall = true; 3095 } else if (isTailCall) { 3096 // Check if it's really possible to do a tail call. 3097 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 3098 isVarArg, SR != NotStructReturn, 3099 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 3100 Outs, OutVals, Ins, DAG); 3101 3102 // Sibcalls are automatically detected tailcalls which do not require 3103 // ABI changes. 3104 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 3105 IsSibcall = true; 3106 3107 if (isTailCall) 3108 ++NumTailCalls; 3109 } 3110 3111 assert(!(isVarArg && canGuaranteeTCO(CallConv)) && 3112 "Var args not supported with calling convention fastcc, ghc or hipe"); 3113 3114 // Analyze operands of the call, assigning locations to each operand. 3115 SmallVector<CCValAssign, 16> ArgLocs; 3116 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 3117 3118 // Allocate shadow area for Win64 3119 if (IsWin64) 3120 CCInfo.AllocateStack(32, 8); 3121 3122 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3123 3124 // Get a count of how many bytes are to be pushed on the stack. 3125 unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); 3126 if (IsSibcall) 3127 // This is a sibcall. The memory operands are available in caller's 3128 // own caller's stack. 3129 NumBytes = 0; 3130 else if (MF.getTarget().Options.GuaranteedTailCallOpt && 3131 canGuaranteeTCO(CallConv)) 3132 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 3133 3134 int FPDiff = 0; 3135 if (isTailCall && !IsSibcall && !IsMustTail) { 3136 // Lower arguments at fp - stackoffset + fpdiff. 3137 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 3138 3139 FPDiff = NumBytesCallerPushed - NumBytes; 3140 3141 // Set the delta of movement of the returnaddr stackslot. 3142 // But only set if delta is greater than previous delta. 3143 if (FPDiff < X86Info->getTCReturnAddrDelta()) 3144 X86Info->setTCReturnAddrDelta(FPDiff); 3145 } 3146 3147 unsigned NumBytesToPush = NumBytes; 3148 unsigned NumBytesToPop = NumBytes; 3149 3150 // If we have an inalloca argument, all stack space has already been allocated 3151 // for us and be right at the top of the stack. We don't support multiple 3152 // arguments passed in memory when using inalloca. 3153 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { 3154 NumBytesToPush = 0; 3155 if (!ArgLocs.back().isMemLoc()) 3156 report_fatal_error("cannot use inalloca attribute on a register " 3157 "parameter"); 3158 if (ArgLocs.back().getLocMemOffset() != 0) 3159 report_fatal_error("any parameter with the inalloca attribute must be " 3160 "the only memory argument"); 3161 } 3162 3163 if (!IsSibcall) 3164 Chain = DAG.getCALLSEQ_START( 3165 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); 3166 3167 SDValue RetAddrFrIdx; 3168 // Load return address for tail calls. 3169 if (isTailCall && FPDiff) 3170 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 3171 Is64Bit, FPDiff, dl); 3172 3173 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 3174 SmallVector<SDValue, 8> MemOpChains; 3175 SDValue StackPtr; 3176 3177 // Walk the register/memloc assignments, inserting copies/loads. In the case 3178 // of tail call optimization arguments are handle later. 3179 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 3180 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3181 // Skip inalloca arguments, they have already been written. 3182 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3183 if (Flags.isInAlloca()) 3184 continue; 3185 3186 CCValAssign &VA = ArgLocs[i]; 3187 EVT RegVT = VA.getLocVT(); 3188 SDValue Arg = OutVals[i]; 3189 bool isByVal = Flags.isByVal(); 3190 3191 // Promote the value if needed. 3192 switch (VA.getLocInfo()) { 3193 default: llvm_unreachable("Unknown loc info!"); 3194 case CCValAssign::Full: break; 3195 case CCValAssign::SExt: 3196 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 3197 break; 3198 case CCValAssign::ZExt: 3199 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 3200 break; 3201 case CCValAssign::AExt: 3202 if (Arg.getValueType().isVector() && 3203 Arg.getValueType().getVectorElementType() == MVT::i1) 3204 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 3205 else if (RegVT.is128BitVector()) { 3206 // Special case: passing MMX values in XMM registers. 3207 Arg = DAG.getBitcast(MVT::i64, Arg); 3208 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 3209 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 3210 } else 3211 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 3212 break; 3213 case CCValAssign::BCvt: 3214 Arg = DAG.getBitcast(RegVT, Arg); 3215 break; 3216 case CCValAssign::Indirect: { 3217 // Store the argument. 3218 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 3219 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 3220 Chain = DAG.getStore( 3221 Chain, dl, Arg, SpillSlot, 3222 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3223 false, false, 0); 3224 Arg = SpillSlot; 3225 break; 3226 } 3227 } 3228 3229 if (VA.isRegLoc()) { 3230 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 3231 if (isVarArg && IsWin64) { 3232 // Win64 ABI requires argument XMM reg to be copied to the corresponding 3233 // shadow reg if callee is a varargs function. 3234 unsigned ShadowReg = 0; 3235 switch (VA.getLocReg()) { 3236 case X86::XMM0: ShadowReg = X86::RCX; break; 3237 case X86::XMM1: ShadowReg = X86::RDX; break; 3238 case X86::XMM2: ShadowReg = X86::R8; break; 3239 case X86::XMM3: ShadowReg = X86::R9; break; 3240 } 3241 if (ShadowReg) 3242 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 3243 } 3244 } else if (!IsSibcall && (!isTailCall || isByVal)) { 3245 assert(VA.isMemLoc()); 3246 if (!StackPtr.getNode()) 3247 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 3248 getPointerTy(DAG.getDataLayout())); 3249 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 3250 dl, DAG, VA, Flags)); 3251 } 3252 } 3253 3254 if (!MemOpChains.empty()) 3255 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 3256 3257 if (Subtarget->isPICStyleGOT()) { 3258 // ELF / PIC requires GOT in the EBX register before function calls via PLT 3259 // GOT pointer. 3260 if (!isTailCall) { 3261 RegsToPass.push_back(std::make_pair( 3262 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 3263 getPointerTy(DAG.getDataLayout())))); 3264 } else { 3265 // If we are tail calling and generating PIC/GOT style code load the 3266 // address of the callee into ECX. The value in ecx is used as target of 3267 // the tail jump. This is done to circumvent the ebx/callee-saved problem 3268 // for tail calls on PIC/GOT architectures. Normally we would just put the 3269 // address of GOT into ebx and then call target@PLT. But for tail calls 3270 // ebx would be restored (since ebx is callee saved) before jumping to the 3271 // target@PLT. 3272 3273 // Note: The actual moving to ECX is done further down. 3274 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 3275 if (G && !G->getGlobal()->hasLocalLinkage() && 3276 G->getGlobal()->hasDefaultVisibility()) 3277 Callee = LowerGlobalAddress(Callee, DAG); 3278 else if (isa<ExternalSymbolSDNode>(Callee)) 3279 Callee = LowerExternalSymbol(Callee, DAG); 3280 } 3281 } 3282 3283 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { 3284 // From AMD64 ABI document: 3285 // For calls that may call functions that use varargs or stdargs 3286 // (prototype-less calls or calls to functions containing ellipsis (...) in 3287 // the declaration) %al is used as hidden argument to specify the number 3288 // of SSE registers used. The contents of %al do not need to match exactly 3289 // the number of registers, but must be an ubound on the number of SSE 3290 // registers used and is in the range 0 - 8 inclusive. 3291 3292 // Count the number of XMM registers allocated. 3293 static const MCPhysReg XMMArgRegs[] = { 3294 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 3295 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 3296 }; 3297 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); 3298 assert((Subtarget->hasSSE1() || !NumXMMRegs) 3299 && "SSE registers cannot be used when SSE is disabled"); 3300 3301 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 3302 DAG.getConstant(NumXMMRegs, dl, 3303 MVT::i8))); 3304 } 3305 3306 if (isVarArg && IsMustTail) { 3307 const auto &Forwards = X86Info->getForwardedMustTailRegParms(); 3308 for (const auto &F : Forwards) { 3309 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 3310 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); 3311 } 3312 } 3313 3314 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls 3315 // don't need this because the eligibility check rejects calls that require 3316 // shuffling arguments passed in memory. 3317 if (!IsSibcall && isTailCall) { 3318 // Force all the incoming stack arguments to be loaded from the stack 3319 // before any new outgoing arguments are stored to the stack, because the 3320 // outgoing stack slots may alias the incoming argument stack slots, and 3321 // the alias isn't otherwise explicit. This is slightly more conservative 3322 // than necessary, because it means that each store effectively depends 3323 // on every argument instead of just those arguments it would clobber. 3324 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 3325 3326 SmallVector<SDValue, 8> MemOpChains2; 3327 SDValue FIN; 3328 int FI = 0; 3329 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3330 CCValAssign &VA = ArgLocs[i]; 3331 if (VA.isRegLoc()) 3332 continue; 3333 assert(VA.isMemLoc()); 3334 SDValue Arg = OutVals[i]; 3335 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3336 // Skip inalloca arguments. They don't require any work. 3337 if (Flags.isInAlloca()) 3338 continue; 3339 // Create frame index. 3340 int32_t Offset = VA.getLocMemOffset()+FPDiff; 3341 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 3342 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 3343 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3344 3345 if (Flags.isByVal()) { 3346 // Copy relative to framepointer. 3347 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); 3348 if (!StackPtr.getNode()) 3349 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 3350 getPointerTy(DAG.getDataLayout())); 3351 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 3352 StackPtr, Source); 3353 3354 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 3355 ArgChain, 3356 Flags, DAG, dl)); 3357 } else { 3358 // Store relative to framepointer. 3359 MemOpChains2.push_back(DAG.getStore( 3360 ArgChain, dl, Arg, FIN, 3361 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3362 false, false, 0)); 3363 } 3364 } 3365 3366 if (!MemOpChains2.empty()) 3367 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 3368 3369 // Store the return address to the appropriate stack slot. 3370 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 3371 getPointerTy(DAG.getDataLayout()), 3372 RegInfo->getSlotSize(), FPDiff, dl); 3373 } 3374 3375 // Build a sequence of copy-to-reg nodes chained together with token chain 3376 // and flag operands which copy the outgoing args into registers. 3377 SDValue InFlag; 3378 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 3379 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 3380 RegsToPass[i].second, InFlag); 3381 InFlag = Chain.getValue(1); 3382 } 3383 3384 if (DAG.getTarget().getCodeModel() == CodeModel::Large) { 3385 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 3386 // In the 64-bit large code model, we have to make all calls 3387 // through a register, since the call instruction's 32-bit 3388 // pc-relative offset may not be large enough to hold the whole 3389 // address. 3390 } else if (Callee->getOpcode() == ISD::GlobalAddress) { 3391 // If the callee is a GlobalAddress node (quite common, every direct call 3392 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 3393 // it. 3394 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); 3395 3396 // We should use extra load for direct calls to dllimported functions in 3397 // non-JIT mode. 3398 const GlobalValue *GV = G->getGlobal(); 3399 if (!GV->hasDLLImportStorageClass()) { 3400 unsigned char OpFlags = 0; 3401 bool ExtraLoad = false; 3402 unsigned WrapperKind = ISD::DELETED_NODE; 3403 3404 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 3405 // external symbols most go through the PLT in PIC mode. If the symbol 3406 // has hidden or protected visibility, or if it is static or local, then 3407 // we don't need to use the PLT - we can directly call it. 3408 if (Subtarget->isTargetELF() && 3409 DAG.getTarget().getRelocationModel() == Reloc::PIC_ && 3410 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 3411 OpFlags = X86II::MO_PLT; 3412 } else if (Subtarget->isPICStyleStubAny() && 3413 !GV->isStrongDefinitionForLinker() && 3414 (!Subtarget->getTargetTriple().isMacOSX() || 3415 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 3416 // PC-relative references to external symbols should go through $stub, 3417 // unless we're building with the leopard linker or later, which 3418 // automatically synthesizes these stubs. 3419 OpFlags = X86II::MO_DARWIN_STUB; 3420 } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) && 3421 cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) { 3422 // If the function is marked as non-lazy, generate an indirect call 3423 // which loads from the GOT directly. This avoids runtime overhead 3424 // at the cost of eager binding (and one extra byte of encoding). 3425 OpFlags = X86II::MO_GOTPCREL; 3426 WrapperKind = X86ISD::WrapperRIP; 3427 ExtraLoad = true; 3428 } 3429 3430 Callee = DAG.getTargetGlobalAddress( 3431 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); 3432 3433 // Add a wrapper if needed. 3434 if (WrapperKind != ISD::DELETED_NODE) 3435 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, 3436 getPointerTy(DAG.getDataLayout()), Callee); 3437 // Add extra indirection if needed. 3438 if (ExtraLoad) 3439 Callee = DAG.getLoad( 3440 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, 3441 MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, 3442 false, 0); 3443 } 3444 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 3445 unsigned char OpFlags = 0; 3446 3447 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 3448 // external symbols should go through the PLT. 3449 if (Subtarget->isTargetELF() && 3450 DAG.getTarget().getRelocationModel() == Reloc::PIC_) { 3451 OpFlags = X86II::MO_PLT; 3452 } else if (Subtarget->isPICStyleStubAny() && 3453 (!Subtarget->getTargetTriple().isMacOSX() || 3454 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 3455 // PC-relative references to external symbols should go through $stub, 3456 // unless we're building with the leopard linker or later, which 3457 // automatically synthesizes these stubs. 3458 OpFlags = X86II::MO_DARWIN_STUB; 3459 } 3460 3461 Callee = DAG.getTargetExternalSymbol( 3462 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); 3463 } else if (Subtarget->isTarget64BitILP32() && 3464 Callee->getValueType(0) == MVT::i32) { 3465 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI 3466 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); 3467 } 3468 3469 // Returns a chain & a flag for retval copy to use. 3470 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3471 SmallVector<SDValue, 8> Ops; 3472 3473 if (!IsSibcall && isTailCall) { 3474 Chain = DAG.getCALLSEQ_END(Chain, 3475 DAG.getIntPtrConstant(NumBytesToPop, dl, true), 3476 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 3477 InFlag = Chain.getValue(1); 3478 } 3479 3480 Ops.push_back(Chain); 3481 Ops.push_back(Callee); 3482 3483 if (isTailCall) 3484 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); 3485 3486 // Add argument registers to the end of the list so that they are known live 3487 // into the call. 3488 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 3489 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 3490 RegsToPass[i].second.getValueType())); 3491 3492 // Add a register mask operand representing the call-preserved registers. 3493 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); 3494 assert(Mask && "Missing call preserved mask for calling convention"); 3495 3496 // If this is an invoke in a 32-bit function using a funclet-based 3497 // personality, assume the function clobbers all registers. If an exception 3498 // is thrown, the runtime will not restore CSRs. 3499 // FIXME: Model this more precisely so that we can register allocate across 3500 // the normal edge and spill and fill across the exceptional edge. 3501 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { 3502 const Function *CallerFn = MF.getFunction(); 3503 EHPersonality Pers = 3504 CallerFn->hasPersonalityFn() 3505 ? classifyEHPersonality(CallerFn->getPersonalityFn()) 3506 : EHPersonality::Unknown; 3507 if (isFuncletEHPersonality(Pers)) 3508 Mask = RegInfo->getNoPreservedMask(); 3509 } 3510 3511 Ops.push_back(DAG.getRegisterMask(Mask)); 3512 3513 if (InFlag.getNode()) 3514 Ops.push_back(InFlag); 3515 3516 if (isTailCall) { 3517 // We used to do: 3518 //// If this is the first return lowered for this function, add the regs 3519 //// to the liveout set for the function. 3520 // This isn't right, although it's probably harmless on x86; liveouts 3521 // should be computed from returns not tail calls. Consider a void 3522 // function making a tail call to a function returning int. 3523 MF.getFrameInfo()->setHasTailCall(); 3524 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); 3525 } 3526 3527 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); 3528 InFlag = Chain.getValue(1); 3529 3530 // Create the CALLSEQ_END node. 3531 unsigned NumBytesForCalleeToPop; 3532 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 3533 DAG.getTarget().Options.GuaranteedTailCallOpt)) 3534 NumBytesForCalleeToPop = NumBytes; // Callee pops everything 3535 else if (!Is64Bit && !canGuaranteeTCO(CallConv) && 3536 !Subtarget->getTargetTriple().isOSMSVCRT() && 3537 SR == StackStructReturn) 3538 // If this is a call to a struct-return function, the callee 3539 // pops the hidden struct pointer, so we have to push it back. 3540 // This is common for Darwin/X86, Linux & Mingw32 targets. 3541 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 3542 NumBytesForCalleeToPop = 4; 3543 else 3544 NumBytesForCalleeToPop = 0; // Callee pops nothing. 3545 3546 // Returns a flag for retval copy to use. 3547 if (!IsSibcall) { 3548 Chain = DAG.getCALLSEQ_END(Chain, 3549 DAG.getIntPtrConstant(NumBytesToPop, dl, true), 3550 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, 3551 true), 3552 InFlag, dl); 3553 InFlag = Chain.getValue(1); 3554 } 3555 3556 // Handle result values, copying them out of physregs into vregs that we 3557 // return. 3558 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 3559 Ins, dl, DAG, InVals); 3560 } 3561 3562 //===----------------------------------------------------------------------===// 3563 // Fast Calling Convention (tail call) implementation 3564 //===----------------------------------------------------------------------===// 3565 3566 // Like std call, callee cleans arguments, convention except that ECX is 3567 // reserved for storing the tail called function address. Only 2 registers are 3568 // free for argument passing (inreg). Tail call optimization is performed 3569 // provided: 3570 // * tailcallopt is enabled 3571 // * caller/callee are fastcc 3572 // On X86_64 architecture with GOT-style position independent code only local 3573 // (within module) calls are supported at the moment. 3574 // To keep the stack aligned according to platform abi the function 3575 // GetAlignedArgumentStackSize ensures that argument delta is always multiples 3576 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 3577 // If a tail called function callee has more arguments than the caller the 3578 // caller needs to make sure that there is room to move the RETADDR to. This is 3579 // achieved by reserving an area the size of the argument delta right after the 3580 // original RETADDR, but before the saved framepointer or the spilled registers 3581 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 3582 // stack layout: 3583 // arg1 3584 // arg2 3585 // RETADDR 3586 // [ new RETADDR 3587 // move area ] 3588 // (possible EBP) 3589 // ESI 3590 // EDI 3591 // local1 .. 3592 3593 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align 3594 /// requirement. 3595 unsigned 3596 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 3597 SelectionDAG& DAG) const { 3598 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 3599 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 3600 unsigned StackAlignment = TFI.getStackAlignment(); 3601 uint64_t AlignMask = StackAlignment - 1; 3602 int64_t Offset = StackSize; 3603 unsigned SlotSize = RegInfo->getSlotSize(); 3604 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 3605 // Number smaller than 12 so just add the difference. 3606 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 3607 } else { 3608 // Mask out lower bits, add stackalignment once plus the 12 bytes. 3609 Offset = ((~AlignMask) & Offset) + StackAlignment + 3610 (StackAlignment-SlotSize); 3611 } 3612 return Offset; 3613 } 3614 3615 /// Return true if the given stack call argument is already available in the 3616 /// same position (relatively) of the caller's incoming argument stack. 3617 static 3618 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 3619 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 3620 const X86InstrInfo *TII) { 3621 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 3622 int FI = INT_MAX; 3623 if (Arg.getOpcode() == ISD::CopyFromReg) { 3624 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 3625 if (!TargetRegisterInfo::isVirtualRegister(VR)) 3626 return false; 3627 MachineInstr *Def = MRI->getVRegDef(VR); 3628 if (!Def) 3629 return false; 3630 if (!Flags.isByVal()) { 3631 if (!TII->isLoadFromStackSlot(Def, FI)) 3632 return false; 3633 } else { 3634 unsigned Opcode = Def->getOpcode(); 3635 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || 3636 Opcode == X86::LEA64_32r) && 3637 Def->getOperand(1).isFI()) { 3638 FI = Def->getOperand(1).getIndex(); 3639 Bytes = Flags.getByValSize(); 3640 } else 3641 return false; 3642 } 3643 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 3644 if (Flags.isByVal()) 3645 // ByVal argument is passed in as a pointer but it's now being 3646 // dereferenced. e.g. 3647 // define @foo(%struct.X* %A) { 3648 // tail call @bar(%struct.X* byval %A) 3649 // } 3650 return false; 3651 SDValue Ptr = Ld->getBasePtr(); 3652 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 3653 if (!FINode) 3654 return false; 3655 FI = FINode->getIndex(); 3656 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 3657 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 3658 FI = FINode->getIndex(); 3659 Bytes = Flags.getByValSize(); 3660 } else 3661 return false; 3662 3663 assert(FI != INT_MAX); 3664 if (!MFI->isFixedObjectIndex(FI)) 3665 return false; 3666 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 3667 } 3668 3669 /// Check whether the call is eligible for tail call optimization. Targets 3670 /// that want to do tail call optimization should implement this function. 3671 bool X86TargetLowering::IsEligibleForTailCallOptimization( 3672 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 3673 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, 3674 const SmallVectorImpl<ISD::OutputArg> &Outs, 3675 const SmallVectorImpl<SDValue> &OutVals, 3676 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 3677 if (!mayTailCallThisCC(CalleeCC)) 3678 return false; 3679 3680 // If -tailcallopt is specified, make fastcc functions tail-callable. 3681 MachineFunction &MF = DAG.getMachineFunction(); 3682 const Function *CallerF = MF.getFunction(); 3683 3684 // If the function return type is x86_fp80 and the callee return type is not, 3685 // then the FP_EXTEND of the call result is not a nop. It's not safe to 3686 // perform a tailcall optimization here. 3687 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 3688 return false; 3689 3690 CallingConv::ID CallerCC = CallerF->getCallingConv(); 3691 bool CCMatch = CallerCC == CalleeCC; 3692 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); 3693 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); 3694 3695 // Win64 functions have extra shadow space for argument homing. Don't do the 3696 // sibcall if the caller and callee have mismatched expectations for this 3697 // space. 3698 if (IsCalleeWin64 != IsCallerWin64) 3699 return false; 3700 3701 if (DAG.getTarget().Options.GuaranteedTailCallOpt) { 3702 if (canGuaranteeTCO(CalleeCC) && CCMatch) 3703 return true; 3704 return false; 3705 } 3706 3707 // Look for obvious safe cases to perform tail call optimization that do not 3708 // require ABI changes. This is what gcc calls sibcall. 3709 3710 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 3711 // emit a special epilogue. 3712 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 3713 if (RegInfo->needsStackRealignment(MF)) 3714 return false; 3715 3716 // Also avoid sibcall optimization if either caller or callee uses struct 3717 // return semantics. 3718 if (isCalleeStructRet || isCallerStructRet) 3719 return false; 3720 3721 // Do not sibcall optimize vararg calls unless all arguments are passed via 3722 // registers. 3723 if (isVarArg && !Outs.empty()) { 3724 // Optimizing for varargs on Win64 is unlikely to be safe without 3725 // additional testing. 3726 if (IsCalleeWin64 || IsCallerWin64) 3727 return false; 3728 3729 SmallVector<CCValAssign, 16> ArgLocs; 3730 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 3731 *DAG.getContext()); 3732 3733 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3734 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 3735 if (!ArgLocs[i].isRegLoc()) 3736 return false; 3737 } 3738 3739 // If the call result is in ST0 / ST1, it needs to be popped off the x87 3740 // stack. Therefore, if it's not used by the call it is not safe to optimize 3741 // this into a sibcall. 3742 bool Unused = false; 3743 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3744 if (!Ins[i].Used) { 3745 Unused = true; 3746 break; 3747 } 3748 } 3749 if (Unused) { 3750 SmallVector<CCValAssign, 16> RVLocs; 3751 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, 3752 *DAG.getContext()); 3753 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 3754 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 3755 CCValAssign &VA = RVLocs[i]; 3756 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) 3757 return false; 3758 } 3759 } 3760 3761 // If the calling conventions do not match, then we'd better make sure the 3762 // results are returned in the same way as what the caller expects. 3763 if (!CCMatch) { 3764 SmallVector<CCValAssign, 16> RVLocs1; 3765 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, 3766 *DAG.getContext()); 3767 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 3768 3769 SmallVector<CCValAssign, 16> RVLocs2; 3770 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, 3771 *DAG.getContext()); 3772 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 3773 3774 if (RVLocs1.size() != RVLocs2.size()) 3775 return false; 3776 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 3777 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 3778 return false; 3779 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 3780 return false; 3781 if (RVLocs1[i].isRegLoc()) { 3782 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 3783 return false; 3784 } else { 3785 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 3786 return false; 3787 } 3788 } 3789 } 3790 3791 unsigned StackArgsSize = 0; 3792 3793 // If the callee takes no arguments then go on to check the results of the 3794 // call. 3795 if (!Outs.empty()) { 3796 // Check if stack adjustment is needed. For now, do not do this if any 3797 // argument is passed on the stack. 3798 SmallVector<CCValAssign, 16> ArgLocs; 3799 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 3800 *DAG.getContext()); 3801 3802 // Allocate shadow area for Win64 3803 if (IsCalleeWin64) 3804 CCInfo.AllocateStack(32, 8); 3805 3806 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3807 StackArgsSize = CCInfo.getNextStackOffset(); 3808 3809 if (CCInfo.getNextStackOffset()) { 3810 // Check if the arguments are already laid out in the right way as 3811 // the caller's fixed stack objects. 3812 MachineFrameInfo *MFI = MF.getFrameInfo(); 3813 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 3814 const X86InstrInfo *TII = Subtarget->getInstrInfo(); 3815 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3816 CCValAssign &VA = ArgLocs[i]; 3817 SDValue Arg = OutVals[i]; 3818 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3819 if (VA.getLocInfo() == CCValAssign::Indirect) 3820 return false; 3821 if (!VA.isRegLoc()) { 3822 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3823 MFI, MRI, TII)) 3824 return false; 3825 } 3826 } 3827 } 3828 3829 // If the tailcall address may be in a register, then make sure it's 3830 // possible to register allocate for it. In 32-bit, the call address can 3831 // only target EAX, EDX, or ECX since the tail call must be scheduled after 3832 // callee-saved registers are restored. These happen to be the same 3833 // registers used to pass 'inreg' arguments so watch out for those. 3834 if (!Subtarget->is64Bit() && 3835 ((!isa<GlobalAddressSDNode>(Callee) && 3836 !isa<ExternalSymbolSDNode>(Callee)) || 3837 DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { 3838 unsigned NumInRegs = 0; 3839 // In PIC we need an extra register to formulate the address computation 3840 // for the callee. 3841 unsigned MaxInRegs = 3842 (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; 3843 3844 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3845 CCValAssign &VA = ArgLocs[i]; 3846 if (!VA.isRegLoc()) 3847 continue; 3848 unsigned Reg = VA.getLocReg(); 3849 switch (Reg) { 3850 default: break; 3851 case X86::EAX: case X86::EDX: case X86::ECX: 3852 if (++NumInRegs == MaxInRegs) 3853 return false; 3854 break; 3855 } 3856 } 3857 } 3858 } 3859 3860 bool CalleeWillPop = 3861 X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg, 3862 MF.getTarget().Options.GuaranteedTailCallOpt); 3863 3864 if (unsigned BytesToPop = 3865 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { 3866 // If we have bytes to pop, the callee must pop them. 3867 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; 3868 if (!CalleePopMatches) 3869 return false; 3870 } else if (CalleeWillPop && StackArgsSize > 0) { 3871 // If we don't have bytes to pop, make sure the callee doesn't pop any. 3872 return false; 3873 } 3874 3875 return true; 3876 } 3877 3878 FastISel * 3879 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 3880 const TargetLibraryInfo *libInfo) const { 3881 return X86::createFastISel(funcInfo, libInfo); 3882 } 3883 3884 //===----------------------------------------------------------------------===// 3885 // Other Lowering Hooks 3886 //===----------------------------------------------------------------------===// 3887 3888 static bool MayFoldLoad(SDValue Op) { 3889 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 3890 } 3891 3892 static bool MayFoldIntoStore(SDValue Op) { 3893 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 3894 } 3895 3896 static bool isTargetShuffle(unsigned Opcode) { 3897 switch(Opcode) { 3898 default: return false; 3899 case X86ISD::BLENDI: 3900 case X86ISD::PSHUFB: 3901 case X86ISD::PSHUFD: 3902 case X86ISD::PSHUFHW: 3903 case X86ISD::PSHUFLW: 3904 case X86ISD::SHUFP: 3905 case X86ISD::PALIGNR: 3906 case X86ISD::MOVLHPS: 3907 case X86ISD::MOVLHPD: 3908 case X86ISD::MOVHLPS: 3909 case X86ISD::MOVLPS: 3910 case X86ISD::MOVLPD: 3911 case X86ISD::MOVSHDUP: 3912 case X86ISD::MOVSLDUP: 3913 case X86ISD::MOVDDUP: 3914 case X86ISD::MOVSS: 3915 case X86ISD::MOVSD: 3916 case X86ISD::UNPCKL: 3917 case X86ISD::UNPCKH: 3918 case X86ISD::VPERMILPI: 3919 case X86ISD::VPERM2X128: 3920 case X86ISD::VPERMI: 3921 case X86ISD::VPERMV: 3922 case X86ISD::VPERMV3: 3923 return true; 3924 } 3925 } 3926 3927 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, 3928 SDValue V1, unsigned TargetMask, 3929 SelectionDAG &DAG) { 3930 switch(Opc) { 3931 default: llvm_unreachable("Unknown x86 shuffle node"); 3932 case X86ISD::PSHUFD: 3933 case X86ISD::PSHUFHW: 3934 case X86ISD::PSHUFLW: 3935 case X86ISD::VPERMILPI: 3936 case X86ISD::VPERMI: 3937 return DAG.getNode(Opc, dl, VT, V1, 3938 DAG.getConstant(TargetMask, dl, MVT::i8)); 3939 } 3940 } 3941 3942 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, 3943 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3944 switch(Opc) { 3945 default: llvm_unreachable("Unknown x86 shuffle node"); 3946 case X86ISD::MOVLHPS: 3947 case X86ISD::MOVLHPD: 3948 case X86ISD::MOVHLPS: 3949 case X86ISD::MOVLPS: 3950 case X86ISD::MOVLPD: 3951 case X86ISD::MOVSS: 3952 case X86ISD::MOVSD: 3953 case X86ISD::UNPCKL: 3954 case X86ISD::UNPCKH: 3955 return DAG.getNode(Opc, dl, VT, V1, V2); 3956 } 3957 } 3958 3959 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3960 MachineFunction &MF = DAG.getMachineFunction(); 3961 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 3962 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3963 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3964 3965 if (ReturnAddrIndex == 0) { 3966 // Set up a frame object for the return address. 3967 unsigned SlotSize = RegInfo->getSlotSize(); 3968 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, 3969 -(int64_t)SlotSize, 3970 false); 3971 FuncInfo->setRAIndex(ReturnAddrIndex); 3972 } 3973 3974 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); 3975 } 3976 3977 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3978 bool hasSymbolicDisplacement) { 3979 // Offset should fit into 32 bit immediate field. 3980 if (!isInt<32>(Offset)) 3981 return false; 3982 3983 // If we don't have a symbolic displacement - we don't have any extra 3984 // restrictions. 3985 if (!hasSymbolicDisplacement) 3986 return true; 3987 3988 // FIXME: Some tweaks might be needed for medium code model. 3989 if (M != CodeModel::Small && M != CodeModel::Kernel) 3990 return false; 3991 3992 // For small code model we assume that latest object is 16MB before end of 31 3993 // bits boundary. We may also accept pretty large negative constants knowing 3994 // that all objects are in the positive half of address space. 3995 if (M == CodeModel::Small && Offset < 16*1024*1024) 3996 return true; 3997 3998 // For kernel code model we know that all object resist in the negative half 3999 // of 32bits address space. We may not accept negative offsets, since they may 4000 // be just off and we may accept pretty large positive ones. 4001 if (M == CodeModel::Kernel && Offset >= 0) 4002 return true; 4003 4004 return false; 4005 } 4006 4007 /// Determines whether the callee is required to pop its own arguments. 4008 /// Callee pop is necessary to support tail calls. 4009 bool X86::isCalleePop(CallingConv::ID CallingConv, 4010 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { 4011 // If GuaranteeTCO is true, we force some calls to be callee pop so that we 4012 // can guarantee TCO. 4013 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) 4014 return true; 4015 4016 switch (CallingConv) { 4017 default: 4018 return false; 4019 case CallingConv::X86_StdCall: 4020 case CallingConv::X86_FastCall: 4021 case CallingConv::X86_ThisCall: 4022 case CallingConv::X86_VectorCall: 4023 return !is64Bit; 4024 } 4025 } 4026 4027 /// \brief Return true if the condition is an unsigned comparison operation. 4028 static bool isX86CCUnsigned(unsigned X86CC) { 4029 switch (X86CC) { 4030 default: llvm_unreachable("Invalid integer condition!"); 4031 case X86::COND_E: return true; 4032 case X86::COND_G: return false; 4033 case X86::COND_GE: return false; 4034 case X86::COND_L: return false; 4035 case X86::COND_LE: return false; 4036 case X86::COND_NE: return true; 4037 case X86::COND_B: return true; 4038 case X86::COND_A: return true; 4039 case X86::COND_BE: return true; 4040 case X86::COND_AE: return true; 4041 } 4042 } 4043 4044 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { 4045 switch (SetCCOpcode) { 4046 default: llvm_unreachable("Invalid integer condition!"); 4047 case ISD::SETEQ: return X86::COND_E; 4048 case ISD::SETGT: return X86::COND_G; 4049 case ISD::SETGE: return X86::COND_GE; 4050 case ISD::SETLT: return X86::COND_L; 4051 case ISD::SETLE: return X86::COND_LE; 4052 case ISD::SETNE: return X86::COND_NE; 4053 case ISD::SETULT: return X86::COND_B; 4054 case ISD::SETUGT: return X86::COND_A; 4055 case ISD::SETULE: return X86::COND_BE; 4056 case ISD::SETUGE: return X86::COND_AE; 4057 } 4058 } 4059 4060 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific 4061 /// condition code, returning the condition code and the LHS/RHS of the 4062 /// comparison to make. 4063 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, 4064 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 4065 if (!isFP) { 4066 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 4067 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 4068 // X > -1 -> X == 0, jump !sign. 4069 RHS = DAG.getConstant(0, DL, RHS.getValueType()); 4070 return X86::COND_NS; 4071 } 4072 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 4073 // X < 0 -> X == 0, jump on sign. 4074 return X86::COND_S; 4075 } 4076 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 4077 // X < 1 -> X <= 0 4078 RHS = DAG.getConstant(0, DL, RHS.getValueType()); 4079 return X86::COND_LE; 4080 } 4081 } 4082 4083 return TranslateIntegerX86CC(SetCCOpcode); 4084 } 4085 4086 // First determine if it is required or is profitable to flip the operands. 4087 4088 // If LHS is a foldable load, but RHS is not, flip the condition. 4089 if (ISD::isNON_EXTLoad(LHS.getNode()) && 4090 !ISD::isNON_EXTLoad(RHS.getNode())) { 4091 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 4092 std::swap(LHS, RHS); 4093 } 4094 4095 switch (SetCCOpcode) { 4096 default: break; 4097 case ISD::SETOLT: 4098 case ISD::SETOLE: 4099 case ISD::SETUGT: 4100 case ISD::SETUGE: 4101 std::swap(LHS, RHS); 4102 break; 4103 } 4104 4105 // On a floating point condition, the flags are set as follows: 4106 // ZF PF CF op 4107 // 0 | 0 | 0 | X > Y 4108 // 0 | 0 | 1 | X < Y 4109 // 1 | 0 | 0 | X == Y 4110 // 1 | 1 | 1 | unordered 4111 switch (SetCCOpcode) { 4112 default: llvm_unreachable("Condcode should be pre-legalized away"); 4113 case ISD::SETUEQ: 4114 case ISD::SETEQ: return X86::COND_E; 4115 case ISD::SETOLT: // flipped 4116 case ISD::SETOGT: 4117 case ISD::SETGT: return X86::COND_A; 4118 case ISD::SETOLE: // flipped 4119 case ISD::SETOGE: 4120 case ISD::SETGE: return X86::COND_AE; 4121 case ISD::SETUGT: // flipped 4122 case ISD::SETULT: 4123 case ISD::SETLT: return X86::COND_B; 4124 case ISD::SETUGE: // flipped 4125 case ISD::SETULE: 4126 case ISD::SETLE: return X86::COND_BE; 4127 case ISD::SETONE: 4128 case ISD::SETNE: return X86::COND_NE; 4129 case ISD::SETUO: return X86::COND_P; 4130 case ISD::SETO: return X86::COND_NP; 4131 case ISD::SETOEQ: 4132 case ISD::SETUNE: return X86::COND_INVALID; 4133 } 4134 } 4135 4136 /// Is there a floating point cmov for the specific X86 condition code? 4137 /// Current x86 isa includes the following FP cmov instructions: 4138 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 4139 static bool hasFPCMov(unsigned X86CC) { 4140 switch (X86CC) { 4141 default: 4142 return false; 4143 case X86::COND_B: 4144 case X86::COND_BE: 4145 case X86::COND_E: 4146 case X86::COND_P: 4147 case X86::COND_A: 4148 case X86::COND_AE: 4149 case X86::COND_NE: 4150 case X86::COND_NP: 4151 return true; 4152 } 4153 } 4154 4155 /// Returns true if the target can instruction select the 4156 /// specified FP immediate natively. If false, the legalizer will 4157 /// materialize the FP immediate as a load from a constant pool. 4158 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 4159 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 4160 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 4161 return true; 4162 } 4163 return false; 4164 } 4165 4166 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, 4167 ISD::LoadExtType ExtTy, 4168 EVT NewVT) const { 4169 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF 4170 // relocation target a movq or addq instruction: don't let the load shrink. 4171 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); 4172 if (BasePtr.getOpcode() == X86ISD::WrapperRIP) 4173 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) 4174 return GA->getTargetFlags() != X86II::MO_GOTTPOFF; 4175 return true; 4176 } 4177 4178 /// \brief Returns true if it is beneficial to convert a load of a constant 4179 /// to just the constant itself. 4180 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 4181 Type *Ty) const { 4182 assert(Ty->isIntegerTy()); 4183 4184 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 4185 if (BitSize == 0 || BitSize > 64) 4186 return false; 4187 return true; 4188 } 4189 4190 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, 4191 unsigned Index) const { 4192 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 4193 return false; 4194 4195 return (Index == 0 || Index == ResVT.getVectorNumElements()); 4196 } 4197 4198 bool X86TargetLowering::isCheapToSpeculateCttz() const { 4199 // Speculate cttz only if we can directly use TZCNT. 4200 return Subtarget->hasBMI(); 4201 } 4202 4203 bool X86TargetLowering::isCheapToSpeculateCtlz() const { 4204 // Speculate ctlz only if we can directly use LZCNT. 4205 return Subtarget->hasLZCNT(); 4206 } 4207 4208 /// Return true if every element in Mask, beginning 4209 /// from position Pos and ending in Pos+Size is undef. 4210 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { 4211 for (unsigned i = Pos, e = Pos + Size; i != e; ++i) 4212 if (0 <= Mask[i]) 4213 return false; 4214 return true; 4215 } 4216 4217 /// Return true if Val is undef or if its value falls within the 4218 /// specified range (L, H]. 4219 static bool isUndefOrInRange(int Val, int Low, int Hi) { 4220 return (Val < 0) || (Val >= Low && Val < Hi); 4221 } 4222 4223 /// Val is either less than zero (undef) or equal to the specified value. 4224 static bool isUndefOrEqual(int Val, int CmpVal) { 4225 return (Val < 0 || Val == CmpVal); 4226 } 4227 4228 /// Return true if every element in Mask, beginning 4229 /// from position Pos and ending in Pos+Size, falls within the specified 4230 /// sequential range (Low, Low+Size]. or is undef. 4231 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 4232 unsigned Pos, unsigned Size, int Low) { 4233 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 4234 if (!isUndefOrEqual(Mask[i], Low)) 4235 return false; 4236 return true; 4237 } 4238 4239 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector 4240 /// extract that is suitable for instruction that extract 128 or 256 bit vectors 4241 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { 4242 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4243 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4244 return false; 4245 4246 // The index should be aligned on a vecWidth-bit boundary. 4247 uint64_t Index = 4248 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4249 4250 MVT VT = N->getSimpleValueType(0); 4251 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4252 bool Result = (Index * ElSize) % vecWidth == 0; 4253 4254 return Result; 4255 } 4256 4257 /// Return true if the specified INSERT_SUBVECTOR 4258 /// operand specifies a subvector insert that is suitable for input to 4259 /// insertion of 128 or 256-bit subvectors 4260 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { 4261 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4262 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4263 return false; 4264 // The index should be aligned on a vecWidth-bit boundary. 4265 uint64_t Index = 4266 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4267 4268 MVT VT = N->getSimpleValueType(0); 4269 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4270 bool Result = (Index * ElSize) % vecWidth == 0; 4271 4272 return Result; 4273 } 4274 4275 bool X86::isVINSERT128Index(SDNode *N) { 4276 return isVINSERTIndex(N, 128); 4277 } 4278 4279 bool X86::isVINSERT256Index(SDNode *N) { 4280 return isVINSERTIndex(N, 256); 4281 } 4282 4283 bool X86::isVEXTRACT128Index(SDNode *N) { 4284 return isVEXTRACTIndex(N, 128); 4285 } 4286 4287 bool X86::isVEXTRACT256Index(SDNode *N) { 4288 return isVEXTRACTIndex(N, 256); 4289 } 4290 4291 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { 4292 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4293 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) && 4294 "Illegal extract subvector for VEXTRACT"); 4295 4296 uint64_t Index = 4297 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4298 4299 MVT VecVT = N->getOperand(0).getSimpleValueType(); 4300 MVT ElVT = VecVT.getVectorElementType(); 4301 4302 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4303 return Index / NumElemsPerChunk; 4304 } 4305 4306 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { 4307 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4308 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) && 4309 "Illegal insert subvector for VINSERT"); 4310 4311 uint64_t Index = 4312 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4313 4314 MVT VecVT = N->getSimpleValueType(0); 4315 MVT ElVT = VecVT.getVectorElementType(); 4316 4317 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4318 return Index / NumElemsPerChunk; 4319 } 4320 4321 /// Return the appropriate immediate to extract the specified 4322 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. 4323 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { 4324 return getExtractVEXTRACTImmediate(N, 128); 4325 } 4326 4327 /// Return the appropriate immediate to extract the specified 4328 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. 4329 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { 4330 return getExtractVEXTRACTImmediate(N, 256); 4331 } 4332 4333 /// Return the appropriate immediate to insert at the specified 4334 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. 4335 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { 4336 return getInsertVINSERTImmediate(N, 128); 4337 } 4338 4339 /// Return the appropriate immediate to insert at the specified 4340 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. 4341 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { 4342 return getInsertVINSERTImmediate(N, 256); 4343 } 4344 4345 /// Returns true if Elt is a constant zero or a floating point constant +0.0. 4346 bool X86::isZeroNode(SDValue Elt) { 4347 return isNullConstant(Elt) || isNullFPConstant(Elt); 4348 } 4349 4350 // Build a vector of constants 4351 // Use an UNDEF node if MaskElt == -1. 4352 // Spilt 64-bit constants in the 32-bit mode. 4353 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, 4354 SelectionDAG &DAG, 4355 SDLoc dl, bool IsMask = false) { 4356 4357 SmallVector<SDValue, 32> Ops; 4358 bool Split = false; 4359 4360 MVT ConstVecVT = VT; 4361 unsigned NumElts = VT.getVectorNumElements(); 4362 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); 4363 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { 4364 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); 4365 Split = true; 4366 } 4367 4368 MVT EltVT = ConstVecVT.getVectorElementType(); 4369 for (unsigned i = 0; i < NumElts; ++i) { 4370 bool IsUndef = Values[i] < 0 && IsMask; 4371 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : 4372 DAG.getConstant(Values[i], dl, EltVT); 4373 Ops.push_back(OpNode); 4374 if (Split) 4375 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : 4376 DAG.getConstant(0, dl, EltVT)); 4377 } 4378 SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops); 4379 if (Split) 4380 ConstsNode = DAG.getBitcast(VT, ConstsNode); 4381 return ConstsNode; 4382 } 4383 4384 /// Returns a vector of specified type with all zero elements. 4385 static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, 4386 SelectionDAG &DAG, SDLoc dl) { 4387 assert(VT.isVector() && "Expected a vector type"); 4388 4389 // Always build SSE zero vectors as <4 x i32> bitcasted 4390 // to their dest type. This ensures they get CSE'd. 4391 SDValue Vec; 4392 if (VT.is128BitVector()) { // SSE 4393 if (Subtarget->hasSSE2()) { // SSE2 4394 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); 4395 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4396 } else { // SSE1 4397 SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); 4398 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4399 } 4400 } else if (VT.is256BitVector()) { // AVX 4401 if (Subtarget->hasInt256()) { // AVX2 4402 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); 4403 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4404 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); 4405 } else { 4406 // 256-bit logic and arithmetic instructions in AVX are all 4407 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4408 SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); 4409 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4410 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); 4411 } 4412 } else if (VT.is512BitVector()) { // AVX-512 4413 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); 4414 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 4415 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4416 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); 4417 } else if (VT.getVectorElementType() == MVT::i1) { 4418 4419 assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) 4420 && "Unexpected vector type"); 4421 assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) 4422 && "Unexpected vector type"); 4423 SDValue Cst = DAG.getConstant(0, dl, MVT::i1); 4424 SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst); 4425 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 4426 } else 4427 llvm_unreachable("Unexpected vector type"); 4428 4429 return DAG.getBitcast(VT, Vec); 4430 } 4431 4432 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, 4433 SelectionDAG &DAG, SDLoc dl, 4434 unsigned vectorWidth) { 4435 assert((vectorWidth == 128 || vectorWidth == 256) && 4436 "Unsupported vector width"); 4437 EVT VT = Vec.getValueType(); 4438 EVT ElVT = VT.getVectorElementType(); 4439 unsigned Factor = VT.getSizeInBits()/vectorWidth; 4440 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 4441 VT.getVectorNumElements()/Factor); 4442 4443 // Extract from UNDEF is UNDEF. 4444 if (Vec.getOpcode() == ISD::UNDEF) 4445 return DAG.getUNDEF(ResultVT); 4446 4447 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR 4448 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); 4449 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); 4450 4451 // This is the index of the first element of the vectorWidth-bit chunk 4452 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. 4453 IdxVal &= ~(ElemsPerChunk - 1); 4454 4455 // If the input is a buildvector just emit a smaller one. 4456 if (Vec.getOpcode() == ISD::BUILD_VECTOR) 4457 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, 4458 makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); 4459 4460 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); 4461 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); 4462 } 4463 4464 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This 4465 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 4466 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 4467 /// instructions or a simple subregister reference. Idx is an index in the 4468 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes 4469 /// lowering EXTRACT_VECTOR_ELT operations easier. 4470 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 4471 SelectionDAG &DAG, SDLoc dl) { 4472 assert((Vec.getValueType().is256BitVector() || 4473 Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); 4474 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); 4475 } 4476 4477 /// Generate a DAG to grab 256-bits from a 512-bit vector. 4478 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, 4479 SelectionDAG &DAG, SDLoc dl) { 4480 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); 4481 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); 4482 } 4483 4484 static SDValue InsertSubVector(SDValue Result, SDValue Vec, 4485 unsigned IdxVal, SelectionDAG &DAG, 4486 SDLoc dl, unsigned vectorWidth) { 4487 assert((vectorWidth == 128 || vectorWidth == 256) && 4488 "Unsupported vector width"); 4489 // Inserting UNDEF is Result 4490 if (Vec.getOpcode() == ISD::UNDEF) 4491 return Result; 4492 EVT VT = Vec.getValueType(); 4493 EVT ElVT = VT.getVectorElementType(); 4494 EVT ResultVT = Result.getValueType(); 4495 4496 // Insert the relevant vectorWidth bits. 4497 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); 4498 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); 4499 4500 // This is the index of the first element of the vectorWidth-bit chunk 4501 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. 4502 IdxVal &= ~(ElemsPerChunk - 1); 4503 4504 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); 4505 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); 4506 } 4507 4508 /// Generate a DAG to put 128-bits into a vector > 128 bits. This 4509 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or 4510 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a 4511 /// simple superregister reference. Idx is an index in the 128 bits 4512 /// we want. It need not be aligned to a 128-bit boundary. That makes 4513 /// lowering INSERT_VECTOR_ELT operations easier. 4514 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, 4515 SelectionDAG &DAG, SDLoc dl) { 4516 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); 4517 4518 // For insertion into the zero index (low half) of a 256-bit vector, it is 4519 // more efficient to generate a blend with immediate instead of an insert*128. 4520 // We are still creating an INSERT_SUBVECTOR below with an undef node to 4521 // extend the subvector to the size of the result vector. Make sure that 4522 // we are not recursing on that node by checking for undef here. 4523 if (IdxVal == 0 && Result.getValueType().is256BitVector() && 4524 Result.getOpcode() != ISD::UNDEF) { 4525 EVT ResultVT = Result.getValueType(); 4526 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); 4527 SDValue Undef = DAG.getUNDEF(ResultVT); 4528 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, 4529 Vec, ZeroIndex); 4530 4531 // The blend instruction, and therefore its mask, depend on the data type. 4532 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); 4533 if (ScalarType.isFloatingPoint()) { 4534 // Choose either vblendps (float) or vblendpd (double). 4535 unsigned ScalarSize = ScalarType.getSizeInBits(); 4536 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); 4537 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; 4538 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); 4539 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); 4540 } 4541 4542 const X86Subtarget &Subtarget = 4543 static_cast<const X86Subtarget &>(DAG.getSubtarget()); 4544 4545 // AVX2 is needed for 256-bit integer blend support. 4546 // Integers must be cast to 32-bit because there is only vpblendd; 4547 // vpblendw can't be used for this because it has a handicapped mask. 4548 4549 // If we don't have AVX2, then cast to float. Using a wrong domain blend 4550 // is still more efficient than using the wrong domain vinsertf128 that 4551 // will be created by InsertSubVector(). 4552 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; 4553 4554 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); 4555 Vec256 = DAG.getBitcast(CastVT, Vec256); 4556 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); 4557 return DAG.getBitcast(ResultVT, Vec256); 4558 } 4559 4560 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); 4561 } 4562 4563 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, 4564 SelectionDAG &DAG, SDLoc dl) { 4565 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); 4566 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); 4567 } 4568 4569 /// Insert i1-subvector to i1-vector. 4570 static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { 4571 4572 SDLoc dl(Op); 4573 SDValue Vec = Op.getOperand(0); 4574 SDValue SubVec = Op.getOperand(1); 4575 SDValue Idx = Op.getOperand(2); 4576 4577 if (!isa<ConstantSDNode>(Idx)) 4578 return SDValue(); 4579 4580 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 4581 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal 4582 return Op; 4583 4584 MVT OpVT = Op.getSimpleValueType(); 4585 MVT SubVecVT = SubVec.getSimpleValueType(); 4586 unsigned NumElems = OpVT.getVectorNumElements(); 4587 unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); 4588 4589 assert(IdxVal + SubVecNumElems <= NumElems && 4590 IdxVal % SubVecVT.getSizeInBits() == 0 && 4591 "Unexpected index value in INSERT_SUBVECTOR"); 4592 4593 // There are 3 possible cases: 4594 // 1. Subvector should be inserted in the lower part (IdxVal == 0) 4595 // 2. Subvector should be inserted in the upper part 4596 // (IdxVal + SubVecNumElems == NumElems) 4597 // 3. Subvector should be inserted in the middle (for example v2i1 4598 // to v16i1, index 2) 4599 4600 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); 4601 SDValue Undef = DAG.getUNDEF(OpVT); 4602 SDValue WideSubVec = 4603 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); 4604 if (Vec.isUndef()) 4605 return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, 4606 DAG.getConstant(IdxVal, dl, MVT::i8)); 4607 4608 if (ISD::isBuildVectorAllZeros(Vec.getNode())) { 4609 unsigned ShiftLeft = NumElems - SubVecNumElems; 4610 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; 4611 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, 4612 DAG.getConstant(ShiftLeft, dl, MVT::i8)); 4613 return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, 4614 DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; 4615 } 4616 4617 if (IdxVal == 0) { 4618 // Zero lower bits of the Vec 4619 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); 4620 Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); 4621 Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); 4622 // Merge them together 4623 return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); 4624 } 4625 4626 // Simple case when we put subvector in the upper part 4627 if (IdxVal + SubVecNumElems == NumElems) { 4628 // Zero upper bits of the Vec 4629 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, 4630 DAG.getConstant(IdxVal, dl, MVT::i8)); 4631 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); 4632 Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); 4633 Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); 4634 return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); 4635 } 4636 // Subvector should be inserted in the middle - use shuffle 4637 SmallVector<int, 64> Mask; 4638 for (unsigned i = 0; i < NumElems; ++i) 4639 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? 4640 i : i + NumElems); 4641 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); 4642 } 4643 4644 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 4645 /// instructions. This is used because creating CONCAT_VECTOR nodes of 4646 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 4647 /// large BUILD_VECTORS. 4648 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 4649 unsigned NumElems, SelectionDAG &DAG, 4650 SDLoc dl) { 4651 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 4652 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 4653 } 4654 4655 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, 4656 unsigned NumElems, SelectionDAG &DAG, 4657 SDLoc dl) { 4658 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 4659 return Insert256BitVector(V, V2, NumElems/2, DAG, dl); 4660 } 4661 4662 /// Returns a vector of specified type with all bits set. 4663 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4664 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4665 /// Then bitcast to their original type, ensuring they get CSE'd. 4666 static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, 4667 SelectionDAG &DAG, SDLoc dl) { 4668 assert(VT.isVector() && "Expected a vector type"); 4669 4670 SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); 4671 SDValue Vec; 4672 if (VT.is512BitVector()) { 4673 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 4674 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4675 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); 4676 } else if (VT.is256BitVector()) { 4677 if (Subtarget->hasInt256()) { // AVX2 4678 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4679 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); 4680 } else { // AVX 4681 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4682 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4683 } 4684 } else if (VT.is128BitVector()) { 4685 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4686 } else 4687 llvm_unreachable("Unexpected vector type"); 4688 4689 return DAG.getBitcast(VT, Vec); 4690 } 4691 4692 /// Returns a vector_shuffle node for an unpackl operation. 4693 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4694 SDValue V2) { 4695 unsigned NumElems = VT.getVectorNumElements(); 4696 SmallVector<int, 8> Mask; 4697 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4698 Mask.push_back(i); 4699 Mask.push_back(i + NumElems); 4700 } 4701 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4702 } 4703 4704 /// Returns a vector_shuffle node for an unpackh operation. 4705 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4706 SDValue V2) { 4707 unsigned NumElems = VT.getVectorNumElements(); 4708 SmallVector<int, 8> Mask; 4709 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4710 Mask.push_back(i + Half); 4711 Mask.push_back(i + NumElems + Half); 4712 } 4713 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4714 } 4715 4716 /// Return a vector_shuffle of the specified vector of zero or undef vector. 4717 /// This produces a shuffle where the low element of V2 is swizzled into the 4718 /// zero/undef vector, landing at element Idx. 4719 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4720 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4721 bool IsZero, 4722 const X86Subtarget *Subtarget, 4723 SelectionDAG &DAG) { 4724 MVT VT = V2.getSimpleValueType(); 4725 SDValue V1 = IsZero 4726 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); 4727 unsigned NumElems = VT.getVectorNumElements(); 4728 SmallVector<int, 16> MaskVec; 4729 for (unsigned i = 0; i != NumElems; ++i) 4730 // If this is the insertion idx, put the low elt of V2 here. 4731 MaskVec.push_back(i == Idx ? NumElems : i); 4732 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); 4733 } 4734 4735 /// Calculates the shuffle mask corresponding to the target-specific opcode. 4736 /// Returns true if the Mask could be calculated. Sets IsUnary to true if only 4737 /// uses one source. Note that this will set IsUnary for shuffles which use a 4738 /// single input multiple times, and in those cases it will 4739 /// adjust the mask to only have indices within that single input. 4740 /// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. 4741 static bool getTargetShuffleMask(SDNode *N, MVT VT, 4742 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4743 unsigned NumElems = VT.getVectorNumElements(); 4744 SDValue ImmN; 4745 4746 IsUnary = false; 4747 bool IsFakeUnary = false; 4748 switch(N->getOpcode()) { 4749 case X86ISD::BLENDI: 4750 ImmN = N->getOperand(N->getNumOperands()-1); 4751 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4752 break; 4753 case X86ISD::SHUFP: 4754 ImmN = N->getOperand(N->getNumOperands()-1); 4755 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4756 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4757 break; 4758 case X86ISD::UNPCKH: 4759 DecodeUNPCKHMask(VT, Mask); 4760 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4761 break; 4762 case X86ISD::UNPCKL: 4763 DecodeUNPCKLMask(VT, Mask); 4764 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4765 break; 4766 case X86ISD::MOVHLPS: 4767 DecodeMOVHLPSMask(NumElems, Mask); 4768 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4769 break; 4770 case X86ISD::MOVLHPS: 4771 DecodeMOVLHPSMask(NumElems, Mask); 4772 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4773 break; 4774 case X86ISD::PALIGNR: 4775 ImmN = N->getOperand(N->getNumOperands()-1); 4776 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4777 break; 4778 case X86ISD::PSHUFD: 4779 case X86ISD::VPERMILPI: 4780 ImmN = N->getOperand(N->getNumOperands()-1); 4781 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4782 IsUnary = true; 4783 break; 4784 case X86ISD::PSHUFHW: 4785 ImmN = N->getOperand(N->getNumOperands()-1); 4786 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4787 IsUnary = true; 4788 break; 4789 case X86ISD::PSHUFLW: 4790 ImmN = N->getOperand(N->getNumOperands()-1); 4791 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4792 IsUnary = true; 4793 break; 4794 case X86ISD::PSHUFB: { 4795 IsUnary = true; 4796 SDValue MaskNode = N->getOperand(1); 4797 while (MaskNode->getOpcode() == ISD::BITCAST) 4798 MaskNode = MaskNode->getOperand(0); 4799 4800 if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { 4801 // If we have a build-vector, then things are easy. 4802 MVT VT = MaskNode.getSimpleValueType(); 4803 assert(VT.isVector() && 4804 "Can't produce a non-vector with a build_vector!"); 4805 if (!VT.isInteger()) 4806 return false; 4807 4808 int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; 4809 4810 SmallVector<uint64_t, 32> RawMask; 4811 for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { 4812 SDValue Op = MaskNode->getOperand(i); 4813 if (Op->getOpcode() == ISD::UNDEF) { 4814 RawMask.push_back((uint64_t)SM_SentinelUndef); 4815 continue; 4816 } 4817 auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); 4818 if (!CN) 4819 return false; 4820 APInt MaskElement = CN->getAPIntValue(); 4821 4822 // We now have to decode the element which could be any integer size and 4823 // extract each byte of it. 4824 for (int j = 0; j < NumBytesPerElement; ++j) { 4825 // Note that this is x86 and so always little endian: the low byte is 4826 // the first byte of the mask. 4827 RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); 4828 MaskElement = MaskElement.lshr(8); 4829 } 4830 } 4831 DecodePSHUFBMask(RawMask, Mask); 4832 break; 4833 } 4834 4835 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); 4836 if (!MaskLoad) 4837 return false; 4838 4839 SDValue Ptr = MaskLoad->getBasePtr(); 4840 if (Ptr->getOpcode() == X86ISD::Wrapper || 4841 Ptr->getOpcode() == X86ISD::WrapperRIP) 4842 Ptr = Ptr->getOperand(0); 4843 4844 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); 4845 if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) 4846 return false; 4847 4848 if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { 4849 DecodePSHUFBMask(C, Mask); 4850 if (Mask.empty()) 4851 return false; 4852 break; 4853 } 4854 4855 return false; 4856 } 4857 case X86ISD::VPERMI: 4858 ImmN = N->getOperand(N->getNumOperands()-1); 4859 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4860 IsUnary = true; 4861 break; 4862 case X86ISD::MOVSS: 4863 case X86ISD::MOVSD: 4864 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); 4865 break; 4866 case X86ISD::VPERM2X128: 4867 ImmN = N->getOperand(N->getNumOperands()-1); 4868 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4869 if (Mask.empty()) return false; 4870 // Mask only contains negative index if an element is zero. 4871 if (std::any_of(Mask.begin(), Mask.end(), 4872 [](int M){ return M == SM_SentinelZero; })) 4873 return false; 4874 break; 4875 case X86ISD::MOVSLDUP: 4876 DecodeMOVSLDUPMask(VT, Mask); 4877 IsUnary = true; 4878 break; 4879 case X86ISD::MOVSHDUP: 4880 DecodeMOVSHDUPMask(VT, Mask); 4881 IsUnary = true; 4882 break; 4883 case X86ISD::MOVDDUP: 4884 DecodeMOVDDUPMask(VT, Mask); 4885 IsUnary = true; 4886 break; 4887 case X86ISD::MOVLHPD: 4888 case X86ISD::MOVLPD: 4889 case X86ISD::MOVLPS: 4890 // Not yet implemented 4891 return false; 4892 case X86ISD::VPERMV: { 4893 IsUnary = true; 4894 SDValue MaskNode = N->getOperand(0); 4895 while (MaskNode->getOpcode() == ISD::BITCAST) 4896 MaskNode = MaskNode->getOperand(0); 4897 4898 unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); 4899 SmallVector<uint64_t, 32> RawMask; 4900 if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { 4901 // If we have a build-vector, then things are easy. 4902 assert(MaskNode.getSimpleValueType().isInteger() && 4903 MaskNode.getSimpleValueType().getVectorNumElements() == 4904 VT.getVectorNumElements()); 4905 4906 for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { 4907 SDValue Op = MaskNode->getOperand(i); 4908 if (Op->getOpcode() == ISD::UNDEF) 4909 RawMask.push_back((uint64_t)SM_SentinelUndef); 4910 else if (isa<ConstantSDNode>(Op)) { 4911 APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue(); 4912 RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); 4913 } else 4914 return false; 4915 } 4916 DecodeVPERMVMask(RawMask, Mask); 4917 break; 4918 } 4919 if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { 4920 unsigned NumEltsInMask = MaskNode->getNumOperands(); 4921 MaskNode = MaskNode->getOperand(0); 4922 auto *CN = dyn_cast<ConstantSDNode>(MaskNode); 4923 if (CN) { 4924 APInt MaskEltValue = CN->getAPIntValue(); 4925 for (unsigned i = 0; i < NumEltsInMask; ++i) 4926 RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); 4927 DecodeVPERMVMask(RawMask, Mask); 4928 break; 4929 } 4930 // It may be a scalar load 4931 } 4932 4933 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); 4934 if (!MaskLoad) 4935 return false; 4936 4937 SDValue Ptr = MaskLoad->getBasePtr(); 4938 if (Ptr->getOpcode() == X86ISD::Wrapper || 4939 Ptr->getOpcode() == X86ISD::WrapperRIP) 4940 Ptr = Ptr->getOperand(0); 4941 4942 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); 4943 if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) 4944 return false; 4945 4946 auto *C = dyn_cast<Constant>(MaskCP->getConstVal()); 4947 if (C) { 4948 DecodeVPERMVMask(C, VT, Mask); 4949 if (Mask.empty()) 4950 return false; 4951 break; 4952 } 4953 return false; 4954 } 4955 case X86ISD::VPERMV3: { 4956 IsUnary = false; 4957 SDValue MaskNode = N->getOperand(1); 4958 while (MaskNode->getOpcode() == ISD::BITCAST) 4959 MaskNode = MaskNode->getOperand(1); 4960 4961 if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { 4962 // If we have a build-vector, then things are easy. 4963 assert(MaskNode.getSimpleValueType().isInteger() && 4964 MaskNode.getSimpleValueType().getVectorNumElements() == 4965 VT.getVectorNumElements()); 4966 4967 SmallVector<uint64_t, 32> RawMask; 4968 unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); 4969 4970 for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { 4971 SDValue Op = MaskNode->getOperand(i); 4972 if (Op->getOpcode() == ISD::UNDEF) 4973 RawMask.push_back((uint64_t)SM_SentinelUndef); 4974 else { 4975 auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); 4976 if (!CN) 4977 return false; 4978 APInt MaskElement = CN->getAPIntValue(); 4979 RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); 4980 } 4981 } 4982 DecodeVPERMV3Mask(RawMask, Mask); 4983 break; 4984 } 4985 4986 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); 4987 if (!MaskLoad) 4988 return false; 4989 4990 SDValue Ptr = MaskLoad->getBasePtr(); 4991 if (Ptr->getOpcode() == X86ISD::Wrapper || 4992 Ptr->getOpcode() == X86ISD::WrapperRIP) 4993 Ptr = Ptr->getOperand(0); 4994 4995 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); 4996 if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) 4997 return false; 4998 4999 auto *C = dyn_cast<Constant>(MaskCP->getConstVal()); 5000 if (C) { 5001 DecodeVPERMV3Mask(C, VT, Mask); 5002 if (Mask.empty()) 5003 return false; 5004 break; 5005 } 5006 return false; 5007 } 5008 default: llvm_unreachable("unknown target shuffle node"); 5009 } 5010 5011 // If we have a fake unary shuffle, the shuffle mask is spread across two 5012 // inputs that are actually the same node. Re-map the mask to always point 5013 // into the first input. 5014 if (IsFakeUnary) 5015 for (int &M : Mask) 5016 if (M >= (int)Mask.size()) 5017 M -= Mask.size(); 5018 5019 return true; 5020 } 5021 5022 /// Returns the scalar element that will make up the ith 5023 /// element of the result of the vector shuffle. 5024 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 5025 unsigned Depth) { 5026 if (Depth == 6) 5027 return SDValue(); // Limit search depth. 5028 5029 SDValue V = SDValue(N, 0); 5030 EVT VT = V.getValueType(); 5031 unsigned Opcode = V.getOpcode(); 5032 5033 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 5034 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 5035 int Elt = SV->getMaskElt(Index); 5036 5037 if (Elt < 0) 5038 return DAG.getUNDEF(VT.getVectorElementType()); 5039 5040 unsigned NumElems = VT.getVectorNumElements(); 5041 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 5042 : SV->getOperand(1); 5043 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 5044 } 5045 5046 // Recurse into target specific vector shuffles to find scalars. 5047 if (isTargetShuffle(Opcode)) { 5048 MVT ShufVT = V.getSimpleValueType(); 5049 unsigned NumElems = ShufVT.getVectorNumElements(); 5050 SmallVector<int, 16> ShuffleMask; 5051 bool IsUnary; 5052 5053 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 5054 return SDValue(); 5055 5056 int Elt = ShuffleMask[Index]; 5057 if (Elt < 0) 5058 return DAG.getUNDEF(ShufVT.getVectorElementType()); 5059 5060 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 5061 : N->getOperand(1); 5062 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 5063 Depth+1); 5064 } 5065 5066 // Actual nodes that may contain scalar elements 5067 if (Opcode == ISD::BITCAST) { 5068 V = V.getOperand(0); 5069 EVT SrcVT = V.getValueType(); 5070 unsigned NumElems = VT.getVectorNumElements(); 5071 5072 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 5073 return SDValue(); 5074 } 5075 5076 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5077 return (Index == 0) ? V.getOperand(0) 5078 : DAG.getUNDEF(VT.getVectorElementType()); 5079 5080 if (V.getOpcode() == ISD::BUILD_VECTOR) 5081 return V.getOperand(Index); 5082 5083 return SDValue(); 5084 } 5085 5086 /// Custom lower build_vector of v16i8. 5087 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 5088 unsigned NumNonZero, unsigned NumZero, 5089 SelectionDAG &DAG, 5090 const X86Subtarget* Subtarget, 5091 const TargetLowering &TLI) { 5092 if (NumNonZero > 8) 5093 return SDValue(); 5094 5095 SDLoc dl(Op); 5096 SDValue V; 5097 bool First = true; 5098 5099 // SSE4.1 - use PINSRB to insert each byte directly. 5100 if (Subtarget->hasSSE41()) { 5101 for (unsigned i = 0; i < 16; ++i) { 5102 bool isNonZero = (NonZeros & (1 << i)) != 0; 5103 if (isNonZero) { 5104 if (First) { 5105 if (NumZero) 5106 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); 5107 else 5108 V = DAG.getUNDEF(MVT::v16i8); 5109 First = false; 5110 } 5111 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 5112 MVT::v16i8, V, Op.getOperand(i), 5113 DAG.getIntPtrConstant(i, dl)); 5114 } 5115 } 5116 5117 return V; 5118 } 5119 5120 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. 5121 for (unsigned i = 0; i < 16; ++i) { 5122 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 5123 if (ThisIsNonZero && First) { 5124 if (NumZero) 5125 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5126 else 5127 V = DAG.getUNDEF(MVT::v8i16); 5128 First = false; 5129 } 5130 5131 if ((i & 1) != 0) { 5132 SDValue ThisElt, LastElt; 5133 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 5134 if (LastIsNonZero) { 5135 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 5136 MVT::i16, Op.getOperand(i-1)); 5137 } 5138 if (ThisIsNonZero) { 5139 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 5140 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 5141 ThisElt, DAG.getConstant(8, dl, MVT::i8)); 5142 if (LastIsNonZero) 5143 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 5144 } else 5145 ThisElt = LastElt; 5146 5147 if (ThisElt.getNode()) 5148 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 5149 DAG.getIntPtrConstant(i/2, dl)); 5150 } 5151 } 5152 5153 return DAG.getBitcast(MVT::v16i8, V); 5154 } 5155 5156 /// Custom lower build_vector of v8i16. 5157 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 5158 unsigned NumNonZero, unsigned NumZero, 5159 SelectionDAG &DAG, 5160 const X86Subtarget* Subtarget, 5161 const TargetLowering &TLI) { 5162 if (NumNonZero > 4) 5163 return SDValue(); 5164 5165 SDLoc dl(Op); 5166 SDValue V; 5167 bool First = true; 5168 for (unsigned i = 0; i < 8; ++i) { 5169 bool isNonZero = (NonZeros & (1 << i)) != 0; 5170 if (isNonZero) { 5171 if (First) { 5172 if (NumZero) 5173 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5174 else 5175 V = DAG.getUNDEF(MVT::v8i16); 5176 First = false; 5177 } 5178 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 5179 MVT::v8i16, V, Op.getOperand(i), 5180 DAG.getIntPtrConstant(i, dl)); 5181 } 5182 } 5183 5184 return V; 5185 } 5186 5187 /// Custom lower build_vector of v4i32 or v4f32. 5188 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, 5189 const X86Subtarget *Subtarget, 5190 const TargetLowering &TLI) { 5191 // Find all zeroable elements. 5192 std::bitset<4> Zeroable; 5193 for (int i=0; i < 4; ++i) { 5194 SDValue Elt = Op->getOperand(i); 5195 Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); 5196 } 5197 assert(Zeroable.size() - Zeroable.count() > 1 && 5198 "We expect at least two non-zero elements!"); 5199 5200 // We only know how to deal with build_vector nodes where elements are either 5201 // zeroable or extract_vector_elt with constant index. 5202 SDValue FirstNonZero; 5203 unsigned FirstNonZeroIdx; 5204 for (unsigned i=0; i < 4; ++i) { 5205 if (Zeroable[i]) 5206 continue; 5207 SDValue Elt = Op->getOperand(i); 5208 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5209 !isa<ConstantSDNode>(Elt.getOperand(1))) 5210 return SDValue(); 5211 // Make sure that this node is extracting from a 128-bit vector. 5212 MVT VT = Elt.getOperand(0).getSimpleValueType(); 5213 if (!VT.is128BitVector()) 5214 return SDValue(); 5215 if (!FirstNonZero.getNode()) { 5216 FirstNonZero = Elt; 5217 FirstNonZeroIdx = i; 5218 } 5219 } 5220 5221 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); 5222 SDValue V1 = FirstNonZero.getOperand(0); 5223 MVT VT = V1.getSimpleValueType(); 5224 5225 // See if this build_vector can be lowered as a blend with zero. 5226 SDValue Elt; 5227 unsigned EltMaskIdx, EltIdx; 5228 int Mask[4]; 5229 for (EltIdx = 0; EltIdx < 4; ++EltIdx) { 5230 if (Zeroable[EltIdx]) { 5231 // The zero vector will be on the right hand side. 5232 Mask[EltIdx] = EltIdx+4; 5233 continue; 5234 } 5235 5236 Elt = Op->getOperand(EltIdx); 5237 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. 5238 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); 5239 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) 5240 break; 5241 Mask[EltIdx] = EltIdx; 5242 } 5243 5244 if (EltIdx == 4) { 5245 // Let the shuffle legalizer deal with blend operations. 5246 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); 5247 if (V1.getSimpleValueType() != VT) 5248 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); 5249 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); 5250 } 5251 5252 // See if we can lower this build_vector to a INSERTPS. 5253 if (!Subtarget->hasSSE41()) 5254 return SDValue(); 5255 5256 SDValue V2 = Elt.getOperand(0); 5257 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) 5258 V1 = SDValue(); 5259 5260 bool CanFold = true; 5261 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { 5262 if (Zeroable[i]) 5263 continue; 5264 5265 SDValue Current = Op->getOperand(i); 5266 SDValue SrcVector = Current->getOperand(0); 5267 if (!V1.getNode()) 5268 V1 = SrcVector; 5269 CanFold = SrcVector == V1 && 5270 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; 5271 } 5272 5273 if (!CanFold) 5274 return SDValue(); 5275 5276 assert(V1.getNode() && "Expected at least two non-zero elements!"); 5277 if (V1.getSimpleValueType() != MVT::v4f32) 5278 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); 5279 if (V2.getSimpleValueType() != MVT::v4f32) 5280 V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); 5281 5282 // Ok, we can emit an INSERTPS instruction. 5283 unsigned ZMask = Zeroable.to_ulong(); 5284 5285 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; 5286 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); 5287 SDLoc DL(Op); 5288 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, 5289 DAG.getIntPtrConstant(InsertPSMask, DL)); 5290 return DAG.getBitcast(VT, Result); 5291 } 5292 5293 /// Return a vector logical shift node. 5294 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 5295 unsigned NumBits, SelectionDAG &DAG, 5296 const TargetLowering &TLI, SDLoc dl) { 5297 assert(VT.is128BitVector() && "Unknown type for VShift"); 5298 MVT ShVT = MVT::v2i64; 5299 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 5300 SrcOp = DAG.getBitcast(ShVT, SrcOp); 5301 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); 5302 assert(NumBits % 8 == 0 && "Only support byte sized shifts"); 5303 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); 5304 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); 5305 } 5306 5307 static SDValue 5308 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { 5309 5310 // Check if the scalar load can be widened into a vector load. And if 5311 // the address is "base + cst" see if the cst can be "absorbed" into 5312 // the shuffle mask. 5313 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 5314 SDValue Ptr = LD->getBasePtr(); 5315 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 5316 return SDValue(); 5317 EVT PVT = LD->getValueType(0); 5318 if (PVT != MVT::i32 && PVT != MVT::f32) 5319 return SDValue(); 5320 5321 int FI = -1; 5322 int64_t Offset = 0; 5323 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 5324 FI = FINode->getIndex(); 5325 Offset = 0; 5326 } else if (DAG.isBaseWithConstantOffset(Ptr) && 5327 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 5328 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 5329 Offset = Ptr.getConstantOperandVal(1); 5330 Ptr = Ptr.getOperand(0); 5331 } else { 5332 return SDValue(); 5333 } 5334 5335 // FIXME: 256-bit vector instructions don't require a strict alignment, 5336 // improve this code to support it better. 5337 unsigned RequiredAlign = VT.getSizeInBits()/8; 5338 SDValue Chain = LD->getChain(); 5339 // Make sure the stack object alignment is at least 16 or 32. 5340 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5341 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 5342 if (MFI->isFixedObjectIndex(FI)) { 5343 // Can't change the alignment. FIXME: It's possible to compute 5344 // the exact stack offset and reference FI + adjust offset instead. 5345 // If someone *really* cares about this. That's the way to implement it. 5346 return SDValue(); 5347 } else { 5348 MFI->setObjectAlignment(FI, RequiredAlign); 5349 } 5350 } 5351 5352 // (Offset % 16 or 32) must be multiple of 4. Then address is then 5353 // Ptr + (Offset & ~15). 5354 if (Offset < 0) 5355 return SDValue(); 5356 if ((Offset % RequiredAlign) & 3) 5357 return SDValue(); 5358 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); 5359 if (StartOffset) { 5360 SDLoc DL(Ptr); 5361 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 5362 DAG.getConstant(StartOffset, DL, Ptr.getValueType())); 5363 } 5364 5365 int EltNo = (Offset - StartOffset) >> 2; 5366 unsigned NumElems = VT.getVectorNumElements(); 5367 5368 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 5369 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 5370 LD->getPointerInfo().getWithOffset(StartOffset), 5371 false, false, false, 0); 5372 5373 SmallVector<int, 8> Mask(NumElems, EltNo); 5374 5375 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 5376 } 5377 5378 return SDValue(); 5379 } 5380 5381 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the 5382 /// elements can be replaced by a single large load which has the same value as 5383 /// a build_vector or insert_subvector whose loaded operands are 'Elts'. 5384 /// 5385 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 5386 /// 5387 /// FIXME: we'd also like to handle the case where the last elements are zero 5388 /// rather than undef via VZEXT_LOAD, but we do not detect that case today. 5389 /// There's even a handy isZeroNode for that purpose. 5390 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, 5391 SDLoc &DL, SelectionDAG &DAG, 5392 bool isAfterLegalize) { 5393 unsigned NumElems = Elts.size(); 5394 5395 LoadSDNode *LDBase = nullptr; 5396 unsigned LastLoadedElt = -1U; 5397 5398 // For each element in the initializer, see if we've found a load or an undef. 5399 // If we don't find an initial load element, or later load elements are 5400 // non-consecutive, bail out. 5401 for (unsigned i = 0; i < NumElems; ++i) { 5402 SDValue Elt = Elts[i]; 5403 // Look through a bitcast. 5404 if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) 5405 Elt = Elt.getOperand(0); 5406 if (!Elt.getNode() || 5407 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 5408 return SDValue(); 5409 if (!LDBase) { 5410 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 5411 return SDValue(); 5412 LDBase = cast<LoadSDNode>(Elt.getNode()); 5413 LastLoadedElt = i; 5414 continue; 5415 } 5416 if (Elt.getOpcode() == ISD::UNDEF) 5417 continue; 5418 5419 LoadSDNode *LD = cast<LoadSDNode>(Elt); 5420 EVT LdVT = Elt.getValueType(); 5421 // Each loaded element must be the correct fractional portion of the 5422 // requested vector load. 5423 if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) 5424 return SDValue(); 5425 if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) 5426 return SDValue(); 5427 LastLoadedElt = i; 5428 } 5429 5430 // If we have found an entire vector of loads and undefs, then return a large 5431 // load of the entire vector width starting at the base pointer. If we found 5432 // consecutive loads for the low half, generate a vzext_load node. 5433 if (LastLoadedElt == NumElems - 1) { 5434 assert(LDBase && "Did not find base load for merging consecutive loads"); 5435 EVT EltVT = LDBase->getValueType(0); 5436 // Ensure that the input vector size for the merged loads matches the 5437 // cumulative size of the input elements. 5438 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) 5439 return SDValue(); 5440 5441 if (isAfterLegalize && 5442 !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) 5443 return SDValue(); 5444 5445 SDValue NewLd = SDValue(); 5446 5447 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5448 LDBase->getPointerInfo(), LDBase->isVolatile(), 5449 LDBase->isNonTemporal(), LDBase->isInvariant(), 5450 LDBase->getAlignment()); 5451 5452 if (LDBase->hasAnyUseOfValue(1)) { 5453 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5454 SDValue(LDBase, 1), 5455 SDValue(NewLd.getNode(), 1)); 5456 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5457 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5458 SDValue(NewLd.getNode(), 1)); 5459 } 5460 5461 return NewLd; 5462 } 5463 5464 //TODO: The code below fires only for for loading the low v2i32 / v2f32 5465 //of a v4i32 / v4f32. It's probably worth generalizing. 5466 EVT EltVT = VT.getVectorElementType(); 5467 if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && 5468 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 5469 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 5470 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 5471 SDValue ResNode = 5472 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, 5473 LDBase->getPointerInfo(), 5474 LDBase->getAlignment(), 5475 false/*isVolatile*/, true/*ReadMem*/, 5476 false/*WriteMem*/); 5477 5478 // Make sure the newly-created LOAD is in the same position as LDBase in 5479 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 5480 // update uses of LDBase's output chain to use the TokenFactor. 5481 if (LDBase->hasAnyUseOfValue(1)) { 5482 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5483 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 5484 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5485 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5486 SDValue(ResNode.getNode(), 1)); 5487 } 5488 5489 return DAG.getBitcast(VT, ResNode); 5490 } 5491 return SDValue(); 5492 } 5493 5494 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 5495 /// to generate a splat value for the following cases: 5496 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 5497 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from 5498 /// a scalar load, or a constant. 5499 /// The VBROADCAST node is returned when a pattern is found, 5500 /// or SDValue() otherwise. 5501 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, 5502 SelectionDAG &DAG) { 5503 // VBROADCAST requires AVX. 5504 // TODO: Splats could be generated for non-AVX CPUs using SSE 5505 // instructions, but there's less potential gain for only 128-bit vectors. 5506 if (!Subtarget->hasAVX()) 5507 return SDValue(); 5508 5509 MVT VT = Op.getSimpleValueType(); 5510 SDLoc dl(Op); 5511 5512 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && 5513 "Unsupported vector type for broadcast."); 5514 5515 SDValue Ld; 5516 bool ConstSplatVal; 5517 5518 switch (Op.getOpcode()) { 5519 default: 5520 // Unknown pattern found. 5521 return SDValue(); 5522 5523 case ISD::BUILD_VECTOR: { 5524 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode()); 5525 BitVector UndefElements; 5526 SDValue Splat = BVOp->getSplatValue(&UndefElements); 5527 5528 // We need a splat of a single value to use broadcast, and it doesn't 5529 // make any sense if the value is only in one element of the vector. 5530 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) 5531 return SDValue(); 5532 5533 Ld = Splat; 5534 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5535 Ld.getOpcode() == ISD::ConstantFP); 5536 5537 // Make sure that all of the users of a non-constant load are from the 5538 // BUILD_VECTOR node. 5539 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) 5540 return SDValue(); 5541 break; 5542 } 5543 5544 case ISD::VECTOR_SHUFFLE: { 5545 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5546 5547 // Shuffles must have a splat mask where the first element is 5548 // broadcasted. 5549 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5550 return SDValue(); 5551 5552 SDValue Sc = Op.getOperand(0); 5553 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5554 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5555 5556 if (!Subtarget->hasInt256()) 5557 return SDValue(); 5558 5559 // Use the register form of the broadcast instruction available on AVX2. 5560 if (VT.getSizeInBits() >= 256) 5561 Sc = Extract128BitVector(Sc, 0, DAG, dl); 5562 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5563 } 5564 5565 Ld = Sc.getOperand(0); 5566 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5567 Ld.getOpcode() == ISD::ConstantFP); 5568 5569 // The scalar_to_vector node and the suspected 5570 // load node must have exactly one user. 5571 // Constants may have multiple users. 5572 5573 // AVX-512 has register version of the broadcast 5574 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && 5575 Ld.getValueType().getSizeInBits() >= 32; 5576 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && 5577 !hasRegVer)) 5578 return SDValue(); 5579 break; 5580 } 5581 } 5582 5583 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5584 bool IsGE256 = (VT.getSizeInBits() >= 256); 5585 5586 // When optimizing for size, generate up to 5 extra bytes for a broadcast 5587 // instruction to save 8 or more bytes of constant pool data. 5588 // TODO: If multiple splats are generated to load the same constant, 5589 // it may be detrimental to overall size. There needs to be a way to detect 5590 // that condition to know if this is truly a size win. 5591 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); 5592 5593 // Handle broadcasting a single constant scalar from the constant pool 5594 // into a vector. 5595 // On Sandybridge (no AVX2), it is still better to load a constant vector 5596 // from the constant pool and not to broadcast it from a scalar. 5597 // But override that restriction when optimizing for size. 5598 // TODO: Check if splatting is recommended for other AVX-capable CPUs. 5599 if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { 5600 EVT CVT = Ld.getValueType(); 5601 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5602 5603 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. 5604 // For size optimization, also splat v2f64 and v2i64, and for size opt 5605 // with AVX2, also splat i8 and i16. 5606 // With pattern matching, the VBROADCAST node may become a VMOVDDUP. 5607 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || 5608 (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { 5609 const Constant *C = nullptr; 5610 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5611 C = CI->getConstantIntValue(); 5612 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5613 C = CF->getConstantFPValue(); 5614 5615 assert(C && "Invalid constant type"); 5616 5617 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5618 SDValue CP = 5619 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); 5620 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5621 Ld = DAG.getLoad( 5622 CVT, dl, DAG.getEntryNode(), CP, 5623 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 5624 false, false, Alignment); 5625 5626 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5627 } 5628 } 5629 5630 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5631 5632 // Handle AVX2 in-register broadcasts. 5633 if (!IsLoad && Subtarget->hasInt256() && 5634 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) 5635 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5636 5637 // The scalar source must be a normal load. 5638 if (!IsLoad) 5639 return SDValue(); 5640 5641 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || 5642 (Subtarget->hasVLX() && ScalarSize == 64)) 5643 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5644 5645 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5646 // double since there is no vbroadcastsd xmm 5647 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { 5648 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5649 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5650 } 5651 5652 // Unsupported broadcast. 5653 return SDValue(); 5654 } 5655 5656 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real 5657 /// underlying vector and index. 5658 /// 5659 /// Modifies \p ExtractedFromVec to the real vector and returns the real 5660 /// index. 5661 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, 5662 SDValue ExtIdx) { 5663 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 5664 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) 5665 return Idx; 5666 5667 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already 5668 // lowered this: 5669 // (extract_vector_elt (v8f32 %vreg1), Constant<6>) 5670 // to: 5671 // (extract_vector_elt (vector_shuffle<2,u,u,u> 5672 // (extract_subvector (v8f32 %vreg0), Constant<4>), 5673 // undef) 5674 // Constant<0>) 5675 // In this case the vector is the extract_subvector expression and the index 5676 // is 2, as specified by the shuffle. 5677 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); 5678 SDValue ShuffleVec = SVOp->getOperand(0); 5679 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); 5680 assert(ShuffleVecVT.getVectorElementType() == 5681 ExtractedFromVec.getSimpleValueType().getVectorElementType()); 5682 5683 int ShuffleIdx = SVOp->getMaskElt(Idx); 5684 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { 5685 ExtractedFromVec = ShuffleVec; 5686 return ShuffleIdx; 5687 } 5688 return Idx; 5689 } 5690 5691 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { 5692 MVT VT = Op.getSimpleValueType(); 5693 5694 // Skip if insert_vec_elt is not supported. 5695 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5696 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 5697 return SDValue(); 5698 5699 SDLoc DL(Op); 5700 unsigned NumElems = Op.getNumOperands(); 5701 5702 SDValue VecIn1; 5703 SDValue VecIn2; 5704 SmallVector<unsigned, 4> InsertIndices; 5705 SmallVector<int, 8> Mask(NumElems, -1); 5706 5707 for (unsigned i = 0; i != NumElems; ++i) { 5708 unsigned Opc = Op.getOperand(i).getOpcode(); 5709 5710 if (Opc == ISD::UNDEF) 5711 continue; 5712 5713 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 5714 // Quit if more than 1 elements need inserting. 5715 if (InsertIndices.size() > 1) 5716 return SDValue(); 5717 5718 InsertIndices.push_back(i); 5719 continue; 5720 } 5721 5722 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 5723 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 5724 // Quit if non-constant index. 5725 if (!isa<ConstantSDNode>(ExtIdx)) 5726 return SDValue(); 5727 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); 5728 5729 // Quit if extracted from vector of different type. 5730 if (ExtractedFromVec.getValueType() != VT) 5731 return SDValue(); 5732 5733 if (!VecIn1.getNode()) 5734 VecIn1 = ExtractedFromVec; 5735 else if (VecIn1 != ExtractedFromVec) { 5736 if (!VecIn2.getNode()) 5737 VecIn2 = ExtractedFromVec; 5738 else if (VecIn2 != ExtractedFromVec) 5739 // Quit if more than 2 vectors to shuffle 5740 return SDValue(); 5741 } 5742 5743 if (ExtractedFromVec == VecIn1) 5744 Mask[i] = Idx; 5745 else if (ExtractedFromVec == VecIn2) 5746 Mask[i] = Idx + NumElems; 5747 } 5748 5749 if (!VecIn1.getNode()) 5750 return SDValue(); 5751 5752 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 5753 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); 5754 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 5755 unsigned Idx = InsertIndices[i]; 5756 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 5757 DAG.getIntPtrConstant(Idx, DL)); 5758 } 5759 5760 return NV; 5761 } 5762 5763 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { 5764 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && 5765 Op.getScalarValueSizeInBits() == 1 && 5766 "Can not convert non-constant vector"); 5767 uint64_t Immediate = 0; 5768 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 5769 SDValue In = Op.getOperand(idx); 5770 if (In.getOpcode() != ISD::UNDEF) 5771 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; 5772 } 5773 SDLoc dl(Op); 5774 MVT VT = 5775 MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8)); 5776 return DAG.getConstant(Immediate, dl, VT); 5777 } 5778 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. 5779 SDValue 5780 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { 5781 5782 MVT VT = Op.getSimpleValueType(); 5783 assert((VT.getVectorElementType() == MVT::i1) && 5784 "Unexpected type in LowerBUILD_VECTORvXi1!"); 5785 5786 SDLoc dl(Op); 5787 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5788 SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1); 5789 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); 5790 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 5791 } 5792 5793 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5794 SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1); 5795 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); 5796 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 5797 } 5798 5799 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { 5800 SDValue Imm = ConvertI1VectorToInteger(Op, DAG); 5801 if (Imm.getValueSizeInBits() == VT.getSizeInBits()) 5802 return DAG.getBitcast(VT, Imm); 5803 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); 5804 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, 5805 DAG.getIntPtrConstant(0, dl)); 5806 } 5807 5808 // Vector has one or more non-const elements 5809 uint64_t Immediate = 0; 5810 SmallVector<unsigned, 16> NonConstIdx; 5811 bool IsSplat = true; 5812 bool HasConstElts = false; 5813 int SplatIdx = -1; 5814 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 5815 SDValue In = Op.getOperand(idx); 5816 if (In.getOpcode() == ISD::UNDEF) 5817 continue; 5818 if (!isa<ConstantSDNode>(In)) 5819 NonConstIdx.push_back(idx); 5820 else { 5821 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; 5822 HasConstElts = true; 5823 } 5824 if (SplatIdx == -1) 5825 SplatIdx = idx; 5826 else if (In != Op.getOperand(SplatIdx)) 5827 IsSplat = false; 5828 } 5829 5830 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" 5831 if (IsSplat) 5832 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), 5833 DAG.getConstant(1, dl, VT), 5834 DAG.getConstant(0, dl, VT)); 5835 5836 // insert elements one by one 5837 SDValue DstVec; 5838 SDValue Imm; 5839 if (Immediate) { 5840 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); 5841 Imm = DAG.getConstant(Immediate, dl, ImmVT); 5842 } 5843 else if (HasConstElts) 5844 Imm = DAG.getConstant(0, dl, VT); 5845 else 5846 Imm = DAG.getUNDEF(VT); 5847 if (Imm.getValueSizeInBits() == VT.getSizeInBits()) 5848 DstVec = DAG.getBitcast(VT, Imm); 5849 else { 5850 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); 5851 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, 5852 DAG.getIntPtrConstant(0, dl)); 5853 } 5854 5855 for (unsigned i = 0; i < NonConstIdx.size(); ++i) { 5856 unsigned InsertIdx = NonConstIdx[i]; 5857 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 5858 Op.getOperand(InsertIdx), 5859 DAG.getIntPtrConstant(InsertIdx, dl)); 5860 } 5861 return DstVec; 5862 } 5863 5864 /// \brief Return true if \p N implements a horizontal binop and return the 5865 /// operands for the horizontal binop into V0 and V1. 5866 /// 5867 /// This is a helper function of LowerToHorizontalOp(). 5868 /// This function checks that the build_vector \p N in input implements a 5869 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal 5870 /// operation to match. 5871 /// For example, if \p Opcode is equal to ISD::ADD, then this function 5872 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode 5873 /// is equal to ISD::SUB, then this function checks if this is a horizontal 5874 /// arithmetic sub. 5875 /// 5876 /// This function only analyzes elements of \p N whose indices are 5877 /// in range [BaseIdx, LastIdx). 5878 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, 5879 SelectionDAG &DAG, 5880 unsigned BaseIdx, unsigned LastIdx, 5881 SDValue &V0, SDValue &V1) { 5882 EVT VT = N->getValueType(0); 5883 5884 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); 5885 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && 5886 "Invalid Vector in input!"); 5887 5888 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); 5889 bool CanFold = true; 5890 unsigned ExpectedVExtractIdx = BaseIdx; 5891 unsigned NumElts = LastIdx - BaseIdx; 5892 V0 = DAG.getUNDEF(VT); 5893 V1 = DAG.getUNDEF(VT); 5894 5895 // Check if N implements a horizontal binop. 5896 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { 5897 SDValue Op = N->getOperand(i + BaseIdx); 5898 5899 // Skip UNDEFs. 5900 if (Op->getOpcode() == ISD::UNDEF) { 5901 // Update the expected vector extract index. 5902 if (i * 2 == NumElts) 5903 ExpectedVExtractIdx = BaseIdx; 5904 ExpectedVExtractIdx += 2; 5905 continue; 5906 } 5907 5908 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); 5909 5910 if (!CanFold) 5911 break; 5912 5913 SDValue Op0 = Op.getOperand(0); 5914 SDValue Op1 = Op.getOperand(1); 5915 5916 // Try to match the following pattern: 5917 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) 5918 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5919 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5920 Op0.getOperand(0) == Op1.getOperand(0) && 5921 isa<ConstantSDNode>(Op0.getOperand(1)) && 5922 isa<ConstantSDNode>(Op1.getOperand(1))); 5923 if (!CanFold) 5924 break; 5925 5926 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); 5927 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); 5928 5929 if (i * 2 < NumElts) { 5930 if (V0.getOpcode() == ISD::UNDEF) { 5931 V0 = Op0.getOperand(0); 5932 if (V0.getValueType() != VT) 5933 return false; 5934 } 5935 } else { 5936 if (V1.getOpcode() == ISD::UNDEF) { 5937 V1 = Op0.getOperand(0); 5938 if (V1.getValueType() != VT) 5939 return false; 5940 } 5941 if (i * 2 == NumElts) 5942 ExpectedVExtractIdx = BaseIdx; 5943 } 5944 5945 SDValue Expected = (i * 2 < NumElts) ? V0 : V1; 5946 if (I0 == ExpectedVExtractIdx) 5947 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; 5948 else if (IsCommutable && I1 == ExpectedVExtractIdx) { 5949 // Try to match the following dag sequence: 5950 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) 5951 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; 5952 } else 5953 CanFold = false; 5954 5955 ExpectedVExtractIdx += 2; 5956 } 5957 5958 return CanFold; 5959 } 5960 5961 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by 5962 /// a concat_vector. 5963 /// 5964 /// This is a helper function of LowerToHorizontalOp(). 5965 /// This function expects two 256-bit vectors called V0 and V1. 5966 /// At first, each vector is split into two separate 128-bit vectors. 5967 /// Then, the resulting 128-bit vectors are used to implement two 5968 /// horizontal binary operations. 5969 /// 5970 /// The kind of horizontal binary operation is defined by \p X86Opcode. 5971 /// 5972 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to 5973 /// the two new horizontal binop. 5974 /// When Mode is set, the first horizontal binop dag node would take as input 5975 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second 5976 /// horizontal binop dag node would take as input the lower 128-bit of V1 5977 /// and the upper 128-bit of V1. 5978 /// Example: 5979 /// HADD V0_LO, V0_HI 5980 /// HADD V1_LO, V1_HI 5981 /// 5982 /// Otherwise, the first horizontal binop dag node takes as input the lower 5983 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop 5984 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. 5985 /// Example: 5986 /// HADD V0_LO, V1_LO 5987 /// HADD V0_HI, V1_HI 5988 /// 5989 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower 5990 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to 5991 /// the upper 128-bits of the result. 5992 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, 5993 SDLoc DL, SelectionDAG &DAG, 5994 unsigned X86Opcode, bool Mode, 5995 bool isUndefLO, bool isUndefHI) { 5996 EVT VT = V0.getValueType(); 5997 assert(VT.is256BitVector() && VT == V1.getValueType() && 5998 "Invalid nodes in input!"); 5999 6000 unsigned NumElts = VT.getVectorNumElements(); 6001 SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); 6002 SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); 6003 SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); 6004 SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); 6005 EVT NewVT = V0_LO.getValueType(); 6006 6007 SDValue LO = DAG.getUNDEF(NewVT); 6008 SDValue HI = DAG.getUNDEF(NewVT); 6009 6010 if (Mode) { 6011 // Don't emit a horizontal binop if the result is expected to be UNDEF. 6012 if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) 6013 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); 6014 if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) 6015 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); 6016 } else { 6017 // Don't emit a horizontal binop if the result is expected to be UNDEF. 6018 if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || 6019 V1_LO->getOpcode() != ISD::UNDEF)) 6020 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); 6021 6022 if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || 6023 V1_HI->getOpcode() != ISD::UNDEF)) 6024 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); 6025 } 6026 6027 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); 6028 } 6029 6030 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB 6031 /// node. 6032 static SDValue LowerToAddSub(const BuildVectorSDNode *BV, 6033 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 6034 MVT VT = BV->getSimpleValueType(0); 6035 if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && 6036 (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) 6037 return SDValue(); 6038 6039 SDLoc DL(BV); 6040 unsigned NumElts = VT.getVectorNumElements(); 6041 SDValue InVec0 = DAG.getUNDEF(VT); 6042 SDValue InVec1 = DAG.getUNDEF(VT); 6043 6044 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || 6045 VT == MVT::v2f64) && "build_vector with an invalid type found!"); 6046 6047 // Odd-numbered elements in the input build vector are obtained from 6048 // adding two integer/float elements. 6049 // Even-numbered elements in the input build vector are obtained from 6050 // subtracting two integer/float elements. 6051 unsigned ExpectedOpcode = ISD::FSUB; 6052 unsigned NextExpectedOpcode = ISD::FADD; 6053 bool AddFound = false; 6054 bool SubFound = false; 6055 6056 for (unsigned i = 0, e = NumElts; i != e; ++i) { 6057 SDValue Op = BV->getOperand(i); 6058 6059 // Skip 'undef' values. 6060 unsigned Opcode = Op.getOpcode(); 6061 if (Opcode == ISD::UNDEF) { 6062 std::swap(ExpectedOpcode, NextExpectedOpcode); 6063 continue; 6064 } 6065 6066 // Early exit if we found an unexpected opcode. 6067 if (Opcode != ExpectedOpcode) 6068 return SDValue(); 6069 6070 SDValue Op0 = Op.getOperand(0); 6071 SDValue Op1 = Op.getOperand(1); 6072 6073 // Try to match the following pattern: 6074 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) 6075 // Early exit if we cannot match that sequence. 6076 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6077 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6078 !isa<ConstantSDNode>(Op0.getOperand(1)) || 6079 !isa<ConstantSDNode>(Op1.getOperand(1)) || 6080 Op0.getOperand(1) != Op1.getOperand(1)) 6081 return SDValue(); 6082 6083 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); 6084 if (I0 != i) 6085 return SDValue(); 6086 6087 // We found a valid add/sub node. Update the information accordingly. 6088 if (i & 1) 6089 AddFound = true; 6090 else 6091 SubFound = true; 6092 6093 // Update InVec0 and InVec1. 6094 if (InVec0.getOpcode() == ISD::UNDEF) { 6095 InVec0 = Op0.getOperand(0); 6096 if (InVec0.getSimpleValueType() != VT) 6097 return SDValue(); 6098 } 6099 if (InVec1.getOpcode() == ISD::UNDEF) { 6100 InVec1 = Op1.getOperand(0); 6101 if (InVec1.getSimpleValueType() != VT) 6102 return SDValue(); 6103 } 6104 6105 // Make sure that operands in input to each add/sub node always 6106 // come from a same pair of vectors. 6107 if (InVec0 != Op0.getOperand(0)) { 6108 if (ExpectedOpcode == ISD::FSUB) 6109 return SDValue(); 6110 6111 // FADD is commutable. Try to commute the operands 6112 // and then test again. 6113 std::swap(Op0, Op1); 6114 if (InVec0 != Op0.getOperand(0)) 6115 return SDValue(); 6116 } 6117 6118 if (InVec1 != Op1.getOperand(0)) 6119 return SDValue(); 6120 6121 // Update the pair of expected opcodes. 6122 std::swap(ExpectedOpcode, NextExpectedOpcode); 6123 } 6124 6125 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. 6126 if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && 6127 InVec1.getOpcode() != ISD::UNDEF) 6128 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); 6129 6130 return SDValue(); 6131 } 6132 6133 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. 6134 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, 6135 const X86Subtarget *Subtarget, 6136 SelectionDAG &DAG) { 6137 MVT VT = BV->getSimpleValueType(0); 6138 unsigned NumElts = VT.getVectorNumElements(); 6139 unsigned NumUndefsLO = 0; 6140 unsigned NumUndefsHI = 0; 6141 unsigned Half = NumElts/2; 6142 6143 // Count the number of UNDEF operands in the build_vector in input. 6144 for (unsigned i = 0, e = Half; i != e; ++i) 6145 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) 6146 NumUndefsLO++; 6147 6148 for (unsigned i = Half, e = NumElts; i != e; ++i) 6149 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) 6150 NumUndefsHI++; 6151 6152 // Early exit if this is either a build_vector of all UNDEFs or all the 6153 // operands but one are UNDEF. 6154 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) 6155 return SDValue(); 6156 6157 SDLoc DL(BV); 6158 SDValue InVec0, InVec1; 6159 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { 6160 // Try to match an SSE3 float HADD/HSUB. 6161 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) 6162 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); 6163 6164 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) 6165 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); 6166 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { 6167 // Try to match an SSSE3 integer HADD/HSUB. 6168 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) 6169 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); 6170 6171 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) 6172 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); 6173 } 6174 6175 if (!Subtarget->hasAVX()) 6176 return SDValue(); 6177 6178 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { 6179 // Try to match an AVX horizontal add/sub of packed single/double 6180 // precision floating point values from 256-bit vectors. 6181 SDValue InVec2, InVec3; 6182 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && 6183 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && 6184 ((InVec0.getOpcode() == ISD::UNDEF || 6185 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 6186 ((InVec1.getOpcode() == ISD::UNDEF || 6187 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 6188 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); 6189 6190 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && 6191 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && 6192 ((InVec0.getOpcode() == ISD::UNDEF || 6193 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 6194 ((InVec1.getOpcode() == ISD::UNDEF || 6195 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 6196 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); 6197 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { 6198 // Try to match an AVX2 horizontal add/sub of signed integers. 6199 SDValue InVec2, InVec3; 6200 unsigned X86Opcode; 6201 bool CanFold = true; 6202 6203 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && 6204 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && 6205 ((InVec0.getOpcode() == ISD::UNDEF || 6206 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 6207 ((InVec1.getOpcode() == ISD::UNDEF || 6208 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 6209 X86Opcode = X86ISD::HADD; 6210 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && 6211 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && 6212 ((InVec0.getOpcode() == ISD::UNDEF || 6213 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 6214 ((InVec1.getOpcode() == ISD::UNDEF || 6215 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 6216 X86Opcode = X86ISD::HSUB; 6217 else 6218 CanFold = false; 6219 6220 if (CanFold) { 6221 // Fold this build_vector into a single horizontal add/sub. 6222 // Do this only if the target has AVX2. 6223 if (Subtarget->hasAVX2()) 6224 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); 6225 6226 // Do not try to expand this build_vector into a pair of horizontal 6227 // add/sub if we can emit a pair of scalar add/sub. 6228 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) 6229 return SDValue(); 6230 6231 // Convert this build_vector into a pair of horizontal binop followed by 6232 // a concat vector. 6233 bool isUndefLO = NumUndefsLO == Half; 6234 bool isUndefHI = NumUndefsHI == Half; 6235 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, 6236 isUndefLO, isUndefHI); 6237 } 6238 } 6239 6240 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || 6241 VT == MVT::v16i16) && Subtarget->hasAVX()) { 6242 unsigned X86Opcode; 6243 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) 6244 X86Opcode = X86ISD::HADD; 6245 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) 6246 X86Opcode = X86ISD::HSUB; 6247 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) 6248 X86Opcode = X86ISD::FHADD; 6249 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) 6250 X86Opcode = X86ISD::FHSUB; 6251 else 6252 return SDValue(); 6253 6254 // Don't try to expand this build_vector into a pair of horizontal add/sub 6255 // if we can simply emit a pair of scalar add/sub. 6256 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) 6257 return SDValue(); 6258 6259 // Convert this build_vector into two horizontal add/sub followed by 6260 // a concat vector. 6261 bool isUndefLO = NumUndefsLO == Half; 6262 bool isUndefHI = NumUndefsHI == Half; 6263 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, 6264 isUndefLO, isUndefHI); 6265 } 6266 6267 return SDValue(); 6268 } 6269 6270 SDValue 6271 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6272 SDLoc dl(Op); 6273 6274 MVT VT = Op.getSimpleValueType(); 6275 MVT ExtVT = VT.getVectorElementType(); 6276 unsigned NumElems = Op.getNumOperands(); 6277 6278 // Generate vectors for predicate vectors. 6279 if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512()) 6280 return LowerBUILD_VECTORvXi1(Op, DAG); 6281 6282 // Vectors containing all zeros can be matched by pxor and xorps later 6283 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 6284 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 6285 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 6286 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) 6287 return Op; 6288 6289 return getZeroVector(VT, Subtarget, DAG, dl); 6290 } 6291 6292 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 6293 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 6294 // vpcmpeqd on 256-bit vectors. 6295 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { 6296 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) 6297 return Op; 6298 6299 if (!VT.is512BitVector()) 6300 return getOnesVector(VT, Subtarget, DAG, dl); 6301 } 6302 6303 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); 6304 if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) 6305 return AddSub; 6306 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) 6307 return HorizontalOp; 6308 if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) 6309 return Broadcast; 6310 6311 unsigned EVTBits = ExtVT.getSizeInBits(); 6312 6313 unsigned NumZero = 0; 6314 unsigned NumNonZero = 0; 6315 uint64_t NonZeros = 0; 6316 bool IsAllConstants = true; 6317 SmallSet<SDValue, 8> Values; 6318 for (unsigned i = 0; i < NumElems; ++i) { 6319 SDValue Elt = Op.getOperand(i); 6320 if (Elt.getOpcode() == ISD::UNDEF) 6321 continue; 6322 Values.insert(Elt); 6323 if (Elt.getOpcode() != ISD::Constant && 6324 Elt.getOpcode() != ISD::ConstantFP) 6325 IsAllConstants = false; 6326 if (X86::isZeroNode(Elt)) 6327 NumZero++; 6328 else { 6329 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. 6330 NonZeros |= ((uint64_t)1 << i); 6331 NumNonZero++; 6332 } 6333 } 6334 6335 // All undef vector. Return an UNDEF. All zero vectors were handled above. 6336 if (NumNonZero == 0) 6337 return DAG.getUNDEF(VT); 6338 6339 // Special case for single non-zero, non-undef, element. 6340 if (NumNonZero == 1) { 6341 unsigned Idx = countTrailingZeros(NonZeros); 6342 SDValue Item = Op.getOperand(Idx); 6343 6344 // If this is an insertion of an i64 value on x86-32, and if the top bits of 6345 // the value are obviously zero, truncate the value to i32 and do the 6346 // insertion that way. Only do this if the value is non-constant or if the 6347 // value is a constant being inserted into element 0. It is cheaper to do 6348 // a constant pool load than it is to do a movd + shuffle. 6349 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 6350 (!IsAllConstants || Idx == 0)) { 6351 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 6352 // Handle SSE only. 6353 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 6354 MVT VecVT = MVT::v4i32; 6355 6356 // Truncate the value (which may itself be a constant) to i32, and 6357 // convert it to a vector with movd (S2V+shuffle to zero extend). 6358 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 6359 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 6360 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef( 6361 Item, Idx * 2, true, Subtarget, DAG)); 6362 } 6363 } 6364 6365 // If we have a constant or non-constant insertion into the low element of 6366 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 6367 // the rest of the elements. This will be matched as movd/movq/movss/movsd 6368 // depending on what the source datatype is. 6369 if (Idx == 0) { 6370 if (NumZero == 0) 6371 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 6372 6373 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 6374 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 6375 if (VT.is512BitVector()) { 6376 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 6377 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 6378 Item, DAG.getIntPtrConstant(0, dl)); 6379 } 6380 assert((VT.is128BitVector() || VT.is256BitVector()) && 6381 "Expected an SSE value type!"); 6382 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 6383 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 6384 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 6385 } 6386 6387 // We can't directly insert an i8 or i16 into a vector, so zero extend 6388 // it to i32 first. 6389 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 6390 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 6391 if (VT.is256BitVector()) { 6392 if (Subtarget->hasAVX()) { 6393 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item); 6394 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 6395 } else { 6396 // Without AVX, we need to extend to a 128-bit vector and then 6397 // insert into the 256-bit vector. 6398 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 6399 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 6400 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 6401 } 6402 } else { 6403 assert(VT.is128BitVector() && "Expected an SSE value type!"); 6404 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 6405 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 6406 } 6407 return DAG.getBitcast(VT, Item); 6408 } 6409 } 6410 6411 // Is it a vector logical left shift? 6412 if (NumElems == 2 && Idx == 1 && 6413 X86::isZeroNode(Op.getOperand(0)) && 6414 !X86::isZeroNode(Op.getOperand(1))) { 6415 unsigned NumBits = VT.getSizeInBits(); 6416 return getVShift(true, VT, 6417 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6418 VT, Op.getOperand(1)), 6419 NumBits/2, DAG, *this, dl); 6420 } 6421 6422 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 6423 return SDValue(); 6424 6425 // Otherwise, if this is a vector with i32 or f32 elements, and the element 6426 // is a non-constant being inserted into an element other than the low one, 6427 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 6428 // movd/movss) to move this into the low element, then shuffle it into 6429 // place. 6430 if (EVTBits == 32) { 6431 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 6432 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); 6433 } 6434 } 6435 6436 // Splat is obviously ok. Let legalizer expand it to a shuffle. 6437 if (Values.size() == 1) { 6438 if (EVTBits == 32) { 6439 // Instead of a shuffle like this: 6440 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 6441 // Check if it's possible to issue this instead. 6442 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 6443 unsigned Idx = countTrailingZeros(NonZeros); 6444 SDValue Item = Op.getOperand(Idx); 6445 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 6446 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 6447 } 6448 return SDValue(); 6449 } 6450 6451 // A vector full of immediates; various special cases are already 6452 // handled, so this is best done with a single constant-pool load. 6453 if (IsAllConstants) 6454 return SDValue(); 6455 6456 // For AVX-length vectors, see if we can use a vector load to get all of the 6457 // elements, otherwise build the individual 128-bit pieces and use 6458 // shuffles to put them in place. 6459 if (VT.is256BitVector() || VT.is512BitVector()) { 6460 SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems); 6461 6462 // Check for a build vector of consecutive loads. 6463 if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) 6464 return LD; 6465 6466 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 6467 6468 // Build both the lower and upper subvector. 6469 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, 6470 makeArrayRef(&V[0], NumElems/2)); 6471 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, 6472 makeArrayRef(&V[NumElems / 2], NumElems/2)); 6473 6474 // Recreate the wider vector with the lower and upper part. 6475 if (VT.is256BitVector()) 6476 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 6477 return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 6478 } 6479 6480 // Let legalizer expand 2-wide build_vectors. 6481 if (EVTBits == 64) { 6482 if (NumNonZero == 1) { 6483 // One half is zero or undef. 6484 unsigned Idx = countTrailingZeros(NonZeros); 6485 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 6486 Op.getOperand(Idx)); 6487 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 6488 } 6489 return SDValue(); 6490 } 6491 6492 // If element VT is < 32 bits, convert it to inserts into a zero vector. 6493 if (EVTBits == 8 && NumElems == 16) 6494 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, 6495 DAG, Subtarget, *this)) 6496 return V; 6497 6498 if (EVTBits == 16 && NumElems == 8) 6499 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, 6500 DAG, Subtarget, *this)) 6501 return V; 6502 6503 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS 6504 if (EVTBits == 32 && NumElems == 4) 6505 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) 6506 return V; 6507 6508 // If element VT is == 32 bits, turn it into a number of shuffles. 6509 SmallVector<SDValue, 8> V(NumElems); 6510 if (NumElems == 4 && NumZero > 0) { 6511 for (unsigned i = 0; i < 4; ++i) { 6512 bool isZero = !(NonZeros & (1ULL << i)); 6513 if (isZero) 6514 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 6515 else 6516 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6517 } 6518 6519 for (unsigned i = 0; i < 2; ++i) { 6520 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 6521 default: break; 6522 case 0: 6523 V[i] = V[i*2]; // Must be a zero vector. 6524 break; 6525 case 1: 6526 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 6527 break; 6528 case 2: 6529 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 6530 break; 6531 case 3: 6532 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 6533 break; 6534 } 6535 } 6536 6537 bool Reverse1 = (NonZeros & 0x3) == 2; 6538 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 6539 int MaskVec[] = { 6540 Reverse1 ? 1 : 0, 6541 Reverse1 ? 0 : 1, 6542 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 6543 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 6544 }; 6545 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 6546 } 6547 6548 if (Values.size() > 1 && VT.is128BitVector()) { 6549 // Check for a build vector of consecutive loads. 6550 for (unsigned i = 0; i < NumElems; ++i) 6551 V[i] = Op.getOperand(i); 6552 6553 // Check for elements which are consecutive loads. 6554 if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) 6555 return LD; 6556 6557 // Check for a build vector from mostly shuffle plus few inserting. 6558 if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) 6559 return Sh; 6560 6561 // For SSE 4.1, use insertps to put the high elements into the low element. 6562 if (Subtarget->hasSSE41()) { 6563 SDValue Result; 6564 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 6565 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 6566 else 6567 Result = DAG.getUNDEF(VT); 6568 6569 for (unsigned i = 1; i < NumElems; ++i) { 6570 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 6571 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 6572 Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); 6573 } 6574 return Result; 6575 } 6576 6577 // Otherwise, expand into a number of unpckl*, start by extending each of 6578 // our (non-undef) elements to the full vector width with the element in the 6579 // bottom slot of the vector (which generates no code for SSE). 6580 for (unsigned i = 0; i < NumElems; ++i) { 6581 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 6582 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6583 else 6584 V[i] = DAG.getUNDEF(VT); 6585 } 6586 6587 // Next, we iteratively mix elements, e.g. for v4f32: 6588 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 6589 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 6590 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 6591 unsigned EltStride = NumElems >> 1; 6592 while (EltStride != 0) { 6593 for (unsigned i = 0; i < EltStride; ++i) { 6594 // If V[i+EltStride] is undef and this is the first round of mixing, 6595 // then it is safe to just drop this shuffle: V[i] is already in the 6596 // right place, the one element (since it's the first round) being 6597 // inserted as undef can be dropped. This isn't safe for successive 6598 // rounds because they will permute elements within both vectors. 6599 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 6600 EltStride == NumElems/2) 6601 continue; 6602 6603 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 6604 } 6605 EltStride >>= 1; 6606 } 6607 return V[0]; 6608 } 6609 return SDValue(); 6610 } 6611 6612 // 256-bit AVX can use the vinsertf128 instruction 6613 // to create 256-bit vectors from two other 128-bit ones. 6614 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6615 SDLoc dl(Op); 6616 MVT ResVT = Op.getSimpleValueType(); 6617 6618 assert((ResVT.is256BitVector() || 6619 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); 6620 6621 SDValue V1 = Op.getOperand(0); 6622 SDValue V2 = Op.getOperand(1); 6623 unsigned NumElems = ResVT.getVectorNumElements(); 6624 if (ResVT.is256BitVector()) 6625 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6626 6627 if (Op.getNumOperands() == 4) { 6628 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), 6629 ResVT.getVectorNumElements()/2); 6630 SDValue V3 = Op.getOperand(2); 6631 SDValue V4 = Op.getOperand(3); 6632 return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), 6633 Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); 6634 } 6635 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6636 } 6637 6638 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, 6639 const X86Subtarget *Subtarget, 6640 SelectionDAG & DAG) { 6641 SDLoc dl(Op); 6642 MVT ResVT = Op.getSimpleValueType(); 6643 unsigned NumOfOperands = Op.getNumOperands(); 6644 6645 assert(isPowerOf2_32(NumOfOperands) && 6646 "Unexpected number of operands in CONCAT_VECTORS"); 6647 6648 SDValue Undef = DAG.getUNDEF(ResVT); 6649 if (NumOfOperands > 2) { 6650 // Specialize the cases when all, or all but one, of the operands are undef. 6651 unsigned NumOfDefinedOps = 0; 6652 unsigned OpIdx = 0; 6653 for (unsigned i = 0; i < NumOfOperands; i++) 6654 if (!Op.getOperand(i).isUndef()) { 6655 NumOfDefinedOps++; 6656 OpIdx = i; 6657 } 6658 if (NumOfDefinedOps == 0) 6659 return Undef; 6660 if (NumOfDefinedOps == 1) { 6661 unsigned SubVecNumElts = 6662 Op.getOperand(OpIdx).getValueType().getVectorNumElements(); 6663 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); 6664 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, 6665 Op.getOperand(OpIdx), IdxVal); 6666 } 6667 6668 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), 6669 ResVT.getVectorNumElements()/2); 6670 SmallVector<SDValue, 2> Ops; 6671 for (unsigned i = 0; i < NumOfOperands/2; i++) 6672 Ops.push_back(Op.getOperand(i)); 6673 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); 6674 Ops.clear(); 6675 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) 6676 Ops.push_back(Op.getOperand(i)); 6677 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); 6678 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); 6679 } 6680 6681 // 2 operands 6682 SDValue V1 = Op.getOperand(0); 6683 SDValue V2 = Op.getOperand(1); 6684 unsigned NumElems = ResVT.getVectorNumElements(); 6685 assert(V1.getValueType() == V2.getValueType() && 6686 V1.getValueType().getVectorNumElements() == NumElems/2 && 6687 "Unexpected operands in CONCAT_VECTORS"); 6688 6689 if (ResVT.getSizeInBits() >= 16) 6690 return Op; // The operation is legal with KUNPCK 6691 6692 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); 6693 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); 6694 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); 6695 if (IsZeroV1 && IsZeroV2) 6696 return ZeroVec; 6697 6698 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); 6699 if (V2.isUndef()) 6700 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); 6701 if (IsZeroV2) 6702 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); 6703 6704 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); 6705 if (V1.isUndef()) 6706 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); 6707 6708 if (IsZeroV1) 6709 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); 6710 6711 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); 6712 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); 6713 } 6714 6715 static SDValue LowerCONCAT_VECTORS(SDValue Op, 6716 const X86Subtarget *Subtarget, 6717 SelectionDAG &DAG) { 6718 MVT VT = Op.getSimpleValueType(); 6719 if (VT.getVectorElementType() == MVT::i1) 6720 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); 6721 6722 assert((VT.is256BitVector() && Op.getNumOperands() == 2) || 6723 (VT.is512BitVector() && (Op.getNumOperands() == 2 || 6724 Op.getNumOperands() == 4))); 6725 6726 // AVX can use the vinsertf128 instruction to create 256-bit vectors 6727 // from two other 128-bit ones. 6728 6729 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors 6730 return LowerAVXCONCAT_VECTORS(Op, DAG); 6731 } 6732 6733 //===----------------------------------------------------------------------===// 6734 // Vector shuffle lowering 6735 // 6736 // This is an experimental code path for lowering vector shuffles on x86. It is 6737 // designed to handle arbitrary vector shuffles and blends, gracefully 6738 // degrading performance as necessary. It works hard to recognize idiomatic 6739 // shuffles and lower them to optimal instruction patterns without leaving 6740 // a framework that allows reasonably efficient handling of all vector shuffle 6741 // patterns. 6742 //===----------------------------------------------------------------------===// 6743 6744 /// \brief Tiny helper function to identify a no-op mask. 6745 /// 6746 /// This is a somewhat boring predicate function. It checks whether the mask 6747 /// array input, which is assumed to be a single-input shuffle mask of the kind 6748 /// used by the X86 shuffle instructions (not a fully general 6749 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an 6750 /// in-place shuffle are 'no-op's. 6751 static bool isNoopShuffleMask(ArrayRef<int> Mask) { 6752 for (int i = 0, Size = Mask.size(); i < Size; ++i) 6753 if (Mask[i] != -1 && Mask[i] != i) 6754 return false; 6755 return true; 6756 } 6757 6758 /// \brief Helper function to classify a mask as a single-input mask. 6759 /// 6760 /// This isn't a generic single-input test because in the vector shuffle 6761 /// lowering we canonicalize single inputs to be the first input operand. This 6762 /// means we can more quickly test for a single input by only checking whether 6763 /// an input from the second operand exists. We also assume that the size of 6764 /// mask corresponds to the size of the input vectors which isn't true in the 6765 /// fully general case. 6766 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { 6767 for (int M : Mask) 6768 if (M >= (int)Mask.size()) 6769 return false; 6770 return true; 6771 } 6772 6773 /// \brief Test whether there are elements crossing 128-bit lanes in this 6774 /// shuffle mask. 6775 /// 6776 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations 6777 /// and we routinely test for these. 6778 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { 6779 int LaneSize = 128 / VT.getScalarSizeInBits(); 6780 int Size = Mask.size(); 6781 for (int i = 0; i < Size; ++i) 6782 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) 6783 return true; 6784 return false; 6785 } 6786 6787 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. 6788 /// 6789 /// This checks a shuffle mask to see if it is performing the same 6790 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies 6791 /// that it is also not lane-crossing. It may however involve a blend from the 6792 /// same lane of a second vector. 6793 /// 6794 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is 6795 /// non-trivial to compute in the face of undef lanes. The representation is 6796 /// *not* suitable for use with existing 128-bit shuffles as it will contain 6797 /// entries from both V1 and V2 inputs to the wider mask. 6798 static bool 6799 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, 6800 SmallVectorImpl<int> &RepeatedMask) { 6801 int LaneSize = 128 / VT.getScalarSizeInBits(); 6802 RepeatedMask.resize(LaneSize, -1); 6803 int Size = Mask.size(); 6804 for (int i = 0; i < Size; ++i) { 6805 if (Mask[i] < 0) 6806 continue; 6807 if ((Mask[i] % Size) / LaneSize != i / LaneSize) 6808 // This entry crosses lanes, so there is no way to model this shuffle. 6809 return false; 6810 6811 // Ok, handle the in-lane shuffles by detecting if and when they repeat. 6812 if (RepeatedMask[i % LaneSize] == -1) 6813 // This is the first non-undef entry in this slot of a 128-bit lane. 6814 RepeatedMask[i % LaneSize] = 6815 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; 6816 else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) 6817 // Found a mismatch with the repeated mask. 6818 return false; 6819 } 6820 return true; 6821 } 6822 6823 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of 6824 /// arguments. 6825 /// 6826 /// This is a fast way to test a shuffle mask against a fixed pattern: 6827 /// 6828 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } 6829 /// 6830 /// It returns true if the mask is exactly as wide as the argument list, and 6831 /// each element of the mask is either -1 (signifying undef) or the value given 6832 /// in the argument. 6833 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, 6834 ArrayRef<int> ExpectedMask) { 6835 if (Mask.size() != ExpectedMask.size()) 6836 return false; 6837 6838 int Size = Mask.size(); 6839 6840 // If the values are build vectors, we can look through them to find 6841 // equivalent inputs that make the shuffles equivalent. 6842 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1); 6843 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2); 6844 6845 for (int i = 0; i < Size; ++i) 6846 if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) { 6847 auto *MaskBV = Mask[i] < Size ? BV1 : BV2; 6848 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; 6849 if (!MaskBV || !ExpectedBV || 6850 MaskBV->getOperand(Mask[i] % Size) != 6851 ExpectedBV->getOperand(ExpectedMask[i] % Size)) 6852 return false; 6853 } 6854 6855 return true; 6856 } 6857 6858 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. 6859 /// 6860 /// This helper function produces an 8-bit shuffle immediate corresponding to 6861 /// the ubiquitous shuffle encoding scheme used in x86 instructions for 6862 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for 6863 /// example. 6864 /// 6865 /// NB: We rely heavily on "undef" masks preserving the input lane. 6866 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, 6867 SelectionDAG &DAG) { 6868 assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); 6869 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); 6870 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); 6871 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); 6872 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); 6873 6874 unsigned Imm = 0; 6875 Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; 6876 Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; 6877 Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; 6878 Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; 6879 return DAG.getConstant(Imm, DL, MVT::i8); 6880 } 6881 6882 /// \brief Compute whether each element of a shuffle is zeroable. 6883 /// 6884 /// A "zeroable" vector shuffle element is one which can be lowered to zero. 6885 /// Either it is an undef element in the shuffle mask, the element of the input 6886 /// referenced is undef, or the element of the input referenced is known to be 6887 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle 6888 /// as many lanes with this technique as possible to simplify the remaining 6889 /// shuffle. 6890 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, 6891 SDValue V1, SDValue V2) { 6892 SmallBitVector Zeroable(Mask.size(), false); 6893 6894 while (V1.getOpcode() == ISD::BITCAST) 6895 V1 = V1->getOperand(0); 6896 while (V2.getOpcode() == ISD::BITCAST) 6897 V2 = V2->getOperand(0); 6898 6899 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); 6900 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); 6901 6902 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 6903 int M = Mask[i]; 6904 // Handle the easy cases. 6905 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { 6906 Zeroable[i] = true; 6907 continue; 6908 } 6909 6910 // If this is an index into a build_vector node (which has the same number 6911 // of elements), dig out the input value and use it. 6912 SDValue V = M < Size ? V1 : V2; 6913 if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) 6914 continue; 6915 6916 SDValue Input = V.getOperand(M % Size); 6917 // The UNDEF opcode check really should be dead code here, but not quite 6918 // worth asserting on (it isn't invalid, just unexpected). 6919 if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) 6920 Zeroable[i] = true; 6921 } 6922 6923 return Zeroable; 6924 } 6925 6926 // X86 has dedicated unpack instructions that can handle specific blend 6927 // operations: UNPCKH and UNPCKL. 6928 static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask, 6929 SDValue V1, SDValue V2, 6930 SelectionDAG &DAG) { 6931 int NumElts = VT.getVectorNumElements(); 6932 int NumEltsInLane = 128 / VT.getScalarSizeInBits(); 6933 SmallVector<int, 8> Unpckl; 6934 SmallVector<int, 8> Unpckh; 6935 6936 for (int i = 0; i < NumElts; ++i) { 6937 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; 6938 int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); 6939 int HiPos = LoPos + NumEltsInLane / 2; 6940 Unpckl.push_back(LoPos); 6941 Unpckh.push_back(HiPos); 6942 } 6943 6944 if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) 6945 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); 6946 if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) 6947 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); 6948 6949 // Commute and try again. 6950 ShuffleVectorSDNode::commuteMask(Unpckl); 6951 if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) 6952 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); 6953 6954 ShuffleVectorSDNode::commuteMask(Unpckh); 6955 if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) 6956 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); 6957 6958 return SDValue(); 6959 } 6960 6961 /// \brief Try to emit a bitmask instruction for a shuffle. 6962 /// 6963 /// This handles cases where we can model a blend exactly as a bitmask due to 6964 /// one of the inputs being zeroable. 6965 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, 6966 SDValue V2, ArrayRef<int> Mask, 6967 SelectionDAG &DAG) { 6968 MVT EltVT = VT.getVectorElementType(); 6969 int NumEltBits = EltVT.getSizeInBits(); 6970 MVT IntEltVT = MVT::getIntegerVT(NumEltBits); 6971 SDValue Zero = DAG.getConstant(0, DL, IntEltVT); 6972 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, 6973 IntEltVT); 6974 if (EltVT.isFloatingPoint()) { 6975 Zero = DAG.getBitcast(EltVT, Zero); 6976 AllOnes = DAG.getBitcast(EltVT, AllOnes); 6977 } 6978 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); 6979 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 6980 SDValue V; 6981 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 6982 if (Zeroable[i]) 6983 continue; 6984 if (Mask[i] % Size != i) 6985 return SDValue(); // Not a blend. 6986 if (!V) 6987 V = Mask[i] < Size ? V1 : V2; 6988 else if (V != (Mask[i] < Size ? V1 : V2)) 6989 return SDValue(); // Can only let one input through the mask. 6990 6991 VMaskOps[i] = AllOnes; 6992 } 6993 if (!V) 6994 return SDValue(); // No non-zeroable elements! 6995 6996 SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); 6997 V = DAG.getNode(VT.isFloatingPoint() 6998 ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, 6999 DL, VT, V, VMask); 7000 return V; 7001 } 7002 7003 /// \brief Try to emit a blend instruction for a shuffle using bit math. 7004 /// 7005 /// This is used as a fallback approach when first class blend instructions are 7006 /// unavailable. Currently it is only suitable for integer vectors, but could 7007 /// be generalized for floating point vectors if desirable. 7008 static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, 7009 SDValue V2, ArrayRef<int> Mask, 7010 SelectionDAG &DAG) { 7011 assert(VT.isInteger() && "Only supports integer vector types!"); 7012 MVT EltVT = VT.getVectorElementType(); 7013 int NumEltBits = EltVT.getSizeInBits(); 7014 SDValue Zero = DAG.getConstant(0, DL, EltVT); 7015 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, 7016 EltVT); 7017 SmallVector<SDValue, 16> MaskOps; 7018 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7019 if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size) 7020 return SDValue(); // Shuffled input! 7021 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); 7022 } 7023 7024 SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps); 7025 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); 7026 // We have to cast V2 around. 7027 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); 7028 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, 7029 DAG.getBitcast(MaskVT, V1Mask), 7030 DAG.getBitcast(MaskVT, V2))); 7031 return DAG.getNode(ISD::OR, DL, VT, V1, V2); 7032 } 7033 7034 /// \brief Try to emit a blend instruction for a shuffle. 7035 /// 7036 /// This doesn't do any checks for the availability of instructions for blending 7037 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to 7038 /// be matched in the backend with the type given. What it does check for is 7039 /// that the shuffle mask is a blend, or convertible into a blend with zero. 7040 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, 7041 SDValue V2, ArrayRef<int> Original, 7042 const X86Subtarget *Subtarget, 7043 SelectionDAG &DAG) { 7044 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); 7045 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); 7046 SmallVector<int, 8> Mask(Original.begin(), Original.end()); 7047 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7048 bool ForceV1Zero = false, ForceV2Zero = false; 7049 7050 // Attempt to generate the binary blend mask. If an input is zero then 7051 // we can use any lane. 7052 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. 7053 unsigned BlendMask = 0; 7054 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7055 int M = Mask[i]; 7056 if (M < 0) 7057 continue; 7058 if (M == i) 7059 continue; 7060 if (M == i + Size) { 7061 BlendMask |= 1u << i; 7062 continue; 7063 } 7064 if (Zeroable[i]) { 7065 if (V1IsZero) { 7066 ForceV1Zero = true; 7067 Mask[i] = i; 7068 continue; 7069 } 7070 if (V2IsZero) { 7071 ForceV2Zero = true; 7072 BlendMask |= 1u << i; 7073 Mask[i] = i + Size; 7074 continue; 7075 } 7076 } 7077 return SDValue(); // Shuffled input! 7078 } 7079 7080 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. 7081 if (ForceV1Zero) 7082 V1 = getZeroVector(VT, Subtarget, DAG, DL); 7083 if (ForceV2Zero) 7084 V2 = getZeroVector(VT, Subtarget, DAG, DL); 7085 7086 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { 7087 unsigned ScaledMask = 0; 7088 for (int i = 0; i != Size; ++i) 7089 if (BlendMask & (1u << i)) 7090 for (int j = 0; j != Scale; ++j) 7091 ScaledMask |= 1u << (i * Scale + j); 7092 return ScaledMask; 7093 }; 7094 7095 switch (VT.SimpleTy) { 7096 case MVT::v2f64: 7097 case MVT::v4f32: 7098 case MVT::v4f64: 7099 case MVT::v8f32: 7100 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, 7101 DAG.getConstant(BlendMask, DL, MVT::i8)); 7102 7103 case MVT::v4i64: 7104 case MVT::v8i32: 7105 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); 7106 // FALLTHROUGH 7107 case MVT::v2i64: 7108 case MVT::v4i32: 7109 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into 7110 // that instruction. 7111 if (Subtarget->hasAVX2()) { 7112 // Scale the blend by the number of 32-bit dwords per element. 7113 int Scale = VT.getScalarSizeInBits() / 32; 7114 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); 7115 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; 7116 V1 = DAG.getBitcast(BlendVT, V1); 7117 V2 = DAG.getBitcast(BlendVT, V2); 7118 return DAG.getBitcast( 7119 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, 7120 DAG.getConstant(BlendMask, DL, MVT::i8))); 7121 } 7122 // FALLTHROUGH 7123 case MVT::v8i16: { 7124 // For integer shuffles we need to expand the mask and cast the inputs to 7125 // v8i16s prior to blending. 7126 int Scale = 8 / VT.getVectorNumElements(); 7127 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); 7128 V1 = DAG.getBitcast(MVT::v8i16, V1); 7129 V2 = DAG.getBitcast(MVT::v8i16, V2); 7130 return DAG.getBitcast(VT, 7131 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, 7132 DAG.getConstant(BlendMask, DL, MVT::i8))); 7133 } 7134 7135 case MVT::v16i16: { 7136 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); 7137 SmallVector<int, 8> RepeatedMask; 7138 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { 7139 // We can lower these with PBLENDW which is mirrored across 128-bit lanes. 7140 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); 7141 BlendMask = 0; 7142 for (int i = 0; i < 8; ++i) 7143 if (RepeatedMask[i] >= 16) 7144 BlendMask |= 1u << i; 7145 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, 7146 DAG.getConstant(BlendMask, DL, MVT::i8)); 7147 } 7148 } 7149 // FALLTHROUGH 7150 case MVT::v16i8: 7151 case MVT::v32i8: { 7152 assert((VT.is128BitVector() || Subtarget->hasAVX2()) && 7153 "256-bit byte-blends require AVX2 support!"); 7154 7155 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. 7156 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) 7157 return Masked; 7158 7159 // Scale the blend by the number of bytes per element. 7160 int Scale = VT.getScalarSizeInBits() / 8; 7161 7162 // This form of blend is always done on bytes. Compute the byte vector 7163 // type. 7164 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); 7165 7166 // Compute the VSELECT mask. Note that VSELECT is really confusing in the 7167 // mix of LLVM's code generator and the x86 backend. We tell the code 7168 // generator that boolean values in the elements of an x86 vector register 7169 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' 7170 // mapping a select to operand #1, and 'false' mapping to operand #2. The 7171 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit 7172 // of the element (the remaining are ignored) and 0 in that high bit would 7173 // mean operand #1 while 1 in the high bit would mean operand #2. So while 7174 // the LLVM model for boolean values in vector elements gets the relevant 7175 // bit set, it is set backwards and over constrained relative to x86's 7176 // actual model. 7177 SmallVector<SDValue, 32> VSELECTMask; 7178 for (int i = 0, Size = Mask.size(); i < Size; ++i) 7179 for (int j = 0; j < Scale; ++j) 7180 VSELECTMask.push_back( 7181 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) 7182 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, 7183 MVT::i8)); 7184 7185 V1 = DAG.getBitcast(BlendVT, V1); 7186 V2 = DAG.getBitcast(BlendVT, V2); 7187 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, 7188 DAG.getNode(ISD::BUILD_VECTOR, DL, 7189 BlendVT, VSELECTMask), 7190 V1, V2)); 7191 } 7192 7193 default: 7194 llvm_unreachable("Not a supported integer vector type!"); 7195 } 7196 } 7197 7198 /// \brief Try to lower as a blend of elements from two inputs followed by 7199 /// a single-input permutation. 7200 /// 7201 /// This matches the pattern where we can blend elements from two inputs and 7202 /// then reduce the shuffle to a single-input permutation. 7203 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, 7204 SDValue V2, 7205 ArrayRef<int> Mask, 7206 SelectionDAG &DAG) { 7207 // We build up the blend mask while checking whether a blend is a viable way 7208 // to reduce the shuffle. 7209 SmallVector<int, 32> BlendMask(Mask.size(), -1); 7210 SmallVector<int, 32> PermuteMask(Mask.size(), -1); 7211 7212 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7213 if (Mask[i] < 0) 7214 continue; 7215 7216 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); 7217 7218 if (BlendMask[Mask[i] % Size] == -1) 7219 BlendMask[Mask[i] % Size] = Mask[i]; 7220 else if (BlendMask[Mask[i] % Size] != Mask[i]) 7221 return SDValue(); // Can't blend in the needed input! 7222 7223 PermuteMask[i] = Mask[i] % Size; 7224 } 7225 7226 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); 7227 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); 7228 } 7229 7230 /// \brief Generic routine to decompose a shuffle and blend into indepndent 7231 /// blends and permutes. 7232 /// 7233 /// This matches the extremely common pattern for handling combined 7234 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend 7235 /// operations. It will try to pick the best arrangement of shuffles and 7236 /// blends. 7237 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, 7238 SDValue V1, 7239 SDValue V2, 7240 ArrayRef<int> Mask, 7241 SelectionDAG &DAG) { 7242 // Shuffle the input elements into the desired positions in V1 and V2 and 7243 // blend them together. 7244 SmallVector<int, 32> V1Mask(Mask.size(), -1); 7245 SmallVector<int, 32> V2Mask(Mask.size(), -1); 7246 SmallVector<int, 32> BlendMask(Mask.size(), -1); 7247 for (int i = 0, Size = Mask.size(); i < Size; ++i) 7248 if (Mask[i] >= 0 && Mask[i] < Size) { 7249 V1Mask[i] = Mask[i]; 7250 BlendMask[i] = i; 7251 } else if (Mask[i] >= Size) { 7252 V2Mask[i] = Mask[i] - Size; 7253 BlendMask[i] = i + Size; 7254 } 7255 7256 // Try to lower with the simpler initial blend strategy unless one of the 7257 // input shuffles would be a no-op. We prefer to shuffle inputs as the 7258 // shuffle may be able to fold with a load or other benefit. However, when 7259 // we'll have to do 2x as many shuffles in order to achieve this, blending 7260 // first is a better strategy. 7261 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) 7262 if (SDValue BlendPerm = 7263 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) 7264 return BlendPerm; 7265 7266 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); 7267 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); 7268 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); 7269 } 7270 7271 /// \brief Try to lower a vector shuffle as a byte rotation. 7272 /// 7273 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary 7274 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use 7275 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will 7276 /// try to generically lower a vector shuffle through such an pattern. It 7277 /// does not check for the profitability of lowering either as PALIGNR or 7278 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. 7279 /// This matches shuffle vectors that look like: 7280 /// 7281 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] 7282 /// 7283 /// Essentially it concatenates V1 and V2, shifts right by some number of 7284 /// elements, and takes the low elements as the result. Note that while this is 7285 /// specified as a *right shift* because x86 is little-endian, it is a *left 7286 /// rotate* of the vector lanes. 7287 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, 7288 SDValue V2, 7289 ArrayRef<int> Mask, 7290 const X86Subtarget *Subtarget, 7291 SelectionDAG &DAG) { 7292 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); 7293 7294 int NumElts = Mask.size(); 7295 int NumLanes = VT.getSizeInBits() / 128; 7296 int NumLaneElts = NumElts / NumLanes; 7297 7298 // We need to detect various ways of spelling a rotation: 7299 // [11, 12, 13, 14, 15, 0, 1, 2] 7300 // [-1, 12, 13, 14, -1, -1, 1, -1] 7301 // [-1, -1, -1, -1, -1, -1, 1, 2] 7302 // [ 3, 4, 5, 6, 7, 8, 9, 10] 7303 // [-1, 4, 5, 6, -1, -1, 9, -1] 7304 // [-1, 4, 5, 6, -1, -1, -1, -1] 7305 int Rotation = 0; 7306 SDValue Lo, Hi; 7307 for (int l = 0; l < NumElts; l += NumLaneElts) { 7308 for (int i = 0; i < NumLaneElts; ++i) { 7309 if (Mask[l + i] == -1) 7310 continue; 7311 assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!"); 7312 7313 // Get the mod-Size index and lane correct it. 7314 int LaneIdx = (Mask[l + i] % NumElts) - l; 7315 // Make sure it was in this lane. 7316 if (LaneIdx < 0 || LaneIdx >= NumLaneElts) 7317 return SDValue(); 7318 7319 // Determine where a rotated vector would have started. 7320 int StartIdx = i - LaneIdx; 7321 if (StartIdx == 0) 7322 // The identity rotation isn't interesting, stop. 7323 return SDValue(); 7324 7325 // If we found the tail of a vector the rotation must be the missing 7326 // front. If we found the head of a vector, it must be how much of the 7327 // head. 7328 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; 7329 7330 if (Rotation == 0) 7331 Rotation = CandidateRotation; 7332 else if (Rotation != CandidateRotation) 7333 // The rotations don't match, so we can't match this mask. 7334 return SDValue(); 7335 7336 // Compute which value this mask is pointing at. 7337 SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; 7338 7339 // Compute which of the two target values this index should be assigned 7340 // to. This reflects whether the high elements are remaining or the low 7341 // elements are remaining. 7342 SDValue &TargetV = StartIdx < 0 ? Hi : Lo; 7343 7344 // Either set up this value if we've not encountered it before, or check 7345 // that it remains consistent. 7346 if (!TargetV) 7347 TargetV = MaskV; 7348 else if (TargetV != MaskV) 7349 // This may be a rotation, but it pulls from the inputs in some 7350 // unsupported interleaving. 7351 return SDValue(); 7352 } 7353 } 7354 7355 // Check that we successfully analyzed the mask, and normalize the results. 7356 assert(Rotation != 0 && "Failed to locate a viable rotation!"); 7357 assert((Lo || Hi) && "Failed to find a rotated input vector!"); 7358 if (!Lo) 7359 Lo = Hi; 7360 else if (!Hi) 7361 Hi = Lo; 7362 7363 // The actual rotate instruction rotates bytes, so we need to scale the 7364 // rotation based on how many bytes are in the vector lane. 7365 int Scale = 16 / NumLaneElts; 7366 7367 // SSSE3 targets can use the palignr instruction. 7368 if (Subtarget->hasSSSE3()) { 7369 // Cast the inputs to i8 vector of correct length to match PALIGNR. 7370 MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); 7371 Lo = DAG.getBitcast(AlignVT, Lo); 7372 Hi = DAG.getBitcast(AlignVT, Hi); 7373 7374 return DAG.getBitcast( 7375 VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, 7376 DAG.getConstant(Rotation * Scale, DL, MVT::i8))); 7377 } 7378 7379 assert(VT.is128BitVector() && 7380 "Rotate-based lowering only supports 128-bit lowering!"); 7381 assert(Mask.size() <= 16 && 7382 "Can shuffle at most 16 bytes in a 128-bit vector!"); 7383 7384 // Default SSE2 implementation 7385 int LoByteShift = 16 - Rotation * Scale; 7386 int HiByteShift = Rotation * Scale; 7387 7388 // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. 7389 Lo = DAG.getBitcast(MVT::v2i64, Lo); 7390 Hi = DAG.getBitcast(MVT::v2i64, Hi); 7391 7392 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, 7393 DAG.getConstant(LoByteShift, DL, MVT::i8)); 7394 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, 7395 DAG.getConstant(HiByteShift, DL, MVT::i8)); 7396 return DAG.getBitcast(VT, 7397 DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); 7398 } 7399 7400 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). 7401 /// 7402 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and 7403 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function 7404 /// matches elements from one of the input vectors shuffled to the left or 7405 /// right with zeroable elements 'shifted in'. It handles both the strictly 7406 /// bit-wise element shifts and the byte shift across an entire 128-bit double 7407 /// quad word lane. 7408 /// 7409 /// PSHL : (little-endian) left bit shift. 7410 /// [ zz, 0, zz, 2 ] 7411 /// [ -1, 4, zz, -1 ] 7412 /// PSRL : (little-endian) right bit shift. 7413 /// [ 1, zz, 3, zz] 7414 /// [ -1, -1, 7, zz] 7415 /// PSLLDQ : (little-endian) left byte shift 7416 /// [ zz, 0, 1, 2, 3, 4, 5, 6] 7417 /// [ zz, zz, -1, -1, 2, 3, 4, -1] 7418 /// [ zz, zz, zz, zz, zz, zz, -1, 1] 7419 /// PSRLDQ : (little-endian) right byte shift 7420 /// [ 5, 6, 7, zz, zz, zz, zz, zz] 7421 /// [ -1, 5, 6, 7, zz, zz, zz, zz] 7422 /// [ 1, 2, -1, -1, -1, -1, zz, zz] 7423 static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, 7424 SDValue V2, ArrayRef<int> Mask, 7425 SelectionDAG &DAG) { 7426 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7427 7428 int Size = Mask.size(); 7429 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); 7430 7431 auto CheckZeros = [&](int Shift, int Scale, bool Left) { 7432 for (int i = 0; i < Size; i += Scale) 7433 for (int j = 0; j < Shift; ++j) 7434 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) 7435 return false; 7436 7437 return true; 7438 }; 7439 7440 auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { 7441 for (int i = 0; i != Size; i += Scale) { 7442 unsigned Pos = Left ? i + Shift : i; 7443 unsigned Low = Left ? i : i + Shift; 7444 unsigned Len = Scale - Shift; 7445 if (!isSequentialOrUndefInRange(Mask, Pos, Len, 7446 Low + (V == V1 ? 0 : Size))) 7447 return SDValue(); 7448 } 7449 7450 int ShiftEltBits = VT.getScalarSizeInBits() * Scale; 7451 bool ByteShift = ShiftEltBits > 64; 7452 unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) 7453 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); 7454 int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); 7455 7456 // Normalize the scale for byte shifts to still produce an i64 element 7457 // type. 7458 Scale = ByteShift ? Scale / 2 : Scale; 7459 7460 // We need to round trip through the appropriate type for the shift. 7461 MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); 7462 MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); 7463 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && 7464 "Illegal integer vector type"); 7465 V = DAG.getBitcast(ShiftVT, V); 7466 7467 V = DAG.getNode(OpCode, DL, ShiftVT, V, 7468 DAG.getConstant(ShiftAmt, DL, MVT::i8)); 7469 return DAG.getBitcast(VT, V); 7470 }; 7471 7472 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just 7473 // keep doubling the size of the integer elements up to that. We can 7474 // then shift the elements of the integer vector by whole multiples of 7475 // their width within the elements of the larger integer vector. Test each 7476 // multiple to see if we can find a match with the moved element indices 7477 // and that the shifted in elements are all zeroable. 7478 for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2) 7479 for (int Shift = 1; Shift != Scale; ++Shift) 7480 for (bool Left : {true, false}) 7481 if (CheckZeros(Shift, Scale, Left)) 7482 for (SDValue V : {V1, V2}) 7483 if (SDValue Match = MatchShift(Shift, Scale, Left, V)) 7484 return Match; 7485 7486 // no match 7487 return SDValue(); 7488 } 7489 7490 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. 7491 static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, 7492 SDValue V2, ArrayRef<int> Mask, 7493 SelectionDAG &DAG) { 7494 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7495 assert(!Zeroable.all() && "Fully zeroable shuffle mask"); 7496 7497 int Size = Mask.size(); 7498 int HalfSize = Size / 2; 7499 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); 7500 7501 // Upper half must be undefined. 7502 if (!isUndefInRange(Mask, HalfSize, HalfSize)) 7503 return SDValue(); 7504 7505 // EXTRQ: Extract Len elements from lower half of source, starting at Idx. 7506 // Remainder of lower half result is zero and upper half is all undef. 7507 auto LowerAsEXTRQ = [&]() { 7508 // Determine the extraction length from the part of the 7509 // lower half that isn't zeroable. 7510 int Len = HalfSize; 7511 for (; Len > 0; --Len) 7512 if (!Zeroable[Len - 1]) 7513 break; 7514 assert(Len > 0 && "Zeroable shuffle mask"); 7515 7516 // Attempt to match first Len sequential elements from the lower half. 7517 SDValue Src; 7518 int Idx = -1; 7519 for (int i = 0; i != Len; ++i) { 7520 int M = Mask[i]; 7521 if (M < 0) 7522 continue; 7523 SDValue &V = (M < Size ? V1 : V2); 7524 M = M % Size; 7525 7526 // The extracted elements must start at a valid index and all mask 7527 // elements must be in the lower half. 7528 if (i > M || M >= HalfSize) 7529 return SDValue(); 7530 7531 if (Idx < 0 || (Src == V && Idx == (M - i))) { 7532 Src = V; 7533 Idx = M - i; 7534 continue; 7535 } 7536 return SDValue(); 7537 } 7538 7539 if (Idx < 0) 7540 return SDValue(); 7541 7542 assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); 7543 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; 7544 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; 7545 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, 7546 DAG.getConstant(BitLen, DL, MVT::i8), 7547 DAG.getConstant(BitIdx, DL, MVT::i8)); 7548 }; 7549 7550 if (SDValue ExtrQ = LowerAsEXTRQ()) 7551 return ExtrQ; 7552 7553 // INSERTQ: Extract lowest Len elements from lower half of second source and 7554 // insert over first source, starting at Idx. 7555 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } 7556 auto LowerAsInsertQ = [&]() { 7557 for (int Idx = 0; Idx != HalfSize; ++Idx) { 7558 SDValue Base; 7559 7560 // Attempt to match first source from mask before insertion point. 7561 if (isUndefInRange(Mask, 0, Idx)) { 7562 /* EMPTY */ 7563 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { 7564 Base = V1; 7565 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { 7566 Base = V2; 7567 } else { 7568 continue; 7569 } 7570 7571 // Extend the extraction length looking to match both the insertion of 7572 // the second source and the remaining elements of the first. 7573 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { 7574 SDValue Insert; 7575 int Len = Hi - Idx; 7576 7577 // Match insertion. 7578 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { 7579 Insert = V1; 7580 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { 7581 Insert = V2; 7582 } else { 7583 continue; 7584 } 7585 7586 // Match the remaining elements of the lower half. 7587 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { 7588 /* EMPTY */ 7589 } else if ((!Base || (Base == V1)) && 7590 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { 7591 Base = V1; 7592 } else if ((!Base || (Base == V2)) && 7593 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, 7594 Size + Hi)) { 7595 Base = V2; 7596 } else { 7597 continue; 7598 } 7599 7600 // We may not have a base (first source) - this can safely be undefined. 7601 if (!Base) 7602 Base = DAG.getUNDEF(VT); 7603 7604 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; 7605 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; 7606 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, 7607 DAG.getConstant(BitLen, DL, MVT::i8), 7608 DAG.getConstant(BitIdx, DL, MVT::i8)); 7609 } 7610 } 7611 7612 return SDValue(); 7613 }; 7614 7615 if (SDValue InsertQ = LowerAsInsertQ()) 7616 return InsertQ; 7617 7618 return SDValue(); 7619 } 7620 7621 /// \brief Lower a vector shuffle as a zero or any extension. 7622 /// 7623 /// Given a specific number of elements, element bit width, and extension 7624 /// stride, produce either a zero or any extension based on the available 7625 /// features of the subtarget. The extended elements are consecutive and 7626 /// begin and can start from an offseted element index in the input; to 7627 /// avoid excess shuffling the offset must either being in the bottom lane 7628 /// or at the start of a higher lane. All extended elements must be from 7629 /// the same lane. 7630 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( 7631 SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, 7632 ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { 7633 assert(Scale > 1 && "Need a scale to extend."); 7634 int EltBits = VT.getScalarSizeInBits(); 7635 int NumElements = VT.getVectorNumElements(); 7636 int NumEltsPerLane = 128 / EltBits; 7637 int OffsetLane = Offset / NumEltsPerLane; 7638 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 7639 "Only 8, 16, and 32 bit elements can be extended."); 7640 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); 7641 assert(0 <= Offset && "Extension offset must be positive."); 7642 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && 7643 "Extension offset must be in the first lane or start an upper lane."); 7644 7645 // Check that an index is in same lane as the base offset. 7646 auto SafeOffset = [&](int Idx) { 7647 return OffsetLane == (Idx / NumEltsPerLane); 7648 }; 7649 7650 // Shift along an input so that the offset base moves to the first element. 7651 auto ShuffleOffset = [&](SDValue V) { 7652 if (!Offset) 7653 return V; 7654 7655 SmallVector<int, 8> ShMask((unsigned)NumElements, -1); 7656 for (int i = 0; i * Scale < NumElements; ++i) { 7657 int SrcIdx = i + Offset; 7658 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; 7659 } 7660 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); 7661 }; 7662 7663 // Found a valid zext mask! Try various lowering strategies based on the 7664 // input type and available ISA extensions. 7665 if (Subtarget->hasSSE41()) { 7666 // Not worth offseting 128-bit vectors if scale == 2, a pattern using 7667 // PUNPCK will catch this in a later shuffle match. 7668 if (Offset && Scale == 2 && VT.is128BitVector()) 7669 return SDValue(); 7670 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), 7671 NumElements / Scale); 7672 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV)); 7673 return DAG.getBitcast(VT, InputV); 7674 } 7675 7676 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); 7677 7678 // For any extends we can cheat for larger element sizes and use shuffle 7679 // instructions that can fold with a load and/or copy. 7680 if (AnyExt && EltBits == 32) { 7681 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, 7682 -1}; 7683 return DAG.getBitcast( 7684 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, 7685 DAG.getBitcast(MVT::v4i32, InputV), 7686 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 7687 } 7688 if (AnyExt && EltBits == 16 && Scale > 2) { 7689 int PSHUFDMask[4] = {Offset / 2, -1, 7690 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; 7691 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, 7692 DAG.getBitcast(MVT::v4i32, InputV), 7693 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); 7694 int PSHUFWMask[4] = {1, -1, -1, -1}; 7695 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); 7696 return DAG.getBitcast( 7697 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, 7698 DAG.getBitcast(MVT::v8i16, InputV), 7699 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); 7700 } 7701 7702 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes 7703 // to 64-bits. 7704 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { 7705 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); 7706 assert(VT.is128BitVector() && "Unexpected vector width!"); 7707 7708 int LoIdx = Offset * EltBits; 7709 SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, 7710 DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, 7711 DAG.getConstant(EltBits, DL, MVT::i8), 7712 DAG.getConstant(LoIdx, DL, MVT::i8))); 7713 7714 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || 7715 !SafeOffset(Offset + 1)) 7716 return DAG.getNode(ISD::BITCAST, DL, VT, Lo); 7717 7718 int HiIdx = (Offset + 1) * EltBits; 7719 SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, 7720 DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, 7721 DAG.getConstant(EltBits, DL, MVT::i8), 7722 DAG.getConstant(HiIdx, DL, MVT::i8))); 7723 return DAG.getNode(ISD::BITCAST, DL, VT, 7724 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); 7725 } 7726 7727 // If this would require more than 2 unpack instructions to expand, use 7728 // pshufb when available. We can only use more than 2 unpack instructions 7729 // when zero extending i8 elements which also makes it easier to use pshufb. 7730 if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { 7731 assert(NumElements == 16 && "Unexpected byte vector width!"); 7732 SDValue PSHUFBMask[16]; 7733 for (int i = 0; i < 16; ++i) { 7734 int Idx = Offset + (i / Scale); 7735 PSHUFBMask[i] = DAG.getConstant( 7736 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); 7737 } 7738 InputV = DAG.getBitcast(MVT::v16i8, InputV); 7739 return DAG.getBitcast(VT, 7740 DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, 7741 DAG.getNode(ISD::BUILD_VECTOR, DL, 7742 MVT::v16i8, PSHUFBMask))); 7743 } 7744 7745 // If we are extending from an offset, ensure we start on a boundary that 7746 // we can unpack from. 7747 int AlignToUnpack = Offset % (NumElements / Scale); 7748 if (AlignToUnpack) { 7749 SmallVector<int, 8> ShMask((unsigned)NumElements, -1); 7750 for (int i = AlignToUnpack; i < NumElements; ++i) 7751 ShMask[i - AlignToUnpack] = i; 7752 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); 7753 Offset -= AlignToUnpack; 7754 } 7755 7756 // Otherwise emit a sequence of unpacks. 7757 do { 7758 unsigned UnpackLoHi = X86ISD::UNPCKL; 7759 if (Offset >= (NumElements / 2)) { 7760 UnpackLoHi = X86ISD::UNPCKH; 7761 Offset -= (NumElements / 2); 7762 } 7763 7764 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); 7765 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) 7766 : getZeroVector(InputVT, Subtarget, DAG, DL); 7767 InputV = DAG.getBitcast(InputVT, InputV); 7768 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); 7769 Scale /= 2; 7770 EltBits *= 2; 7771 NumElements /= 2; 7772 } while (Scale > 1); 7773 return DAG.getBitcast(VT, InputV); 7774 } 7775 7776 /// \brief Try to lower a vector shuffle as a zero extension on any microarch. 7777 /// 7778 /// This routine will try to do everything in its power to cleverly lower 7779 /// a shuffle which happens to match the pattern of a zero extend. It doesn't 7780 /// check for the profitability of this lowering, it tries to aggressively 7781 /// match this pattern. It will use all of the micro-architectural details it 7782 /// can to emit an efficient lowering. It handles both blends with all-zero 7783 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to 7784 /// masking out later). 7785 /// 7786 /// The reason we have dedicated lowering for zext-style shuffles is that they 7787 /// are both incredibly common and often quite performance sensitive. 7788 static SDValue lowerVectorShuffleAsZeroOrAnyExtend( 7789 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 7790 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 7791 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7792 7793 int Bits = VT.getSizeInBits(); 7794 int NumLanes = Bits / 128; 7795 int NumElements = VT.getVectorNumElements(); 7796 int NumEltsPerLane = NumElements / NumLanes; 7797 assert(VT.getScalarSizeInBits() <= 32 && 7798 "Exceeds 32-bit integer zero extension limit"); 7799 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); 7800 7801 // Define a helper function to check a particular ext-scale and lower to it if 7802 // valid. 7803 auto Lower = [&](int Scale) -> SDValue { 7804 SDValue InputV; 7805 bool AnyExt = true; 7806 int Offset = 0; 7807 int Matches = 0; 7808 for (int i = 0; i < NumElements; ++i) { 7809 int M = Mask[i]; 7810 if (M == -1) 7811 continue; // Valid anywhere but doesn't tell us anything. 7812 if (i % Scale != 0) { 7813 // Each of the extended elements need to be zeroable. 7814 if (!Zeroable[i]) 7815 return SDValue(); 7816 7817 // We no longer are in the anyext case. 7818 AnyExt = false; 7819 continue; 7820 } 7821 7822 // Each of the base elements needs to be consecutive indices into the 7823 // same input vector. 7824 SDValue V = M < NumElements ? V1 : V2; 7825 M = M % NumElements; 7826 if (!InputV) { 7827 InputV = V; 7828 Offset = M - (i / Scale); 7829 } else if (InputV != V) 7830 return SDValue(); // Flip-flopping inputs. 7831 7832 // Offset must start in the lowest 128-bit lane or at the start of an 7833 // upper lane. 7834 // FIXME: Is it ever worth allowing a negative base offset? 7835 if (!((0 <= Offset && Offset < NumEltsPerLane) || 7836 (Offset % NumEltsPerLane) == 0)) 7837 return SDValue(); 7838 7839 // If we are offsetting, all referenced entries must come from the same 7840 // lane. 7841 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) 7842 return SDValue(); 7843 7844 if ((M % NumElements) != (Offset + (i / Scale))) 7845 return SDValue(); // Non-consecutive strided elements. 7846 Matches++; 7847 } 7848 7849 // If we fail to find an input, we have a zero-shuffle which should always 7850 // have already been handled. 7851 // FIXME: Maybe handle this here in case during blending we end up with one? 7852 if (!InputV) 7853 return SDValue(); 7854 7855 // If we are offsetting, don't extend if we only match a single input, we 7856 // can always do better by using a basic PSHUF or PUNPCK. 7857 if (Offset != 0 && Matches < 2) 7858 return SDValue(); 7859 7860 return lowerVectorShuffleAsSpecificZeroOrAnyExtend( 7861 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); 7862 }; 7863 7864 // The widest scale possible for extending is to a 64-bit integer. 7865 assert(Bits % 64 == 0 && 7866 "The number of bits in a vector must be divisible by 64 on x86!"); 7867 int NumExtElements = Bits / 64; 7868 7869 // Each iteration, try extending the elements half as much, but into twice as 7870 // many elements. 7871 for (; NumExtElements < NumElements; NumExtElements *= 2) { 7872 assert(NumElements % NumExtElements == 0 && 7873 "The input vector size must be divisible by the extended size."); 7874 if (SDValue V = Lower(NumElements / NumExtElements)) 7875 return V; 7876 } 7877 7878 // General extends failed, but 128-bit vectors may be able to use MOVQ. 7879 if (Bits != 128) 7880 return SDValue(); 7881 7882 // Returns one of the source operands if the shuffle can be reduced to a 7883 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. 7884 auto CanZExtLowHalf = [&]() { 7885 for (int i = NumElements / 2; i != NumElements; ++i) 7886 if (!Zeroable[i]) 7887 return SDValue(); 7888 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) 7889 return V1; 7890 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) 7891 return V2; 7892 return SDValue(); 7893 }; 7894 7895 if (SDValue V = CanZExtLowHalf()) { 7896 V = DAG.getBitcast(MVT::v2i64, V); 7897 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); 7898 return DAG.getBitcast(VT, V); 7899 } 7900 7901 // No viable ext lowering found. 7902 return SDValue(); 7903 } 7904 7905 /// \brief Try to get a scalar value for a specific element of a vector. 7906 /// 7907 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. 7908 static SDValue getScalarValueForVectorElement(SDValue V, int Idx, 7909 SelectionDAG &DAG) { 7910 MVT VT = V.getSimpleValueType(); 7911 MVT EltVT = VT.getVectorElementType(); 7912 while (V.getOpcode() == ISD::BITCAST) 7913 V = V.getOperand(0); 7914 // If the bitcasts shift the element size, we can't extract an equivalent 7915 // element from it. 7916 MVT NewVT = V.getSimpleValueType(); 7917 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) 7918 return SDValue(); 7919 7920 if (V.getOpcode() == ISD::BUILD_VECTOR || 7921 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { 7922 // Ensure the scalar operand is the same size as the destination. 7923 // FIXME: Add support for scalar truncation where possible. 7924 SDValue S = V.getOperand(Idx); 7925 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) 7926 return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S); 7927 } 7928 7929 return SDValue(); 7930 } 7931 7932 /// \brief Helper to test for a load that can be folded with x86 shuffles. 7933 /// 7934 /// This is particularly important because the set of instructions varies 7935 /// significantly based on whether the operand is a load or not. 7936 static bool isShuffleFoldableLoad(SDValue V) { 7937 while (V.getOpcode() == ISD::BITCAST) 7938 V = V.getOperand(0); 7939 7940 return ISD::isNON_EXTLoad(V.getNode()); 7941 } 7942 7943 /// \brief Try to lower insertion of a single element into a zero vector. 7944 /// 7945 /// This is a common pattern that we have especially efficient patterns to lower 7946 /// across all subtarget feature sets. 7947 static SDValue lowerVectorShuffleAsElementInsertion( 7948 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 7949 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 7950 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7951 MVT ExtVT = VT; 7952 MVT EltVT = VT.getVectorElementType(); 7953 7954 int V2Index = std::find_if(Mask.begin(), Mask.end(), 7955 [&Mask](int M) { return M >= (int)Mask.size(); }) - 7956 Mask.begin(); 7957 bool IsV1Zeroable = true; 7958 for (int i = 0, Size = Mask.size(); i < Size; ++i) 7959 if (i != V2Index && !Zeroable[i]) { 7960 IsV1Zeroable = false; 7961 break; 7962 } 7963 7964 // Check for a single input from a SCALAR_TO_VECTOR node. 7965 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and 7966 // all the smarts here sunk into that routine. However, the current 7967 // lowering of BUILD_VECTOR makes that nearly impossible until the old 7968 // vector shuffle lowering is dead. 7969 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), 7970 DAG); 7971 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { 7972 // We need to zext the scalar if it is smaller than an i32. 7973 V2S = DAG.getBitcast(EltVT, V2S); 7974 if (EltVT == MVT::i8 || EltVT == MVT::i16) { 7975 // Using zext to expand a narrow element won't work for non-zero 7976 // insertions. 7977 if (!IsV1Zeroable) 7978 return SDValue(); 7979 7980 // Zero-extend directly to i32. 7981 ExtVT = MVT::v4i32; 7982 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); 7983 } 7984 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); 7985 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || 7986 EltVT == MVT::i16) { 7987 // Either not inserting from the low element of the input or the input 7988 // element size is too small to use VZEXT_MOVL to clear the high bits. 7989 return SDValue(); 7990 } 7991 7992 if (!IsV1Zeroable) { 7993 // If V1 can't be treated as a zero vector we have fewer options to lower 7994 // this. We can't support integer vectors or non-zero targets cheaply, and 7995 // the V1 elements can't be permuted in any way. 7996 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); 7997 if (!VT.isFloatingPoint() || V2Index != 0) 7998 return SDValue(); 7999 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end()); 8000 V1Mask[V2Index] = -1; 8001 if (!isNoopShuffleMask(V1Mask)) 8002 return SDValue(); 8003 // This is essentially a special case blend operation, but if we have 8004 // general purpose blend operations, they are always faster. Bail and let 8005 // the rest of the lowering handle these as blends. 8006 if (Subtarget->hasSSE41()) 8007 return SDValue(); 8008 8009 // Otherwise, use MOVSD or MOVSS. 8010 assert((EltVT == MVT::f32 || EltVT == MVT::f64) && 8011 "Only two types of floating point element types to handle!"); 8012 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, 8013 ExtVT, V1, V2); 8014 } 8015 8016 // This lowering only works for the low element with floating point vectors. 8017 if (VT.isFloatingPoint() && V2Index != 0) 8018 return SDValue(); 8019 8020 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); 8021 if (ExtVT != VT) 8022 V2 = DAG.getBitcast(VT, V2); 8023 8024 if (V2Index != 0) { 8025 // If we have 4 or fewer lanes we can cheaply shuffle the element into 8026 // the desired position. Otherwise it is more efficient to do a vector 8027 // shift left. We know that we can do a vector shift left because all 8028 // the inputs are zero. 8029 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { 8030 SmallVector<int, 4> V2Shuffle(Mask.size(), 1); 8031 V2Shuffle[V2Index] = 0; 8032 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); 8033 } else { 8034 V2 = DAG.getBitcast(MVT::v2i64, V2); 8035 V2 = DAG.getNode( 8036 X86ISD::VSHLDQ, DL, MVT::v2i64, V2, 8037 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, 8038 DAG.getTargetLoweringInfo().getScalarShiftAmountTy( 8039 DAG.getDataLayout(), VT))); 8040 V2 = DAG.getBitcast(VT, V2); 8041 } 8042 } 8043 return V2; 8044 } 8045 8046 /// \brief Try to lower broadcast of a single - truncated - integer element, 8047 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. 8048 /// 8049 /// This assumes we have AVX2. 8050 static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, 8051 int BroadcastIdx, 8052 const X86Subtarget *Subtarget, 8053 SelectionDAG &DAG) { 8054 assert(Subtarget->hasAVX2() && 8055 "We can only lower integer broadcasts with AVX2!"); 8056 8057 EVT EltVT = VT.getVectorElementType(); 8058 EVT V0VT = V0.getValueType(); 8059 8060 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); 8061 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); 8062 8063 EVT V0EltVT = V0VT.getVectorElementType(); 8064 if (!V0EltVT.isInteger()) 8065 return SDValue(); 8066 8067 const unsigned EltSize = EltVT.getSizeInBits(); 8068 const unsigned V0EltSize = V0EltVT.getSizeInBits(); 8069 8070 // This is only a truncation if the original element type is larger. 8071 if (V0EltSize <= EltSize) 8072 return SDValue(); 8073 8074 assert(((V0EltSize % EltSize) == 0) && 8075 "Scalar type sizes must all be powers of 2 on x86!"); 8076 8077 const unsigned V0Opc = V0.getOpcode(); 8078 const unsigned Scale = V0EltSize / EltSize; 8079 const unsigned V0BroadcastIdx = BroadcastIdx / Scale; 8080 8081 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && 8082 V0Opc != ISD::BUILD_VECTOR) 8083 return SDValue(); 8084 8085 SDValue Scalar = V0.getOperand(V0BroadcastIdx); 8086 8087 // If we're extracting non-least-significant bits, shift so we can truncate. 8088 // Hopefully, we can fold away the trunc/srl/load into the broadcast. 8089 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer 8090 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. 8091 if (const int OffsetIdx = BroadcastIdx % Scale) 8092 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, 8093 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); 8094 8095 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, 8096 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); 8097 } 8098 8099 /// \brief Try to lower broadcast of a single element. 8100 /// 8101 /// For convenience, this code also bundles all of the subtarget feature set 8102 /// filtering. While a little annoying to re-dispatch on type here, there isn't 8103 /// a convenient way to factor it out. 8104 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? 8105 static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, 8106 ArrayRef<int> Mask, 8107 const X86Subtarget *Subtarget, 8108 SelectionDAG &DAG) { 8109 if (!Subtarget->hasAVX()) 8110 return SDValue(); 8111 if (VT.isInteger() && !Subtarget->hasAVX2()) 8112 return SDValue(); 8113 8114 // Check that the mask is a broadcast. 8115 int BroadcastIdx = -1; 8116 for (int M : Mask) 8117 if (M >= 0 && BroadcastIdx == -1) 8118 BroadcastIdx = M; 8119 else if (M >= 0 && M != BroadcastIdx) 8120 return SDValue(); 8121 8122 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " 8123 "a sorted mask where the broadcast " 8124 "comes from V1."); 8125 8126 // Go up the chain of (vector) values to find a scalar load that we can 8127 // combine with the broadcast. 8128 for (;;) { 8129 switch (V.getOpcode()) { 8130 case ISD::CONCAT_VECTORS: { 8131 int OperandSize = Mask.size() / V.getNumOperands(); 8132 V = V.getOperand(BroadcastIdx / OperandSize); 8133 BroadcastIdx %= OperandSize; 8134 continue; 8135 } 8136 8137 case ISD::INSERT_SUBVECTOR: { 8138 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); 8139 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); 8140 if (!ConstantIdx) 8141 break; 8142 8143 int BeginIdx = (int)ConstantIdx->getZExtValue(); 8144 int EndIdx = 8145 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); 8146 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { 8147 BroadcastIdx -= BeginIdx; 8148 V = VInner; 8149 } else { 8150 V = VOuter; 8151 } 8152 continue; 8153 } 8154 } 8155 break; 8156 } 8157 8158 // Check if this is a broadcast of a scalar. We special case lowering 8159 // for scalars so that we can more effectively fold with loads. 8160 // First, look through bitcast: if the original value has a larger element 8161 // type than the shuffle, the broadcast element is in essence truncated. 8162 // Make that explicit to ease folding. 8163 if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) 8164 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( 8165 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) 8166 return TruncBroadcast; 8167 8168 // Also check the simpler case, where we can directly reuse the scalar. 8169 if (V.getOpcode() == ISD::BUILD_VECTOR || 8170 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { 8171 V = V.getOperand(BroadcastIdx); 8172 8173 // If the scalar isn't a load, we can't broadcast from it in AVX1. 8174 // Only AVX2 has register broadcasts. 8175 if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) 8176 return SDValue(); 8177 } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { 8178 // If we are broadcasting a load that is only used by the shuffle 8179 // then we can reduce the vector load to the broadcasted scalar load. 8180 LoadSDNode *Ld = cast<LoadSDNode>(V); 8181 SDValue BaseAddr = Ld->getOperand(1); 8182 EVT AddrVT = BaseAddr.getValueType(); 8183 EVT SVT = VT.getScalarType(); 8184 unsigned Offset = BroadcastIdx * SVT.getStoreSize(); 8185 SDValue NewAddr = DAG.getNode( 8186 ISD::ADD, DL, AddrVT, BaseAddr, 8187 DAG.getConstant(Offset, DL, AddrVT)); 8188 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, 8189 DAG.getMachineFunction().getMachineMemOperand( 8190 Ld->getMemOperand(), Offset, SVT.getStoreSize())); 8191 } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { 8192 // We can't broadcast from a vector register without AVX2, and we can only 8193 // broadcast from the zero-element of a vector register. 8194 return SDValue(); 8195 } 8196 8197 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); 8198 } 8199 8200 // Check for whether we can use INSERTPS to perform the shuffle. We only use 8201 // INSERTPS when the V1 elements are already in the correct locations 8202 // because otherwise we can just always use two SHUFPS instructions which 8203 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also 8204 // perform INSERTPS if a single V1 element is out of place and all V2 8205 // elements are zeroable. 8206 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, 8207 ArrayRef<int> Mask, 8208 SelectionDAG &DAG) { 8209 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); 8210 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 8211 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 8212 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 8213 8214 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 8215 8216 unsigned ZMask = 0; 8217 int V1DstIndex = -1; 8218 int V2DstIndex = -1; 8219 bool V1UsedInPlace = false; 8220 8221 for (int i = 0; i < 4; ++i) { 8222 // Synthesize a zero mask from the zeroable elements (includes undefs). 8223 if (Zeroable[i]) { 8224 ZMask |= 1 << i; 8225 continue; 8226 } 8227 8228 // Flag if we use any V1 inputs in place. 8229 if (i == Mask[i]) { 8230 V1UsedInPlace = true; 8231 continue; 8232 } 8233 8234 // We can only insert a single non-zeroable element. 8235 if (V1DstIndex != -1 || V2DstIndex != -1) 8236 return SDValue(); 8237 8238 if (Mask[i] < 4) { 8239 // V1 input out of place for insertion. 8240 V1DstIndex = i; 8241 } else { 8242 // V2 input for insertion. 8243 V2DstIndex = i; 8244 } 8245 } 8246 8247 // Don't bother if we have no (non-zeroable) element for insertion. 8248 if (V1DstIndex == -1 && V2DstIndex == -1) 8249 return SDValue(); 8250 8251 // Determine element insertion src/dst indices. The src index is from the 8252 // start of the inserted vector, not the start of the concatenated vector. 8253 unsigned V2SrcIndex = 0; 8254 if (V1DstIndex != -1) { 8255 // If we have a V1 input out of place, we use V1 as the V2 element insertion 8256 // and don't use the original V2 at all. 8257 V2SrcIndex = Mask[V1DstIndex]; 8258 V2DstIndex = V1DstIndex; 8259 V2 = V1; 8260 } else { 8261 V2SrcIndex = Mask[V2DstIndex] - 4; 8262 } 8263 8264 // If no V1 inputs are used in place, then the result is created only from 8265 // the zero mask and the V2 insertion - so remove V1 dependency. 8266 if (!V1UsedInPlace) 8267 V1 = DAG.getUNDEF(MVT::v4f32); 8268 8269 unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; 8270 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); 8271 8272 // Insert the V2 element into the desired position. 8273 SDLoc DL(Op); 8274 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, 8275 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 8276 } 8277 8278 /// \brief Try to lower a shuffle as a permute of the inputs followed by an 8279 /// UNPCK instruction. 8280 /// 8281 /// This specifically targets cases where we end up with alternating between 8282 /// the two inputs, and so can permute them into something that feeds a single 8283 /// UNPCK instruction. Note that this routine only targets integer vectors 8284 /// because for floating point vectors we have a generalized SHUFPS lowering 8285 /// strategy that handles everything that doesn't *exactly* match an unpack, 8286 /// making this clever lowering unnecessary. 8287 static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, 8288 SDValue V1, SDValue V2, 8289 ArrayRef<int> Mask, 8290 SelectionDAG &DAG) { 8291 assert(!VT.isFloatingPoint() && 8292 "This routine only supports integer vectors."); 8293 assert(!isSingleInputShuffleMask(Mask) && 8294 "This routine should only be used when blending two inputs."); 8295 assert(Mask.size() >= 2 && "Single element masks are invalid."); 8296 8297 int Size = Mask.size(); 8298 8299 int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { 8300 return M >= 0 && M % Size < Size / 2; 8301 }); 8302 int NumHiInputs = std::count_if( 8303 Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; }); 8304 8305 bool UnpackLo = NumLoInputs >= NumHiInputs; 8306 8307 auto TryUnpack = [&](MVT UnpackVT, int Scale) { 8308 SmallVector<int, 32> V1Mask(Mask.size(), -1); 8309 SmallVector<int, 32> V2Mask(Mask.size(), -1); 8310 8311 for (int i = 0; i < Size; ++i) { 8312 if (Mask[i] < 0) 8313 continue; 8314 8315 // Each element of the unpack contains Scale elements from this mask. 8316 int UnpackIdx = i / Scale; 8317 8318 // We only handle the case where V1 feeds the first slots of the unpack. 8319 // We rely on canonicalization to ensure this is the case. 8320 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) 8321 return SDValue(); 8322 8323 // Setup the mask for this input. The indexing is tricky as we have to 8324 // handle the unpack stride. 8325 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; 8326 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = 8327 Mask[i] % Size; 8328 } 8329 8330 // If we will have to shuffle both inputs to use the unpack, check whether 8331 // we can just unpack first and shuffle the result. If so, skip this unpack. 8332 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && 8333 !isNoopShuffleMask(V2Mask)) 8334 return SDValue(); 8335 8336 // Shuffle the inputs into place. 8337 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); 8338 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); 8339 8340 // Cast the inputs to the type we will use to unpack them. 8341 V1 = DAG.getBitcast(UnpackVT, V1); 8342 V2 = DAG.getBitcast(UnpackVT, V2); 8343 8344 // Unpack the inputs and cast the result back to the desired type. 8345 return DAG.getBitcast( 8346 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, 8347 UnpackVT, V1, V2)); 8348 }; 8349 8350 // We try each unpack from the largest to the smallest to try and find one 8351 // that fits this mask. 8352 int OrigNumElements = VT.getVectorNumElements(); 8353 int OrigScalarSize = VT.getScalarSizeInBits(); 8354 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { 8355 int Scale = ScalarSize / OrigScalarSize; 8356 int NumElements = OrigNumElements / Scale; 8357 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); 8358 if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) 8359 return Unpack; 8360 } 8361 8362 // If none of the unpack-rooted lowerings worked (or were profitable) try an 8363 // initial unpack. 8364 if (NumLoInputs == 0 || NumHiInputs == 0) { 8365 assert((NumLoInputs > 0 || NumHiInputs > 0) && 8366 "We have to have *some* inputs!"); 8367 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; 8368 8369 // FIXME: We could consider the total complexity of the permute of each 8370 // possible unpacking. Or at the least we should consider how many 8371 // half-crossings are created. 8372 // FIXME: We could consider commuting the unpacks. 8373 8374 SmallVector<int, 32> PermMask; 8375 PermMask.assign(Size, -1); 8376 for (int i = 0; i < Size; ++i) { 8377 if (Mask[i] < 0) 8378 continue; 8379 8380 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); 8381 8382 PermMask[i] = 8383 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); 8384 } 8385 return DAG.getVectorShuffle( 8386 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, 8387 DL, VT, V1, V2), 8388 DAG.getUNDEF(VT), PermMask); 8389 } 8390 8391 return SDValue(); 8392 } 8393 8394 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. 8395 /// 8396 /// This is the basis function for the 2-lane 64-bit shuffles as we have full 8397 /// support for floating point shuffles but not integer shuffles. These 8398 /// instructions will incur a domain crossing penalty on some chips though so 8399 /// it is better to avoid lowering through this for integer vectors where 8400 /// possible. 8401 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 8402 const X86Subtarget *Subtarget, 8403 SelectionDAG &DAG) { 8404 SDLoc DL(Op); 8405 assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); 8406 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); 8407 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); 8408 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 8409 ArrayRef<int> Mask = SVOp->getMask(); 8410 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); 8411 8412 if (isSingleInputShuffleMask(Mask)) { 8413 // Use low duplicate instructions for masks that match their pattern. 8414 if (Subtarget->hasSSE3()) 8415 if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) 8416 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); 8417 8418 // Straight shuffle of a single input vector. Simulate this by using the 8419 // single input as both of the "inputs" to this instruction.. 8420 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); 8421 8422 if (Subtarget->hasAVX()) { 8423 // If we have AVX, we can use VPERMILPS which will allow folding a load 8424 // into the shuffle. 8425 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, 8426 DAG.getConstant(SHUFPDMask, DL, MVT::i8)); 8427 } 8428 8429 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1, 8430 DAG.getConstant(SHUFPDMask, DL, MVT::i8)); 8431 } 8432 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); 8433 assert(Mask[1] >= 2 && "Non-canonicalized blend!"); 8434 8435 // If we have a single input, insert that into V1 if we can do so cheaply. 8436 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { 8437 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 8438 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) 8439 return Insertion; 8440 // Try inverting the insertion since for v2 masks it is easy to do and we 8441 // can't reliably sort the mask one way or the other. 8442 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), 8443 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; 8444 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 8445 DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) 8446 return Insertion; 8447 } 8448 8449 // Try to use one of the special instruction patterns to handle two common 8450 // blend patterns if a zero-blend above didn't work. 8451 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || 8452 isShuffleEquivalent(V1, V2, Mask, {1, 3})) 8453 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) 8454 // We can either use a special instruction to load over the low double or 8455 // to move just the low double. 8456 return DAG.getNode( 8457 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, 8458 DL, MVT::v2f64, V2, 8459 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); 8460 8461 if (Subtarget->hasSSE41()) 8462 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, 8463 Subtarget, DAG)) 8464 return Blend; 8465 8466 // Use dedicated unpack instructions for masks that match their pattern. 8467 if (SDValue V = 8468 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) 8469 return V; 8470 8471 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); 8472 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, 8473 DAG.getConstant(SHUFPDMask, DL, MVT::i8)); 8474 } 8475 8476 /// \brief Handle lowering of 2-lane 64-bit integer shuffles. 8477 /// 8478 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by 8479 /// the integer unit to minimize domain crossing penalties. However, for blends 8480 /// it falls back to the floating point shuffle operation with appropriate bit 8481 /// casting. 8482 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 8483 const X86Subtarget *Subtarget, 8484 SelectionDAG &DAG) { 8485 SDLoc DL(Op); 8486 assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); 8487 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); 8488 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); 8489 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 8490 ArrayRef<int> Mask = SVOp->getMask(); 8491 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); 8492 8493 if (isSingleInputShuffleMask(Mask)) { 8494 // Check for being able to broadcast a single element. 8495 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, 8496 Mask, Subtarget, DAG)) 8497 return Broadcast; 8498 8499 // Straight shuffle of a single input vector. For everything from SSE2 8500 // onward this has a single fast instruction with no scary immediates. 8501 // We have to map the mask as it is actually a v4i32 shuffle instruction. 8502 V1 = DAG.getBitcast(MVT::v4i32, V1); 8503 int WidenedMask[4] = { 8504 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, 8505 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; 8506 return DAG.getBitcast( 8507 MVT::v2i64, 8508 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, 8509 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); 8510 } 8511 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); 8512 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); 8513 assert(Mask[0] < 2 && "We sort V1 to be the first input."); 8514 assert(Mask[1] >= 2 && "We sort V2 to be the second input."); 8515 8516 // If we have a blend of two PACKUS operations an the blend aligns with the 8517 // low and half halves, we can just merge the PACKUS operations. This is 8518 // particularly important as it lets us merge shuffles that this routine itself 8519 // creates. 8520 auto GetPackNode = [](SDValue V) { 8521 while (V.getOpcode() == ISD::BITCAST) 8522 V = V.getOperand(0); 8523 8524 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); 8525 }; 8526 if (SDValue V1Pack = GetPackNode(V1)) 8527 if (SDValue V2Pack = GetPackNode(V2)) 8528 return DAG.getBitcast(MVT::v2i64, 8529 DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, 8530 Mask[0] == 0 ? V1Pack.getOperand(0) 8531 : V1Pack.getOperand(1), 8532 Mask[1] == 2 ? V2Pack.getOperand(0) 8533 : V2Pack.getOperand(1))); 8534 8535 // Try to use shift instructions. 8536 if (SDValue Shift = 8537 lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG)) 8538 return Shift; 8539 8540 // When loading a scalar and then shuffling it into a vector we can often do 8541 // the insertion cheaply. 8542 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 8543 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) 8544 return Insertion; 8545 // Try inverting the insertion since for v2 masks it is easy to do and we 8546 // can't reliably sort the mask one way or the other. 8547 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; 8548 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 8549 DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) 8550 return Insertion; 8551 8552 // We have different paths for blend lowering, but they all must use the 8553 // *exact* same predicate. 8554 bool IsBlendSupported = Subtarget->hasSSE41(); 8555 if (IsBlendSupported) 8556 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, 8557 Subtarget, DAG)) 8558 return Blend; 8559 8560 // Use dedicated unpack instructions for masks that match their pattern. 8561 if (SDValue V = 8562 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) 8563 return V; 8564 8565 // Try to use byte rotation instructions. 8566 // Its more profitable for pre-SSSE3 to use shuffles/unpacks. 8567 if (Subtarget->hasSSSE3()) 8568 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 8569 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) 8570 return Rotate; 8571 8572 // If we have direct support for blends, we should lower by decomposing into 8573 // a permute. That will be faster than the domain cross. 8574 if (IsBlendSupported) 8575 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, 8576 Mask, DAG); 8577 8578 // We implement this with SHUFPD which is pretty lame because it will likely 8579 // incur 2 cycles of stall for integer vectors on Nehalem and older chips. 8580 // However, all the alternatives are still more cycles and newer chips don't 8581 // have this problem. It would be really nice if x86 had better shuffles here. 8582 V1 = DAG.getBitcast(MVT::v2f64, V1); 8583 V2 = DAG.getBitcast(MVT::v2f64, V2); 8584 return DAG.getBitcast(MVT::v2i64, 8585 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); 8586 } 8587 8588 /// \brief Test whether this can be lowered with a single SHUFPS instruction. 8589 /// 8590 /// This is used to disable more specialized lowerings when the shufps lowering 8591 /// will happen to be efficient. 8592 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { 8593 // This routine only handles 128-bit shufps. 8594 assert(Mask.size() == 4 && "Unsupported mask size!"); 8595 8596 // To lower with a single SHUFPS we need to have the low half and high half 8597 // each requiring a single input. 8598 if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) 8599 return false; 8600 if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) 8601 return false; 8602 8603 return true; 8604 } 8605 8606 /// \brief Lower a vector shuffle using the SHUFPS instruction. 8607 /// 8608 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. 8609 /// It makes no assumptions about whether this is the *best* lowering, it simply 8610 /// uses it. 8611 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, 8612 ArrayRef<int> Mask, SDValue V1, 8613 SDValue V2, SelectionDAG &DAG) { 8614 SDValue LowV = V1, HighV = V2; 8615 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; 8616 8617 int NumV2Elements = 8618 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); 8619 8620 if (NumV2Elements == 1) { 8621 int V2Index = 8622 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - 8623 Mask.begin(); 8624 8625 // Compute the index adjacent to V2Index and in the same half by toggling 8626 // the low bit. 8627 int V2AdjIndex = V2Index ^ 1; 8628 8629 if (Mask[V2AdjIndex] == -1) { 8630 // Handles all the cases where we have a single V2 element and an undef. 8631 // This will only ever happen in the high lanes because we commute the 8632 // vector otherwise. 8633 if (V2Index < 2) 8634 std::swap(LowV, HighV); 8635 NewMask[V2Index] -= 4; 8636 } else { 8637 // Handle the case where the V2 element ends up adjacent to a V1 element. 8638 // To make this work, blend them together as the first step. 8639 int V1Index = V2AdjIndex; 8640 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; 8641 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, 8642 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); 8643 8644 // Now proceed to reconstruct the final blend as we have the necessary 8645 // high or low half formed. 8646 if (V2Index < 2) { 8647 LowV = V2; 8648 HighV = V1; 8649 } else { 8650 HighV = V2; 8651 } 8652 NewMask[V1Index] = 2; // We put the V1 element in V2[2]. 8653 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. 8654 } 8655 } else if (NumV2Elements == 2) { 8656 if (Mask[0] < 4 && Mask[1] < 4) { 8657 // Handle the easy case where we have V1 in the low lanes and V2 in the 8658 // high lanes. 8659 NewMask[2] -= 4; 8660 NewMask[3] -= 4; 8661 } else if (Mask[2] < 4 && Mask[3] < 4) { 8662 // We also handle the reversed case because this utility may get called 8663 // when we detect a SHUFPS pattern but can't easily commute the shuffle to 8664 // arrange things in the right direction. 8665 NewMask[0] -= 4; 8666 NewMask[1] -= 4; 8667 HighV = V1; 8668 LowV = V2; 8669 } else { 8670 // We have a mixture of V1 and V2 in both low and high lanes. Rather than 8671 // trying to place elements directly, just blend them and set up the final 8672 // shuffle to place them. 8673 8674 // The first two blend mask elements are for V1, the second two are for 8675 // V2. 8676 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], 8677 Mask[2] < 4 ? Mask[2] : Mask[3], 8678 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, 8679 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; 8680 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, 8681 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); 8682 8683 // Now we do a normal shuffle of V1 by giving V1 as both operands to 8684 // a blend. 8685 LowV = HighV = V1; 8686 NewMask[0] = Mask[0] < 4 ? 0 : 2; 8687 NewMask[1] = Mask[0] < 4 ? 2 : 0; 8688 NewMask[2] = Mask[2] < 4 ? 1 : 3; 8689 NewMask[3] = Mask[2] < 4 ? 3 : 1; 8690 } 8691 } 8692 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, 8693 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); 8694 } 8695 8696 /// \brief Lower 4-lane 32-bit floating point shuffles. 8697 /// 8698 /// Uses instructions exclusively from the floating point unit to minimize 8699 /// domain crossing penalties, as these are sufficient to implement all v4f32 8700 /// shuffles. 8701 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 8702 const X86Subtarget *Subtarget, 8703 SelectionDAG &DAG) { 8704 SDLoc DL(Op); 8705 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); 8706 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 8707 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 8708 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 8709 ArrayRef<int> Mask = SVOp->getMask(); 8710 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 8711 8712 int NumV2Elements = 8713 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); 8714 8715 if (NumV2Elements == 0) { 8716 // Check for being able to broadcast a single element. 8717 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, 8718 Mask, Subtarget, DAG)) 8719 return Broadcast; 8720 8721 // Use even/odd duplicate instructions for masks that match their pattern. 8722 if (Subtarget->hasSSE3()) { 8723 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) 8724 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); 8725 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) 8726 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); 8727 } 8728 8729 if (Subtarget->hasAVX()) { 8730 // If we have AVX, we can use VPERMILPS which will allow folding a load 8731 // into the shuffle. 8732 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, 8733 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 8734 } 8735 8736 // Otherwise, use a straight shuffle of a single input vector. We pass the 8737 // input vector to both operands to simulate this with a SHUFPS. 8738 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, 8739 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 8740 } 8741 8742 // There are special ways we can lower some single-element blends. However, we 8743 // have custom ways we can lower more complex single-element blends below that 8744 // we defer to if both this and BLENDPS fail to match, so restrict this to 8745 // when the V2 input is targeting element 0 of the mask -- that is the fast 8746 // case here. 8747 if (NumV2Elements == 1 && Mask[0] >= 4) 8748 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, 8749 Mask, Subtarget, DAG)) 8750 return V; 8751 8752 if (Subtarget->hasSSE41()) { 8753 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, 8754 Subtarget, DAG)) 8755 return Blend; 8756 8757 // Use INSERTPS if we can complete the shuffle efficiently. 8758 if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) 8759 return V; 8760 8761 if (!isSingleSHUFPSMask(Mask)) 8762 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( 8763 DL, MVT::v4f32, V1, V2, Mask, DAG)) 8764 return BlendPerm; 8765 } 8766 8767 // Use dedicated unpack instructions for masks that match their pattern. 8768 if (SDValue V = 8769 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) 8770 return V; 8771 8772 // Otherwise fall back to a SHUFPS lowering strategy. 8773 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); 8774 } 8775 8776 /// \brief Lower 4-lane i32 vector shuffles. 8777 /// 8778 /// We try to handle these with integer-domain shuffles where we can, but for 8779 /// blends we use the floating point domain blend instructions. 8780 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 8781 const X86Subtarget *Subtarget, 8782 SelectionDAG &DAG) { 8783 SDLoc DL(Op); 8784 assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); 8785 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); 8786 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); 8787 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 8788 ArrayRef<int> Mask = SVOp->getMask(); 8789 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 8790 8791 // Whenever we can lower this as a zext, that instruction is strictly faster 8792 // than any alternative. It also allows us to fold memory operands into the 8793 // shuffle in many cases. 8794 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, 8795 Mask, Subtarget, DAG)) 8796 return ZExt; 8797 8798 int NumV2Elements = 8799 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); 8800 8801 if (NumV2Elements == 0) { 8802 // Check for being able to broadcast a single element. 8803 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, 8804 Mask, Subtarget, DAG)) 8805 return Broadcast; 8806 8807 // Straight shuffle of a single input vector. For everything from SSE2 8808 // onward this has a single fast instruction with no scary immediates. 8809 // We coerce the shuffle pattern to be compatible with UNPCK instructions 8810 // but we aren't actually going to use the UNPCK instruction because doing 8811 // so prevents folding a load into this instruction or making a copy. 8812 const int UnpackLoMask[] = {0, 0, 1, 1}; 8813 const int UnpackHiMask[] = {2, 2, 3, 3}; 8814 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) 8815 Mask = UnpackLoMask; 8816 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) 8817 Mask = UnpackHiMask; 8818 8819 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, 8820 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 8821 } 8822 8823 // Try to use shift instructions. 8824 if (SDValue Shift = 8825 lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG)) 8826 return Shift; 8827 8828 // There are special ways we can lower some single-element blends. 8829 if (NumV2Elements == 1) 8830 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, 8831 Mask, Subtarget, DAG)) 8832 return V; 8833 8834 // We have different paths for blend lowering, but they all must use the 8835 // *exact* same predicate. 8836 bool IsBlendSupported = Subtarget->hasSSE41(); 8837 if (IsBlendSupported) 8838 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, 8839 Subtarget, DAG)) 8840 return Blend; 8841 8842 if (SDValue Masked = 8843 lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) 8844 return Masked; 8845 8846 // Use dedicated unpack instructions for masks that match their pattern. 8847 if (SDValue V = 8848 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) 8849 return V; 8850 8851 // Try to use byte rotation instructions. 8852 // Its more profitable for pre-SSSE3 to use shuffles/unpacks. 8853 if (Subtarget->hasSSSE3()) 8854 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 8855 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) 8856 return Rotate; 8857 8858 // If we have direct support for blends, we should lower by decomposing into 8859 // a permute. That will be faster than the domain cross. 8860 if (IsBlendSupported) 8861 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, 8862 Mask, DAG); 8863 8864 // Try to lower by permuting the inputs into an unpack instruction. 8865 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, 8866 V2, Mask, DAG)) 8867 return Unpack; 8868 8869 // We implement this with SHUFPS because it can blend from two vectors. 8870 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build 8871 // up the inputs, bypassing domain shift penalties that we would encur if we 8872 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't 8873 // relevant. 8874 return DAG.getBitcast( 8875 MVT::v4i32, 8876 DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1), 8877 DAG.getBitcast(MVT::v4f32, V2), Mask)); 8878 } 8879 8880 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 8881 /// shuffle lowering, and the most complex part. 8882 /// 8883 /// The lowering strategy is to try to form pairs of input lanes which are 8884 /// targeted at the same half of the final vector, and then use a dword shuffle 8885 /// to place them onto the right half, and finally unpack the paired lanes into 8886 /// their final position. 8887 /// 8888 /// The exact breakdown of how to form these dword pairs and align them on the 8889 /// correct sides is really tricky. See the comments within the function for 8890 /// more of the details. 8891 /// 8892 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each 8893 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to 8894 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 8895 /// vector, form the analogous 128-bit 8-element Mask. 8896 static SDValue lowerV8I16GeneralSingleInputVectorShuffle( 8897 SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, 8898 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 8899 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); 8900 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); 8901 8902 assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); 8903 MutableArrayRef<int> LoMask = Mask.slice(0, 4); 8904 MutableArrayRef<int> HiMask = Mask.slice(4, 4); 8905 8906 SmallVector<int, 4> LoInputs; 8907 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), 8908 [](int M) { return M >= 0; }); 8909 std::sort(LoInputs.begin(), LoInputs.end()); 8910 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); 8911 SmallVector<int, 4> HiInputs; 8912 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), 8913 [](int M) { return M >= 0; }); 8914 std::sort(HiInputs.begin(), HiInputs.end()); 8915 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); 8916 int NumLToL = 8917 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); 8918 int NumHToL = LoInputs.size() - NumLToL; 8919 int NumLToH = 8920 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); 8921 int NumHToH = HiInputs.size() - NumLToH; 8922 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); 8923 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); 8924 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); 8925 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); 8926 8927 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all 8928 // such inputs we can swap two of the dwords across the half mark and end up 8929 // with <=2 inputs to each half in each half. Once there, we can fall through 8930 // to the generic code below. For example: 8931 // 8932 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] 8933 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] 8934 // 8935 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half 8936 // and an existing 2-into-2 on the other half. In this case we may have to 8937 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or 8938 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. 8939 // Fortunately, we don't have to handle anything but a 2-into-2 pattern 8940 // because any other situation (including a 3-into-1 or 1-into-3 in the other 8941 // half than the one we target for fixing) will be fixed when we re-enter this 8942 // path. We will also combine away any sequence of PSHUFD instructions that 8943 // result into a single instruction. Here is an example of the tricky case: 8944 // 8945 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] 8946 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] 8947 // 8948 // This now has a 1-into-3 in the high half! Instead, we do two shuffles: 8949 // 8950 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] 8951 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] 8952 // 8953 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] 8954 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] 8955 // 8956 // The result is fine to be handled by the generic logic. 8957 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, 8958 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, 8959 int AOffset, int BOffset) { 8960 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && 8961 "Must call this with A having 3 or 1 inputs from the A half."); 8962 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && 8963 "Must call this with B having 1 or 3 inputs from the B half."); 8964 assert(AToAInputs.size() + BToAInputs.size() == 4 && 8965 "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); 8966 8967 bool ThreeAInputs = AToAInputs.size() == 3; 8968 8969 // Compute the index of dword with only one word among the three inputs in 8970 // a half by taking the sum of the half with three inputs and subtracting 8971 // the sum of the actual three inputs. The difference is the remaining 8972 // slot. 8973 int ADWord, BDWord; 8974 int &TripleDWord = ThreeAInputs ? ADWord : BDWord; 8975 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; 8976 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; 8977 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; 8978 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; 8979 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); 8980 int TripleNonInputIdx = 8981 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); 8982 TripleDWord = TripleNonInputIdx / 2; 8983 8984 // We use xor with one to compute the adjacent DWord to whichever one the 8985 // OneInput is in. 8986 OneInputDWord = (OneInput / 2) ^ 1; 8987 8988 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA 8989 // and BToA inputs. If there is also such a problem with the BToB and AToB 8990 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in 8991 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it 8992 // is essential that we don't *create* a 3<-1 as then we might oscillate. 8993 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { 8994 // Compute how many inputs will be flipped by swapping these DWords. We 8995 // need 8996 // to balance this to ensure we don't form a 3-1 shuffle in the other 8997 // half. 8998 int NumFlippedAToBInputs = 8999 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + 9000 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); 9001 int NumFlippedBToBInputs = 9002 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + 9003 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); 9004 if ((NumFlippedAToBInputs == 1 && 9005 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || 9006 (NumFlippedBToBInputs == 1 && 9007 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { 9008 // We choose whether to fix the A half or B half based on whether that 9009 // half has zero flipped inputs. At zero, we may not be able to fix it 9010 // with that half. We also bias towards fixing the B half because that 9011 // will more commonly be the high half, and we have to bias one way. 9012 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, 9013 ArrayRef<int> Inputs) { 9014 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. 9015 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), 9016 PinnedIdx ^ 1) != Inputs.end(); 9017 // Determine whether the free index is in the flipped dword or the 9018 // unflipped dword based on where the pinned index is. We use this bit 9019 // in an xor to conditionally select the adjacent dword. 9020 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); 9021 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), 9022 FixFreeIdx) != Inputs.end(); 9023 if (IsFixIdxInput == IsFixFreeIdxInput) 9024 FixFreeIdx += 1; 9025 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), 9026 FixFreeIdx) != Inputs.end(); 9027 assert(IsFixIdxInput != IsFixFreeIdxInput && 9028 "We need to be changing the number of flipped inputs!"); 9029 int PSHUFHalfMask[] = {0, 1, 2, 3}; 9030 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); 9031 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, 9032 MVT::v8i16, V, 9033 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); 9034 9035 for (int &M : Mask) 9036 if (M != -1 && M == FixIdx) 9037 M = FixFreeIdx; 9038 else if (M != -1 && M == FixFreeIdx) 9039 M = FixIdx; 9040 }; 9041 if (NumFlippedBToBInputs != 0) { 9042 int BPinnedIdx = 9043 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; 9044 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); 9045 } else { 9046 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); 9047 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; 9048 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); 9049 } 9050 } 9051 } 9052 9053 int PSHUFDMask[] = {0, 1, 2, 3}; 9054 PSHUFDMask[ADWord] = BDWord; 9055 PSHUFDMask[BDWord] = ADWord; 9056 V = DAG.getBitcast( 9057 VT, 9058 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), 9059 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 9060 9061 // Adjust the mask to match the new locations of A and B. 9062 for (int &M : Mask) 9063 if (M != -1 && M/2 == ADWord) 9064 M = 2 * BDWord + M % 2; 9065 else if (M != -1 && M/2 == BDWord) 9066 M = 2 * ADWord + M % 2; 9067 9068 // Recurse back into this routine to re-compute state now that this isn't 9069 // a 3 and 1 problem. 9070 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, 9071 DAG); 9072 }; 9073 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) 9074 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); 9075 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) 9076 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); 9077 9078 // At this point there are at most two inputs to the low and high halves from 9079 // each half. That means the inputs can always be grouped into dwords and 9080 // those dwords can then be moved to the correct half with a dword shuffle. 9081 // We use at most one low and one high word shuffle to collect these paired 9082 // inputs into dwords, and finally a dword shuffle to place them. 9083 int PSHUFLMask[4] = {-1, -1, -1, -1}; 9084 int PSHUFHMask[4] = {-1, -1, -1, -1}; 9085 int PSHUFDMask[4] = {-1, -1, -1, -1}; 9086 9087 // First fix the masks for all the inputs that are staying in their 9088 // original halves. This will then dictate the targets of the cross-half 9089 // shuffles. 9090 auto fixInPlaceInputs = 9091 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, 9092 MutableArrayRef<int> SourceHalfMask, 9093 MutableArrayRef<int> HalfMask, int HalfOffset) { 9094 if (InPlaceInputs.empty()) 9095 return; 9096 if (InPlaceInputs.size() == 1) { 9097 SourceHalfMask[InPlaceInputs[0] - HalfOffset] = 9098 InPlaceInputs[0] - HalfOffset; 9099 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; 9100 return; 9101 } 9102 if (IncomingInputs.empty()) { 9103 // Just fix all of the in place inputs. 9104 for (int Input : InPlaceInputs) { 9105 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; 9106 PSHUFDMask[Input / 2] = Input / 2; 9107 } 9108 return; 9109 } 9110 9111 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); 9112 SourceHalfMask[InPlaceInputs[0] - HalfOffset] = 9113 InPlaceInputs[0] - HalfOffset; 9114 // Put the second input next to the first so that they are packed into 9115 // a dword. We find the adjacent index by toggling the low bit. 9116 int AdjIndex = InPlaceInputs[0] ^ 1; 9117 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; 9118 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); 9119 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; 9120 }; 9121 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); 9122 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); 9123 9124 // Now gather the cross-half inputs and place them into a free dword of 9125 // their target half. 9126 // FIXME: This operation could almost certainly be simplified dramatically to 9127 // look more like the 3-1 fixing operation. 9128 auto moveInputsToRightHalf = [&PSHUFDMask]( 9129 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, 9130 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, 9131 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, 9132 int DestOffset) { 9133 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { 9134 return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; 9135 }; 9136 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, 9137 int Word) { 9138 int LowWord = Word & ~1; 9139 int HighWord = Word | 1; 9140 return isWordClobbered(SourceHalfMask, LowWord) || 9141 isWordClobbered(SourceHalfMask, HighWord); 9142 }; 9143 9144 if (IncomingInputs.empty()) 9145 return; 9146 9147 if (ExistingInputs.empty()) { 9148 // Map any dwords with inputs from them into the right half. 9149 for (int Input : IncomingInputs) { 9150 // If the source half mask maps over the inputs, turn those into 9151 // swaps and use the swapped lane. 9152 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { 9153 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { 9154 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = 9155 Input - SourceOffset; 9156 // We have to swap the uses in our half mask in one sweep. 9157 for (int &M : HalfMask) 9158 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) 9159 M = Input; 9160 else if (M == Input) 9161 M = SourceHalfMask[Input - SourceOffset] + SourceOffset; 9162 } else { 9163 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == 9164 Input - SourceOffset && 9165 "Previous placement doesn't match!"); 9166 } 9167 // Note that this correctly re-maps both when we do a swap and when 9168 // we observe the other side of the swap above. We rely on that to 9169 // avoid swapping the members of the input list directly. 9170 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; 9171 } 9172 9173 // Map the input's dword into the correct half. 9174 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) 9175 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; 9176 else 9177 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == 9178 Input / 2 && 9179 "Previous placement doesn't match!"); 9180 } 9181 9182 // And just directly shift any other-half mask elements to be same-half 9183 // as we will have mirrored the dword containing the element into the 9184 // same position within that half. 9185 for (int &M : HalfMask) 9186 if (M >= SourceOffset && M < SourceOffset + 4) { 9187 M = M - SourceOffset + DestOffset; 9188 assert(M >= 0 && "This should never wrap below zero!"); 9189 } 9190 return; 9191 } 9192 9193 // Ensure we have the input in a viable dword of its current half. This 9194 // is particularly tricky because the original position may be clobbered 9195 // by inputs being moved and *staying* in that half. 9196 if (IncomingInputs.size() == 1) { 9197 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { 9198 int InputFixed = std::find(std::begin(SourceHalfMask), 9199 std::end(SourceHalfMask), -1) - 9200 std::begin(SourceHalfMask) + SourceOffset; 9201 SourceHalfMask[InputFixed - SourceOffset] = 9202 IncomingInputs[0] - SourceOffset; 9203 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], 9204 InputFixed); 9205 IncomingInputs[0] = InputFixed; 9206 } 9207 } else if (IncomingInputs.size() == 2) { 9208 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || 9209 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { 9210 // We have two non-adjacent or clobbered inputs we need to extract from 9211 // the source half. To do this, we need to map them into some adjacent 9212 // dword slot in the source mask. 9213 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, 9214 IncomingInputs[1] - SourceOffset}; 9215 9216 // If there is a free slot in the source half mask adjacent to one of 9217 // the inputs, place the other input in it. We use (Index XOR 1) to 9218 // compute an adjacent index. 9219 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && 9220 SourceHalfMask[InputsFixed[0] ^ 1] == -1) { 9221 SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; 9222 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; 9223 InputsFixed[1] = InputsFixed[0] ^ 1; 9224 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && 9225 SourceHalfMask[InputsFixed[1] ^ 1] == -1) { 9226 SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; 9227 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; 9228 InputsFixed[0] = InputsFixed[1] ^ 1; 9229 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && 9230 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { 9231 // The two inputs are in the same DWord but it is clobbered and the 9232 // adjacent DWord isn't used at all. Move both inputs to the free 9233 // slot. 9234 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; 9235 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; 9236 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); 9237 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; 9238 } else { 9239 // The only way we hit this point is if there is no clobbering 9240 // (because there are no off-half inputs to this half) and there is no 9241 // free slot adjacent to one of the inputs. In this case, we have to 9242 // swap an input with a non-input. 9243 for (int i = 0; i < 4; ++i) 9244 assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && 9245 "We can't handle any clobbers here!"); 9246 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && 9247 "Cannot have adjacent inputs here!"); 9248 9249 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; 9250 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; 9251 9252 // We also have to update the final source mask in this case because 9253 // it may need to undo the above swap. 9254 for (int &M : FinalSourceHalfMask) 9255 if (M == (InputsFixed[0] ^ 1) + SourceOffset) 9256 M = InputsFixed[1] + SourceOffset; 9257 else if (M == InputsFixed[1] + SourceOffset) 9258 M = (InputsFixed[0] ^ 1) + SourceOffset; 9259 9260 InputsFixed[1] = InputsFixed[0] ^ 1; 9261 } 9262 9263 // Point everything at the fixed inputs. 9264 for (int &M : HalfMask) 9265 if (M == IncomingInputs[0]) 9266 M = InputsFixed[0] + SourceOffset; 9267 else if (M == IncomingInputs[1]) 9268 M = InputsFixed[1] + SourceOffset; 9269 9270 IncomingInputs[0] = InputsFixed[0] + SourceOffset; 9271 IncomingInputs[1] = InputsFixed[1] + SourceOffset; 9272 } 9273 } else { 9274 llvm_unreachable("Unhandled input size!"); 9275 } 9276 9277 // Now hoist the DWord down to the right half. 9278 int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; 9279 assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); 9280 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; 9281 for (int &M : HalfMask) 9282 for (int Input : IncomingInputs) 9283 if (M == Input) 9284 M = FreeDWord * 2 + Input % 2; 9285 }; 9286 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, 9287 /*SourceOffset*/ 4, /*DestOffset*/ 0); 9288 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, 9289 /*SourceOffset*/ 0, /*DestOffset*/ 4); 9290 9291 // Now enact all the shuffles we've computed to move the inputs into their 9292 // target half. 9293 if (!isNoopShuffleMask(PSHUFLMask)) 9294 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, 9295 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); 9296 if (!isNoopShuffleMask(PSHUFHMask)) 9297 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, 9298 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); 9299 if (!isNoopShuffleMask(PSHUFDMask)) 9300 V = DAG.getBitcast( 9301 VT, 9302 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), 9303 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 9304 9305 // At this point, each half should contain all its inputs, and we can then 9306 // just shuffle them into their final position. 9307 assert(std::count_if(LoMask.begin(), LoMask.end(), 9308 [](int M) { return M >= 4; }) == 0 && 9309 "Failed to lift all the high half inputs to the low mask!"); 9310 assert(std::count_if(HiMask.begin(), HiMask.end(), 9311 [](int M) { return M >= 0 && M < 4; }) == 0 && 9312 "Failed to lift all the low half inputs to the high mask!"); 9313 9314 // Do a half shuffle for the low mask. 9315 if (!isNoopShuffleMask(LoMask)) 9316 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, 9317 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); 9318 9319 // Do a half shuffle with the high mask after shifting its values down. 9320 for (int &M : HiMask) 9321 if (M >= 0) 9322 M -= 4; 9323 if (!isNoopShuffleMask(HiMask)) 9324 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, 9325 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); 9326 9327 return V; 9328 } 9329 9330 /// \brief Helper to form a PSHUFB-based shuffle+blend. 9331 static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, 9332 SDValue V2, ArrayRef<int> Mask, 9333 SelectionDAG &DAG, bool &V1InUse, 9334 bool &V2InUse) { 9335 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 9336 SDValue V1Mask[16]; 9337 SDValue V2Mask[16]; 9338 V1InUse = false; 9339 V2InUse = false; 9340 9341 int Size = Mask.size(); 9342 int Scale = 16 / Size; 9343 for (int i = 0; i < 16; ++i) { 9344 if (Mask[i / Scale] == -1) { 9345 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); 9346 } else { 9347 const int ZeroMask = 0x80; 9348 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale 9349 : ZeroMask; 9350 int V2Idx = Mask[i / Scale] < Size 9351 ? ZeroMask 9352 : (Mask[i / Scale] - Size) * Scale + i % Scale; 9353 if (Zeroable[i / Scale]) 9354 V1Idx = V2Idx = ZeroMask; 9355 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); 9356 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); 9357 V1InUse |= (ZeroMask != V1Idx); 9358 V2InUse |= (ZeroMask != V2Idx); 9359 } 9360 } 9361 9362 if (V1InUse) 9363 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, 9364 DAG.getBitcast(MVT::v16i8, V1), 9365 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); 9366 if (V2InUse) 9367 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, 9368 DAG.getBitcast(MVT::v16i8, V2), 9369 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); 9370 9371 // If we need shuffled inputs from both, blend the two. 9372 SDValue V; 9373 if (V1InUse && V2InUse) 9374 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); 9375 else 9376 V = V1InUse ? V1 : V2; 9377 9378 // Cast the result back to the correct type. 9379 return DAG.getBitcast(VT, V); 9380 } 9381 9382 /// \brief Generic lowering of 8-lane i16 shuffles. 9383 /// 9384 /// This handles both single-input shuffles and combined shuffle/blends with 9385 /// two inputs. The single input shuffles are immediately delegated to 9386 /// a dedicated lowering routine. 9387 /// 9388 /// The blends are lowered in one of three fundamental ways. If there are few 9389 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle 9390 /// of the input is significantly cheaper when lowered as an interleaving of 9391 /// the two inputs, try to interleave them. Otherwise, blend the low and high 9392 /// halves of the inputs separately (making them have relatively few inputs) 9393 /// and then concatenate them. 9394 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9395 const X86Subtarget *Subtarget, 9396 SelectionDAG &DAG) { 9397 SDLoc DL(Op); 9398 assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); 9399 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); 9400 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); 9401 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9402 ArrayRef<int> OrigMask = SVOp->getMask(); 9403 int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], 9404 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; 9405 MutableArrayRef<int> Mask(MaskStorage); 9406 9407 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 9408 9409 // Whenever we can lower this as a zext, that instruction is strictly faster 9410 // than any alternative. 9411 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( 9412 DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) 9413 return ZExt; 9414 9415 auto isV1 = [](int M) { return M >= 0 && M < 8; }; 9416 (void)isV1; 9417 auto isV2 = [](int M) { return M >= 8; }; 9418 9419 int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); 9420 9421 if (NumV2Inputs == 0) { 9422 // Check for being able to broadcast a single element. 9423 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, 9424 Mask, Subtarget, DAG)) 9425 return Broadcast; 9426 9427 // Try to use shift instructions. 9428 if (SDValue Shift = 9429 lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG)) 9430 return Shift; 9431 9432 // Use dedicated unpack instructions for masks that match their pattern. 9433 if (SDValue V = 9434 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) 9435 return V; 9436 9437 // Try to use byte rotation instructions. 9438 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, 9439 Mask, Subtarget, DAG)) 9440 return Rotate; 9441 9442 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask, 9443 Subtarget, DAG); 9444 } 9445 9446 assert(std::any_of(Mask.begin(), Mask.end(), isV1) && 9447 "All single-input shuffles should be canonicalized to be V1-input " 9448 "shuffles."); 9449 9450 // Try to use shift instructions. 9451 if (SDValue Shift = 9452 lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) 9453 return Shift; 9454 9455 // See if we can use SSE4A Extraction / Insertion. 9456 if (Subtarget->hasSSE4A()) 9457 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) 9458 return V; 9459 9460 // There are special ways we can lower some single-element blends. 9461 if (NumV2Inputs == 1) 9462 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, 9463 Mask, Subtarget, DAG)) 9464 return V; 9465 9466 // We have different paths for blend lowering, but they all must use the 9467 // *exact* same predicate. 9468 bool IsBlendSupported = Subtarget->hasSSE41(); 9469 if (IsBlendSupported) 9470 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, 9471 Subtarget, DAG)) 9472 return Blend; 9473 9474 if (SDValue Masked = 9475 lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) 9476 return Masked; 9477 9478 // Use dedicated unpack instructions for masks that match their pattern. 9479 if (SDValue V = 9480 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) 9481 return V; 9482 9483 // Try to use byte rotation instructions. 9484 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 9485 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) 9486 return Rotate; 9487 9488 if (SDValue BitBlend = 9489 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) 9490 return BitBlend; 9491 9492 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, 9493 V2, Mask, DAG)) 9494 return Unpack; 9495 9496 // If we can't directly blend but can use PSHUFB, that will be better as it 9497 // can both shuffle and set up the inefficient blend. 9498 if (!IsBlendSupported && Subtarget->hasSSSE3()) { 9499 bool V1InUse, V2InUse; 9500 return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG, 9501 V1InUse, V2InUse); 9502 } 9503 9504 // We can always bit-blend if we have to so the fallback strategy is to 9505 // decompose into single-input permutes and blends. 9506 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, 9507 Mask, DAG); 9508 } 9509 9510 /// \brief Check whether a compaction lowering can be done by dropping even 9511 /// elements and compute how many times even elements must be dropped. 9512 /// 9513 /// This handles shuffles which take every Nth element where N is a power of 9514 /// two. Example shuffle masks: 9515 /// 9516 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 9517 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 9518 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 9519 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 9520 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 9521 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 9522 /// 9523 /// Any of these lanes can of course be undef. 9524 /// 9525 /// This routine only supports N <= 3. 9526 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here 9527 /// for larger N. 9528 /// 9529 /// \returns N above, or the number of times even elements must be dropped if 9530 /// there is such a number. Otherwise returns zero. 9531 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { 9532 // Figure out whether we're looping over two inputs or just one. 9533 bool IsSingleInput = isSingleInputShuffleMask(Mask); 9534 9535 // The modulus for the shuffle vector entries is based on whether this is 9536 // a single input or not. 9537 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); 9538 assert(isPowerOf2_32((uint32_t)ShuffleModulus) && 9539 "We should only be called with masks with a power-of-2 size!"); 9540 9541 uint64_t ModMask = (uint64_t)ShuffleModulus - 1; 9542 9543 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, 9544 // and 2^3 simultaneously. This is because we may have ambiguity with 9545 // partially undef inputs. 9546 bool ViableForN[3] = {true, true, true}; 9547 9548 for (int i = 0, e = Mask.size(); i < e; ++i) { 9549 // Ignore undef lanes, we'll optimistically collapse them to the pattern we 9550 // want. 9551 if (Mask[i] == -1) 9552 continue; 9553 9554 bool IsAnyViable = false; 9555 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) 9556 if (ViableForN[j]) { 9557 uint64_t N = j + 1; 9558 9559 // The shuffle mask must be equal to (i * 2^N) % M. 9560 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) 9561 IsAnyViable = true; 9562 else 9563 ViableForN[j] = false; 9564 } 9565 // Early exit if we exhaust the possible powers of two. 9566 if (!IsAnyViable) 9567 break; 9568 } 9569 9570 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) 9571 if (ViableForN[j]) 9572 return j + 1; 9573 9574 // Return 0 as there is no viable power of two. 9575 return 0; 9576 } 9577 9578 /// \brief Generic lowering of v16i8 shuffles. 9579 /// 9580 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to 9581 /// detect any complexity reducing interleaving. If that doesn't help, it uses 9582 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses 9583 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them 9584 /// back together. 9585 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9586 const X86Subtarget *Subtarget, 9587 SelectionDAG &DAG) { 9588 SDLoc DL(Op); 9589 assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); 9590 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); 9591 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); 9592 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 9593 ArrayRef<int> Mask = SVOp->getMask(); 9594 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 9595 9596 // Try to use shift instructions. 9597 if (SDValue Shift = 9598 lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG)) 9599 return Shift; 9600 9601 // Try to use byte rotation instructions. 9602 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 9603 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) 9604 return Rotate; 9605 9606 // Try to use a zext lowering. 9607 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( 9608 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) 9609 return ZExt; 9610 9611 // See if we can use SSE4A Extraction / Insertion. 9612 if (Subtarget->hasSSE4A()) 9613 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) 9614 return V; 9615 9616 int NumV2Elements = 9617 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); 9618 9619 // For single-input shuffles, there are some nicer lowering tricks we can use. 9620 if (NumV2Elements == 0) { 9621 // Check for being able to broadcast a single element. 9622 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, 9623 Mask, Subtarget, DAG)) 9624 return Broadcast; 9625 9626 // Check whether we can widen this to an i16 shuffle by duplicating bytes. 9627 // Notably, this handles splat and partial-splat shuffles more efficiently. 9628 // However, it only makes sense if the pre-duplication shuffle simplifies 9629 // things significantly. Currently, this means we need to be able to 9630 // express the pre-duplication shuffle as an i16 shuffle. 9631 // 9632 // FIXME: We should check for other patterns which can be widened into an 9633 // i16 shuffle as well. 9634 auto canWidenViaDuplication = [](ArrayRef<int> Mask) { 9635 for (int i = 0; i < 16; i += 2) 9636 if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) 9637 return false; 9638 9639 return true; 9640 }; 9641 auto tryToWidenViaDuplication = [&]() -> SDValue { 9642 if (!canWidenViaDuplication(Mask)) 9643 return SDValue(); 9644 SmallVector<int, 4> LoInputs; 9645 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), 9646 [](int M) { return M >= 0 && M < 8; }); 9647 std::sort(LoInputs.begin(), LoInputs.end()); 9648 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), 9649 LoInputs.end()); 9650 SmallVector<int, 4> HiInputs; 9651 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), 9652 [](int M) { return M >= 8; }); 9653 std::sort(HiInputs.begin(), HiInputs.end()); 9654 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), 9655 HiInputs.end()); 9656 9657 bool TargetLo = LoInputs.size() >= HiInputs.size(); 9658 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; 9659 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs; 9660 9661 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; 9662 SmallDenseMap<int, int, 8> LaneMap; 9663 for (int I : InPlaceInputs) { 9664 PreDupI16Shuffle[I/2] = I/2; 9665 LaneMap[I] = I; 9666 } 9667 int j = TargetLo ? 0 : 4, je = j + 4; 9668 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { 9669 // Check if j is already a shuffle of this input. This happens when 9670 // there are two adjacent bytes after we move the low one. 9671 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { 9672 // If we haven't yet mapped the input, search for a slot into which 9673 // we can map it. 9674 while (j < je && PreDupI16Shuffle[j] != -1) 9675 ++j; 9676 9677 if (j == je) 9678 // We can't place the inputs into a single half with a simple i16 shuffle, so bail. 9679 return SDValue(); 9680 9681 // Map this input with the i16 shuffle. 9682 PreDupI16Shuffle[j] = MovingInputs[i] / 2; 9683 } 9684 9685 // Update the lane map based on the mapping we ended up with. 9686 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; 9687 } 9688 V1 = DAG.getBitcast( 9689 MVT::v16i8, 9690 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), 9691 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); 9692 9693 // Unpack the bytes to form the i16s that will be shuffled into place. 9694 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, 9695 MVT::v16i8, V1, V1); 9696 9697 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 9698 for (int i = 0; i < 16; ++i) 9699 if (Mask[i] != -1) { 9700 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); 9701 assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); 9702 if (PostDupI16Shuffle[i / 2] == -1) 9703 PostDupI16Shuffle[i / 2] = MappedMask; 9704 else 9705 assert(PostDupI16Shuffle[i / 2] == MappedMask && 9706 "Conflicting entrties in the original shuffle!"); 9707 } 9708 return DAG.getBitcast( 9709 MVT::v16i8, 9710 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), 9711 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); 9712 }; 9713 if (SDValue V = tryToWidenViaDuplication()) 9714 return V; 9715 } 9716 9717 if (SDValue Masked = 9718 lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) 9719 return Masked; 9720 9721 // Use dedicated unpack instructions for masks that match their pattern. 9722 if (SDValue V = 9723 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) 9724 return V; 9725 9726 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly 9727 // with PSHUFB. It is important to do this before we attempt to generate any 9728 // blends but after all of the single-input lowerings. If the single input 9729 // lowerings can find an instruction sequence that is faster than a PSHUFB, we 9730 // want to preserve that and we can DAG combine any longer sequences into 9731 // a PSHUFB in the end. But once we start blending from multiple inputs, 9732 // the complexity of DAG combining bad patterns back into PSHUFB is too high, 9733 // and there are *very* few patterns that would actually be faster than the 9734 // PSHUFB approach because of its ability to zero lanes. 9735 // 9736 // FIXME: The only exceptions to the above are blends which are exact 9737 // interleavings with direct instructions supporting them. We currently don't 9738 // handle those well here. 9739 if (Subtarget->hasSSSE3()) { 9740 bool V1InUse = false; 9741 bool V2InUse = false; 9742 9743 SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask, 9744 DAG, V1InUse, V2InUse); 9745 9746 // If both V1 and V2 are in use and we can use a direct blend or an unpack, 9747 // do so. This avoids using them to handle blends-with-zero which is 9748 // important as a single pshufb is significantly faster for that. 9749 if (V1InUse && V2InUse) { 9750 if (Subtarget->hasSSE41()) 9751 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, 9752 Mask, Subtarget, DAG)) 9753 return Blend; 9754 9755 // We can use an unpack to do the blending rather than an or in some 9756 // cases. Even though the or may be (very minorly) more efficient, we 9757 // preference this lowering because there are common cases where part of 9758 // the complexity of the shuffles goes away when we do the final blend as 9759 // an unpack. 9760 // FIXME: It might be worth trying to detect if the unpack-feeding 9761 // shuffles will both be pshufb, in which case we shouldn't bother with 9762 // this. 9763 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( 9764 DL, MVT::v16i8, V1, V2, Mask, DAG)) 9765 return Unpack; 9766 } 9767 9768 return PSHUFB; 9769 } 9770 9771 // There are special ways we can lower some single-element blends. 9772 if (NumV2Elements == 1) 9773 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, 9774 Mask, Subtarget, DAG)) 9775 return V; 9776 9777 if (SDValue BitBlend = 9778 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) 9779 return BitBlend; 9780 9781 // Check whether a compaction lowering can be done. This handles shuffles 9782 // which take every Nth element for some even N. See the helper function for 9783 // details. 9784 // 9785 // We special case these as they can be particularly efficiently handled with 9786 // the PACKUSB instruction on x86 and they show up in common patterns of 9787 // rearranging bytes to truncate wide elements. 9788 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { 9789 // NumEvenDrops is the power of two stride of the elements. Another way of 9790 // thinking about it is that we need to drop the even elements this many 9791 // times to get the original input. 9792 bool IsSingleInput = isSingleInputShuffleMask(Mask); 9793 9794 // First we need to zero all the dropped bytes. 9795 assert(NumEvenDrops <= 3 && 9796 "No support for dropping even elements more than 3 times."); 9797 // We use the mask type to pick which bytes are preserved based on how many 9798 // elements are dropped. 9799 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; 9800 SDValue ByteClearMask = DAG.getBitcast( 9801 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); 9802 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); 9803 if (!IsSingleInput) 9804 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); 9805 9806 // Now pack things back together. 9807 V1 = DAG.getBitcast(MVT::v8i16, V1); 9808 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); 9809 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); 9810 for (int i = 1; i < NumEvenDrops; ++i) { 9811 Result = DAG.getBitcast(MVT::v8i16, Result); 9812 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); 9813 } 9814 9815 return Result; 9816 } 9817 9818 // Handle multi-input cases by blending single-input shuffles. 9819 if (NumV2Elements > 0) 9820 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, 9821 Mask, DAG); 9822 9823 // The fallback path for single-input shuffles widens this into two v8i16 9824 // vectors with unpacks, shuffles those, and then pulls them back together 9825 // with a pack. 9826 SDValue V = V1; 9827 9828 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 9829 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 9830 for (int i = 0; i < 16; ++i) 9831 if (Mask[i] >= 0) 9832 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; 9833 9834 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); 9835 9836 SDValue VLoHalf, VHiHalf; 9837 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask 9838 // them out and avoid using UNPCK{L,H} to extract the elements of V as 9839 // i16s. 9840 if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask), 9841 [](int M) { return M >= 0 && M % 2 == 1; }) && 9842 std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), 9843 [](int M) { return M >= 0 && M % 2 == 1; })) { 9844 // Use a mask to drop the high bytes. 9845 VLoHalf = DAG.getBitcast(MVT::v8i16, V); 9846 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, 9847 DAG.getConstant(0x00FF, DL, MVT::v8i16)); 9848 9849 // This will be a single vector shuffle instead of a blend so nuke VHiHalf. 9850 VHiHalf = DAG.getUNDEF(MVT::v8i16); 9851 9852 // Squash the masks to point directly into VLoHalf. 9853 for (int &M : LoBlendMask) 9854 if (M >= 0) 9855 M /= 2; 9856 for (int &M : HiBlendMask) 9857 if (M >= 0) 9858 M /= 2; 9859 } else { 9860 // Otherwise just unpack the low half of V into VLoHalf and the high half into 9861 // VHiHalf so that we can blend them as i16s. 9862 VLoHalf = DAG.getBitcast( 9863 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); 9864 VHiHalf = DAG.getBitcast( 9865 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); 9866 } 9867 9868 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); 9869 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); 9870 9871 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); 9872 } 9873 9874 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. 9875 /// 9876 /// This routine breaks down the specific type of 128-bit shuffle and 9877 /// dispatches to the lowering routines accordingly. 9878 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 9879 MVT VT, const X86Subtarget *Subtarget, 9880 SelectionDAG &DAG) { 9881 switch (VT.SimpleTy) { 9882 case MVT::v2i64: 9883 return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); 9884 case MVT::v2f64: 9885 return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); 9886 case MVT::v4i32: 9887 return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); 9888 case MVT::v4f32: 9889 return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); 9890 case MVT::v8i16: 9891 return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); 9892 case MVT::v16i8: 9893 return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); 9894 9895 default: 9896 llvm_unreachable("Unimplemented!"); 9897 } 9898 } 9899 9900 /// \brief Helper function to test whether a shuffle mask could be 9901 /// simplified by widening the elements being shuffled. 9902 /// 9903 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise 9904 /// leaves it in an unspecified state. 9905 /// 9906 /// NOTE: This must handle normal vector shuffle masks and *target* vector 9907 /// shuffle masks. The latter have the special property of a '-2' representing 9908 /// a zero-ed lane of a vector. 9909 static bool canWidenShuffleElements(ArrayRef<int> Mask, 9910 SmallVectorImpl<int> &WidenedMask) { 9911 for (int i = 0, Size = Mask.size(); i < Size; i += 2) { 9912 // If both elements are undef, its trivial. 9913 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { 9914 WidenedMask.push_back(SM_SentinelUndef); 9915 continue; 9916 } 9917 9918 // Check for an undef mask and a mask value properly aligned to fit with 9919 // a pair of values. If we find such a case, use the non-undef mask's value. 9920 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { 9921 WidenedMask.push_back(Mask[i + 1] / 2); 9922 continue; 9923 } 9924 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { 9925 WidenedMask.push_back(Mask[i] / 2); 9926 continue; 9927 } 9928 9929 // When zeroing, we need to spread the zeroing across both lanes to widen. 9930 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { 9931 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && 9932 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { 9933 WidenedMask.push_back(SM_SentinelZero); 9934 continue; 9935 } 9936 return false; 9937 } 9938 9939 // Finally check if the two mask values are adjacent and aligned with 9940 // a pair. 9941 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { 9942 WidenedMask.push_back(Mask[i] / 2); 9943 continue; 9944 } 9945 9946 // Otherwise we can't safely widen the elements used in this shuffle. 9947 return false; 9948 } 9949 assert(WidenedMask.size() == Mask.size() / 2 && 9950 "Incorrect size of mask after widening the elements!"); 9951 9952 return true; 9953 } 9954 9955 /// \brief Generic routine to split vector shuffle into half-sized shuffles. 9956 /// 9957 /// This routine just extracts two subvectors, shuffles them independently, and 9958 /// then concatenates them back together. This should work effectively with all 9959 /// AVX vector shuffle types. 9960 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, 9961 SDValue V2, ArrayRef<int> Mask, 9962 SelectionDAG &DAG) { 9963 assert(VT.getSizeInBits() >= 256 && 9964 "Only for 256-bit or wider vector shuffles!"); 9965 assert(V1.getSimpleValueType() == VT && "Bad operand type!"); 9966 assert(V2.getSimpleValueType() == VT && "Bad operand type!"); 9967 9968 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2); 9969 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2); 9970 9971 int NumElements = VT.getVectorNumElements(); 9972 int SplitNumElements = NumElements / 2; 9973 MVT ScalarVT = VT.getVectorElementType(); 9974 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); 9975 9976 // Rather than splitting build-vectors, just build two narrower build 9977 // vectors. This helps shuffling with splats and zeros. 9978 auto SplitVector = [&](SDValue V) { 9979 while (V.getOpcode() == ISD::BITCAST) 9980 V = V->getOperand(0); 9981 9982 MVT OrigVT = V.getSimpleValueType(); 9983 int OrigNumElements = OrigVT.getVectorNumElements(); 9984 int OrigSplitNumElements = OrigNumElements / 2; 9985 MVT OrigScalarVT = OrigVT.getVectorElementType(); 9986 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); 9987 9988 SDValue LoV, HiV; 9989 9990 auto *BV = dyn_cast<BuildVectorSDNode>(V); 9991 if (!BV) { 9992 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, 9993 DAG.getIntPtrConstant(0, DL)); 9994 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, 9995 DAG.getIntPtrConstant(OrigSplitNumElements, DL)); 9996 } else { 9997 9998 SmallVector<SDValue, 16> LoOps, HiOps; 9999 for (int i = 0; i < OrigSplitNumElements; ++i) { 10000 LoOps.push_back(BV->getOperand(i)); 10001 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); 10002 } 10003 LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); 10004 HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); 10005 } 10006 return std::make_pair(DAG.getBitcast(SplitVT, LoV), 10007 DAG.getBitcast(SplitVT, HiV)); 10008 }; 10009 10010 SDValue LoV1, HiV1, LoV2, HiV2; 10011 std::tie(LoV1, HiV1) = SplitVector(V1); 10012 std::tie(LoV2, HiV2) = SplitVector(V2); 10013 10014 // Now create two 4-way blends of these half-width vectors. 10015 auto HalfBlend = [&](ArrayRef<int> HalfMask) { 10016 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; 10017 SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask; 10018 for (int i = 0; i < SplitNumElements; ++i) { 10019 int M = HalfMask[i]; 10020 if (M >= NumElements) { 10021 if (M >= NumElements + SplitNumElements) 10022 UseHiV2 = true; 10023 else 10024 UseLoV2 = true; 10025 V2BlendMask.push_back(M - NumElements); 10026 V1BlendMask.push_back(-1); 10027 BlendMask.push_back(SplitNumElements + i); 10028 } else if (M >= 0) { 10029 if (M >= SplitNumElements) 10030 UseHiV1 = true; 10031 else 10032 UseLoV1 = true; 10033 V2BlendMask.push_back(-1); 10034 V1BlendMask.push_back(M); 10035 BlendMask.push_back(i); 10036 } else { 10037 V2BlendMask.push_back(-1); 10038 V1BlendMask.push_back(-1); 10039 BlendMask.push_back(-1); 10040 } 10041 } 10042 10043 // Because the lowering happens after all combining takes place, we need to 10044 // manually combine these blend masks as much as possible so that we create 10045 // a minimal number of high-level vector shuffle nodes. 10046 10047 // First try just blending the halves of V1 or V2. 10048 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) 10049 return DAG.getUNDEF(SplitVT); 10050 if (!UseLoV2 && !UseHiV2) 10051 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); 10052 if (!UseLoV1 && !UseHiV1) 10053 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); 10054 10055 SDValue V1Blend, V2Blend; 10056 if (UseLoV1 && UseHiV1) { 10057 V1Blend = 10058 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); 10059 } else { 10060 // We only use half of V1 so map the usage down into the final blend mask. 10061 V1Blend = UseLoV1 ? LoV1 : HiV1; 10062 for (int i = 0; i < SplitNumElements; ++i) 10063 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) 10064 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); 10065 } 10066 if (UseLoV2 && UseHiV2) { 10067 V2Blend = 10068 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); 10069 } else { 10070 // We only use half of V2 so map the usage down into the final blend mask. 10071 V2Blend = UseLoV2 ? LoV2 : HiV2; 10072 for (int i = 0; i < SplitNumElements; ++i) 10073 if (BlendMask[i] >= SplitNumElements) 10074 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); 10075 } 10076 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); 10077 }; 10078 SDValue Lo = HalfBlend(LoMask); 10079 SDValue Hi = HalfBlend(HiMask); 10080 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); 10081 } 10082 10083 /// \brief Either split a vector in halves or decompose the shuffles and the 10084 /// blend. 10085 /// 10086 /// This is provided as a good fallback for many lowerings of non-single-input 10087 /// shuffles with more than one 128-bit lane. In those cases, we want to select 10088 /// between splitting the shuffle into 128-bit components and stitching those 10089 /// back together vs. extracting the single-input shuffles and blending those 10090 /// results. 10091 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, 10092 SDValue V2, ArrayRef<int> Mask, 10093 SelectionDAG &DAG) { 10094 assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " 10095 "lower single-input shuffles as it " 10096 "could then recurse on itself."); 10097 int Size = Mask.size(); 10098 10099 // If this can be modeled as a broadcast of two elements followed by a blend, 10100 // prefer that lowering. This is especially important because broadcasts can 10101 // often fold with memory operands. 10102 auto DoBothBroadcast = [&] { 10103 int V1BroadcastIdx = -1, V2BroadcastIdx = -1; 10104 for (int M : Mask) 10105 if (M >= Size) { 10106 if (V2BroadcastIdx == -1) 10107 V2BroadcastIdx = M - Size; 10108 else if (M - Size != V2BroadcastIdx) 10109 return false; 10110 } else if (M >= 0) { 10111 if (V1BroadcastIdx == -1) 10112 V1BroadcastIdx = M; 10113 else if (M != V1BroadcastIdx) 10114 return false; 10115 } 10116 return true; 10117 }; 10118 if (DoBothBroadcast()) 10119 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, 10120 DAG); 10121 10122 // If the inputs all stem from a single 128-bit lane of each input, then we 10123 // split them rather than blending because the split will decompose to 10124 // unusually few instructions. 10125 int LaneCount = VT.getSizeInBits() / 128; 10126 int LaneSize = Size / LaneCount; 10127 SmallBitVector LaneInputs[2]; 10128 LaneInputs[0].resize(LaneCount, false); 10129 LaneInputs[1].resize(LaneCount, false); 10130 for (int i = 0; i < Size; ++i) 10131 if (Mask[i] >= 0) 10132 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; 10133 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) 10134 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 10135 10136 // Otherwise, just fall back to decomposed shuffles and a blend. This requires 10137 // that the decomposed single-input shuffles don't end up here. 10138 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); 10139 } 10140 10141 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as 10142 /// a permutation and blend of those lanes. 10143 /// 10144 /// This essentially blends the out-of-lane inputs to each lane into the lane 10145 /// from a permuted copy of the vector. This lowering strategy results in four 10146 /// instructions in the worst case for a single-input cross lane shuffle which 10147 /// is lower than any other fully general cross-lane shuffle strategy I'm aware 10148 /// of. Special cases for each particular shuffle pattern should be handled 10149 /// prior to trying this lowering. 10150 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, 10151 SDValue V1, SDValue V2, 10152 ArrayRef<int> Mask, 10153 SelectionDAG &DAG) { 10154 // FIXME: This should probably be generalized for 512-bit vectors as well. 10155 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); 10156 int LaneSize = Mask.size() / 2; 10157 10158 // If there are only inputs from one 128-bit lane, splitting will in fact be 10159 // less expensive. The flags track whether the given lane contains an element 10160 // that crosses to another lane. 10161 bool LaneCrossing[2] = {false, false}; 10162 for (int i = 0, Size = Mask.size(); i < Size; ++i) 10163 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) 10164 LaneCrossing[(Mask[i] % Size) / LaneSize] = true; 10165 if (!LaneCrossing[0] || !LaneCrossing[1]) 10166 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 10167 10168 if (isSingleInputShuffleMask(Mask)) { 10169 SmallVector<int, 32> FlippedBlendMask; 10170 for (int i = 0, Size = Mask.size(); i < Size; ++i) 10171 FlippedBlendMask.push_back( 10172 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) 10173 ? Mask[i] 10174 : Mask[i] % LaneSize + 10175 (i / LaneSize) * LaneSize + Size)); 10176 10177 // Flip the vector, and blend the results which should now be in-lane. The 10178 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and 10179 // 5 for the high source. The value 3 selects the high half of source 2 and 10180 // the value 2 selects the low half of source 2. We only use source 2 to 10181 // allow folding it into a memory operand. 10182 unsigned PERMMask = 3 | 2 << 4; 10183 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), 10184 V1, DAG.getConstant(PERMMask, DL, MVT::i8)); 10185 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); 10186 } 10187 10188 // This now reduces to two single-input shuffles of V1 and V2 which at worst 10189 // will be handled by the above logic and a blend of the results, much like 10190 // other patterns in AVX. 10191 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); 10192 } 10193 10194 /// \brief Handle lowering 2-lane 128-bit shuffles. 10195 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, 10196 SDValue V2, ArrayRef<int> Mask, 10197 const X86Subtarget *Subtarget, 10198 SelectionDAG &DAG) { 10199 // TODO: If minimizing size and one of the inputs is a zero vector and the 10200 // the zero vector has only one use, we could use a VPERM2X128 to save the 10201 // instruction bytes needed to explicitly generate the zero vector. 10202 10203 // Blends are faster and handle all the non-lane-crossing cases. 10204 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, 10205 Subtarget, DAG)) 10206 return Blend; 10207 10208 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); 10209 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); 10210 10211 // If either input operand is a zero vector, use VPERM2X128 because its mask 10212 // allows us to replace the zero input with an implicit zero. 10213 if (!IsV1Zero && !IsV2Zero) { 10214 // Check for patterns which can be matched with a single insert of a 128-bit 10215 // subvector. 10216 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); 10217 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { 10218 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 10219 VT.getVectorNumElements() / 2); 10220 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, 10221 DAG.getIntPtrConstant(0, DL)); 10222 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, 10223 OnlyUsesV1 ? V1 : V2, 10224 DAG.getIntPtrConstant(0, DL)); 10225 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); 10226 } 10227 } 10228 10229 // Otherwise form a 128-bit permutation. After accounting for undefs, 10230 // convert the 64-bit shuffle mask selection values into 128-bit 10231 // selection bits by dividing the indexes by 2 and shifting into positions 10232 // defined by a vperm2*128 instruction's immediate control byte. 10233 10234 // The immediate permute control byte looks like this: 10235 // [1:0] - select 128 bits from sources for low half of destination 10236 // [2] - ignore 10237 // [3] - zero low half of destination 10238 // [5:4] - select 128 bits from sources for high half of destination 10239 // [6] - ignore 10240 // [7] - zero high half of destination 10241 10242 int MaskLO = Mask[0]; 10243 if (MaskLO == SM_SentinelUndef) 10244 MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; 10245 10246 int MaskHI = Mask[2]; 10247 if (MaskHI == SM_SentinelUndef) 10248 MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; 10249 10250 unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; 10251 10252 // If either input is a zero vector, replace it with an undef input. 10253 // Shuffle mask values < 4 are selecting elements of V1. 10254 // Shuffle mask values >= 4 are selecting elements of V2. 10255 // Adjust each half of the permute mask by clearing the half that was 10256 // selecting the zero vector and setting the zero mask bit. 10257 if (IsV1Zero) { 10258 V1 = DAG.getUNDEF(VT); 10259 if (MaskLO < 4) 10260 PermMask = (PermMask & 0xf0) | 0x08; 10261 if (MaskHI < 4) 10262 PermMask = (PermMask & 0x0f) | 0x80; 10263 } 10264 if (IsV2Zero) { 10265 V2 = DAG.getUNDEF(VT); 10266 if (MaskLO >= 4) 10267 PermMask = (PermMask & 0xf0) | 0x08; 10268 if (MaskHI >= 4) 10269 PermMask = (PermMask & 0x0f) | 0x80; 10270 } 10271 10272 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, 10273 DAG.getConstant(PermMask, DL, MVT::i8)); 10274 } 10275 10276 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then 10277 /// shuffling each lane. 10278 /// 10279 /// This will only succeed when the result of fixing the 128-bit lanes results 10280 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in 10281 /// each 128-bit lanes. This handles many cases where we can quickly blend away 10282 /// the lane crosses early and then use simpler shuffles within each lane. 10283 /// 10284 /// FIXME: It might be worthwhile at some point to support this without 10285 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently 10286 /// in x86 only floating point has interesting non-repeating shuffles, and even 10287 /// those are still *marginally* more expensive. 10288 static SDValue lowerVectorShuffleByMerging128BitLanes( 10289 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 10290 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 10291 assert(!isSingleInputShuffleMask(Mask) && 10292 "This is only useful with multiple inputs."); 10293 10294 int Size = Mask.size(); 10295 int LaneSize = 128 / VT.getScalarSizeInBits(); 10296 int NumLanes = Size / LaneSize; 10297 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); 10298 10299 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also 10300 // check whether the in-128-bit lane shuffles share a repeating pattern. 10301 SmallVector<int, 4> Lanes; 10302 Lanes.resize(NumLanes, -1); 10303 SmallVector<int, 4> InLaneMask; 10304 InLaneMask.resize(LaneSize, -1); 10305 for (int i = 0; i < Size; ++i) { 10306 if (Mask[i] < 0) 10307 continue; 10308 10309 int j = i / LaneSize; 10310 10311 if (Lanes[j] < 0) { 10312 // First entry we've seen for this lane. 10313 Lanes[j] = Mask[i] / LaneSize; 10314 } else if (Lanes[j] != Mask[i] / LaneSize) { 10315 // This doesn't match the lane selected previously! 10316 return SDValue(); 10317 } 10318 10319 // Check that within each lane we have a consistent shuffle mask. 10320 int k = i % LaneSize; 10321 if (InLaneMask[k] < 0) { 10322 InLaneMask[k] = Mask[i] % LaneSize; 10323 } else if (InLaneMask[k] != Mask[i] % LaneSize) { 10324 // This doesn't fit a repeating in-lane mask. 10325 return SDValue(); 10326 } 10327 } 10328 10329 // First shuffle the lanes into place. 10330 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, 10331 VT.getSizeInBits() / 64); 10332 SmallVector<int, 8> LaneMask; 10333 LaneMask.resize(NumLanes * 2, -1); 10334 for (int i = 0; i < NumLanes; ++i) 10335 if (Lanes[i] >= 0) { 10336 LaneMask[2 * i + 0] = 2*Lanes[i] + 0; 10337 LaneMask[2 * i + 1] = 2*Lanes[i] + 1; 10338 } 10339 10340 V1 = DAG.getBitcast(LaneVT, V1); 10341 V2 = DAG.getBitcast(LaneVT, V2); 10342 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); 10343 10344 // Cast it back to the type we actually want. 10345 LaneShuffle = DAG.getBitcast(VT, LaneShuffle); 10346 10347 // Now do a simple shuffle that isn't lane crossing. 10348 SmallVector<int, 8> NewMask; 10349 NewMask.resize(Size, -1); 10350 for (int i = 0; i < Size; ++i) 10351 if (Mask[i] >= 0) 10352 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; 10353 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && 10354 "Must not introduce lane crosses at this point!"); 10355 10356 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); 10357 } 10358 10359 /// \brief Test whether the specified input (0 or 1) is in-place blended by the 10360 /// given mask. 10361 /// 10362 /// This returns true if the elements from a particular input are already in the 10363 /// slot required by the given mask and require no permutation. 10364 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { 10365 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); 10366 int Size = Mask.size(); 10367 for (int i = 0; i < Size; ++i) 10368 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) 10369 return false; 10370 10371 return true; 10372 } 10373 10374 static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT, 10375 ArrayRef<int> Mask, SDValue V1, 10376 SDValue V2, SelectionDAG &DAG) { 10377 10378 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. 10379 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. 10380 assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD"); 10381 int NumElts = VT.getVectorNumElements(); 10382 bool ShufpdMask = true; 10383 bool CommutableMask = true; 10384 unsigned Immediate = 0; 10385 for (int i = 0; i < NumElts; ++i) { 10386 if (Mask[i] < 0) 10387 continue; 10388 int Val = (i & 6) + NumElts * (i & 1); 10389 int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1); 10390 if (Mask[i] < Val || Mask[i] > Val + 1) 10391 ShufpdMask = false; 10392 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) 10393 CommutableMask = false; 10394 Immediate |= (Mask[i] % 2) << i; 10395 } 10396 if (ShufpdMask) 10397 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, 10398 DAG.getConstant(Immediate, DL, MVT::i8)); 10399 if (CommutableMask) 10400 return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, 10401 DAG.getConstant(Immediate, DL, MVT::i8)); 10402 return SDValue(); 10403 } 10404 10405 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. 10406 /// 10407 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 10408 /// isn't available. 10409 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10410 const X86Subtarget *Subtarget, 10411 SelectionDAG &DAG) { 10412 SDLoc DL(Op); 10413 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); 10414 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); 10415 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10416 ArrayRef<int> Mask = SVOp->getMask(); 10417 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 10418 10419 SmallVector<int, 4> WidenedMask; 10420 if (canWidenShuffleElements(Mask, WidenedMask)) 10421 return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, 10422 DAG); 10423 10424 if (isSingleInputShuffleMask(Mask)) { 10425 // Check for being able to broadcast a single element. 10426 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, 10427 Mask, Subtarget, DAG)) 10428 return Broadcast; 10429 10430 // Use low duplicate instructions for masks that match their pattern. 10431 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) 10432 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); 10433 10434 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { 10435 // Non-half-crossing single input shuffles can be lowerid with an 10436 // interleaved permutation. 10437 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | 10438 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); 10439 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, 10440 DAG.getConstant(VPERMILPMask, DL, MVT::i8)); 10441 } 10442 10443 // With AVX2 we have direct support for this permutation. 10444 if (Subtarget->hasAVX2()) 10445 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, 10446 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 10447 10448 // Otherwise, fall back. 10449 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, 10450 DAG); 10451 } 10452 10453 // Use dedicated unpack instructions for masks that match their pattern. 10454 if (SDValue V = 10455 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) 10456 return V; 10457 10458 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, 10459 Subtarget, DAG)) 10460 return Blend; 10461 10462 // Check if the blend happens to exactly fit that of SHUFPD. 10463 if (SDValue Op = 10464 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) 10465 return Op; 10466 10467 // Try to simplify this by merging 128-bit lanes to enable a lane-based 10468 // shuffle. However, if we have AVX2 and either inputs are already in place, 10469 // we will be able to shuffle even across lanes the other input in a single 10470 // instruction so skip this pattern. 10471 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || 10472 isShuffleMaskInputInPlace(1, Mask)))) 10473 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 10474 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) 10475 return Result; 10476 10477 // If we have AVX2 then we always want to lower with a blend because an v4 we 10478 // can fully permute the elements. 10479 if (Subtarget->hasAVX2()) 10480 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, 10481 Mask, DAG); 10482 10483 // Otherwise fall back on generic lowering. 10484 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); 10485 } 10486 10487 /// \brief Handle lowering of 4-lane 64-bit integer shuffles. 10488 /// 10489 /// This routine is only called when we have AVX2 and thus a reasonable 10490 /// instruction set for v4i64 shuffling.. 10491 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10492 const X86Subtarget *Subtarget, 10493 SelectionDAG &DAG) { 10494 SDLoc DL(Op); 10495 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); 10496 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); 10497 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10498 ArrayRef<int> Mask = SVOp->getMask(); 10499 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 10500 assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); 10501 10502 SmallVector<int, 4> WidenedMask; 10503 if (canWidenShuffleElements(Mask, WidenedMask)) 10504 return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, 10505 DAG); 10506 10507 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, 10508 Subtarget, DAG)) 10509 return Blend; 10510 10511 // Check for being able to broadcast a single element. 10512 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, 10513 Mask, Subtarget, DAG)) 10514 return Broadcast; 10515 10516 // When the shuffle is mirrored between the 128-bit lanes of the unit, we can 10517 // use lower latency instructions that will operate on both 128-bit lanes. 10518 SmallVector<int, 2> RepeatedMask; 10519 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { 10520 if (isSingleInputShuffleMask(Mask)) { 10521 int PSHUFDMask[] = {-1, -1, -1, -1}; 10522 for (int i = 0; i < 2; ++i) 10523 if (RepeatedMask[i] >= 0) { 10524 PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; 10525 PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; 10526 } 10527 return DAG.getBitcast( 10528 MVT::v4i64, 10529 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, 10530 DAG.getBitcast(MVT::v8i32, V1), 10531 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 10532 } 10533 } 10534 10535 // AVX2 provides a direct instruction for permuting a single input across 10536 // lanes. 10537 if (isSingleInputShuffleMask(Mask)) 10538 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, 10539 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 10540 10541 // Try to use shift instructions. 10542 if (SDValue Shift = 10543 lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG)) 10544 return Shift; 10545 10546 // Use dedicated unpack instructions for masks that match their pattern. 10547 if (SDValue V = 10548 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) 10549 return V; 10550 10551 // Try to simplify this by merging 128-bit lanes to enable a lane-based 10552 // shuffle. However, if we have AVX2 and either inputs are already in place, 10553 // we will be able to shuffle even across lanes the other input in a single 10554 // instruction so skip this pattern. 10555 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || 10556 isShuffleMaskInputInPlace(1, Mask)))) 10557 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 10558 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) 10559 return Result; 10560 10561 // Otherwise fall back on generic blend lowering. 10562 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, 10563 Mask, DAG); 10564 } 10565 10566 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles. 10567 /// 10568 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 10569 /// isn't available. 10570 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10571 const X86Subtarget *Subtarget, 10572 SelectionDAG &DAG) { 10573 SDLoc DL(Op); 10574 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); 10575 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); 10576 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10577 ArrayRef<int> Mask = SVOp->getMask(); 10578 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 10579 10580 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, 10581 Subtarget, DAG)) 10582 return Blend; 10583 10584 // Check for being able to broadcast a single element. 10585 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, 10586 Mask, Subtarget, DAG)) 10587 return Broadcast; 10588 10589 // If the shuffle mask is repeated in each 128-bit lane, we have many more 10590 // options to efficiently lower the shuffle. 10591 SmallVector<int, 4> RepeatedMask; 10592 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { 10593 assert(RepeatedMask.size() == 4 && 10594 "Repeated masks must be half the mask width!"); 10595 10596 // Use even/odd duplicate instructions for masks that match their pattern. 10597 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) 10598 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); 10599 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7})) 10600 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); 10601 10602 if (isSingleInputShuffleMask(Mask)) 10603 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, 10604 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); 10605 10606 // Use dedicated unpack instructions for masks that match their pattern. 10607 if (SDValue V = 10608 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) 10609 return V; 10610 10611 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we 10612 // have already handled any direct blends. We also need to squash the 10613 // repeated mask into a simulated v4f32 mask. 10614 for (int i = 0; i < 4; ++i) 10615 if (RepeatedMask[i] >= 8) 10616 RepeatedMask[i] -= 4; 10617 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); 10618 } 10619 10620 // If we have a single input shuffle with different shuffle patterns in the 10621 // two 128-bit lanes use the variable mask to VPERMILPS. 10622 if (isSingleInputShuffleMask(Mask)) { 10623 SDValue VPermMask[8]; 10624 for (int i = 0; i < 8; ++i) 10625 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) 10626 : DAG.getConstant(Mask[i], DL, MVT::i32); 10627 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) 10628 return DAG.getNode( 10629 X86ISD::VPERMILPV, DL, MVT::v8f32, V1, 10630 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); 10631 10632 if (Subtarget->hasAVX2()) 10633 return DAG.getNode( 10634 X86ISD::VPERMV, DL, MVT::v8f32, 10635 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); 10636 10637 // Otherwise, fall back. 10638 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, 10639 DAG); 10640 } 10641 10642 // Try to simplify this by merging 128-bit lanes to enable a lane-based 10643 // shuffle. 10644 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 10645 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) 10646 return Result; 10647 10648 // If we have AVX2 then we always want to lower with a blend because at v8 we 10649 // can fully permute the elements. 10650 if (Subtarget->hasAVX2()) 10651 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, 10652 Mask, DAG); 10653 10654 // Otherwise fall back on generic lowering. 10655 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); 10656 } 10657 10658 /// \brief Handle lowering of 8-lane 32-bit integer shuffles. 10659 /// 10660 /// This routine is only called when we have AVX2 and thus a reasonable 10661 /// instruction set for v8i32 shuffling.. 10662 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10663 const X86Subtarget *Subtarget, 10664 SelectionDAG &DAG) { 10665 SDLoc DL(Op); 10666 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); 10667 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); 10668 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10669 ArrayRef<int> Mask = SVOp->getMask(); 10670 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 10671 assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); 10672 10673 // Whenever we can lower this as a zext, that instruction is strictly faster 10674 // than any alternative. It also allows us to fold memory operands into the 10675 // shuffle in many cases. 10676 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, 10677 Mask, Subtarget, DAG)) 10678 return ZExt; 10679 10680 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, 10681 Subtarget, DAG)) 10682 return Blend; 10683 10684 // Check for being able to broadcast a single element. 10685 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, 10686 Mask, Subtarget, DAG)) 10687 return Broadcast; 10688 10689 // If the shuffle mask is repeated in each 128-bit lane we can use more 10690 // efficient instructions that mirror the shuffles across the two 128-bit 10691 // lanes. 10692 SmallVector<int, 4> RepeatedMask; 10693 if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { 10694 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); 10695 if (isSingleInputShuffleMask(Mask)) 10696 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, 10697 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); 10698 10699 // Use dedicated unpack instructions for masks that match their pattern. 10700 if (SDValue V = 10701 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) 10702 return V; 10703 } 10704 10705 // Try to use shift instructions. 10706 if (SDValue Shift = 10707 lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG)) 10708 return Shift; 10709 10710 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 10711 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) 10712 return Rotate; 10713 10714 // If the shuffle patterns aren't repeated but it is a single input, directly 10715 // generate a cross-lane VPERMD instruction. 10716 if (isSingleInputShuffleMask(Mask)) { 10717 SDValue VPermMask[8]; 10718 for (int i = 0; i < 8; ++i) 10719 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) 10720 : DAG.getConstant(Mask[i], DL, MVT::i32); 10721 return DAG.getNode( 10722 X86ISD::VPERMV, DL, MVT::v8i32, 10723 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); 10724 } 10725 10726 // Try to simplify this by merging 128-bit lanes to enable a lane-based 10727 // shuffle. 10728 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 10729 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) 10730 return Result; 10731 10732 // Otherwise fall back on generic blend lowering. 10733 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, 10734 Mask, DAG); 10735 } 10736 10737 /// \brief Handle lowering of 16-lane 16-bit integer shuffles. 10738 /// 10739 /// This routine is only called when we have AVX2 and thus a reasonable 10740 /// instruction set for v16i16 shuffling.. 10741 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10742 const X86Subtarget *Subtarget, 10743 SelectionDAG &DAG) { 10744 SDLoc DL(Op); 10745 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); 10746 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); 10747 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10748 ArrayRef<int> Mask = SVOp->getMask(); 10749 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 10750 assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); 10751 10752 // Whenever we can lower this as a zext, that instruction is strictly faster 10753 // than any alternative. It also allows us to fold memory operands into the 10754 // shuffle in many cases. 10755 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2, 10756 Mask, Subtarget, DAG)) 10757 return ZExt; 10758 10759 // Check for being able to broadcast a single element. 10760 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, 10761 Mask, Subtarget, DAG)) 10762 return Broadcast; 10763 10764 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, 10765 Subtarget, DAG)) 10766 return Blend; 10767 10768 // Use dedicated unpack instructions for masks that match their pattern. 10769 if (SDValue V = 10770 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) 10771 return V; 10772 10773 // Try to use shift instructions. 10774 if (SDValue Shift = 10775 lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG)) 10776 return Shift; 10777 10778 // Try to use byte rotation instructions. 10779 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 10780 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) 10781 return Rotate; 10782 10783 if (isSingleInputShuffleMask(Mask)) { 10784 // There are no generalized cross-lane shuffle operations available on i16 10785 // element types. 10786 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) 10787 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, 10788 Mask, DAG); 10789 10790 SmallVector<int, 8> RepeatedMask; 10791 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { 10792 // As this is a single-input shuffle, the repeated mask should be 10793 // a strictly valid v8i16 mask that we can pass through to the v8i16 10794 // lowering to handle even the v16 case. 10795 return lowerV8I16GeneralSingleInputVectorShuffle( 10796 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); 10797 } 10798 10799 SDValue PSHUFBMask[32]; 10800 for (int i = 0; i < 16; ++i) { 10801 if (Mask[i] == -1) { 10802 PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); 10803 continue; 10804 } 10805 10806 int M = i < 8 ? Mask[i] : Mask[i] - 8; 10807 assert(M >= 0 && M < 8 && "Invalid single-input mask!"); 10808 PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8); 10809 PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8); 10810 } 10811 return DAG.getBitcast(MVT::v16i16, 10812 DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, 10813 DAG.getBitcast(MVT::v32i8, V1), 10814 DAG.getNode(ISD::BUILD_VECTOR, DL, 10815 MVT::v32i8, PSHUFBMask))); 10816 } 10817 10818 // Try to simplify this by merging 128-bit lanes to enable a lane-based 10819 // shuffle. 10820 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 10821 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) 10822 return Result; 10823 10824 // Otherwise fall back on generic lowering. 10825 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); 10826 } 10827 10828 /// \brief Handle lowering of 32-lane 8-bit integer shuffles. 10829 /// 10830 /// This routine is only called when we have AVX2 and thus a reasonable 10831 /// instruction set for v32i8 shuffling.. 10832 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10833 const X86Subtarget *Subtarget, 10834 SelectionDAG &DAG) { 10835 SDLoc DL(Op); 10836 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); 10837 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); 10838 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10839 ArrayRef<int> Mask = SVOp->getMask(); 10840 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); 10841 assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); 10842 10843 // Whenever we can lower this as a zext, that instruction is strictly faster 10844 // than any alternative. It also allows us to fold memory operands into the 10845 // shuffle in many cases. 10846 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, 10847 Mask, Subtarget, DAG)) 10848 return ZExt; 10849 10850 // Check for being able to broadcast a single element. 10851 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, 10852 Mask, Subtarget, DAG)) 10853 return Broadcast; 10854 10855 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, 10856 Subtarget, DAG)) 10857 return Blend; 10858 10859 // Use dedicated unpack instructions for masks that match their pattern. 10860 if (SDValue V = 10861 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) 10862 return V; 10863 10864 // Try to use shift instructions. 10865 if (SDValue Shift = 10866 lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG)) 10867 return Shift; 10868 10869 // Try to use byte rotation instructions. 10870 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 10871 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) 10872 return Rotate; 10873 10874 if (isSingleInputShuffleMask(Mask)) { 10875 // There are no generalized cross-lane shuffle operations available on i8 10876 // element types. 10877 if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) 10878 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, 10879 Mask, DAG); 10880 10881 SDValue PSHUFBMask[32]; 10882 for (int i = 0; i < 32; ++i) 10883 PSHUFBMask[i] = 10884 Mask[i] < 0 10885 ? DAG.getUNDEF(MVT::i8) 10886 : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL, 10887 MVT::i8); 10888 10889 return DAG.getNode( 10890 X86ISD::PSHUFB, DL, MVT::v32i8, V1, 10891 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); 10892 } 10893 10894 // Try to simplify this by merging 128-bit lanes to enable a lane-based 10895 // shuffle. 10896 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 10897 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) 10898 return Result; 10899 10900 // Otherwise fall back on generic lowering. 10901 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); 10902 } 10903 10904 /// \brief High-level routine to lower various 256-bit x86 vector shuffles. 10905 /// 10906 /// This routine either breaks down the specific type of a 256-bit x86 vector 10907 /// shuffle or splits it into two 128-bit shuffles and fuses the results back 10908 /// together based on the available instructions. 10909 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 10910 MVT VT, const X86Subtarget *Subtarget, 10911 SelectionDAG &DAG) { 10912 SDLoc DL(Op); 10913 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10914 ArrayRef<int> Mask = SVOp->getMask(); 10915 10916 // If we have a single input to the zero element, insert that into V1 if we 10917 // can do so cheaply. 10918 int NumElts = VT.getVectorNumElements(); 10919 int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) { 10920 return M >= NumElts; 10921 }); 10922 10923 if (NumV2Elements == 1 && Mask[0] >= NumElts) 10924 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 10925 DL, VT, V1, V2, Mask, Subtarget, DAG)) 10926 return Insertion; 10927 10928 // There is a really nice hard cut-over between AVX1 and AVX2 that means we 10929 // can check for those subtargets here and avoid much of the subtarget 10930 // querying in the per-vector-type lowering routines. With AVX1 we have 10931 // essentially *zero* ability to manipulate a 256-bit vector with integer 10932 // types. Since we'll use floating point types there eventually, just 10933 // immediately cast everything to a float and operate entirely in that domain. 10934 if (VT.isInteger() && !Subtarget->hasAVX2()) { 10935 int ElementBits = VT.getScalarSizeInBits(); 10936 if (ElementBits < 32) 10937 // No floating point type available, decompose into 128-bit vectors. 10938 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 10939 10940 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), 10941 VT.getVectorNumElements()); 10942 V1 = DAG.getBitcast(FpVT, V1); 10943 V2 = DAG.getBitcast(FpVT, V2); 10944 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); 10945 } 10946 10947 switch (VT.SimpleTy) { 10948 case MVT::v4f64: 10949 return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); 10950 case MVT::v4i64: 10951 return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); 10952 case MVT::v8f32: 10953 return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); 10954 case MVT::v8i32: 10955 return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); 10956 case MVT::v16i16: 10957 return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); 10958 case MVT::v32i8: 10959 return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); 10960 10961 default: 10962 llvm_unreachable("Not a valid 256-bit x86 vector type!"); 10963 } 10964 } 10965 10966 /// \brief Try to lower a vector shuffle as a 128-bit shuffles. 10967 static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, 10968 ArrayRef<int> Mask, 10969 SDValue V1, SDValue V2, 10970 SelectionDAG &DAG) { 10971 assert(VT.getScalarSizeInBits() == 64 && 10972 "Unexpected element type size for 128bit shuffle."); 10973 10974 // To handle 256 bit vector requires VLX and most probably 10975 // function lowerV2X128VectorShuffle() is better solution. 10976 assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle."); 10977 10978 SmallVector<int, 4> WidenedMask; 10979 if (!canWidenShuffleElements(Mask, WidenedMask)) 10980 return SDValue(); 10981 10982 // Form a 128-bit permutation. 10983 // Convert the 64-bit shuffle mask selection values into 128-bit selection 10984 // bits defined by a vshuf64x2 instruction's immediate control byte. 10985 unsigned PermMask = 0, Imm = 0; 10986 unsigned ControlBitsNum = WidenedMask.size() / 2; 10987 10988 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { 10989 if (WidenedMask[i] == SM_SentinelZero) 10990 return SDValue(); 10991 10992 // Use first element in place of undef mask. 10993 Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; 10994 PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); 10995 } 10996 10997 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, 10998 DAG.getConstant(PermMask, DL, MVT::i8)); 10999 } 11000 11001 static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, 11002 ArrayRef<int> Mask, SDValue V1, 11003 SDValue V2, SelectionDAG &DAG) { 11004 11005 assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); 11006 11007 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); 11008 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); 11009 11010 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); 11011 if (isSingleInputShuffleMask(Mask)) 11012 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); 11013 11014 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); 11015 } 11016 11017 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. 11018 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 11019 const X86Subtarget *Subtarget, 11020 SelectionDAG &DAG) { 11021 SDLoc DL(Op); 11022 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); 11023 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); 11024 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11025 ArrayRef<int> Mask = SVOp->getMask(); 11026 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 11027 11028 if (SDValue Shuf128 = 11029 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) 11030 return Shuf128; 11031 11032 if (SDValue Unpck = 11033 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) 11034 return Unpck; 11035 11036 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); 11037 } 11038 11039 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. 11040 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 11041 const X86Subtarget *Subtarget, 11042 SelectionDAG &DAG) { 11043 SDLoc DL(Op); 11044 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); 11045 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); 11046 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11047 ArrayRef<int> Mask = SVOp->getMask(); 11048 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 11049 11050 if (SDValue Unpck = 11051 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) 11052 return Unpck; 11053 11054 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); 11055 } 11056 11057 /// \brief Handle lowering of 8-lane 64-bit integer shuffles. 11058 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 11059 const X86Subtarget *Subtarget, 11060 SelectionDAG &DAG) { 11061 SDLoc DL(Op); 11062 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); 11063 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); 11064 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11065 ArrayRef<int> Mask = SVOp->getMask(); 11066 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 11067 11068 if (SDValue Shuf128 = 11069 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) 11070 return Shuf128; 11071 11072 if (SDValue Unpck = 11073 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) 11074 return Unpck; 11075 11076 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); 11077 } 11078 11079 /// \brief Handle lowering of 16-lane 32-bit integer shuffles. 11080 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 11081 const X86Subtarget *Subtarget, 11082 SelectionDAG &DAG) { 11083 SDLoc DL(Op); 11084 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); 11085 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); 11086 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11087 ArrayRef<int> Mask = SVOp->getMask(); 11088 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 11089 11090 if (SDValue Unpck = 11091 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) 11092 return Unpck; 11093 11094 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); 11095 } 11096 11097 /// \brief Handle lowering of 32-lane 16-bit integer shuffles. 11098 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 11099 const X86Subtarget *Subtarget, 11100 SelectionDAG &DAG) { 11101 SDLoc DL(Op); 11102 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); 11103 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); 11104 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11105 ArrayRef<int> Mask = SVOp->getMask(); 11106 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); 11107 assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); 11108 11109 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); 11110 } 11111 11112 /// \brief Handle lowering of 64-lane 8-bit integer shuffles. 11113 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 11114 const X86Subtarget *Subtarget, 11115 SelectionDAG &DAG) { 11116 SDLoc DL(Op); 11117 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); 11118 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); 11119 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11120 ArrayRef<int> Mask = SVOp->getMask(); 11121 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); 11122 assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); 11123 11124 // FIXME: Implement direct support for this type! 11125 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); 11126 } 11127 11128 /// \brief High-level routine to lower various 512-bit x86 vector shuffles. 11129 /// 11130 /// This routine either breaks down the specific type of a 512-bit x86 vector 11131 /// shuffle or splits it into two 256-bit shuffles and fuses the results back 11132 /// together based on the available instructions. 11133 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 11134 MVT VT, const X86Subtarget *Subtarget, 11135 SelectionDAG &DAG) { 11136 SDLoc DL(Op); 11137 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11138 ArrayRef<int> Mask = SVOp->getMask(); 11139 assert(Subtarget->hasAVX512() && 11140 "Cannot lower 512-bit vectors w/ basic ISA!"); 11141 11142 // Check for being able to broadcast a single element. 11143 if (SDValue Broadcast = 11144 lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) 11145 return Broadcast; 11146 11147 // Dispatch to each element type for lowering. If we don't have supprot for 11148 // specific element type shuffles at 512 bits, immediately split them and 11149 // lower them. Each lowering routine of a given type is allowed to assume that 11150 // the requisite ISA extensions for that element type are available. 11151 switch (VT.SimpleTy) { 11152 case MVT::v8f64: 11153 return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); 11154 case MVT::v16f32: 11155 return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); 11156 case MVT::v8i64: 11157 return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); 11158 case MVT::v16i32: 11159 return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); 11160 case MVT::v32i16: 11161 if (Subtarget->hasBWI()) 11162 return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); 11163 break; 11164 case MVT::v64i8: 11165 if (Subtarget->hasBWI()) 11166 return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); 11167 break; 11168 11169 default: 11170 llvm_unreachable("Not a valid 512-bit x86 vector type!"); 11171 } 11172 11173 // Otherwise fall back on splitting. 11174 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 11175 } 11176 11177 // Lower vXi1 vector shuffles. 11178 // There is no a dedicated instruction on AVX-512 that shuffles the masks. 11179 // The only way to shuffle bits is to sign-extend the mask vector to SIMD 11180 // vector, shuffle and then truncate it back. 11181 static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 11182 MVT VT, const X86Subtarget *Subtarget, 11183 SelectionDAG &DAG) { 11184 SDLoc DL(Op); 11185 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11186 ArrayRef<int> Mask = SVOp->getMask(); 11187 assert(Subtarget->hasAVX512() && 11188 "Cannot lower 512-bit vectors w/o basic ISA!"); 11189 MVT ExtVT; 11190 switch (VT.SimpleTy) { 11191 default: 11192 llvm_unreachable("Expected a vector of i1 elements"); 11193 case MVT::v2i1: 11194 ExtVT = MVT::v2i64; 11195 break; 11196 case MVT::v4i1: 11197 ExtVT = MVT::v4i32; 11198 break; 11199 case MVT::v8i1: 11200 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL 11201 break; 11202 case MVT::v16i1: 11203 ExtVT = MVT::v16i32; 11204 break; 11205 case MVT::v32i1: 11206 ExtVT = MVT::v32i16; 11207 break; 11208 case MVT::v64i1: 11209 ExtVT = MVT::v64i8; 11210 break; 11211 } 11212 11213 if (ISD::isBuildVectorAllZeros(V1.getNode())) 11214 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); 11215 else if (ISD::isBuildVectorAllOnes(V1.getNode())) 11216 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); 11217 else 11218 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); 11219 11220 if (V2.isUndef()) 11221 V2 = DAG.getUNDEF(ExtVT); 11222 else if (ISD::isBuildVectorAllZeros(V2.getNode())) 11223 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); 11224 else if (ISD::isBuildVectorAllOnes(V2.getNode())) 11225 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); 11226 else 11227 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); 11228 return DAG.getNode(ISD::TRUNCATE, DL, VT, 11229 DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); 11230 } 11231 /// \brief Top-level lowering for x86 vector shuffles. 11232 /// 11233 /// This handles decomposition, canonicalization, and lowering of all x86 11234 /// vector shuffles. Most of the specific lowering strategies are encapsulated 11235 /// above in helper routines. The canonicalization attempts to widen shuffles 11236 /// to involve fewer lanes of wider elements, consolidate symmetric patterns 11237 /// s.t. only one of the two inputs needs to be tested, etc. 11238 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, 11239 SelectionDAG &DAG) { 11240 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 11241 ArrayRef<int> Mask = SVOp->getMask(); 11242 SDValue V1 = Op.getOperand(0); 11243 SDValue V2 = Op.getOperand(1); 11244 MVT VT = Op.getSimpleValueType(); 11245 int NumElements = VT.getVectorNumElements(); 11246 SDLoc dl(Op); 11247 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); 11248 11249 assert((VT.getSizeInBits() != 64 || Is1BitVector) && 11250 "Can't lower MMX shuffles"); 11251 11252 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 11253 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 11254 if (V1IsUndef && V2IsUndef) 11255 return DAG.getUNDEF(VT); 11256 11257 // When we create a shuffle node we put the UNDEF node to second operand, 11258 // but in some cases the first operand may be transformed to UNDEF. 11259 // In this case we should just commute the node. 11260 if (V1IsUndef) 11261 return DAG.getCommutedVectorShuffle(*SVOp); 11262 11263 // Check for non-undef masks pointing at an undef vector and make the masks 11264 // undef as well. This makes it easier to match the shuffle based solely on 11265 // the mask. 11266 if (V2IsUndef) 11267 for (int M : Mask) 11268 if (M >= NumElements) { 11269 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); 11270 for (int &M : NewMask) 11271 if (M >= NumElements) 11272 M = -1; 11273 return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); 11274 } 11275 11276 // We actually see shuffles that are entirely re-arrangements of a set of 11277 // zero inputs. This mostly happens while decomposing complex shuffles into 11278 // simple ones. Directly lower these as a buildvector of zeros. 11279 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 11280 if (Zeroable.all()) 11281 return getZeroVector(VT, Subtarget, DAG, dl); 11282 11283 // Try to collapse shuffles into using a vector type with fewer elements but 11284 // wider element types. We cap this to not form integers or floating point 11285 // elements wider than 64 bits, but it might be interesting to form i128 11286 // integers to handle flipping the low and high halves of AVX 256-bit vectors. 11287 SmallVector<int, 16> WidenedMask; 11288 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && 11289 canWidenShuffleElements(Mask, WidenedMask)) { 11290 MVT NewEltVT = VT.isFloatingPoint() 11291 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) 11292 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); 11293 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); 11294 // Make sure that the new vector type is legal. For example, v2f64 isn't 11295 // legal on SSE1. 11296 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { 11297 V1 = DAG.getBitcast(NewVT, V1); 11298 V2 = DAG.getBitcast(NewVT, V2); 11299 return DAG.getBitcast( 11300 VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); 11301 } 11302 } 11303 11304 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; 11305 for (int M : SVOp->getMask()) 11306 if (M < 0) 11307 ++NumUndefElements; 11308 else if (M < NumElements) 11309 ++NumV1Elements; 11310 else 11311 ++NumV2Elements; 11312 11313 // Commute the shuffle as needed such that more elements come from V1 than 11314 // V2. This allows us to match the shuffle pattern strictly on how many 11315 // elements come from V1 without handling the symmetric cases. 11316 if (NumV2Elements > NumV1Elements) 11317 return DAG.getCommutedVectorShuffle(*SVOp); 11318 11319 // When the number of V1 and V2 elements are the same, try to minimize the 11320 // number of uses of V2 in the low half of the vector. When that is tied, 11321 // ensure that the sum of indices for V1 is equal to or lower than the sum 11322 // indices for V2. When those are equal, try to ensure that the number of odd 11323 // indices for V1 is lower than the number of odd indices for V2. 11324 if (NumV1Elements == NumV2Elements) { 11325 int LowV1Elements = 0, LowV2Elements = 0; 11326 for (int M : SVOp->getMask().slice(0, NumElements / 2)) 11327 if (M >= NumElements) 11328 ++LowV2Elements; 11329 else if (M >= 0) 11330 ++LowV1Elements; 11331 if (LowV2Elements > LowV1Elements) { 11332 return DAG.getCommutedVectorShuffle(*SVOp); 11333 } else if (LowV2Elements == LowV1Elements) { 11334 int SumV1Indices = 0, SumV2Indices = 0; 11335 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) 11336 if (SVOp->getMask()[i] >= NumElements) 11337 SumV2Indices += i; 11338 else if (SVOp->getMask()[i] >= 0) 11339 SumV1Indices += i; 11340 if (SumV2Indices < SumV1Indices) { 11341 return DAG.getCommutedVectorShuffle(*SVOp); 11342 } else if (SumV2Indices == SumV1Indices) { 11343 int NumV1OddIndices = 0, NumV2OddIndices = 0; 11344 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) 11345 if (SVOp->getMask()[i] >= NumElements) 11346 NumV2OddIndices += i % 2; 11347 else if (SVOp->getMask()[i] >= 0) 11348 NumV1OddIndices += i % 2; 11349 if (NumV2OddIndices < NumV1OddIndices) 11350 return DAG.getCommutedVectorShuffle(*SVOp); 11351 } 11352 } 11353 } 11354 11355 // For each vector width, delegate to a specialized lowering routine. 11356 if (VT.is128BitVector()) 11357 return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 11358 11359 if (VT.is256BitVector()) 11360 return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 11361 11362 if (VT.is512BitVector()) 11363 return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 11364 11365 if (Is1BitVector) 11366 return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 11367 llvm_unreachable("Unimplemented!"); 11368 } 11369 11370 // This function assumes its argument is a BUILD_VECTOR of constants or 11371 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is 11372 // true. 11373 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, 11374 unsigned &MaskValue) { 11375 MaskValue = 0; 11376 unsigned NumElems = BuildVector->getNumOperands(); 11377 11378 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 11379 // We don't handle the >2 lanes case right now. 11380 unsigned NumLanes = (NumElems - 1) / 8 + 1; 11381 if (NumLanes > 2) 11382 return false; 11383 11384 unsigned NumElemsInLane = NumElems / NumLanes; 11385 11386 // Blend for v16i16 should be symmetric for the both lanes. 11387 for (unsigned i = 0; i < NumElemsInLane; ++i) { 11388 SDValue EltCond = BuildVector->getOperand(i); 11389 SDValue SndLaneEltCond = 11390 (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; 11391 11392 int Lane1Cond = -1, Lane2Cond = -1; 11393 if (isa<ConstantSDNode>(EltCond)) 11394 Lane1Cond = !isNullConstant(EltCond); 11395 if (isa<ConstantSDNode>(SndLaneEltCond)) 11396 Lane2Cond = !isNullConstant(SndLaneEltCond); 11397 11398 unsigned LaneMask = 0; 11399 if (Lane1Cond == Lane2Cond || Lane2Cond < 0) 11400 // Lane1Cond != 0, means we want the first argument. 11401 // Lane1Cond == 0, means we want the second argument. 11402 // The encoding of this argument is 0 for the first argument, 1 11403 // for the second. Therefore, invert the condition. 11404 LaneMask = !Lane1Cond << i; 11405 else if (Lane1Cond < 0) 11406 LaneMask = !Lane2Cond << i; 11407 else 11408 return false; 11409 11410 MaskValue |= LaneMask; 11411 if (NumLanes == 2) 11412 MaskValue |= LaneMask << NumElemsInLane; 11413 } 11414 return true; 11415 } 11416 11417 /// \brief Try to lower a VSELECT instruction to a vector shuffle. 11418 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, 11419 const X86Subtarget *Subtarget, 11420 SelectionDAG &DAG) { 11421 SDValue Cond = Op.getOperand(0); 11422 SDValue LHS = Op.getOperand(1); 11423 SDValue RHS = Op.getOperand(2); 11424 SDLoc dl(Op); 11425 MVT VT = Op.getSimpleValueType(); 11426 11427 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) 11428 return SDValue(); 11429 auto *CondBV = cast<BuildVectorSDNode>(Cond); 11430 11431 // Only non-legal VSELECTs reach this lowering, convert those into generic 11432 // shuffles and re-use the shuffle lowering path for blends. 11433 SmallVector<int, 32> Mask; 11434 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { 11435 SDValue CondElt = CondBV->getOperand(i); 11436 Mask.push_back( 11437 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) 11438 : -1); 11439 } 11440 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); 11441 } 11442 11443 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { 11444 // A vselect where all conditions and data are constants can be optimized into 11445 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). 11446 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && 11447 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && 11448 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) 11449 return SDValue(); 11450 11451 // Try to lower this to a blend-style vector shuffle. This can handle all 11452 // constant condition cases. 11453 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) 11454 return BlendOp; 11455 11456 // Variable blends are only legal from SSE4.1 onward. 11457 if (!Subtarget->hasSSE41()) 11458 return SDValue(); 11459 11460 // Only some types will be legal on some subtargets. If we can emit a legal 11461 // VSELECT-matching blend, return Op, and but if we need to expand, return 11462 // a null value. 11463 switch (Op.getSimpleValueType().SimpleTy) { 11464 default: 11465 // Most of the vector types have blends past SSE4.1. 11466 return Op; 11467 11468 case MVT::v32i8: 11469 // The byte blends for AVX vectors were introduced only in AVX2. 11470 if (Subtarget->hasAVX2()) 11471 return Op; 11472 11473 return SDValue(); 11474 11475 case MVT::v8i16: 11476 case MVT::v16i16: 11477 // AVX-512 BWI and VLX features support VSELECT with i16 elements. 11478 if (Subtarget->hasBWI() && Subtarget->hasVLX()) 11479 return Op; 11480 11481 // FIXME: We should custom lower this by fixing the condition and using i8 11482 // blends. 11483 return SDValue(); 11484 } 11485 } 11486 11487 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 11488 MVT VT = Op.getSimpleValueType(); 11489 SDLoc dl(Op); 11490 11491 if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) 11492 return SDValue(); 11493 11494 if (VT.getSizeInBits() == 8) { 11495 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 11496 Op.getOperand(0), Op.getOperand(1)); 11497 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 11498 DAG.getValueType(VT)); 11499 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 11500 } 11501 11502 if (VT.getSizeInBits() == 16) { 11503 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 11504 if (isNullConstant(Op.getOperand(1))) 11505 return DAG.getNode( 11506 ISD::TRUNCATE, dl, MVT::i16, 11507 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 11508 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), 11509 Op.getOperand(1))); 11510 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 11511 Op.getOperand(0), Op.getOperand(1)); 11512 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 11513 DAG.getValueType(VT)); 11514 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 11515 } 11516 11517 if (VT == MVT::f32) { 11518 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 11519 // the result back to FR32 register. It's only worth matching if the 11520 // result has a single use which is a store or a bitcast to i32. And in 11521 // the case of a store, it's not worth it if the index is a constant 0, 11522 // because a MOVSSmr can be used instead, which is smaller and faster. 11523 if (!Op.hasOneUse()) 11524 return SDValue(); 11525 SDNode *User = *Op.getNode()->use_begin(); 11526 if ((User->getOpcode() != ISD::STORE || 11527 isNullConstant(Op.getOperand(1))) && 11528 (User->getOpcode() != ISD::BITCAST || 11529 User->getValueType(0) != MVT::i32)) 11530 return SDValue(); 11531 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 11532 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), 11533 Op.getOperand(1)); 11534 return DAG.getBitcast(MVT::f32, Extract); 11535 } 11536 11537 if (VT == MVT::i32 || VT == MVT::i64) { 11538 // ExtractPS/pextrq works with constant index. 11539 if (isa<ConstantSDNode>(Op.getOperand(1))) 11540 return Op; 11541 } 11542 return SDValue(); 11543 } 11544 11545 /// Extract one bit from mask vector, like v16i1 or v8i1. 11546 /// AVX-512 feature. 11547 SDValue 11548 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { 11549 SDValue Vec = Op.getOperand(0); 11550 SDLoc dl(Vec); 11551 MVT VecVT = Vec.getSimpleValueType(); 11552 SDValue Idx = Op.getOperand(1); 11553 MVT EltVT = Op.getSimpleValueType(); 11554 11555 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); 11556 assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && 11557 "Unexpected vector type in ExtractBitFromMaskVector"); 11558 11559 // variable index can't be handled in mask registers, 11560 // extend vector to VR512 11561 if (!isa<ConstantSDNode>(Idx)) { 11562 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 11563 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); 11564 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 11565 ExtVT.getVectorElementType(), Ext, Idx); 11566 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 11567 } 11568 11569 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11570 const TargetRegisterClass* rc = getRegClassFor(VecVT); 11571 if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) 11572 rc = getRegClassFor(MVT::v16i1); 11573 unsigned MaxSift = rc->getSize()*8 - 1; 11574 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, 11575 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); 11576 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, 11577 DAG.getConstant(MaxSift, dl, MVT::i8)); 11578 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, 11579 DAG.getIntPtrConstant(0, dl)); 11580 } 11581 11582 SDValue 11583 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 11584 SelectionDAG &DAG) const { 11585 SDLoc dl(Op); 11586 SDValue Vec = Op.getOperand(0); 11587 MVT VecVT = Vec.getSimpleValueType(); 11588 SDValue Idx = Op.getOperand(1); 11589 11590 if (Op.getSimpleValueType() == MVT::i1) 11591 return ExtractBitFromMaskVector(Op, DAG); 11592 11593 if (!isa<ConstantSDNode>(Idx)) { 11594 if (VecVT.is512BitVector() || 11595 (VecVT.is256BitVector() && Subtarget->hasInt256() && 11596 VecVT.getVectorElementType().getSizeInBits() == 32)) { 11597 11598 MVT MaskEltVT = 11599 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); 11600 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / 11601 MaskEltVT.getSizeInBits()); 11602 11603 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); 11604 auto PtrVT = getPointerTy(DAG.getDataLayout()); 11605 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, 11606 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, 11607 DAG.getConstant(0, dl, PtrVT)); 11608 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); 11609 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, 11610 DAG.getConstant(0, dl, PtrVT)); 11611 } 11612 return SDValue(); 11613 } 11614 11615 // If this is a 256-bit vector result, first extract the 128-bit vector and 11616 // then extract the element from the 128-bit vector. 11617 if (VecVT.is256BitVector() || VecVT.is512BitVector()) { 11618 11619 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11620 // Get the 128-bit vector. 11621 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 11622 MVT EltVT = VecVT.getVectorElementType(); 11623 11624 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); 11625 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); 11626 11627 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 11628 // this can be done with a mask. 11629 IdxVal &= ElemsPerChunk - 1; 11630 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 11631 DAG.getConstant(IdxVal, dl, MVT::i32)); 11632 } 11633 11634 assert(VecVT.is128BitVector() && "Unexpected vector length"); 11635 11636 if (Subtarget->hasSSE41()) 11637 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) 11638 return Res; 11639 11640 MVT VT = Op.getSimpleValueType(); 11641 // TODO: handle v16i8. 11642 if (VT.getSizeInBits() == 16) { 11643 SDValue Vec = Op.getOperand(0); 11644 if (isNullConstant(Op.getOperand(1))) 11645 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 11646 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 11647 DAG.getBitcast(MVT::v4i32, Vec), 11648 Op.getOperand(1))); 11649 // Transform it so it match pextrw which produces a 32-bit result. 11650 MVT EltVT = MVT::i32; 11651 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 11652 Op.getOperand(0), Op.getOperand(1)); 11653 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 11654 DAG.getValueType(VT)); 11655 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 11656 } 11657 11658 if (VT.getSizeInBits() == 32) { 11659 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11660 if (Idx == 0) 11661 return Op; 11662 11663 // SHUFPS the element to the lowest double word, then movss. 11664 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 11665 MVT VVT = Op.getOperand(0).getSimpleValueType(); 11666 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 11667 DAG.getUNDEF(VVT), Mask); 11668 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 11669 DAG.getIntPtrConstant(0, dl)); 11670 } 11671 11672 if (VT.getSizeInBits() == 64) { 11673 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 11674 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 11675 // to match extract_elt for f64. 11676 if (isNullConstant(Op.getOperand(1))) 11677 return Op; 11678 11679 // UNPCKHPD the element to the lowest double word, then movsd. 11680 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 11681 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 11682 int Mask[2] = { 1, -1 }; 11683 MVT VVT = Op.getOperand(0).getSimpleValueType(); 11684 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 11685 DAG.getUNDEF(VVT), Mask); 11686 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 11687 DAG.getIntPtrConstant(0, dl)); 11688 } 11689 11690 return SDValue(); 11691 } 11692 11693 /// Insert one bit to mask vector, like v16i1 or v8i1. 11694 /// AVX-512 feature. 11695 SDValue 11696 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { 11697 SDLoc dl(Op); 11698 SDValue Vec = Op.getOperand(0); 11699 SDValue Elt = Op.getOperand(1); 11700 SDValue Idx = Op.getOperand(2); 11701 MVT VecVT = Vec.getSimpleValueType(); 11702 11703 if (!isa<ConstantSDNode>(Idx)) { 11704 // Non constant index. Extend source and destination, 11705 // insert element and then truncate the result. 11706 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 11707 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); 11708 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 11709 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), 11710 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); 11711 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); 11712 } 11713 11714 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11715 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); 11716 if (IdxVal) 11717 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, 11718 DAG.getConstant(IdxVal, dl, MVT::i8)); 11719 if (Vec.getOpcode() == ISD::UNDEF) 11720 return EltInVec; 11721 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); 11722 } 11723 11724 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 11725 SelectionDAG &DAG) const { 11726 MVT VT = Op.getSimpleValueType(); 11727 MVT EltVT = VT.getVectorElementType(); 11728 11729 if (EltVT == MVT::i1) 11730 return InsertBitToMaskVector(Op, DAG); 11731 11732 SDLoc dl(Op); 11733 SDValue N0 = Op.getOperand(0); 11734 SDValue N1 = Op.getOperand(1); 11735 SDValue N2 = Op.getOperand(2); 11736 if (!isa<ConstantSDNode>(N2)) 11737 return SDValue(); 11738 auto *N2C = cast<ConstantSDNode>(N2); 11739 unsigned IdxVal = N2C->getZExtValue(); 11740 11741 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert 11742 // into that, and then insert the subvector back into the result. 11743 if (VT.is256BitVector() || VT.is512BitVector()) { 11744 // With a 256-bit vector, we can insert into the zero element efficiently 11745 // using a blend if we have AVX or AVX2 and the right data type. 11746 if (VT.is256BitVector() && IdxVal == 0) { 11747 // TODO: It is worthwhile to cast integer to floating point and back 11748 // and incur a domain crossing penalty if that's what we'll end up 11749 // doing anyway after extracting to a 128-bit vector. 11750 if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || 11751 (Subtarget->hasAVX2() && EltVT == MVT::i32)) { 11752 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); 11753 N2 = DAG.getIntPtrConstant(1, dl); 11754 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); 11755 } 11756 } 11757 11758 // Get the desired 128-bit vector chunk. 11759 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 11760 11761 // Insert the element into the desired chunk. 11762 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); 11763 assert(isPowerOf2_32(NumEltsIn128)); 11764 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. 11765 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); 11766 11767 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 11768 DAG.getConstant(IdxIn128, dl, MVT::i32)); 11769 11770 // Insert the changed part back into the bigger vector 11771 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 11772 } 11773 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); 11774 11775 if (Subtarget->hasSSE41()) { 11776 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { 11777 unsigned Opc; 11778 if (VT == MVT::v8i16) { 11779 Opc = X86ISD::PINSRW; 11780 } else { 11781 assert(VT == MVT::v16i8); 11782 Opc = X86ISD::PINSRB; 11783 } 11784 11785 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 11786 // argument. 11787 if (N1.getValueType() != MVT::i32) 11788 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 11789 if (N2.getValueType() != MVT::i32) 11790 N2 = DAG.getIntPtrConstant(IdxVal, dl); 11791 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 11792 } 11793 11794 if (EltVT == MVT::f32) { 11795 // Bits [7:6] of the constant are the source select. This will always be 11796 // zero here. The DAG Combiner may combine an extract_elt index into 11797 // these bits. For example (insert (extract, 3), 2) could be matched by 11798 // putting the '3' into bits [7:6] of X86ISD::INSERTPS. 11799 // Bits [5:4] of the constant are the destination select. This is the 11800 // value of the incoming immediate. 11801 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 11802 // combine either bitwise AND or insert of float 0.0 to set these bits. 11803 11804 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); 11805 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { 11806 // If this is an insertion of 32-bits into the low 32-bits of 11807 // a vector, we prefer to generate a blend with immediate rather 11808 // than an insertps. Blends are simpler operations in hardware and so 11809 // will always have equal or better performance than insertps. 11810 // But if optimizing for size and there's a load folding opportunity, 11811 // generate insertps because blendps does not have a 32-bit memory 11812 // operand form. 11813 N2 = DAG.getIntPtrConstant(1, dl); 11814 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 11815 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); 11816 } 11817 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); 11818 // Create this as a scalar to vector.. 11819 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 11820 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 11821 } 11822 11823 if (EltVT == MVT::i32 || EltVT == MVT::i64) { 11824 // PINSR* works with constant index. 11825 return Op; 11826 } 11827 } 11828 11829 if (EltVT == MVT::i8) 11830 return SDValue(); 11831 11832 if (EltVT.getSizeInBits() == 16) { 11833 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 11834 // as its second argument. 11835 if (N1.getValueType() != MVT::i32) 11836 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 11837 if (N2.getValueType() != MVT::i32) 11838 N2 = DAG.getIntPtrConstant(IdxVal, dl); 11839 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 11840 } 11841 return SDValue(); 11842 } 11843 11844 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 11845 SDLoc dl(Op); 11846 MVT OpVT = Op.getSimpleValueType(); 11847 11848 // If this is a 256-bit vector result, first insert into a 128-bit 11849 // vector and then insert into the 256-bit vector. 11850 if (!OpVT.is128BitVector()) { 11851 // Insert into a 128-bit vector. 11852 unsigned SizeFactor = OpVT.getSizeInBits()/128; 11853 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), 11854 OpVT.getVectorNumElements() / SizeFactor); 11855 11856 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 11857 11858 // Insert the 128-bit vector. 11859 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 11860 } 11861 11862 if (OpVT == MVT::v1i64 && 11863 Op.getOperand(0).getValueType() == MVT::i64) 11864 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 11865 11866 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 11867 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 11868 return DAG.getBitcast( 11869 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); 11870 } 11871 11872 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 11873 // a simple subregister reference or explicit instructions to grab 11874 // upper bits of a vector. 11875 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 11876 SelectionDAG &DAG) { 11877 SDLoc dl(Op); 11878 SDValue In = Op.getOperand(0); 11879 SDValue Idx = Op.getOperand(1); 11880 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11881 MVT ResVT = Op.getSimpleValueType(); 11882 MVT InVT = In.getSimpleValueType(); 11883 11884 if (Subtarget->hasFp256()) { 11885 if (ResVT.is128BitVector() && 11886 (InVT.is256BitVector() || InVT.is512BitVector()) && 11887 isa<ConstantSDNode>(Idx)) { 11888 return Extract128BitVector(In, IdxVal, DAG, dl); 11889 } 11890 if (ResVT.is256BitVector() && InVT.is512BitVector() && 11891 isa<ConstantSDNode>(Idx)) { 11892 return Extract256BitVector(In, IdxVal, DAG, dl); 11893 } 11894 } 11895 return SDValue(); 11896 } 11897 11898 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 11899 // simple superregister reference or explicit instructions to insert 11900 // the upper bits of a vector. 11901 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 11902 SelectionDAG &DAG) { 11903 if (!Subtarget->hasAVX()) 11904 return SDValue(); 11905 11906 SDLoc dl(Op); 11907 SDValue Vec = Op.getOperand(0); 11908 SDValue SubVec = Op.getOperand(1); 11909 SDValue Idx = Op.getOperand(2); 11910 11911 if (!isa<ConstantSDNode>(Idx)) 11912 return SDValue(); 11913 11914 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11915 MVT OpVT = Op.getSimpleValueType(); 11916 MVT SubVecVT = SubVec.getSimpleValueType(); 11917 11918 // Fold two 16-byte subvector loads into one 32-byte load: 11919 // (insert_subvector (insert_subvector undef, (load addr), 0), 11920 // (load addr + 16), Elts/2) 11921 // --> load32 addr 11922 if ((IdxVal == OpVT.getVectorNumElements() / 2) && 11923 Vec.getOpcode() == ISD::INSERT_SUBVECTOR && 11924 OpVT.is256BitVector() && SubVecVT.is128BitVector()) { 11925 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); 11926 if (Idx2 && Idx2->getZExtValue() == 0) { 11927 SDValue SubVec2 = Vec.getOperand(1); 11928 // If needed, look through a bitcast to get to the load. 11929 if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) 11930 SubVec2 = SubVec2.getOperand(0); 11931 11932 if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) { 11933 bool Fast; 11934 unsigned Alignment = FirstLd->getAlignment(); 11935 unsigned AS = FirstLd->getAddressSpace(); 11936 const X86TargetLowering *TLI = Subtarget->getTargetLowering(); 11937 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), 11938 OpVT, AS, Alignment, &Fast) && Fast) { 11939 SDValue Ops[] = { SubVec2, SubVec }; 11940 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) 11941 return Ld; 11942 } 11943 } 11944 } 11945 } 11946 11947 if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && 11948 SubVecVT.is128BitVector()) 11949 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 11950 11951 if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) 11952 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); 11953 11954 if (OpVT.getVectorElementType() == MVT::i1) 11955 return Insert1BitVector(Op, DAG); 11956 11957 return SDValue(); 11958 } 11959 11960 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 11961 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 11962 // one of the above mentioned nodes. It has to be wrapped because otherwise 11963 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 11964 // be used to form addressing mode. These wrapped nodes will be selected 11965 // into MOV32ri. 11966 SDValue 11967 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 11968 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 11969 11970 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 11971 // global base reg. 11972 unsigned char OpFlag = 0; 11973 unsigned WrapperKind = X86ISD::Wrapper; 11974 CodeModel::Model M = DAG.getTarget().getCodeModel(); 11975 11976 if (Subtarget->isPICStyleRIPRel() && 11977 (M == CodeModel::Small || M == CodeModel::Kernel)) 11978 WrapperKind = X86ISD::WrapperRIP; 11979 else if (Subtarget->isPICStyleGOT()) 11980 OpFlag = X86II::MO_GOTOFF; 11981 else if (Subtarget->isPICStyleStubPIC()) 11982 OpFlag = X86II::MO_PIC_BASE_OFFSET; 11983 11984 auto PtrVT = getPointerTy(DAG.getDataLayout()); 11985 SDValue Result = DAG.getTargetConstantPool( 11986 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); 11987 SDLoc DL(CP); 11988 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); 11989 // With PIC, the address is actually $g + Offset. 11990 if (OpFlag) { 11991 Result = 11992 DAG.getNode(ISD::ADD, DL, PtrVT, 11993 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); 11994 } 11995 11996 return Result; 11997 } 11998 11999 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 12000 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 12001 12002 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 12003 // global base reg. 12004 unsigned char OpFlag = 0; 12005 unsigned WrapperKind = X86ISD::Wrapper; 12006 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12007 12008 if (Subtarget->isPICStyleRIPRel() && 12009 (M == CodeModel::Small || M == CodeModel::Kernel)) 12010 WrapperKind = X86ISD::WrapperRIP; 12011 else if (Subtarget->isPICStyleGOT()) 12012 OpFlag = X86II::MO_GOTOFF; 12013 else if (Subtarget->isPICStyleStubPIC()) 12014 OpFlag = X86II::MO_PIC_BASE_OFFSET; 12015 12016 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12017 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); 12018 SDLoc DL(JT); 12019 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); 12020 12021 // With PIC, the address is actually $g + Offset. 12022 if (OpFlag) 12023 Result = 12024 DAG.getNode(ISD::ADD, DL, PtrVT, 12025 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); 12026 12027 return Result; 12028 } 12029 12030 SDValue 12031 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 12032 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 12033 12034 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 12035 // global base reg. 12036 unsigned char OpFlag = 0; 12037 unsigned WrapperKind = X86ISD::Wrapper; 12038 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12039 12040 if (Subtarget->isPICStyleRIPRel() && 12041 (M == CodeModel::Small || M == CodeModel::Kernel)) { 12042 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 12043 OpFlag = X86II::MO_GOTPCREL; 12044 WrapperKind = X86ISD::WrapperRIP; 12045 } else if (Subtarget->isPICStyleGOT()) { 12046 OpFlag = X86II::MO_GOT; 12047 } else if (Subtarget->isPICStyleStubPIC()) { 12048 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 12049 } else if (Subtarget->isPICStyleStubNoDynamic()) { 12050 OpFlag = X86II::MO_DARWIN_NONLAZY; 12051 } 12052 12053 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12054 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); 12055 12056 SDLoc DL(Op); 12057 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); 12058 12059 // With PIC, the address is actually $g + Offset. 12060 if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && 12061 !Subtarget->is64Bit()) { 12062 Result = 12063 DAG.getNode(ISD::ADD, DL, PtrVT, 12064 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); 12065 } 12066 12067 // For symbols that require a load from a stub to get the address, emit the 12068 // load. 12069 if (isGlobalStubReference(OpFlag)) 12070 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 12071 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 12072 false, false, false, 0); 12073 12074 return Result; 12075 } 12076 12077 SDValue 12078 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 12079 // Create the TargetBlockAddressAddress node. 12080 unsigned char OpFlags = 12081 Subtarget->ClassifyBlockAddressReference(); 12082 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12083 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 12084 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 12085 SDLoc dl(Op); 12086 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12087 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); 12088 12089 if (Subtarget->isPICStyleRIPRel() && 12090 (M == CodeModel::Small || M == CodeModel::Kernel)) 12091 Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); 12092 else 12093 Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); 12094 12095 // With PIC, the address is actually $g + Offset. 12096 if (isGlobalRelativeToPICBase(OpFlags)) { 12097 Result = DAG.getNode(ISD::ADD, dl, PtrVT, 12098 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); 12099 } 12100 12101 return Result; 12102 } 12103 12104 SDValue 12105 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, 12106 int64_t Offset, SelectionDAG &DAG) const { 12107 // Create the TargetGlobalAddress node, folding in the constant 12108 // offset if it is legal. 12109 unsigned char OpFlags = 12110 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); 12111 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12112 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12113 SDValue Result; 12114 if (OpFlags == X86II::MO_NO_FLAG && 12115 X86::isOffsetSuitableForCodeModel(Offset, M)) { 12116 // A direct static reference to a global. 12117 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); 12118 Offset = 0; 12119 } else { 12120 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); 12121 } 12122 12123 if (Subtarget->isPICStyleRIPRel() && 12124 (M == CodeModel::Small || M == CodeModel::Kernel)) 12125 Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); 12126 else 12127 Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); 12128 12129 // With PIC, the address is actually $g + Offset. 12130 if (isGlobalRelativeToPICBase(OpFlags)) { 12131 Result = DAG.getNode(ISD::ADD, dl, PtrVT, 12132 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); 12133 } 12134 12135 // For globals that require a load from a stub to get the address, emit the 12136 // load. 12137 if (isGlobalStubReference(OpFlags)) 12138 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 12139 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 12140 false, false, false, 0); 12141 12142 // If there was a non-zero offset that we didn't fold, create an explicit 12143 // addition for it. 12144 if (Offset != 0) 12145 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, 12146 DAG.getConstant(Offset, dl, PtrVT)); 12147 12148 return Result; 12149 } 12150 12151 SDValue 12152 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 12153 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 12154 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 12155 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); 12156 } 12157 12158 static SDValue 12159 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 12160 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 12161 unsigned char OperandFlags, bool LocalDynamic = false) { 12162 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 12163 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 12164 SDLoc dl(GA); 12165 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 12166 GA->getValueType(0), 12167 GA->getOffset(), 12168 OperandFlags); 12169 12170 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 12171 : X86ISD::TLSADDR; 12172 12173 if (InFlag) { 12174 SDValue Ops[] = { Chain, TGA, *InFlag }; 12175 Chain = DAG.getNode(CallType, dl, NodeTys, Ops); 12176 } else { 12177 SDValue Ops[] = { Chain, TGA }; 12178 Chain = DAG.getNode(CallType, dl, NodeTys, Ops); 12179 } 12180 12181 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 12182 MFI->setAdjustsStack(true); 12183 MFI->setHasCalls(true); 12184 12185 SDValue Flag = Chain.getValue(1); 12186 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 12187 } 12188 12189 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 12190 static SDValue 12191 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 12192 const EVT PtrVT) { 12193 SDValue InFlag; 12194 SDLoc dl(GA); // ? function entry point might be better 12195 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 12196 DAG.getNode(X86ISD::GlobalBaseReg, 12197 SDLoc(), PtrVT), InFlag); 12198 InFlag = Chain.getValue(1); 12199 12200 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 12201 } 12202 12203 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 12204 static SDValue 12205 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 12206 const EVT PtrVT) { 12207 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, 12208 X86::RAX, X86II::MO_TLSGD); 12209 } 12210 12211 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 12212 SelectionDAG &DAG, 12213 const EVT PtrVT, 12214 bool is64Bit) { 12215 SDLoc dl(GA); 12216 12217 // Get the start address of the TLS block for this module. 12218 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 12219 .getInfo<X86MachineFunctionInfo>(); 12220 MFI->incNumLocalDynamicTLSAccesses(); 12221 12222 SDValue Base; 12223 if (is64Bit) { 12224 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, 12225 X86II::MO_TLSLD, /*LocalDynamic=*/true); 12226 } else { 12227 SDValue InFlag; 12228 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 12229 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); 12230 InFlag = Chain.getValue(1); 12231 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 12232 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 12233 } 12234 12235 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 12236 // of Base. 12237 12238 // Build x@dtpoff. 12239 unsigned char OperandFlags = X86II::MO_DTPOFF; 12240 unsigned WrapperKind = X86ISD::Wrapper; 12241 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 12242 GA->getValueType(0), 12243 GA->getOffset(), OperandFlags); 12244 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 12245 12246 // Add x@dtpoff with the base. 12247 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 12248 } 12249 12250 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 12251 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 12252 const EVT PtrVT, TLSModel::Model model, 12253 bool is64Bit, bool isPIC) { 12254 SDLoc dl(GA); 12255 12256 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 12257 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 12258 is64Bit ? 257 : 256)); 12259 12260 SDValue ThreadPointer = 12261 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), 12262 MachinePointerInfo(Ptr), false, false, false, 0); 12263 12264 unsigned char OperandFlags = 0; 12265 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 12266 // initialexec. 12267 unsigned WrapperKind = X86ISD::Wrapper; 12268 if (model == TLSModel::LocalExec) { 12269 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 12270 } else if (model == TLSModel::InitialExec) { 12271 if (is64Bit) { 12272 OperandFlags = X86II::MO_GOTTPOFF; 12273 WrapperKind = X86ISD::WrapperRIP; 12274 } else { 12275 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 12276 } 12277 } else { 12278 llvm_unreachable("Unexpected model"); 12279 } 12280 12281 // emit "addl x@ntpoff,%eax" (local exec) 12282 // or "addl x@indntpoff,%eax" (initial exec) 12283 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 12284 SDValue TGA = 12285 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), 12286 GA->getOffset(), OperandFlags); 12287 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 12288 12289 if (model == TLSModel::InitialExec) { 12290 if (isPIC && !is64Bit) { 12291 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 12292 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 12293 Offset); 12294 } 12295 12296 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 12297 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 12298 false, false, false, 0); 12299 } 12300 12301 // The address of the thread local variable is the add of the thread 12302 // pointer with the offset of the variable. 12303 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 12304 } 12305 12306 SDValue 12307 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 12308 12309 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 12310 12311 // Cygwin uses emutls. 12312 // FIXME: It may be EmulatedTLS-generic also for X86-Android. 12313 if (Subtarget->isTargetWindowsCygwin()) 12314 return LowerToTLSEmulatedModel(GA, DAG); 12315 12316 const GlobalValue *GV = GA->getGlobal(); 12317 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12318 12319 if (Subtarget->isTargetELF()) { 12320 if (DAG.getTarget().Options.EmulatedTLS) 12321 return LowerToTLSEmulatedModel(GA, DAG); 12322 TLSModel::Model model = DAG.getTarget().getTLSModel(GV); 12323 switch (model) { 12324 case TLSModel::GeneralDynamic: 12325 if (Subtarget->is64Bit()) 12326 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); 12327 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); 12328 case TLSModel::LocalDynamic: 12329 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, 12330 Subtarget->is64Bit()); 12331 case TLSModel::InitialExec: 12332 case TLSModel::LocalExec: 12333 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(), 12334 DAG.getTarget().getRelocationModel() == 12335 Reloc::PIC_); 12336 } 12337 llvm_unreachable("Unknown TLS model."); 12338 } 12339 12340 if (Subtarget->isTargetDarwin()) { 12341 // Darwin only has one model of TLS. Lower to that. 12342 unsigned char OpFlag = 0; 12343 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 12344 X86ISD::WrapperRIP : X86ISD::Wrapper; 12345 12346 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 12347 // global base reg. 12348 bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && 12349 !Subtarget->is64Bit(); 12350 if (PIC32) 12351 OpFlag = X86II::MO_TLVP_PIC_BASE; 12352 else 12353 OpFlag = X86II::MO_TLVP; 12354 SDLoc DL(Op); 12355 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 12356 GA->getValueType(0), 12357 GA->getOffset(), OpFlag); 12358 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); 12359 12360 // With PIC32, the address is actually $g + Offset. 12361 if (PIC32) 12362 Offset = DAG.getNode(ISD::ADD, DL, PtrVT, 12363 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 12364 Offset); 12365 12366 // Lowering the machine isd will make sure everything is in the right 12367 // location. 12368 SDValue Chain = DAG.getEntryNode(); 12369 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 12370 SDValue Args[] = { Chain, Offset }; 12371 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); 12372 12373 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 12374 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 12375 MFI->setAdjustsStack(true); 12376 12377 // And our return value (tls address) is in the standard call return value 12378 // location. 12379 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 12380 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); 12381 } 12382 12383 if (Subtarget->isTargetKnownWindowsMSVC() || 12384 Subtarget->isTargetWindowsGNU()) { 12385 // Just use the implicit TLS architecture 12386 // Need to generate someting similar to: 12387 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 12388 // ; from TEB 12389 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 12390 // mov rcx, qword [rdx+rcx*8] 12391 // mov eax, .tls$:tlsvar 12392 // [rax+rcx] contains the address 12393 // Windows 64bit: gs:0x58 12394 // Windows 32bit: fs:__tls_array 12395 12396 SDLoc dl(GA); 12397 SDValue Chain = DAG.getEntryNode(); 12398 12399 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 12400 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly 12401 // use its literal value of 0x2C. 12402 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 12403 ? Type::getInt8PtrTy(*DAG.getContext(), 12404 256) 12405 : Type::getInt32PtrTy(*DAG.getContext(), 12406 257)); 12407 12408 SDValue TlsArray = Subtarget->is64Bit() 12409 ? DAG.getIntPtrConstant(0x58, dl) 12410 : (Subtarget->isTargetWindowsGNU() 12411 ? DAG.getIntPtrConstant(0x2C, dl) 12412 : DAG.getExternalSymbol("_tls_array", PtrVT)); 12413 12414 SDValue ThreadPointer = 12415 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, 12416 false, false, 0); 12417 12418 SDValue res; 12419 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { 12420 res = ThreadPointer; 12421 } else { 12422 // Load the _tls_index variable 12423 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); 12424 if (Subtarget->is64Bit()) 12425 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, 12426 MachinePointerInfo(), MVT::i32, false, false, 12427 false, 0); 12428 else 12429 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false, 12430 false, false, 0); 12431 12432 auto &DL = DAG.getDataLayout(); 12433 SDValue Scale = 12434 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT); 12435 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); 12436 12437 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); 12438 } 12439 12440 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false, 12441 false, 0); 12442 12443 // Get the offset of start of .tls section 12444 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 12445 GA->getValueType(0), 12446 GA->getOffset(), X86II::MO_SECREL); 12447 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); 12448 12449 // The address of the thread local variable is the add of the thread 12450 // pointer with the offset of the variable. 12451 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); 12452 } 12453 12454 llvm_unreachable("TLS not implemented for this target."); 12455 } 12456 12457 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 12458 /// and take a 2 x i32 value to shift plus a shift amount. 12459 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { 12460 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 12461 MVT VT = Op.getSimpleValueType(); 12462 unsigned VTBits = VT.getSizeInBits(); 12463 SDLoc dl(Op); 12464 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 12465 SDValue ShOpLo = Op.getOperand(0); 12466 SDValue ShOpHi = Op.getOperand(1); 12467 SDValue ShAmt = Op.getOperand(2); 12468 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the 12469 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away 12470 // during isel. 12471 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 12472 DAG.getConstant(VTBits - 1, dl, MVT::i8)); 12473 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 12474 DAG.getConstant(VTBits - 1, dl, MVT::i8)) 12475 : DAG.getConstant(0, dl, VT); 12476 12477 SDValue Tmp2, Tmp3; 12478 if (Op.getOpcode() == ISD::SHL_PARTS) { 12479 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 12480 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); 12481 } else { 12482 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 12483 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); 12484 } 12485 12486 // If the shift amount is larger or equal than the width of a part we can't 12487 // rely on the results of shld/shrd. Insert a test and select the appropriate 12488 // values for large shift amounts. 12489 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 12490 DAG.getConstant(VTBits, dl, MVT::i8)); 12491 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 12492 AndNode, DAG.getConstant(0, dl, MVT::i8)); 12493 12494 SDValue Hi, Lo; 12495 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); 12496 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 12497 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 12498 12499 if (Op.getOpcode() == ISD::SHL_PARTS) { 12500 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); 12501 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); 12502 } else { 12503 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); 12504 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); 12505 } 12506 12507 SDValue Ops[2] = { Lo, Hi }; 12508 return DAG.getMergeValues(Ops, dl); 12509 } 12510 12511 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 12512 SelectionDAG &DAG) const { 12513 SDValue Src = Op.getOperand(0); 12514 MVT SrcVT = Src.getSimpleValueType(); 12515 MVT VT = Op.getSimpleValueType(); 12516 SDLoc dl(Op); 12517 12518 if (SrcVT.isVector()) { 12519 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { 12520 return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, 12521 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, 12522 DAG.getUNDEF(SrcVT))); 12523 } 12524 if (SrcVT.getVectorElementType() == MVT::i1) { 12525 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); 12526 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 12527 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); 12528 } 12529 return SDValue(); 12530 } 12531 12532 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && 12533 "Unknown SINT_TO_FP to lower!"); 12534 12535 // These are really Legal; return the operand so the caller accepts it as 12536 // Legal. 12537 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 12538 return Op; 12539 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 12540 Subtarget->is64Bit()) { 12541 return Op; 12542 } 12543 12544 unsigned Size = SrcVT.getSizeInBits()/8; 12545 MachineFunction &MF = DAG.getMachineFunction(); 12546 auto PtrVT = getPointerTy(MF.getDataLayout()); 12547 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 12548 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 12549 SDValue Chain = DAG.getStore( 12550 DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, 12551 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, 12552 false, 0); 12553 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 12554 } 12555 12556 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 12557 SDValue StackSlot, 12558 SelectionDAG &DAG) const { 12559 // Build the FILD 12560 SDLoc DL(Op); 12561 SDVTList Tys; 12562 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 12563 if (useSSE) 12564 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 12565 else 12566 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 12567 12568 unsigned ByteSize = SrcVT.getSizeInBits()/8; 12569 12570 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 12571 MachineMemOperand *MMO; 12572 if (FI) { 12573 int SSFI = FI->getIndex(); 12574 MMO = DAG.getMachineFunction().getMachineMemOperand( 12575 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), 12576 MachineMemOperand::MOLoad, ByteSize, ByteSize); 12577 } else { 12578 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 12579 StackSlot = StackSlot.getOperand(1); 12580 } 12581 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 12582 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 12583 X86ISD::FILD, DL, 12584 Tys, Ops, SrcVT, MMO); 12585 12586 if (useSSE) { 12587 Chain = Result.getValue(1); 12588 SDValue InFlag = Result.getValue(2); 12589 12590 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 12591 // shouldn't be necessary except that RFP cannot be live across 12592 // multiple blocks. When stackifier is fixed, they can be uncoupled. 12593 MachineFunction &MF = DAG.getMachineFunction(); 12594 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 12595 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 12596 auto PtrVT = getPointerTy(MF.getDataLayout()); 12597 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 12598 Tys = DAG.getVTList(MVT::Other); 12599 SDValue Ops[] = { 12600 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 12601 }; 12602 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( 12603 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), 12604 MachineMemOperand::MOStore, SSFISize, SSFISize); 12605 12606 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 12607 Ops, Op.getValueType(), MMO); 12608 Result = DAG.getLoad( 12609 Op.getValueType(), DL, Chain, StackSlot, 12610 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), 12611 false, false, false, 0); 12612 } 12613 12614 return Result; 12615 } 12616 12617 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 12618 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 12619 SelectionDAG &DAG) const { 12620 // This algorithm is not obvious. Here it is what we're trying to output: 12621 /* 12622 movq %rax, %xmm0 12623 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 12624 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 12625 #ifdef __SSE3__ 12626 haddpd %xmm0, %xmm0 12627 #else 12628 pshufd $0x4e, %xmm0, %xmm1 12629 addpd %xmm1, %xmm0 12630 #endif 12631 */ 12632 12633 SDLoc dl(Op); 12634 LLVMContext *Context = DAG.getContext(); 12635 12636 // Build some magic constants. 12637 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 12638 Constant *C0 = ConstantDataVector::get(*Context, CV0); 12639 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12640 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); 12641 12642 SmallVector<Constant*,2> CV1; 12643 CV1.push_back( 12644 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 12645 APInt(64, 0x4330000000000000ULL)))); 12646 CV1.push_back( 12647 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 12648 APInt(64, 0x4530000000000000ULL)))); 12649 Constant *C1 = ConstantVector::get(CV1); 12650 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); 12651 12652 // Load the 64-bit value into an XMM register. 12653 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 12654 Op.getOperand(0)); 12655 SDValue CLod0 = 12656 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 12657 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 12658 false, false, false, 16); 12659 SDValue Unpck1 = 12660 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); 12661 12662 SDValue CLod1 = 12663 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 12664 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 12665 false, false, false, 16); 12666 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); 12667 // TODO: Are there any fast-math-flags to propagate here? 12668 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 12669 SDValue Result; 12670 12671 if (Subtarget->hasSSE3()) { 12672 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 12673 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 12674 } else { 12675 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub); 12676 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 12677 S2F, 0x4E, DAG); 12678 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 12679 DAG.getBitcast(MVT::v2f64, Shuffle), Sub); 12680 } 12681 12682 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 12683 DAG.getIntPtrConstant(0, dl)); 12684 } 12685 12686 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 12687 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 12688 SelectionDAG &DAG) const { 12689 SDLoc dl(Op); 12690 // FP constant to bias correct the final result. 12691 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, 12692 MVT::f64); 12693 12694 // Load the 32-bit value into an XMM register. 12695 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 12696 Op.getOperand(0)); 12697 12698 // Zero out the upper parts of the register. 12699 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 12700 12701 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 12702 DAG.getBitcast(MVT::v2f64, Load), 12703 DAG.getIntPtrConstant(0, dl)); 12704 12705 // Or the load with the bias. 12706 SDValue Or = DAG.getNode( 12707 ISD::OR, dl, MVT::v2i64, 12708 DAG.getBitcast(MVT::v2i64, 12709 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), 12710 DAG.getBitcast(MVT::v2i64, 12711 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); 12712 Or = 12713 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 12714 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); 12715 12716 // Subtract the bias. 12717 // TODO: Are there any fast-math-flags to propagate here? 12718 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 12719 12720 // Handle final rounding. 12721 MVT DestVT = Op.getSimpleValueType(); 12722 12723 if (DestVT.bitsLT(MVT::f64)) 12724 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 12725 DAG.getIntPtrConstant(0, dl)); 12726 if (DestVT.bitsGT(MVT::f64)) 12727 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 12728 12729 // Handle final rounding. 12730 return Sub; 12731 } 12732 12733 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, 12734 const X86Subtarget &Subtarget) { 12735 // The algorithm is the following: 12736 // #ifdef __SSE4_1__ 12737 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); 12738 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), 12739 // (uint4) 0x53000000, 0xaa); 12740 // #else 12741 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; 12742 // uint4 hi = (v >> 16) | (uint4) 0x53000000; 12743 // #endif 12744 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); 12745 // return (float4) lo + fhi; 12746 12747 // We shouldn't use it when unsafe-fp-math is enabled though: we might later 12748 // reassociate the two FADDs, and if we do that, the algorithm fails 12749 // spectacularly (PR24512). 12750 // FIXME: If we ever have some kind of Machine FMF, this should be marked 12751 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because 12752 // there's also the MachineCombiner reassociations happening on Machine IR. 12753 if (DAG.getTarget().Options.UnsafeFPMath) 12754 return SDValue(); 12755 12756 SDLoc DL(Op); 12757 SDValue V = Op->getOperand(0); 12758 MVT VecIntVT = V.getSimpleValueType(); 12759 bool Is128 = VecIntVT == MVT::v4i32; 12760 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; 12761 // If we convert to something else than the supported type, e.g., to v4f64, 12762 // abort early. 12763 if (VecFloatVT != Op->getSimpleValueType(0)) 12764 return SDValue(); 12765 12766 unsigned NumElts = VecIntVT.getVectorNumElements(); 12767 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && 12768 "Unsupported custom type"); 12769 assert(NumElts <= 8 && "The size of the constant array must be fixed"); 12770 12771 // In the #idef/#else code, we have in common: 12772 // - The vector of constants: 12773 // -- 0x4b000000 12774 // -- 0x53000000 12775 // - A shift: 12776 // -- v >> 16 12777 12778 // Create the splat vector for 0x4b000000. 12779 SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32); 12780 SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, 12781 CstLow, CstLow, CstLow, CstLow}; 12782 SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, 12783 makeArrayRef(&CstLowArray[0], NumElts)); 12784 // Create the splat vector for 0x53000000. 12785 SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32); 12786 SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, 12787 CstHigh, CstHigh, CstHigh, CstHigh}; 12788 SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, 12789 makeArrayRef(&CstHighArray[0], NumElts)); 12790 12791 // Create the right shift. 12792 SDValue CstShift = DAG.getConstant(16, DL, MVT::i32); 12793 SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, 12794 CstShift, CstShift, CstShift, CstShift}; 12795 SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, 12796 makeArrayRef(&CstShiftArray[0], NumElts)); 12797 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); 12798 12799 SDValue Low, High; 12800 if (Subtarget.hasSSE41()) { 12801 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; 12802 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); 12803 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); 12804 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); 12805 // Low will be bitcasted right away, so do not bother bitcasting back to its 12806 // original type. 12807 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, 12808 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); 12809 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), 12810 // (uint4) 0x53000000, 0xaa); 12811 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); 12812 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); 12813 // High will be bitcasted right away, so do not bother bitcasting back to 12814 // its original type. 12815 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, 12816 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); 12817 } else { 12818 SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32); 12819 SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, 12820 CstMask, CstMask, CstMask); 12821 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; 12822 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); 12823 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); 12824 12825 // uint4 hi = (v >> 16) | (uint4) 0x53000000; 12826 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); 12827 } 12828 12829 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). 12830 SDValue CstFAdd = DAG.getConstantFP( 12831 APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32); 12832 SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, 12833 CstFAdd, CstFAdd, CstFAdd, CstFAdd}; 12834 SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, 12835 makeArrayRef(&CstFAddArray[0], NumElts)); 12836 12837 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); 12838 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); 12839 // TODO: Are there any fast-math-flags to propagate here? 12840 SDValue FHigh = 12841 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); 12842 // return (float4) lo + fhi; 12843 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); 12844 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); 12845 } 12846 12847 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, 12848 SelectionDAG &DAG) const { 12849 SDValue N0 = Op.getOperand(0); 12850 MVT SVT = N0.getSimpleValueType(); 12851 SDLoc dl(Op); 12852 12853 switch (SVT.SimpleTy) { 12854 default: 12855 llvm_unreachable("Custom UINT_TO_FP is not supported!"); 12856 case MVT::v4i8: 12857 case MVT::v4i16: 12858 case MVT::v8i8: 12859 case MVT::v8i16: { 12860 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); 12861 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 12862 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); 12863 } 12864 case MVT::v4i32: 12865 case MVT::v8i32: 12866 return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); 12867 case MVT::v16i8: 12868 case MVT::v16i16: 12869 assert(Subtarget->hasAVX512()); 12870 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), 12871 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); 12872 } 12873 } 12874 12875 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 12876 SelectionDAG &DAG) const { 12877 SDValue N0 = Op.getOperand(0); 12878 SDLoc dl(Op); 12879 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12880 12881 if (Op.getSimpleValueType().isVector()) 12882 return lowerUINT_TO_FP_vec(Op, DAG); 12883 12884 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 12885 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 12886 // the optimization here. 12887 if (DAG.SignBitIsZero(N0)) 12888 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 12889 12890 MVT SrcVT = N0.getSimpleValueType(); 12891 MVT DstVT = Op.getSimpleValueType(); 12892 12893 if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && 12894 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) { 12895 // Conversions from unsigned i32 to f32/f64 are legal, 12896 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. 12897 return Op; 12898 } 12899 12900 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 12901 return LowerUINT_TO_FP_i64(Op, DAG); 12902 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 12903 return LowerUINT_TO_FP_i32(Op, DAG); 12904 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 12905 return SDValue(); 12906 12907 // Make a 64-bit buffer, and use it to build an FILD. 12908 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 12909 if (SrcVT == MVT::i32) { 12910 SDValue WordOff = DAG.getConstant(4, dl, PtrVT); 12911 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff); 12912 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 12913 StackSlot, MachinePointerInfo(), 12914 false, false, 0); 12915 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), 12916 OffsetSlot, MachinePointerInfo(), 12917 false, false, 0); 12918 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 12919 return Fild; 12920 } 12921 12922 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 12923 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 12924 StackSlot, MachinePointerInfo(), 12925 false, false, 0); 12926 // For i64 source, we need to add the appropriate power of 2 if the input 12927 // was negative. This is the same as the optimization in 12928 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 12929 // we must be careful to do the computation in x87 extended precision, not 12930 // in SSE. (The generic code can't know it's OK to do this, or how to.) 12931 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 12932 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( 12933 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), 12934 MachineMemOperand::MOLoad, 8, 8); 12935 12936 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 12937 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 12938 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 12939 MVT::i64, MMO); 12940 12941 APInt FF(32, 0x5F800000ULL); 12942 12943 // Check whether the sign bit is set. 12944 SDValue SignSet = DAG.getSetCC( 12945 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), 12946 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); 12947 12948 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 12949 SDValue FudgePtr = DAG.getConstantPool( 12950 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); 12951 12952 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 12953 SDValue Zero = DAG.getIntPtrConstant(0, dl); 12954 SDValue Four = DAG.getIntPtrConstant(4, dl); 12955 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 12956 Zero, Four); 12957 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); 12958 12959 // Load the value out, extending it from f32 to f80. 12960 // FIXME: Avoid the extend by constructing the right constant pool? 12961 SDValue Fudge = DAG.getExtLoad( 12962 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, 12963 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, 12964 false, false, false, 4); 12965 // Extend everything to 80 bits to force it to be done on x87. 12966 // TODO: Are there any fast-math-flags to propagate here? 12967 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 12968 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, 12969 DAG.getIntPtrConstant(0, dl)); 12970 } 12971 12972 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation 12973 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), 12974 // just return an <SDValue(), SDValue()> pair. 12975 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 12976 // to i16, i32 or i64, and we lower it to a legal sequence. 12977 // If lowered to the final integer result we return a <result, SDValue()> pair. 12978 // Otherwise we lower it to a sequence ending with a FIST, return a 12979 // <FIST, StackSlot> pair, and the caller is responsible for loading 12980 // the final integer result from StackSlot. 12981 std::pair<SDValue,SDValue> 12982 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, 12983 bool IsSigned, bool IsReplace) const { 12984 SDLoc DL(Op); 12985 12986 EVT DstTy = Op.getValueType(); 12987 EVT TheVT = Op.getOperand(0).getValueType(); 12988 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12989 12990 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { 12991 // f16 must be promoted before using the lowering in this routine. 12992 // fp128 does not use this lowering. 12993 return std::make_pair(SDValue(), SDValue()); 12994 } 12995 12996 // If using FIST to compute an unsigned i64, we'll need some fixup 12997 // to handle values above the maximum signed i64. A FIST is always 12998 // used for the 32-bit subtarget, but also for f80 on a 64-bit target. 12999 bool UnsignedFixup = !IsSigned && 13000 DstTy == MVT::i64 && 13001 (!Subtarget->is64Bit() || 13002 !isScalarFPTypeInSSEReg(TheVT)); 13003 13004 if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) { 13005 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. 13006 // The low 32 bits of the fist result will have the correct uint32 result. 13007 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 13008 DstTy = MVT::i64; 13009 } 13010 13011 assert(DstTy.getSimpleVT() <= MVT::i64 && 13012 DstTy.getSimpleVT() >= MVT::i16 && 13013 "Unknown FP_TO_INT to lower!"); 13014 13015 // These are really Legal. 13016 if (DstTy == MVT::i32 && 13017 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 13018 return std::make_pair(SDValue(), SDValue()); 13019 if (Subtarget->is64Bit() && 13020 DstTy == MVT::i64 && 13021 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 13022 return std::make_pair(SDValue(), SDValue()); 13023 13024 // We lower FP->int64 into FISTP64 followed by a load from a temporary 13025 // stack slot. 13026 MachineFunction &MF = DAG.getMachineFunction(); 13027 unsigned MemSize = DstTy.getSizeInBits()/8; 13028 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 13029 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 13030 13031 unsigned Opc; 13032 switch (DstTy.getSimpleVT().SimpleTy) { 13033 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 13034 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 13035 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 13036 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 13037 } 13038 13039 SDValue Chain = DAG.getEntryNode(); 13040 SDValue Value = Op.getOperand(0); 13041 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. 13042 13043 if (UnsignedFixup) { 13044 // 13045 // Conversion to unsigned i64 is implemented with a select, 13046 // depending on whether the source value fits in the range 13047 // of a signed i64. Let Thresh be the FP equivalent of 13048 // 0x8000000000000000ULL. 13049 // 13050 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; 13051 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); 13052 // Fist-to-mem64 FistSrc 13053 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent 13054 // to XOR'ing the high 32 bits with Adjust. 13055 // 13056 // Being a power of 2, Thresh is exactly representable in all FP formats. 13057 // For X87 we'd like to use the smallest FP type for this constant, but 13058 // for DAG type consistency we have to match the FP operand type. 13059 13060 APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); 13061 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; 13062 bool LosesInfo = false; 13063 if (TheVT == MVT::f64) 13064 // The rounding mode is irrelevant as the conversion should be exact. 13065 Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, 13066 &LosesInfo); 13067 else if (TheVT == MVT::f80) 13068 Status = Thresh.convert(APFloat::x87DoubleExtended, 13069 APFloat::rmNearestTiesToEven, &LosesInfo); 13070 13071 assert(Status == APFloat::opOK && !LosesInfo && 13072 "FP conversion should have been exact"); 13073 13074 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); 13075 13076 SDValue Cmp = DAG.getSetCC(DL, 13077 getSetCCResultType(DAG.getDataLayout(), 13078 *DAG.getContext(), TheVT), 13079 Value, ThreshVal, ISD::SETLT); 13080 Adjust = DAG.getSelect(DL, MVT::i32, Cmp, 13081 DAG.getConstant(0, DL, MVT::i32), 13082 DAG.getConstant(0x80000000, DL, MVT::i32)); 13083 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); 13084 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), 13085 *DAG.getContext(), TheVT), 13086 Value, ThreshVal, ISD::SETLT); 13087 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); 13088 } 13089 13090 // FIXME This causes a redundant load/store if the SSE-class value is already 13091 // in memory, such as if it is on the callstack. 13092 if (isScalarFPTypeInSSEReg(TheVT)) { 13093 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 13094 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 13095 MachinePointerInfo::getFixedStack(MF, SSFI), false, 13096 false, 0); 13097 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 13098 SDValue Ops[] = { 13099 Chain, StackSlot, DAG.getValueType(TheVT) 13100 }; 13101 13102 MachineMemOperand *MMO = 13103 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), 13104 MachineMemOperand::MOLoad, MemSize, MemSize); 13105 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); 13106 Chain = Value.getValue(1); 13107 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 13108 StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 13109 } 13110 13111 MachineMemOperand *MMO = 13112 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), 13113 MachineMemOperand::MOStore, MemSize, MemSize); 13114 13115 if (UnsignedFixup) { 13116 13117 // Insert the FIST, load its result as two i32's, 13118 // and XOR the high i32 with Adjust. 13119 13120 SDValue FistOps[] = { Chain, Value, StackSlot }; 13121 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 13122 FistOps, DstTy, MMO); 13123 13124 SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, 13125 MachinePointerInfo(), 13126 false, false, false, 0); 13127 SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot, 13128 DAG.getConstant(4, DL, PtrVT)); 13129 13130 SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, 13131 MachinePointerInfo(), 13132 false, false, false, 0); 13133 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); 13134 13135 if (Subtarget->is64Bit()) { 13136 // Join High32 and Low32 into a 64-bit result. 13137 // (High32 << 32) | Low32 13138 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); 13139 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); 13140 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, 13141 DAG.getConstant(32, DL, MVT::i8)); 13142 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); 13143 return std::make_pair(Result, SDValue()); 13144 } 13145 13146 SDValue ResultOps[] = { Low32, High32 }; 13147 13148 SDValue pair = IsReplace 13149 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) 13150 : DAG.getMergeValues(ResultOps, DL); 13151 return std::make_pair(pair, SDValue()); 13152 } else { 13153 // Build the FP_TO_INT*_IN_MEM 13154 SDValue Ops[] = { Chain, Value, StackSlot }; 13155 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 13156 Ops, DstTy, MMO); 13157 return std::make_pair(FIST, StackSlot); 13158 } 13159 } 13160 13161 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, 13162 const X86Subtarget *Subtarget) { 13163 MVT VT = Op->getSimpleValueType(0); 13164 SDValue In = Op->getOperand(0); 13165 MVT InVT = In.getSimpleValueType(); 13166 SDLoc dl(Op); 13167 13168 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 13169 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); 13170 13171 // Optimize vectors in AVX mode: 13172 // 13173 // v8i16 -> v8i32 13174 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 13175 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 13176 // Concat upper and lower parts. 13177 // 13178 // v4i32 -> v4i64 13179 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 13180 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 13181 // Concat upper and lower parts. 13182 // 13183 13184 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && 13185 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && 13186 ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) 13187 return SDValue(); 13188 13189 if (Subtarget->hasInt256()) 13190 return DAG.getNode(X86ISD::VZEXT, dl, VT, In); 13191 13192 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); 13193 SDValue Undef = DAG.getUNDEF(InVT); 13194 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; 13195 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 13196 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 13197 13198 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), 13199 VT.getVectorNumElements()/2); 13200 13201 OpLo = DAG.getBitcast(HVT, OpLo); 13202 OpHi = DAG.getBitcast(HVT, OpHi); 13203 13204 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 13205 } 13206 13207 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, 13208 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 13209 MVT VT = Op->getSimpleValueType(0); 13210 SDValue In = Op->getOperand(0); 13211 MVT InVT = In.getSimpleValueType(); 13212 SDLoc DL(Op); 13213 unsigned int NumElts = VT.getVectorNumElements(); 13214 if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) 13215 return SDValue(); 13216 13217 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 13218 return DAG.getNode(X86ISD::VZEXT, DL, VT, In); 13219 13220 assert(InVT.getVectorElementType() == MVT::i1); 13221 MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; 13222 SDValue One = 13223 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); 13224 SDValue Zero = 13225 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); 13226 13227 SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); 13228 if (VT.is512BitVector()) 13229 return V; 13230 return DAG.getNode(X86ISD::VTRUNC, DL, VT, V); 13231 } 13232 13233 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 13234 SelectionDAG &DAG) { 13235 if (Subtarget->hasFp256()) 13236 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) 13237 return Res; 13238 13239 return SDValue(); 13240 } 13241 13242 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 13243 SelectionDAG &DAG) { 13244 SDLoc DL(Op); 13245 MVT VT = Op.getSimpleValueType(); 13246 SDValue In = Op.getOperand(0); 13247 MVT SVT = In.getSimpleValueType(); 13248 13249 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) 13250 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); 13251 13252 if (Subtarget->hasFp256()) 13253 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) 13254 return Res; 13255 13256 assert(!VT.is256BitVector() || !SVT.is128BitVector() || 13257 VT.getVectorNumElements() != SVT.getVectorNumElements()); 13258 return SDValue(); 13259 } 13260 13261 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 13262 SDLoc DL(Op); 13263 MVT VT = Op.getSimpleValueType(); 13264 SDValue In = Op.getOperand(0); 13265 MVT InVT = In.getSimpleValueType(); 13266 13267 if (VT == MVT::i1) { 13268 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && 13269 "Invalid scalar TRUNCATE operation"); 13270 if (InVT.getSizeInBits() >= 32) 13271 return SDValue(); 13272 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); 13273 return DAG.getNode(ISD::TRUNCATE, DL, VT, In); 13274 } 13275 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && 13276 "Invalid TRUNCATE operation"); 13277 13278 // move vector to mask - truncate solution for SKX 13279 if (VT.getVectorElementType() == MVT::i1) { 13280 if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && 13281 Subtarget->hasBWI()) 13282 return Op; // legal, will go to VPMOVB2M, VPMOVW2M 13283 if ((InVT.is256BitVector() || InVT.is128BitVector()) 13284 && InVT.getScalarSizeInBits() <= 16 && 13285 Subtarget->hasBWI() && Subtarget->hasVLX()) 13286 return Op; // legal, will go to VPMOVB2M, VPMOVW2M 13287 if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && 13288 Subtarget->hasDQI()) 13289 return Op; // legal, will go to VPMOVD2M, VPMOVQ2M 13290 if ((InVT.is256BitVector() || InVT.is128BitVector()) 13291 && InVT.getScalarSizeInBits() >= 32 && 13292 Subtarget->hasDQI() && Subtarget->hasVLX()) 13293 return Op; // legal, will go to VPMOVB2M, VPMOVQ2M 13294 } 13295 13296 if (VT.getVectorElementType() == MVT::i1) { 13297 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 13298 unsigned NumElts = InVT.getVectorNumElements(); 13299 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); 13300 if (InVT.getSizeInBits() < 512) { 13301 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; 13302 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); 13303 InVT = ExtVT; 13304 } 13305 13306 SDValue OneV = 13307 DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); 13308 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); 13309 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); 13310 } 13311 13312 // vpmovqb/w/d, vpmovdb/w, vpmovwb 13313 if (Subtarget->hasAVX512()) { 13314 // word to byte only under BWI 13315 if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8 13316 return DAG.getNode(X86ISD::VTRUNC, DL, VT, 13317 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); 13318 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); 13319 } 13320 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { 13321 // On AVX2, v4i64 -> v4i32 becomes VPERMD. 13322 if (Subtarget->hasInt256()) { 13323 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 13324 In = DAG.getBitcast(MVT::v8i32, In); 13325 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), 13326 ShufMask); 13327 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, 13328 DAG.getIntPtrConstant(0, DL)); 13329 } 13330 13331 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 13332 DAG.getIntPtrConstant(0, DL)); 13333 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 13334 DAG.getIntPtrConstant(2, DL)); 13335 OpLo = DAG.getBitcast(MVT::v4i32, OpLo); 13336 OpHi = DAG.getBitcast(MVT::v4i32, OpHi); 13337 static const int ShufMask[] = {0, 2, 4, 6}; 13338 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); 13339 } 13340 13341 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { 13342 // On AVX2, v8i32 -> v8i16 becomed PSHUFB. 13343 if (Subtarget->hasInt256()) { 13344 In = DAG.getBitcast(MVT::v32i8, In); 13345 13346 SmallVector<SDValue,32> pshufbMask; 13347 for (unsigned i = 0; i < 2; ++i) { 13348 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8)); 13349 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8)); 13350 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8)); 13351 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8)); 13352 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8)); 13353 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8)); 13354 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8)); 13355 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8)); 13356 for (unsigned j = 0; j < 8; ++j) 13357 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); 13358 } 13359 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); 13360 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); 13361 In = DAG.getBitcast(MVT::v4i64, In); 13362 13363 static const int ShufMask[] = {0, 2, -1, -1}; 13364 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), 13365 &ShufMask[0]); 13366 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 13367 DAG.getIntPtrConstant(0, DL)); 13368 return DAG.getBitcast(VT, In); 13369 } 13370 13371 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 13372 DAG.getIntPtrConstant(0, DL)); 13373 13374 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 13375 DAG.getIntPtrConstant(4, DL)); 13376 13377 OpLo = DAG.getBitcast(MVT::v16i8, OpLo); 13378 OpHi = DAG.getBitcast(MVT::v16i8, OpHi); 13379 13380 // The PSHUFB mask: 13381 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 13382 -1, -1, -1, -1, -1, -1, -1, -1}; 13383 13384 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 13385 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); 13386 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); 13387 13388 OpLo = DAG.getBitcast(MVT::v4i32, OpLo); 13389 OpHi = DAG.getBitcast(MVT::v4i32, OpHi); 13390 13391 // The MOVLHPS Mask: 13392 static const int ShufMask2[] = {0, 1, 4, 5}; 13393 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); 13394 return DAG.getBitcast(MVT::v8i16, res); 13395 } 13396 13397 // Handle truncation of V256 to V128 using shuffles. 13398 if (!VT.is128BitVector() || !InVT.is256BitVector()) 13399 return SDValue(); 13400 13401 assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); 13402 13403 unsigned NumElems = VT.getVectorNumElements(); 13404 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); 13405 13406 SmallVector<int, 16> MaskVec(NumElems * 2, -1); 13407 // Prepare truncation shuffle mask 13408 for (unsigned i = 0; i != NumElems; ++i) 13409 MaskVec[i] = i * 2; 13410 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), 13411 DAG.getUNDEF(NVT), &MaskVec[0]); 13412 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, 13413 DAG.getIntPtrConstant(0, DL)); 13414 } 13415 13416 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 13417 SelectionDAG &DAG) const { 13418 assert(!Op.getSimpleValueType().isVector()); 13419 13420 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 13421 /*IsSigned=*/ true, /*IsReplace=*/ false); 13422 SDValue FIST = Vals.first, StackSlot = Vals.second; 13423 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 13424 if (!FIST.getNode()) 13425 return Op; 13426 13427 if (StackSlot.getNode()) 13428 // Load the result. 13429 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 13430 FIST, StackSlot, MachinePointerInfo(), 13431 false, false, false, 0); 13432 13433 // The node is the result. 13434 return FIST; 13435 } 13436 13437 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 13438 SelectionDAG &DAG) const { 13439 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 13440 /*IsSigned=*/ false, /*IsReplace=*/ false); 13441 SDValue FIST = Vals.first, StackSlot = Vals.second; 13442 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 13443 if (!FIST.getNode()) 13444 return Op; 13445 13446 if (StackSlot.getNode()) 13447 // Load the result. 13448 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 13449 FIST, StackSlot, MachinePointerInfo(), 13450 false, false, false, 0); 13451 13452 // The node is the result. 13453 return FIST; 13454 } 13455 13456 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { 13457 SDLoc DL(Op); 13458 MVT VT = Op.getSimpleValueType(); 13459 SDValue In = Op.getOperand(0); 13460 MVT SVT = In.getSimpleValueType(); 13461 13462 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 13463 13464 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 13465 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 13466 In, DAG.getUNDEF(SVT))); 13467 } 13468 13469 /// The only differences between FABS and FNEG are the mask and the logic op. 13470 /// FNEG also has a folding opportunity for FNEG(FABS(x)). 13471 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { 13472 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && 13473 "Wrong opcode for lowering FABS or FNEG."); 13474 13475 bool IsFABS = (Op.getOpcode() == ISD::FABS); 13476 13477 // If this is a FABS and it has an FNEG user, bail out to fold the combination 13478 // into an FNABS. We'll lower the FABS after that if it is still in use. 13479 if (IsFABS) 13480 for (SDNode *User : Op->uses()) 13481 if (User->getOpcode() == ISD::FNEG) 13482 return Op; 13483 13484 SDLoc dl(Op); 13485 MVT VT = Op.getSimpleValueType(); 13486 13487 bool IsF128 = (VT == MVT::f128); 13488 13489 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to 13490 // decide if we should generate a 16-byte constant mask when we only need 4 or 13491 // 8 bytes for the scalar case. 13492 13493 MVT LogicVT; 13494 MVT EltVT; 13495 unsigned NumElts; 13496 13497 if (VT.isVector()) { 13498 LogicVT = VT; 13499 EltVT = VT.getVectorElementType(); 13500 NumElts = VT.getVectorNumElements(); 13501 } else if (IsF128) { 13502 // SSE instructions are used for optimized f128 logical operations. 13503 LogicVT = MVT::f128; 13504 EltVT = VT; 13505 NumElts = 1; 13506 } else { 13507 // There are no scalar bitwise logical SSE/AVX instructions, so we 13508 // generate a 16-byte vector constant and logic op even for the scalar case. 13509 // Using a 16-byte mask allows folding the load of the mask with 13510 // the logic op, so it can save (~4 bytes) on code size. 13511 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; 13512 EltVT = VT; 13513 NumElts = (VT == MVT::f64) ? 2 : 4; 13514 } 13515 13516 unsigned EltBits = EltVT.getSizeInBits(); 13517 LLVMContext *Context = DAG.getContext(); 13518 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... 13519 APInt MaskElt = 13520 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); 13521 Constant *C = ConstantInt::get(*Context, MaskElt); 13522 C = ConstantVector::getSplat(NumElts, C); 13523 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13524 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); 13525 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 13526 SDValue Mask = 13527 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, 13528 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 13529 false, false, false, Alignment); 13530 13531 SDValue Op0 = Op.getOperand(0); 13532 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); 13533 unsigned LogicOp = 13534 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; 13535 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; 13536 13537 if (VT.isVector() || IsF128) 13538 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); 13539 13540 // For the scalar case extend to a 128-bit vector, perform the logic op, 13541 // and extract the scalar result back out. 13542 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); 13543 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); 13544 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, 13545 DAG.getIntPtrConstant(0, dl)); 13546 } 13547 13548 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 13549 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13550 LLVMContext *Context = DAG.getContext(); 13551 SDValue Op0 = Op.getOperand(0); 13552 SDValue Op1 = Op.getOperand(1); 13553 SDLoc dl(Op); 13554 MVT VT = Op.getSimpleValueType(); 13555 MVT SrcVT = Op1.getSimpleValueType(); 13556 bool IsF128 = (VT == MVT::f128); 13557 13558 // If second operand is smaller, extend it first. 13559 if (SrcVT.bitsLT(VT)) { 13560 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 13561 SrcVT = VT; 13562 } 13563 // And if it is bigger, shrink it first. 13564 if (SrcVT.bitsGT(VT)) { 13565 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl)); 13566 SrcVT = VT; 13567 } 13568 13569 // At this point the operands and the result should have the same 13570 // type, and that won't be f80 since that is not custom lowered. 13571 assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && 13572 "Unexpected type in LowerFCOPYSIGN"); 13573 13574 const fltSemantics &Sem = 13575 VT == MVT::f64 ? APFloat::IEEEdouble : 13576 (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); 13577 const unsigned SizeInBits = VT.getSizeInBits(); 13578 13579 SmallVector<Constant *, 4> CV( 13580 VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), 13581 ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); 13582 13583 // First, clear all bits but the sign bit from the second operand (sign). 13584 CV[0] = ConstantFP::get(*Context, 13585 APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); 13586 Constant *C = ConstantVector::get(CV); 13587 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); 13588 SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16); 13589 13590 // Perform all logic operations as 16-byte vectors because there are no 13591 // scalar FP logic instructions in SSE. This allows load folding of the 13592 // constants into the logic instructions. 13593 MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); 13594 SDValue Mask1 = 13595 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, 13596 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 13597 false, false, false, 16); 13598 if (!IsF128) 13599 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); 13600 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); 13601 13602 // Next, clear the sign bit from the first operand (magnitude). 13603 // If it's a constant, we can clear it here. 13604 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) { 13605 APFloat APF = Op0CN->getValueAPF(); 13606 // If the magnitude is a positive zero, the sign bit alone is enough. 13607 if (APF.isPosZero()) 13608 return IsF128 ? SignBit : 13609 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, 13610 DAG.getIntPtrConstant(0, dl)); 13611 APF.clearSign(); 13612 CV[0] = ConstantFP::get(*Context, APF); 13613 } else { 13614 CV[0] = ConstantFP::get( 13615 *Context, 13616 APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); 13617 } 13618 C = ConstantVector::get(CV); 13619 CPIdx = DAG.getConstantPool(C, PtrVT, 16); 13620 SDValue Val = 13621 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, 13622 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 13623 false, false, false, 16); 13624 // If the magnitude operand wasn't a constant, we need to AND out the sign. 13625 if (!isa<ConstantFPSDNode>(Op0)) { 13626 if (!IsF128) 13627 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); 13628 Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); 13629 } 13630 // OR the magnitude value with the sign bit. 13631 Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); 13632 return IsF128 ? Val : 13633 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, 13634 DAG.getIntPtrConstant(0, dl)); 13635 } 13636 13637 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 13638 SDValue N0 = Op.getOperand(0); 13639 SDLoc dl(Op); 13640 MVT VT = Op.getSimpleValueType(); 13641 13642 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 13643 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 13644 DAG.getConstant(1, dl, VT)); 13645 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT)); 13646 } 13647 13648 // Check whether an OR'd tree is PTEST-able. 13649 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, 13650 SelectionDAG &DAG) { 13651 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 13652 13653 if (!Subtarget->hasSSE41()) 13654 return SDValue(); 13655 13656 if (!Op->hasOneUse()) 13657 return SDValue(); 13658 13659 SDNode *N = Op.getNode(); 13660 SDLoc DL(N); 13661 13662 SmallVector<SDValue, 8> Opnds; 13663 DenseMap<SDValue, unsigned> VecInMap; 13664 SmallVector<SDValue, 8> VecIns; 13665 EVT VT = MVT::Other; 13666 13667 // Recognize a special case where a vector is casted into wide integer to 13668 // test all 0s. 13669 Opnds.push_back(N->getOperand(0)); 13670 Opnds.push_back(N->getOperand(1)); 13671 13672 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 13673 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; 13674 // BFS traverse all OR'd operands. 13675 if (I->getOpcode() == ISD::OR) { 13676 Opnds.push_back(I->getOperand(0)); 13677 Opnds.push_back(I->getOperand(1)); 13678 // Re-evaluate the number of nodes to be traversed. 13679 e += 2; // 2 more nodes (LHS and RHS) are pushed. 13680 continue; 13681 } 13682 13683 // Quit if a non-EXTRACT_VECTOR_ELT 13684 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 13685 return SDValue(); 13686 13687 // Quit if without a constant index. 13688 SDValue Idx = I->getOperand(1); 13689 if (!isa<ConstantSDNode>(Idx)) 13690 return SDValue(); 13691 13692 SDValue ExtractedFromVec = I->getOperand(0); 13693 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 13694 if (M == VecInMap.end()) { 13695 VT = ExtractedFromVec.getValueType(); 13696 // Quit if not 128/256-bit vector. 13697 if (!VT.is128BitVector() && !VT.is256BitVector()) 13698 return SDValue(); 13699 // Quit if not the same type. 13700 if (VecInMap.begin() != VecInMap.end() && 13701 VT != VecInMap.begin()->first.getValueType()) 13702 return SDValue(); 13703 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 13704 VecIns.push_back(ExtractedFromVec); 13705 } 13706 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 13707 } 13708 13709 assert((VT.is128BitVector() || VT.is256BitVector()) && 13710 "Not extracted from 128-/256-bit vector."); 13711 13712 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 13713 13714 for (DenseMap<SDValue, unsigned>::const_iterator 13715 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 13716 // Quit if not all elements are used. 13717 if (I->second != FullMask) 13718 return SDValue(); 13719 } 13720 13721 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 13722 13723 // Cast all vectors into TestVT for PTEST. 13724 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 13725 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); 13726 13727 // If more than one full vectors are evaluated, OR them first before PTEST. 13728 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 13729 // Each iteration will OR 2 nodes and append the result until there is only 13730 // 1 node left, i.e. the final OR'd value of all vectors. 13731 SDValue LHS = VecIns[Slot]; 13732 SDValue RHS = VecIns[Slot + 1]; 13733 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 13734 } 13735 13736 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 13737 VecIns.back(), VecIns.back()); 13738 } 13739 13740 /// \brief return true if \c Op has a use that doesn't just read flags. 13741 static bool hasNonFlagsUse(SDValue Op) { 13742 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; 13743 ++UI) { 13744 SDNode *User = *UI; 13745 unsigned UOpNo = UI.getOperandNo(); 13746 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 13747 // Look pass truncate. 13748 UOpNo = User->use_begin().getOperandNo(); 13749 User = *User->use_begin(); 13750 } 13751 13752 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && 13753 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) 13754 return true; 13755 } 13756 return false; 13757 } 13758 13759 /// Emit nodes that will be selected as "test Op0,Op0", or something 13760 /// equivalent. 13761 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, 13762 SelectionDAG &DAG) const { 13763 if (Op.getValueType() == MVT::i1) { 13764 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); 13765 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, 13766 DAG.getConstant(0, dl, MVT::i8)); 13767 } 13768 // CF and OF aren't always set the way we want. Determine which 13769 // of these we need. 13770 bool NeedCF = false; 13771 bool NeedOF = false; 13772 switch (X86CC) { 13773 default: break; 13774 case X86::COND_A: case X86::COND_AE: 13775 case X86::COND_B: case X86::COND_BE: 13776 NeedCF = true; 13777 break; 13778 case X86::COND_G: case X86::COND_GE: 13779 case X86::COND_L: case X86::COND_LE: 13780 case X86::COND_O: case X86::COND_NO: { 13781 // Check if we really need to set the 13782 // Overflow flag. If NoSignedWrap is present 13783 // that is not actually needed. 13784 switch (Op->getOpcode()) { 13785 case ISD::ADD: 13786 case ISD::SUB: 13787 case ISD::MUL: 13788 case ISD::SHL: { 13789 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode()); 13790 if (BinNode->Flags.hasNoSignedWrap()) 13791 break; 13792 } 13793 default: 13794 NeedOF = true; 13795 break; 13796 } 13797 break; 13798 } 13799 } 13800 // See if we can use the EFLAGS value from the operand instead of 13801 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 13802 // we prove that the arithmetic won't overflow, we can't use OF or CF. 13803 if (Op.getResNo() != 0 || NeedOF || NeedCF) { 13804 // Emit a CMP with 0, which is the TEST pattern. 13805 //if (Op.getValueType() == MVT::i1) 13806 // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, 13807 // DAG.getConstant(0, MVT::i1)); 13808 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 13809 DAG.getConstant(0, dl, Op.getValueType())); 13810 } 13811 unsigned Opcode = 0; 13812 unsigned NumOperands = 0; 13813 13814 // Truncate operations may prevent the merge of the SETCC instruction 13815 // and the arithmetic instruction before it. Attempt to truncate the operands 13816 // of the arithmetic instruction and use a reduced bit-width instruction. 13817 bool NeedTruncation = false; 13818 SDValue ArithOp = Op; 13819 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 13820 SDValue Arith = Op->getOperand(0); 13821 // Both the trunc and the arithmetic op need to have one user each. 13822 if (Arith->hasOneUse()) 13823 switch (Arith.getOpcode()) { 13824 default: break; 13825 case ISD::ADD: 13826 case ISD::SUB: 13827 case ISD::AND: 13828 case ISD::OR: 13829 case ISD::XOR: { 13830 NeedTruncation = true; 13831 ArithOp = Arith; 13832 } 13833 } 13834 } 13835 13836 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 13837 // which may be the result of a CAST. We use the variable 'Op', which is the 13838 // non-casted variable when we check for possible users. 13839 switch (ArithOp.getOpcode()) { 13840 case ISD::ADD: 13841 // Due to an isel shortcoming, be conservative if this add is likely to be 13842 // selected as part of a load-modify-store instruction. When the root node 13843 // in a match is a store, isel doesn't know how to remap non-chain non-flag 13844 // uses of other nodes in the match, such as the ADD in this case. This 13845 // leads to the ADD being left around and reselected, with the result being 13846 // two adds in the output. Alas, even if none our users are stores, that 13847 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 13848 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 13849 // climbing the DAG back to the root, and it doesn't seem to be worth the 13850 // effort. 13851 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 13852 UE = Op.getNode()->use_end(); UI != UE; ++UI) 13853 if (UI->getOpcode() != ISD::CopyToReg && 13854 UI->getOpcode() != ISD::SETCC && 13855 UI->getOpcode() != ISD::STORE) 13856 goto default_case; 13857 13858 if (ConstantSDNode *C = 13859 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 13860 // An add of one will be selected as an INC. 13861 if (C->isOne() && !Subtarget->slowIncDec()) { 13862 Opcode = X86ISD::INC; 13863 NumOperands = 1; 13864 break; 13865 } 13866 13867 // An add of negative one (subtract of one) will be selected as a DEC. 13868 if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { 13869 Opcode = X86ISD::DEC; 13870 NumOperands = 1; 13871 break; 13872 } 13873 } 13874 13875 // Otherwise use a regular EFLAGS-setting add. 13876 Opcode = X86ISD::ADD; 13877 NumOperands = 2; 13878 break; 13879 case ISD::SHL: 13880 case ISD::SRL: 13881 // If we have a constant logical shift that's only used in a comparison 13882 // against zero turn it into an equivalent AND. This allows turning it into 13883 // a TEST instruction later. 13884 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && 13885 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { 13886 EVT VT = Op.getValueType(); 13887 unsigned BitWidth = VT.getSizeInBits(); 13888 unsigned ShAmt = Op->getConstantOperandVal(1); 13889 if (ShAmt >= BitWidth) // Avoid undefined shifts. 13890 break; 13891 APInt Mask = ArithOp.getOpcode() == ISD::SRL 13892 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) 13893 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); 13894 if (!Mask.isSignedIntN(32)) // Avoid large immediates. 13895 break; 13896 SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), 13897 DAG.getConstant(Mask, dl, VT)); 13898 DAG.ReplaceAllUsesWith(Op, New); 13899 Op = New; 13900 } 13901 break; 13902 13903 case ISD::AND: 13904 // If the primary and result isn't used, don't bother using X86ISD::AND, 13905 // because a TEST instruction will be better. 13906 if (!hasNonFlagsUse(Op)) 13907 break; 13908 // FALL THROUGH 13909 case ISD::SUB: 13910 case ISD::OR: 13911 case ISD::XOR: 13912 // Due to the ISEL shortcoming noted above, be conservative if this op is 13913 // likely to be selected as part of a load-modify-store instruction. 13914 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 13915 UE = Op.getNode()->use_end(); UI != UE; ++UI) 13916 if (UI->getOpcode() == ISD::STORE) 13917 goto default_case; 13918 13919 // Otherwise use a regular EFLAGS-setting instruction. 13920 switch (ArithOp.getOpcode()) { 13921 default: llvm_unreachable("unexpected operator!"); 13922 case ISD::SUB: Opcode = X86ISD::SUB; break; 13923 case ISD::XOR: Opcode = X86ISD::XOR; break; 13924 case ISD::AND: Opcode = X86ISD::AND; break; 13925 case ISD::OR: { 13926 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 13927 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); 13928 if (EFLAGS.getNode()) 13929 return EFLAGS; 13930 } 13931 Opcode = X86ISD::OR; 13932 break; 13933 } 13934 } 13935 13936 NumOperands = 2; 13937 break; 13938 case X86ISD::ADD: 13939 case X86ISD::SUB: 13940 case X86ISD::INC: 13941 case X86ISD::DEC: 13942 case X86ISD::OR: 13943 case X86ISD::XOR: 13944 case X86ISD::AND: 13945 return SDValue(Op.getNode(), 1); 13946 default: 13947 default_case: 13948 break; 13949 } 13950 13951 // If we found that truncation is beneficial, perform the truncation and 13952 // update 'Op'. 13953 if (NeedTruncation) { 13954 EVT VT = Op.getValueType(); 13955 SDValue WideVal = Op->getOperand(0); 13956 EVT WideVT = WideVal.getValueType(); 13957 unsigned ConvertedOp = 0; 13958 // Use a target machine opcode to prevent further DAGCombine 13959 // optimizations that may separate the arithmetic operations 13960 // from the setcc node. 13961 switch (WideVal.getOpcode()) { 13962 default: break; 13963 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 13964 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 13965 case ISD::AND: ConvertedOp = X86ISD::AND; break; 13966 case ISD::OR: ConvertedOp = X86ISD::OR; break; 13967 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 13968 } 13969 13970 if (ConvertedOp) { 13971 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13972 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 13973 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 13974 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 13975 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 13976 } 13977 } 13978 } 13979 13980 if (Opcode == 0) 13981 // Emit a CMP with 0, which is the TEST pattern. 13982 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 13983 DAG.getConstant(0, dl, Op.getValueType())); 13984 13985 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 13986 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands); 13987 13988 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); 13989 DAG.ReplaceAllUsesWith(Op, New); 13990 return SDValue(New.getNode(), 1); 13991 } 13992 13993 /// Emit nodes that will be selected as "cmp Op0,Op1", or something 13994 /// equivalent. 13995 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 13996 SDLoc dl, SelectionDAG &DAG) const { 13997 if (isNullConstant(Op1)) 13998 return EmitTest(Op0, X86CC, dl, DAG); 13999 14000 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && 14001 "Unexpected comparison operation for MVT::i1 operands"); 14002 14003 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 14004 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 14005 // Do the comparison at i32 if it's smaller, besides the Atom case. 14006 // This avoids subregister aliasing issues. Keep the smaller reference 14007 // if we're optimizing for size, however, as that'll allow better folding 14008 // of memory operations. 14009 if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && 14010 !DAG.getMachineFunction().getFunction()->optForMinSize() && 14011 !Subtarget->isAtom()) { 14012 unsigned ExtendOp = 14013 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; 14014 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); 14015 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); 14016 } 14017 // Use SUB instead of CMP to enable CSE between SUB and CMP. 14018 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 14019 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 14020 Op0, Op1); 14021 return SDValue(Sub.getNode(), 1); 14022 } 14023 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 14024 } 14025 14026 /// Convert a comparison if required by the subtarget. 14027 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 14028 SelectionDAG &DAG) const { 14029 // If the subtarget does not support the FUCOMI instruction, floating-point 14030 // comparisons have to be converted. 14031 if (Subtarget->hasCMov() || 14032 Cmp.getOpcode() != X86ISD::CMP || 14033 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 14034 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 14035 return Cmp; 14036 14037 // The instruction selector will select an FUCOM instruction instead of 14038 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 14039 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 14040 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 14041 SDLoc dl(Cmp); 14042 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 14043 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 14044 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 14045 DAG.getConstant(8, dl, MVT::i8)); 14046 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 14047 14048 // Some 64-bit targets lack SAHF support, but they do support FCOMI. 14049 assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); 14050 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 14051 } 14052 14053 /// The minimum architected relative accuracy is 2^-12. We need one 14054 /// Newton-Raphson step to have a good float result (24 bits of precision). 14055 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, 14056 DAGCombinerInfo &DCI, 14057 unsigned &RefinementSteps, 14058 bool &UseOneConstNR) const { 14059 EVT VT = Op.getValueType(); 14060 const char *RecipOp; 14061 14062 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. 14063 // TODO: Add support for AVX512 (v16f32). 14064 // It is likely not profitable to do this for f64 because a double-precision 14065 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 14066 // instructions: convert to single, rsqrtss, convert back to double, refine 14067 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA 14068 // along with FMA, this could be a throughput win. 14069 if (VT == MVT::f32 && Subtarget->hasSSE1()) 14070 RecipOp = "sqrtf"; 14071 else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || 14072 (VT == MVT::v8f32 && Subtarget->hasAVX())) 14073 RecipOp = "vec-sqrtf"; 14074 else 14075 return SDValue(); 14076 14077 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; 14078 if (!Recips.isEnabled(RecipOp)) 14079 return SDValue(); 14080 14081 RefinementSteps = Recips.getRefinementSteps(RecipOp); 14082 UseOneConstNR = false; 14083 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); 14084 } 14085 14086 /// The minimum architected relative accuracy is 2^-12. We need one 14087 /// Newton-Raphson step to have a good float result (24 bits of precision). 14088 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, 14089 DAGCombinerInfo &DCI, 14090 unsigned &RefinementSteps) const { 14091 EVT VT = Op.getValueType(); 14092 const char *RecipOp; 14093 14094 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. 14095 // TODO: Add support for AVX512 (v16f32). 14096 // It is likely not profitable to do this for f64 because a double-precision 14097 // reciprocal estimate with refinement on x86 prior to FMA requires 14098 // 15 instructions: convert to single, rcpss, convert back to double, refine 14099 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA 14100 // along with FMA, this could be a throughput win. 14101 if (VT == MVT::f32 && Subtarget->hasSSE1()) 14102 RecipOp = "divf"; 14103 else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || 14104 (VT == MVT::v8f32 && Subtarget->hasAVX())) 14105 RecipOp = "vec-divf"; 14106 else 14107 return SDValue(); 14108 14109 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; 14110 if (!Recips.isEnabled(RecipOp)) 14111 return SDValue(); 14112 14113 RefinementSteps = Recips.getRefinementSteps(RecipOp); 14114 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); 14115 } 14116 14117 /// If we have at least two divisions that use the same divisor, convert to 14118 /// multplication by a reciprocal. This may need to be adjusted for a given 14119 /// CPU if a division's cost is not at least twice the cost of a multiplication. 14120 /// This is because we still need one division to calculate the reciprocal and 14121 /// then we need two multiplies by that reciprocal as replacements for the 14122 /// original divisions. 14123 unsigned X86TargetLowering::combineRepeatedFPDivisors() const { 14124 return 2; 14125 } 14126 14127 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 14128 /// if it's possible. 14129 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 14130 SDLoc dl, SelectionDAG &DAG) const { 14131 SDValue Op0 = And.getOperand(0); 14132 SDValue Op1 = And.getOperand(1); 14133 if (Op0.getOpcode() == ISD::TRUNCATE) 14134 Op0 = Op0.getOperand(0); 14135 if (Op1.getOpcode() == ISD::TRUNCATE) 14136 Op1 = Op1.getOperand(0); 14137 14138 SDValue LHS, RHS; 14139 if (Op1.getOpcode() == ISD::SHL) 14140 std::swap(Op0, Op1); 14141 if (Op0.getOpcode() == ISD::SHL) { 14142 if (isOneConstant(Op0.getOperand(0))) { 14143 // If we looked past a truncate, check that it's only truncating away 14144 // known zeros. 14145 unsigned BitWidth = Op0.getValueSizeInBits(); 14146 unsigned AndBitWidth = And.getValueSizeInBits(); 14147 if (BitWidth > AndBitWidth) { 14148 APInt Zeros, Ones; 14149 DAG.computeKnownBits(Op0, Zeros, Ones); 14150 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 14151 return SDValue(); 14152 } 14153 LHS = Op1; 14154 RHS = Op0.getOperand(1); 14155 } 14156 } else if (Op1.getOpcode() == ISD::Constant) { 14157 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 14158 uint64_t AndRHSVal = AndRHS->getZExtValue(); 14159 SDValue AndLHS = Op0; 14160 14161 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 14162 LHS = AndLHS.getOperand(0); 14163 RHS = AndLHS.getOperand(1); 14164 } 14165 14166 // Use BT if the immediate can't be encoded in a TEST instruction. 14167 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 14168 LHS = AndLHS; 14169 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType()); 14170 } 14171 } 14172 14173 if (LHS.getNode()) { 14174 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 14175 // instruction. Since the shift amount is in-range-or-undefined, we know 14176 // that doing a bittest on the i32 value is ok. We extend to i32 because 14177 // the encoding for the i16 version is larger than the i32 version. 14178 // Also promote i16 to i32 for performance / code size reason. 14179 if (LHS.getValueType() == MVT::i8 || 14180 LHS.getValueType() == MVT::i16) 14181 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 14182 14183 // If the operand types disagree, extend the shift amount to match. Since 14184 // BT ignores high bits (like shifts) we can use anyextend. 14185 if (LHS.getValueType() != RHS.getValueType()) 14186 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 14187 14188 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 14189 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 14190 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 14191 DAG.getConstant(Cond, dl, MVT::i8), BT); 14192 } 14193 14194 return SDValue(); 14195 } 14196 14197 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point 14198 /// mask CMPs. 14199 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, 14200 SDValue &Op1) { 14201 unsigned SSECC; 14202 bool Swap = false; 14203 14204 // SSE Condition code mapping: 14205 // 0 - EQ 14206 // 1 - LT 14207 // 2 - LE 14208 // 3 - UNORD 14209 // 4 - NEQ 14210 // 5 - NLT 14211 // 6 - NLE 14212 // 7 - ORD 14213 switch (SetCCOpcode) { 14214 default: llvm_unreachable("Unexpected SETCC condition"); 14215 case ISD::SETOEQ: 14216 case ISD::SETEQ: SSECC = 0; break; 14217 case ISD::SETOGT: 14218 case ISD::SETGT: Swap = true; // Fallthrough 14219 case ISD::SETLT: 14220 case ISD::SETOLT: SSECC = 1; break; 14221 case ISD::SETOGE: 14222 case ISD::SETGE: Swap = true; // Fallthrough 14223 case ISD::SETLE: 14224 case ISD::SETOLE: SSECC = 2; break; 14225 case ISD::SETUO: SSECC = 3; break; 14226 case ISD::SETUNE: 14227 case ISD::SETNE: SSECC = 4; break; 14228 case ISD::SETULE: Swap = true; // Fallthrough 14229 case ISD::SETUGE: SSECC = 5; break; 14230 case ISD::SETULT: Swap = true; // Fallthrough 14231 case ISD::SETUGT: SSECC = 6; break; 14232 case ISD::SETO: SSECC = 7; break; 14233 case ISD::SETUEQ: 14234 case ISD::SETONE: SSECC = 8; break; 14235 } 14236 if (Swap) 14237 std::swap(Op0, Op1); 14238 14239 return SSECC; 14240 } 14241 14242 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 14243 // ones, and then concatenate the result back. 14244 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 14245 MVT VT = Op.getSimpleValueType(); 14246 14247 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 14248 "Unsupported value type for operation"); 14249 14250 unsigned NumElems = VT.getVectorNumElements(); 14251 SDLoc dl(Op); 14252 SDValue CC = Op.getOperand(2); 14253 14254 // Extract the LHS vectors 14255 SDValue LHS = Op.getOperand(0); 14256 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 14257 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 14258 14259 // Extract the RHS vectors 14260 SDValue RHS = Op.getOperand(1); 14261 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 14262 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 14263 14264 // Issue the operation on the smaller types and concatenate the result back 14265 MVT EltVT = VT.getVectorElementType(); 14266 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 14267 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 14268 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 14269 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 14270 } 14271 14272 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { 14273 SDValue Op0 = Op.getOperand(0); 14274 SDValue Op1 = Op.getOperand(1); 14275 SDValue CC = Op.getOperand(2); 14276 MVT VT = Op.getSimpleValueType(); 14277 SDLoc dl(Op); 14278 14279 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && 14280 "Unexpected type for boolean compare operation"); 14281 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 14282 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, 14283 DAG.getConstant(-1, dl, VT)); 14284 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1, 14285 DAG.getConstant(-1, dl, VT)); 14286 switch (SetCCOpcode) { 14287 default: llvm_unreachable("Unexpected SETCC condition"); 14288 case ISD::SETEQ: 14289 // (x == y) -> ~(x ^ y) 14290 return DAG.getNode(ISD::XOR, dl, VT, 14291 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), 14292 DAG.getConstant(-1, dl, VT)); 14293 case ISD::SETNE: 14294 // (x != y) -> (x ^ y) 14295 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); 14296 case ISD::SETUGT: 14297 case ISD::SETGT: 14298 // (x > y) -> (x & ~y) 14299 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1); 14300 case ISD::SETULT: 14301 case ISD::SETLT: 14302 // (x < y) -> (~x & y) 14303 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1); 14304 case ISD::SETULE: 14305 case ISD::SETLE: 14306 // (x <= y) -> (~x | y) 14307 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1); 14308 case ISD::SETUGE: 14309 case ISD::SETGE: 14310 // (x >=y) -> (x | ~y) 14311 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1); 14312 } 14313 } 14314 14315 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, 14316 const X86Subtarget *Subtarget) { 14317 SDValue Op0 = Op.getOperand(0); 14318 SDValue Op1 = Op.getOperand(1); 14319 SDValue CC = Op.getOperand(2); 14320 MVT VT = Op.getSimpleValueType(); 14321 SDLoc dl(Op); 14322 14323 assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 && 14324 Op.getSimpleValueType().getVectorElementType() == MVT::i1 && 14325 "Cannot set masked compare for this operation"); 14326 14327 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 14328 unsigned Opc = 0; 14329 bool Unsigned = false; 14330 bool Swap = false; 14331 unsigned SSECC; 14332 switch (SetCCOpcode) { 14333 default: llvm_unreachable("Unexpected SETCC condition"); 14334 case ISD::SETNE: SSECC = 4; break; 14335 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; 14336 case ISD::SETUGT: SSECC = 6; Unsigned = true; break; 14337 case ISD::SETLT: Swap = true; //fall-through 14338 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; 14339 case ISD::SETULT: SSECC = 1; Unsigned = true; break; 14340 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT 14341 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap 14342 case ISD::SETULE: Unsigned = true; //fall-through 14343 case ISD::SETLE: SSECC = 2; break; 14344 } 14345 14346 if (Swap) 14347 std::swap(Op0, Op1); 14348 if (Opc) 14349 return DAG.getNode(Opc, dl, VT, Op0, Op1); 14350 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; 14351 return DAG.getNode(Opc, dl, VT, Op0, Op1, 14352 DAG.getConstant(SSECC, dl, MVT::i8)); 14353 } 14354 14355 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second 14356 /// operand \p Op1. If non-trivial (for example because it's not constant) 14357 /// return an empty value. 14358 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) 14359 { 14360 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode()); 14361 if (!BV) 14362 return SDValue(); 14363 14364 MVT VT = Op1.getSimpleValueType(); 14365 MVT EVT = VT.getVectorElementType(); 14366 unsigned n = VT.getVectorNumElements(); 14367 SmallVector<SDValue, 8> ULTOp1; 14368 14369 for (unsigned i = 0; i < n; ++i) { 14370 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); 14371 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) 14372 return SDValue(); 14373 14374 // Avoid underflow. 14375 APInt Val = Elt->getAPIntValue(); 14376 if (Val == 0) 14377 return SDValue(); 14378 14379 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT)); 14380 } 14381 14382 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); 14383 } 14384 14385 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, 14386 SelectionDAG &DAG) { 14387 SDValue Op0 = Op.getOperand(0); 14388 SDValue Op1 = Op.getOperand(1); 14389 SDValue CC = Op.getOperand(2); 14390 MVT VT = Op.getSimpleValueType(); 14391 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 14392 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); 14393 SDLoc dl(Op); 14394 14395 if (isFP) { 14396 #ifndef NDEBUG 14397 MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); 14398 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 14399 #endif 14400 14401 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); 14402 unsigned Opc = X86ISD::CMPP; 14403 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { 14404 assert(VT.getVectorNumElements() <= 16); 14405 Opc = X86ISD::CMPM; 14406 } 14407 // In the two special cases we can't handle, emit two comparisons. 14408 if (SSECC == 8) { 14409 unsigned CC0, CC1; 14410 unsigned CombineOpc; 14411 if (SetCCOpcode == ISD::SETUEQ) { 14412 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 14413 } else { 14414 assert(SetCCOpcode == ISD::SETONE); 14415 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 14416 } 14417 14418 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, 14419 DAG.getConstant(CC0, dl, MVT::i8)); 14420 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, 14421 DAG.getConstant(CC1, dl, MVT::i8)); 14422 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 14423 } 14424 // Handle all other FP comparisons here. 14425 return DAG.getNode(Opc, dl, VT, Op0, Op1, 14426 DAG.getConstant(SSECC, dl, MVT::i8)); 14427 } 14428 14429 MVT VTOp0 = Op0.getSimpleValueType(); 14430 assert(VTOp0 == Op1.getSimpleValueType() && 14431 "Expected operands with same type!"); 14432 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && 14433 "Invalid number of packed elements for source and destination!"); 14434 14435 if (VT.is128BitVector() && VTOp0.is256BitVector()) { 14436 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type 14437 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the 14438 // legalizer firstly checks if the first operand in input to the setcc has 14439 // a legal type. If so, then it promotes the return type to that same type. 14440 // Otherwise, the return type is promoted to the 'next legal type' which, 14441 // for a vector of MVT::i1 is always a 128-bit integer vector type. 14442 // 14443 // We reach this code only if the following two conditions are met: 14444 // 1. Both return type and operand type have been promoted to wider types 14445 // by the type legalizer. 14446 // 2. The original operand type has been promoted to a 256-bit vector. 14447 // 14448 // Note that condition 2. only applies for AVX targets. 14449 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode); 14450 return DAG.getZExtOrTrunc(NewOp, dl, VT); 14451 } 14452 14453 // The non-AVX512 code below works under the assumption that source and 14454 // destination types are the same. 14455 assert((Subtarget->hasAVX512() || (VT == VTOp0)) && 14456 "Value types for source and destination must be the same!"); 14457 14458 // Break 256-bit integer vector compare into smaller ones. 14459 if (VT.is256BitVector() && !Subtarget->hasInt256()) 14460 return Lower256IntVSETCC(Op, DAG); 14461 14462 MVT OpVT = Op1.getSimpleValueType(); 14463 if (OpVT.getVectorElementType() == MVT::i1) 14464 return LowerBoolVSETCC_AVX512(Op, DAG); 14465 14466 bool MaskResult = (VT.getVectorElementType() == MVT::i1); 14467 if (Subtarget->hasAVX512()) { 14468 if (Op1.getSimpleValueType().is512BitVector() || 14469 (Subtarget->hasBWI() && Subtarget->hasVLX()) || 14470 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) 14471 return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); 14472 14473 // In AVX-512 architecture setcc returns mask with i1 elements, 14474 // But there is no compare instruction for i8 and i16 elements in KNL. 14475 // We are not talking about 512-bit operands in this case, these 14476 // types are illegal. 14477 if (MaskResult && 14478 (OpVT.getVectorElementType().getSizeInBits() < 32 && 14479 OpVT.getVectorElementType().getSizeInBits() >= 8)) 14480 return DAG.getNode(ISD::TRUNCATE, dl, VT, 14481 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); 14482 } 14483 14484 // Lower using XOP integer comparisons. 14485 if ((VT == MVT::v16i8 || VT == MVT::v8i16 || 14486 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) { 14487 // Translate compare code to XOP PCOM compare mode. 14488 unsigned CmpMode = 0; 14489 switch (SetCCOpcode) { 14490 default: llvm_unreachable("Unexpected SETCC condition"); 14491 case ISD::SETULT: 14492 case ISD::SETLT: CmpMode = 0x00; break; 14493 case ISD::SETULE: 14494 case ISD::SETLE: CmpMode = 0x01; break; 14495 case ISD::SETUGT: 14496 case ISD::SETGT: CmpMode = 0x02; break; 14497 case ISD::SETUGE: 14498 case ISD::SETGE: CmpMode = 0x03; break; 14499 case ISD::SETEQ: CmpMode = 0x04; break; 14500 case ISD::SETNE: CmpMode = 0x05; break; 14501 } 14502 14503 // Are we comparing unsigned or signed integers? 14504 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) 14505 ? X86ISD::VPCOMU : X86ISD::VPCOM; 14506 14507 return DAG.getNode(Opc, dl, VT, Op0, Op1, 14508 DAG.getConstant(CmpMode, dl, MVT::i8)); 14509 } 14510 14511 // We are handling one of the integer comparisons here. Since SSE only has 14512 // GT and EQ comparisons for integer, swapping operands and multiple 14513 // operations may be required for some comparisons. 14514 unsigned Opc; 14515 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; 14516 bool Subus = false; 14517 14518 switch (SetCCOpcode) { 14519 default: llvm_unreachable("Unexpected SETCC condition"); 14520 case ISD::SETNE: Invert = true; 14521 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; 14522 case ISD::SETLT: Swap = true; 14523 case ISD::SETGT: Opc = X86ISD::PCMPGT; break; 14524 case ISD::SETGE: Swap = true; 14525 case ISD::SETLE: Opc = X86ISD::PCMPGT; 14526 Invert = true; break; 14527 case ISD::SETULT: Swap = true; 14528 case ISD::SETUGT: Opc = X86ISD::PCMPGT; 14529 FlipSigns = true; break; 14530 case ISD::SETUGE: Swap = true; 14531 case ISD::SETULE: Opc = X86ISD::PCMPGT; 14532 FlipSigns = true; Invert = true; break; 14533 } 14534 14535 // Special case: Use min/max operations for SETULE/SETUGE 14536 MVT VET = VT.getVectorElementType(); 14537 bool hasMinMax = 14538 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) 14539 || (Subtarget->hasSSE2() && (VET == MVT::i8)); 14540 14541 if (hasMinMax) { 14542 switch (SetCCOpcode) { 14543 default: break; 14544 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; 14545 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; 14546 } 14547 14548 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } 14549 } 14550 14551 bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); 14552 if (!MinMax && hasSubus) { 14553 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for 14554 // Op0 u<= Op1: 14555 // t = psubus Op0, Op1 14556 // pcmpeq t, <0..0> 14557 switch (SetCCOpcode) { 14558 default: break; 14559 case ISD::SETULT: { 14560 // If the comparison is against a constant we can turn this into a 14561 // setule. With psubus, setule does not require a swap. This is 14562 // beneficial because the constant in the register is no longer 14563 // destructed as the destination so it can be hoisted out of a loop. 14564 // Only do this pre-AVX since vpcmp* is no longer destructive. 14565 if (Subtarget->hasAVX()) 14566 break; 14567 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); 14568 if (ULEOp1.getNode()) { 14569 Op1 = ULEOp1; 14570 Subus = true; Invert = false; Swap = false; 14571 } 14572 break; 14573 } 14574 // Psubus is better than flip-sign because it requires no inversion. 14575 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; 14576 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; 14577 } 14578 14579 if (Subus) { 14580 Opc = X86ISD::SUBUS; 14581 FlipSigns = false; 14582 } 14583 } 14584 14585 if (Swap) 14586 std::swap(Op0, Op1); 14587 14588 // Check that the operation in question is available (most are plain SSE2, 14589 // but PCMPGTQ and PCMPEQQ have different requirements). 14590 if (VT == MVT::v2i64) { 14591 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { 14592 assert(Subtarget->hasSSE2() && "Don't know how to lower!"); 14593 14594 // First cast everything to the right type. 14595 Op0 = DAG.getBitcast(MVT::v4i32, Op0); 14596 Op1 = DAG.getBitcast(MVT::v4i32, Op1); 14597 14598 // Since SSE has no unsigned integer comparisons, we need to flip the sign 14599 // bits of the inputs before performing those operations. The lower 14600 // compare is always unsigned. 14601 SDValue SB; 14602 if (FlipSigns) { 14603 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32); 14604 } else { 14605 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32); 14606 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32); 14607 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 14608 Sign, Zero, Sign, Zero); 14609 } 14610 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); 14611 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); 14612 14613 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) 14614 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); 14615 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); 14616 14617 // Create masks for only the low parts/high parts of the 64 bit integers. 14618 static const int MaskHi[] = { 1, 1, 3, 3 }; 14619 static const int MaskLo[] = { 0, 0, 2, 2 }; 14620 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); 14621 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); 14622 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); 14623 14624 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); 14625 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); 14626 14627 if (Invert) 14628 Result = DAG.getNOT(dl, Result, MVT::v4i32); 14629 14630 return DAG.getBitcast(VT, Result); 14631 } 14632 14633 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { 14634 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with 14635 // pcmpeqd + pshufd + pand. 14636 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); 14637 14638 // First cast everything to the right type. 14639 Op0 = DAG.getBitcast(MVT::v4i32, Op0); 14640 Op1 = DAG.getBitcast(MVT::v4i32, Op1); 14641 14642 // Do the compare. 14643 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); 14644 14645 // Make sure the lower and upper halves are both all-ones. 14646 static const int Mask[] = { 1, 0, 3, 2 }; 14647 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); 14648 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); 14649 14650 if (Invert) 14651 Result = DAG.getNOT(dl, Result, MVT::v4i32); 14652 14653 return DAG.getBitcast(VT, Result); 14654 } 14655 } 14656 14657 // Since SSE has no unsigned integer comparisons, we need to flip the sign 14658 // bits of the inputs before performing those operations. 14659 if (FlipSigns) { 14660 MVT EltVT = VT.getVectorElementType(); 14661 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, 14662 VT); 14663 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); 14664 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); 14665 } 14666 14667 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 14668 14669 // If the logical-not of the result is required, perform that now. 14670 if (Invert) 14671 Result = DAG.getNOT(dl, Result, VT); 14672 14673 if (MinMax) 14674 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); 14675 14676 if (Subus) 14677 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, 14678 getZeroVector(VT, Subtarget, DAG, dl)); 14679 14680 return Result; 14681 } 14682 14683 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 14684 14685 MVT VT = Op.getSimpleValueType(); 14686 14687 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); 14688 14689 assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) 14690 && "SetCC type must be 8-bit or 1-bit integer"); 14691 SDValue Op0 = Op.getOperand(0); 14692 SDValue Op1 = Op.getOperand(1); 14693 SDLoc dl(Op); 14694 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 14695 14696 // Optimize to BT if possible. 14697 // Lower (X & (1 << N)) == 0 to BT(X, N). 14698 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 14699 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 14700 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 14701 isNullConstant(Op1) && 14702 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 14703 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { 14704 if (VT == MVT::i1) 14705 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); 14706 return NewSetCC; 14707 } 14708 } 14709 14710 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 14711 // these. 14712 if ((isOneConstant(Op1) || isNullConstant(Op1)) && 14713 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 14714 14715 // If the input is a setcc, then reuse the input setcc or use a new one with 14716 // the inverted condition. 14717 if (Op0.getOpcode() == X86ISD::SETCC) { 14718 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 14719 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); 14720 if (!Invert) 14721 return Op0; 14722 14723 CCode = X86::GetOppositeBranchCondition(CCode); 14724 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 14725 DAG.getConstant(CCode, dl, MVT::i8), 14726 Op0.getOperand(1)); 14727 if (VT == MVT::i1) 14728 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); 14729 return SetCC; 14730 } 14731 } 14732 if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && 14733 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 14734 14735 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); 14736 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC); 14737 } 14738 14739 bool isFP = Op1.getSimpleValueType().isFloatingPoint(); 14740 unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG); 14741 if (X86CC == X86::COND_INVALID) 14742 return SDValue(); 14743 14744 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); 14745 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 14746 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 14747 DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS); 14748 if (VT == MVT::i1) 14749 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); 14750 return SetCC; 14751 } 14752 14753 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { 14754 SDValue LHS = Op.getOperand(0); 14755 SDValue RHS = Op.getOperand(1); 14756 SDValue Carry = Op.getOperand(2); 14757 SDValue Cond = Op.getOperand(3); 14758 SDLoc DL(Op); 14759 14760 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); 14761 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); 14762 14763 assert(Carry.getOpcode() != ISD::CARRY_FALSE); 14764 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 14765 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); 14766 return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), 14767 DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); 14768 } 14769 14770 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 14771 static bool isX86LogicalCmp(SDValue Op) { 14772 unsigned Opc = Op.getNode()->getOpcode(); 14773 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 14774 Opc == X86ISD::SAHF) 14775 return true; 14776 if (Op.getResNo() == 1 && 14777 (Opc == X86ISD::ADD || 14778 Opc == X86ISD::SUB || 14779 Opc == X86ISD::ADC || 14780 Opc == X86ISD::SBB || 14781 Opc == X86ISD::SMUL || 14782 Opc == X86ISD::UMUL || 14783 Opc == X86ISD::INC || 14784 Opc == X86ISD::DEC || 14785 Opc == X86ISD::OR || 14786 Opc == X86ISD::XOR || 14787 Opc == X86ISD::AND)) 14788 return true; 14789 14790 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 14791 return true; 14792 14793 return false; 14794 } 14795 14796 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 14797 if (V.getOpcode() != ISD::TRUNCATE) 14798 return false; 14799 14800 SDValue VOp0 = V.getOperand(0); 14801 unsigned InBits = VOp0.getValueSizeInBits(); 14802 unsigned Bits = V.getValueSizeInBits(); 14803 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 14804 } 14805 14806 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 14807 bool addTest = true; 14808 SDValue Cond = Op.getOperand(0); 14809 SDValue Op1 = Op.getOperand(1); 14810 SDValue Op2 = Op.getOperand(2); 14811 SDLoc DL(Op); 14812 MVT VT = Op1.getSimpleValueType(); 14813 SDValue CC; 14814 14815 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops 14816 // are available or VBLENDV if AVX is available. 14817 // Otherwise FP cmovs get lowered into a less efficient branch sequence later. 14818 if (Cond.getOpcode() == ISD::SETCC && 14819 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || 14820 (Subtarget->hasSSE1() && VT == MVT::f32)) && 14821 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { 14822 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); 14823 int SSECC = translateX86FSETCC( 14824 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); 14825 14826 if (SSECC != 8) { 14827 if (Subtarget->hasAVX512()) { 14828 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, 14829 DAG.getConstant(SSECC, DL, MVT::i8)); 14830 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); 14831 } 14832 14833 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, 14834 DAG.getConstant(SSECC, DL, MVT::i8)); 14835 14836 // If we have AVX, we can use a variable vector select (VBLENDV) instead 14837 // of 3 logic instructions for size savings and potentially speed. 14838 // Unfortunately, there is no scalar form of VBLENDV. 14839 14840 // If either operand is a constant, don't try this. We can expect to 14841 // optimize away at least one of the logic instructions later in that 14842 // case, so that sequence would be faster than a variable blend. 14843 14844 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly 14845 // uses XMM0 as the selection register. That may need just as many 14846 // instructions as the AND/ANDN/OR sequence due to register moves, so 14847 // don't bother. 14848 14849 if (Subtarget->hasAVX() && 14850 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) { 14851 14852 // Convert to vectors, do a VSELECT, and convert back to scalar. 14853 // All of the conversions should be optimized away. 14854 14855 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; 14856 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); 14857 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); 14858 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); 14859 14860 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; 14861 VCmp = DAG.getBitcast(VCmpVT, VCmp); 14862 14863 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); 14864 14865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, 14866 VSel, DAG.getIntPtrConstant(0, DL)); 14867 } 14868 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); 14869 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); 14870 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); 14871 } 14872 } 14873 14874 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { 14875 SDValue Op1Scalar; 14876 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) 14877 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); 14878 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) 14879 Op1Scalar = Op1.getOperand(0); 14880 SDValue Op2Scalar; 14881 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) 14882 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); 14883 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) 14884 Op2Scalar = Op2.getOperand(0); 14885 if (Op1Scalar.getNode() && Op2Scalar.getNode()) { 14886 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, 14887 Op1Scalar.getValueType(), 14888 Cond, Op1Scalar, Op2Scalar); 14889 if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) 14890 return DAG.getBitcast(VT, newSelect); 14891 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); 14892 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, 14893 DAG.getIntPtrConstant(0, DL)); 14894 } 14895 } 14896 14897 if (VT == MVT::v4i1 || VT == MVT::v2i1) { 14898 SDValue zeroConst = DAG.getIntPtrConstant(0, DL); 14899 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, 14900 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); 14901 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, 14902 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); 14903 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, 14904 Cond, Op1, Op2); 14905 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); 14906 } 14907 14908 if (Cond.getOpcode() == ISD::SETCC) { 14909 SDValue NewCond = LowerSETCC(Cond, DAG); 14910 if (NewCond.getNode()) 14911 Cond = NewCond; 14912 } 14913 14914 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 14915 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 14916 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 14917 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 14918 if (Cond.getOpcode() == X86ISD::SETCC && 14919 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 14920 isNullConstant(Cond.getOperand(1).getOperand(1))) { 14921 SDValue Cmp = Cond.getOperand(1); 14922 14923 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 14924 14925 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && 14926 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 14927 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; 14928 14929 SDValue CmpOp0 = Cmp.getOperand(0); 14930 // Apply further optimizations for special cases 14931 // (select (x != 0), -1, 0) -> neg & sbb 14932 // (select (x == 0), 0, -1) -> neg & sbb 14933 if (isNullConstant(Y) && 14934 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { 14935 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 14936 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 14937 DAG.getConstant(0, DL, 14938 CmpOp0.getValueType()), 14939 CmpOp0); 14940 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 14941 DAG.getConstant(X86::COND_B, DL, MVT::i8), 14942 SDValue(Neg.getNode(), 1)); 14943 return Res; 14944 } 14945 14946 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 14947 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); 14948 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 14949 14950 SDValue Res = // Res = 0 or -1. 14951 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 14952 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); 14953 14954 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) 14955 Res = DAG.getNOT(DL, Res, Res.getValueType()); 14956 14957 if (!isNullConstant(Op2)) 14958 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 14959 return Res; 14960 } 14961 } 14962 14963 // Look past (and (setcc_carry (cmp ...)), 1). 14964 if (Cond.getOpcode() == ISD::AND && 14965 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && 14966 isOneConstant(Cond.getOperand(1))) 14967 Cond = Cond.getOperand(0); 14968 14969 // If condition flag is set by a X86ISD::CMP, then use it as the condition 14970 // setting operand in place of the X86ISD::SETCC. 14971 unsigned CondOpcode = Cond.getOpcode(); 14972 if (CondOpcode == X86ISD::SETCC || 14973 CondOpcode == X86ISD::SETCC_CARRY) { 14974 CC = Cond.getOperand(0); 14975 14976 SDValue Cmp = Cond.getOperand(1); 14977 unsigned Opc = Cmp.getOpcode(); 14978 MVT VT = Op.getSimpleValueType(); 14979 14980 bool IllegalFPCMov = false; 14981 if (VT.isFloatingPoint() && !VT.isVector() && 14982 !isScalarFPTypeInSSEReg(VT)) // FPStack? 14983 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 14984 14985 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 14986 Opc == X86ISD::BT) { // FIXME 14987 Cond = Cmp; 14988 addTest = false; 14989 } 14990 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 14991 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 14992 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 14993 Cond.getOperand(0).getValueType() != MVT::i8)) { 14994 SDValue LHS = Cond.getOperand(0); 14995 SDValue RHS = Cond.getOperand(1); 14996 unsigned X86Opcode; 14997 unsigned X86Cond; 14998 SDVTList VTs; 14999 switch (CondOpcode) { 15000 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 15001 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 15002 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 15003 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 15004 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 15005 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 15006 default: llvm_unreachable("unexpected overflowing operator"); 15007 } 15008 if (CondOpcode == ISD::UMULO) 15009 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 15010 MVT::i32); 15011 else 15012 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 15013 15014 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 15015 15016 if (CondOpcode == ISD::UMULO) 15017 Cond = X86Op.getValue(2); 15018 else 15019 Cond = X86Op.getValue(1); 15020 15021 CC = DAG.getConstant(X86Cond, DL, MVT::i8); 15022 addTest = false; 15023 } 15024 15025 if (addTest) { 15026 // Look past the truncate if the high bits are known zero. 15027 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 15028 Cond = Cond.getOperand(0); 15029 15030 // We know the result of AND is compared against zero. Try to match 15031 // it to BT. 15032 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 15033 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { 15034 CC = NewSetCC.getOperand(0); 15035 Cond = NewSetCC.getOperand(1); 15036 addTest = false; 15037 } 15038 } 15039 } 15040 15041 if (addTest) { 15042 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); 15043 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); 15044 } 15045 15046 // a < b ? -1 : 0 -> RES = ~setcc_carry 15047 // a < b ? 0 : -1 -> RES = setcc_carry 15048 // a >= b ? -1 : 0 -> RES = setcc_carry 15049 // a >= b ? 0 : -1 -> RES = ~setcc_carry 15050 if (Cond.getOpcode() == X86ISD::SUB) { 15051 Cond = ConvertCmpIfNecessary(Cond, DAG); 15052 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 15053 15054 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 15055 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && 15056 (isNullConstant(Op1) || isNullConstant(Op2))) { 15057 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 15058 DAG.getConstant(X86::COND_B, DL, MVT::i8), 15059 Cond); 15060 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) 15061 return DAG.getNOT(DL, Res, Res.getValueType()); 15062 return Res; 15063 } 15064 } 15065 15066 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate 15067 // widen the cmov and push the truncate through. This avoids introducing a new 15068 // branch during isel and doesn't add any extensions. 15069 if (Op.getValueType() == MVT::i8 && 15070 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { 15071 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); 15072 if (T1.getValueType() == T2.getValueType() && 15073 // Blacklist CopyFromReg to avoid partial register stalls. 15074 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ 15075 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); 15076 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); 15077 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); 15078 } 15079 } 15080 15081 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 15082 // condition is true. 15083 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 15084 SDValue Ops[] = { Op2, Op1, CC, Cond }; 15085 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); 15086 } 15087 15088 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, 15089 const X86Subtarget *Subtarget, 15090 SelectionDAG &DAG) { 15091 MVT VT = Op->getSimpleValueType(0); 15092 SDValue In = Op->getOperand(0); 15093 MVT InVT = In.getSimpleValueType(); 15094 MVT VTElt = VT.getVectorElementType(); 15095 MVT InVTElt = InVT.getVectorElementType(); 15096 SDLoc dl(Op); 15097 15098 // SKX processor 15099 if ((InVTElt == MVT::i1) && 15100 (((Subtarget->hasBWI() && Subtarget->hasVLX() && 15101 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || 15102 15103 ((Subtarget->hasBWI() && VT.is512BitVector() && 15104 VTElt.getSizeInBits() <= 16)) || 15105 15106 ((Subtarget->hasDQI() && Subtarget->hasVLX() && 15107 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || 15108 15109 ((Subtarget->hasDQI() && VT.is512BitVector() && 15110 VTElt.getSizeInBits() >= 32)))) 15111 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 15112 15113 unsigned int NumElts = VT.getVectorNumElements(); 15114 15115 if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) 15116 return SDValue(); 15117 15118 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { 15119 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) 15120 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); 15121 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 15122 } 15123 15124 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 15125 MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; 15126 SDValue NegOne = 15127 DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, 15128 ExtVT); 15129 SDValue Zero = 15130 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); 15131 15132 SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); 15133 if (VT.is512BitVector()) 15134 return V; 15135 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); 15136 } 15137 15138 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, 15139 const X86Subtarget *Subtarget, 15140 SelectionDAG &DAG) { 15141 SDValue In = Op->getOperand(0); 15142 MVT VT = Op->getSimpleValueType(0); 15143 MVT InVT = In.getSimpleValueType(); 15144 assert(VT.getSizeInBits() == InVT.getSizeInBits()); 15145 15146 MVT InSVT = InVT.getVectorElementType(); 15147 assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits()); 15148 15149 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 15150 return SDValue(); 15151 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) 15152 return SDValue(); 15153 15154 SDLoc dl(Op); 15155 15156 // SSE41 targets can use the pmovsx* instructions directly. 15157 if (Subtarget->hasSSE41()) 15158 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 15159 15160 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. 15161 SDValue Curr = In; 15162 MVT CurrVT = InVT; 15163 15164 // As SRAI is only available on i16/i32 types, we expand only up to i32 15165 // and handle i64 separately. 15166 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { 15167 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); 15168 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); 15169 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); 15170 Curr = DAG.getBitcast(CurrVT, Curr); 15171 } 15172 15173 SDValue SignExt = Curr; 15174 if (CurrVT != InVT) { 15175 unsigned SignExtShift = 15176 CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits(); 15177 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, 15178 DAG.getConstant(SignExtShift, dl, MVT::i8)); 15179 } 15180 15181 if (CurrVT == VT) 15182 return SignExt; 15183 15184 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { 15185 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, 15186 DAG.getConstant(31, dl, MVT::i8)); 15187 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); 15188 return DAG.getBitcast(VT, Ext); 15189 } 15190 15191 return SDValue(); 15192 } 15193 15194 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 15195 SelectionDAG &DAG) { 15196 MVT VT = Op->getSimpleValueType(0); 15197 SDValue In = Op->getOperand(0); 15198 MVT InVT = In.getSimpleValueType(); 15199 SDLoc dl(Op); 15200 15201 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 15202 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); 15203 15204 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && 15205 (VT != MVT::v8i32 || InVT != MVT::v8i16) && 15206 (VT != MVT::v16i16 || InVT != MVT::v16i8)) 15207 return SDValue(); 15208 15209 if (Subtarget->hasInt256()) 15210 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 15211 15212 // Optimize vectors in AVX mode 15213 // Sign extend v8i16 to v8i32 and 15214 // v4i32 to v4i64 15215 // 15216 // Divide input vector into two parts 15217 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 15218 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 15219 // concat the vectors to original VT 15220 15221 unsigned NumElems = InVT.getVectorNumElements(); 15222 SDValue Undef = DAG.getUNDEF(InVT); 15223 15224 SmallVector<int,8> ShufMask1(NumElems, -1); 15225 for (unsigned i = 0; i != NumElems/2; ++i) 15226 ShufMask1[i] = i; 15227 15228 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); 15229 15230 SmallVector<int,8> ShufMask2(NumElems, -1); 15231 for (unsigned i = 0; i != NumElems/2; ++i) 15232 ShufMask2[i] = i + NumElems/2; 15233 15234 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); 15235 15236 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), 15237 VT.getVectorNumElements()/2); 15238 15239 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); 15240 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); 15241 15242 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 15243 } 15244 15245 // Lower vector extended loads using a shuffle. If SSSE3 is not available we 15246 // may emit an illegal shuffle but the expansion is still better than scalar 15247 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise 15248 // we'll emit a shuffle and a arithmetic shift. 15249 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. 15250 // TODO: It is possible to support ZExt by zeroing the undef values during 15251 // the shuffle phase or after the shuffle. 15252 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, 15253 SelectionDAG &DAG) { 15254 MVT RegVT = Op.getSimpleValueType(); 15255 assert(RegVT.isVector() && "We only custom lower vector sext loads."); 15256 assert(RegVT.isInteger() && 15257 "We only custom lower integer vector sext loads."); 15258 15259 // Nothing useful we can do without SSE2 shuffles. 15260 assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); 15261 15262 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); 15263 SDLoc dl(Ld); 15264 EVT MemVT = Ld->getMemoryVT(); 15265 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15266 unsigned RegSz = RegVT.getSizeInBits(); 15267 15268 ISD::LoadExtType Ext = Ld->getExtensionType(); 15269 15270 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) 15271 && "Only anyext and sext are currently implemented."); 15272 assert(MemVT != RegVT && "Cannot extend to the same type"); 15273 assert(MemVT.isVector() && "Must load a vector from memory"); 15274 15275 unsigned NumElems = RegVT.getVectorNumElements(); 15276 unsigned MemSz = MemVT.getSizeInBits(); 15277 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 15278 15279 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { 15280 // The only way in which we have a legal 256-bit vector result but not the 15281 // integer 256-bit operations needed to directly lower a sextload is if we 15282 // have AVX1 but not AVX2. In that case, we can always emit a sextload to 15283 // a 128-bit vector and a normal sign_extend to 256-bits that should get 15284 // correctly legalized. We do this late to allow the canonical form of 15285 // sextload to persist throughout the rest of the DAG combiner -- it wants 15286 // to fold together any extensions it can, and so will fuse a sign_extend 15287 // of an sextload into a sextload targeting a wider value. 15288 SDValue Load; 15289 if (MemSz == 128) { 15290 // Just switch this to a normal load. 15291 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " 15292 "it must be a legal 128-bit vector " 15293 "type!"); 15294 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), 15295 Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), 15296 Ld->isInvariant(), Ld->getAlignment()); 15297 } else { 15298 assert(MemSz < 128 && 15299 "Can't extend a type wider than 128 bits to a 256 bit vector!"); 15300 // Do an sext load to a 128-bit vector type. We want to use the same 15301 // number of elements, but elements half as wide. This will end up being 15302 // recursively lowered by this routine, but will succeed as we definitely 15303 // have all the necessary features if we're using AVX1. 15304 EVT HalfEltVT = 15305 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); 15306 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); 15307 Load = 15308 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), 15309 Ld->getPointerInfo(), MemVT, Ld->isVolatile(), 15310 Ld->isNonTemporal(), Ld->isInvariant(), 15311 Ld->getAlignment()); 15312 } 15313 15314 // Replace chain users with the new chain. 15315 assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); 15316 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); 15317 15318 // Finally, do a normal sign-extend to the desired register. 15319 return DAG.getSExtOrTrunc(Load, dl, RegVT); 15320 } 15321 15322 // All sizes must be a power of two. 15323 assert(isPowerOf2_32(RegSz * MemSz * NumElems) && 15324 "Non-power-of-two elements are not custom lowered!"); 15325 15326 // Attempt to load the original value using scalar loads. 15327 // Find the largest scalar type that divides the total loaded size. 15328 MVT SclrLoadTy = MVT::i8; 15329 for (MVT Tp : MVT::integer_valuetypes()) { 15330 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 15331 SclrLoadTy = Tp; 15332 } 15333 } 15334 15335 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 15336 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 15337 (64 <= MemSz)) 15338 SclrLoadTy = MVT::f64; 15339 15340 // Calculate the number of scalar loads that we need to perform 15341 // in order to load our vector from memory. 15342 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 15343 15344 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && 15345 "Can only lower sext loads with a single scalar load!"); 15346 15347 unsigned loadRegZize = RegSz; 15348 if (Ext == ISD::SEXTLOAD && RegSz >= 256) 15349 loadRegZize = 128; 15350 15351 // Represent our vector as a sequence of elements which are the 15352 // largest scalar that we can load. 15353 EVT LoadUnitVecVT = EVT::getVectorVT( 15354 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); 15355 15356 // Represent the data using the same element type that is stored in 15357 // memory. In practice, we ''widen'' MemVT. 15358 EVT WideVecVT = 15359 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 15360 loadRegZize / MemVT.getScalarSizeInBits()); 15361 15362 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 15363 "Invalid vector type"); 15364 15365 // We can't shuffle using an illegal type. 15366 assert(TLI.isTypeLegal(WideVecVT) && 15367 "We only lower types that form legal widened vector types"); 15368 15369 SmallVector<SDValue, 8> Chains; 15370 SDValue Ptr = Ld->getBasePtr(); 15371 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, 15372 TLI.getPointerTy(DAG.getDataLayout())); 15373 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 15374 15375 for (unsigned i = 0; i < NumLoads; ++i) { 15376 // Perform a single load. 15377 SDValue ScalarLoad = 15378 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 15379 Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), 15380 Ld->getAlignment()); 15381 Chains.push_back(ScalarLoad.getValue(1)); 15382 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 15383 // another round of DAGCombining. 15384 if (i == 0) 15385 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 15386 else 15387 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 15388 ScalarLoad, DAG.getIntPtrConstant(i, dl)); 15389 15390 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 15391 } 15392 15393 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 15394 15395 // Bitcast the loaded value to a vector of the original element type, in 15396 // the size of the target vector type. 15397 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); 15398 unsigned SizeRatio = RegSz / MemSz; 15399 15400 if (Ext == ISD::SEXTLOAD) { 15401 // If we have SSE4.1, we can directly emit a VSEXT node. 15402 if (Subtarget->hasSSE41()) { 15403 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); 15404 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 15405 return Sext; 15406 } 15407 15408 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest 15409 // lanes. 15410 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && 15411 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); 15412 15413 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT); 15414 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 15415 return Shuff; 15416 } 15417 15418 // Redistribute the loaded elements into the different locations. 15419 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 15420 for (unsigned i = 0; i != NumElems; ++i) 15421 ShuffleVec[i * SizeRatio] = i; 15422 15423 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 15424 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); 15425 15426 // Bitcast to the requested type. 15427 Shuff = DAG.getBitcast(RegVT, Shuff); 15428 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 15429 return Shuff; 15430 } 15431 15432 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 15433 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 15434 // from the AND / OR. 15435 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 15436 Opc = Op.getOpcode(); 15437 if (Opc != ISD::OR && Opc != ISD::AND) 15438 return false; 15439 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 15440 Op.getOperand(0).hasOneUse() && 15441 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 15442 Op.getOperand(1).hasOneUse()); 15443 } 15444 15445 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 15446 // 1 and that the SETCC node has a single use. 15447 static bool isXor1OfSetCC(SDValue Op) { 15448 if (Op.getOpcode() != ISD::XOR) 15449 return false; 15450 if (isOneConstant(Op.getOperand(1))) 15451 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 15452 Op.getOperand(0).hasOneUse(); 15453 return false; 15454 } 15455 15456 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 15457 bool addTest = true; 15458 SDValue Chain = Op.getOperand(0); 15459 SDValue Cond = Op.getOperand(1); 15460 SDValue Dest = Op.getOperand(2); 15461 SDLoc dl(Op); 15462 SDValue CC; 15463 bool Inverted = false; 15464 15465 if (Cond.getOpcode() == ISD::SETCC) { 15466 // Check for setcc([su]{add,sub,mul}o == 0). 15467 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 15468 isNullConstant(Cond.getOperand(1)) && 15469 Cond.getOperand(0).getResNo() == 1 && 15470 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 15471 Cond.getOperand(0).getOpcode() == ISD::UADDO || 15472 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 15473 Cond.getOperand(0).getOpcode() == ISD::USUBO || 15474 Cond.getOperand(0).getOpcode() == ISD::SMULO || 15475 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 15476 Inverted = true; 15477 Cond = Cond.getOperand(0); 15478 } else { 15479 SDValue NewCond = LowerSETCC(Cond, DAG); 15480 if (NewCond.getNode()) 15481 Cond = NewCond; 15482 } 15483 } 15484 #if 0 15485 // FIXME: LowerXALUO doesn't handle these!! 15486 else if (Cond.getOpcode() == X86ISD::ADD || 15487 Cond.getOpcode() == X86ISD::SUB || 15488 Cond.getOpcode() == X86ISD::SMUL || 15489 Cond.getOpcode() == X86ISD::UMUL) 15490 Cond = LowerXALUO(Cond, DAG); 15491 #endif 15492 15493 // Look pass (and (setcc_carry (cmp ...)), 1). 15494 if (Cond.getOpcode() == ISD::AND && 15495 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && 15496 isOneConstant(Cond.getOperand(1))) 15497 Cond = Cond.getOperand(0); 15498 15499 // If condition flag is set by a X86ISD::CMP, then use it as the condition 15500 // setting operand in place of the X86ISD::SETCC. 15501 unsigned CondOpcode = Cond.getOpcode(); 15502 if (CondOpcode == X86ISD::SETCC || 15503 CondOpcode == X86ISD::SETCC_CARRY) { 15504 CC = Cond.getOperand(0); 15505 15506 SDValue Cmp = Cond.getOperand(1); 15507 unsigned Opc = Cmp.getOpcode(); 15508 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 15509 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 15510 Cond = Cmp; 15511 addTest = false; 15512 } else { 15513 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 15514 default: break; 15515 case X86::COND_O: 15516 case X86::COND_B: 15517 // These can only come from an arithmetic instruction with overflow, 15518 // e.g. SADDO, UADDO. 15519 Cond = Cond.getNode()->getOperand(1); 15520 addTest = false; 15521 break; 15522 } 15523 } 15524 } 15525 CondOpcode = Cond.getOpcode(); 15526 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 15527 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 15528 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 15529 Cond.getOperand(0).getValueType() != MVT::i8)) { 15530 SDValue LHS = Cond.getOperand(0); 15531 SDValue RHS = Cond.getOperand(1); 15532 unsigned X86Opcode; 15533 unsigned X86Cond; 15534 SDVTList VTs; 15535 // Keep this in sync with LowerXALUO, otherwise we might create redundant 15536 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and 15537 // X86ISD::INC). 15538 switch (CondOpcode) { 15539 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 15540 case ISD::SADDO: 15541 if (isOneConstant(RHS)) { 15542 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; 15543 break; 15544 } 15545 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 15546 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 15547 case ISD::SSUBO: 15548 if (isOneConstant(RHS)) { 15549 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; 15550 break; 15551 } 15552 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 15553 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 15554 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 15555 default: llvm_unreachable("unexpected overflowing operator"); 15556 } 15557 if (Inverted) 15558 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 15559 if (CondOpcode == ISD::UMULO) 15560 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 15561 MVT::i32); 15562 else 15563 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 15564 15565 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 15566 15567 if (CondOpcode == ISD::UMULO) 15568 Cond = X86Op.getValue(2); 15569 else 15570 Cond = X86Op.getValue(1); 15571 15572 CC = DAG.getConstant(X86Cond, dl, MVT::i8); 15573 addTest = false; 15574 } else { 15575 unsigned CondOpc; 15576 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 15577 SDValue Cmp = Cond.getOperand(0).getOperand(1); 15578 if (CondOpc == ISD::OR) { 15579 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 15580 // two branches instead of an explicit OR instruction with a 15581 // separate test. 15582 if (Cmp == Cond.getOperand(1).getOperand(1) && 15583 isX86LogicalCmp(Cmp)) { 15584 CC = Cond.getOperand(0).getOperand(0); 15585 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 15586 Chain, Dest, CC, Cmp); 15587 CC = Cond.getOperand(1).getOperand(0); 15588 Cond = Cmp; 15589 addTest = false; 15590 } 15591 } else { // ISD::AND 15592 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 15593 // two branches instead of an explicit AND instruction with a 15594 // separate test. However, we only do this if this block doesn't 15595 // have a fall-through edge, because this requires an explicit 15596 // jmp when the condition is false. 15597 if (Cmp == Cond.getOperand(1).getOperand(1) && 15598 isX86LogicalCmp(Cmp) && 15599 Op.getNode()->hasOneUse()) { 15600 X86::CondCode CCode = 15601 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 15602 CCode = X86::GetOppositeBranchCondition(CCode); 15603 CC = DAG.getConstant(CCode, dl, MVT::i8); 15604 SDNode *User = *Op.getNode()->use_begin(); 15605 // Look for an unconditional branch following this conditional branch. 15606 // We need this because we need to reverse the successors in order 15607 // to implement FCMP_OEQ. 15608 if (User->getOpcode() == ISD::BR) { 15609 SDValue FalseBB = User->getOperand(1); 15610 SDNode *NewBR = 15611 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 15612 assert(NewBR == User); 15613 (void)NewBR; 15614 Dest = FalseBB; 15615 15616 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 15617 Chain, Dest, CC, Cmp); 15618 X86::CondCode CCode = 15619 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 15620 CCode = X86::GetOppositeBranchCondition(CCode); 15621 CC = DAG.getConstant(CCode, dl, MVT::i8); 15622 Cond = Cmp; 15623 addTest = false; 15624 } 15625 } 15626 } 15627 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 15628 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 15629 // It should be transformed during dag combiner except when the condition 15630 // is set by a arithmetics with overflow node. 15631 X86::CondCode CCode = 15632 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 15633 CCode = X86::GetOppositeBranchCondition(CCode); 15634 CC = DAG.getConstant(CCode, dl, MVT::i8); 15635 Cond = Cond.getOperand(0).getOperand(1); 15636 addTest = false; 15637 } else if (Cond.getOpcode() == ISD::SETCC && 15638 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 15639 // For FCMP_OEQ, we can emit 15640 // two branches instead of an explicit AND instruction with a 15641 // separate test. However, we only do this if this block doesn't 15642 // have a fall-through edge, because this requires an explicit 15643 // jmp when the condition is false. 15644 if (Op.getNode()->hasOneUse()) { 15645 SDNode *User = *Op.getNode()->use_begin(); 15646 // Look for an unconditional branch following this conditional branch. 15647 // We need this because we need to reverse the successors in order 15648 // to implement FCMP_OEQ. 15649 if (User->getOpcode() == ISD::BR) { 15650 SDValue FalseBB = User->getOperand(1); 15651 SDNode *NewBR = 15652 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 15653 assert(NewBR == User); 15654 (void)NewBR; 15655 Dest = FalseBB; 15656 15657 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 15658 Cond.getOperand(0), Cond.getOperand(1)); 15659 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 15660 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); 15661 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 15662 Chain, Dest, CC, Cmp); 15663 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); 15664 Cond = Cmp; 15665 addTest = false; 15666 } 15667 } 15668 } else if (Cond.getOpcode() == ISD::SETCC && 15669 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 15670 // For FCMP_UNE, we can emit 15671 // two branches instead of an explicit AND instruction with a 15672 // separate test. However, we only do this if this block doesn't 15673 // have a fall-through edge, because this requires an explicit 15674 // jmp when the condition is false. 15675 if (Op.getNode()->hasOneUse()) { 15676 SDNode *User = *Op.getNode()->use_begin(); 15677 // Look for an unconditional branch following this conditional branch. 15678 // We need this because we need to reverse the successors in order 15679 // to implement FCMP_UNE. 15680 if (User->getOpcode() == ISD::BR) { 15681 SDValue FalseBB = User->getOperand(1); 15682 SDNode *NewBR = 15683 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 15684 assert(NewBR == User); 15685 (void)NewBR; 15686 15687 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 15688 Cond.getOperand(0), Cond.getOperand(1)); 15689 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 15690 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); 15691 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 15692 Chain, Dest, CC, Cmp); 15693 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8); 15694 Cond = Cmp; 15695 addTest = false; 15696 Dest = FalseBB; 15697 } 15698 } 15699 } 15700 } 15701 15702 if (addTest) { 15703 // Look pass the truncate if the high bits are known zero. 15704 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 15705 Cond = Cond.getOperand(0); 15706 15707 // We know the result of AND is compared against zero. Try to match 15708 // it to BT. 15709 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 15710 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { 15711 CC = NewSetCC.getOperand(0); 15712 Cond = NewSetCC.getOperand(1); 15713 addTest = false; 15714 } 15715 } 15716 } 15717 15718 if (addTest) { 15719 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; 15720 CC = DAG.getConstant(X86Cond, dl, MVT::i8); 15721 Cond = EmitTest(Cond, X86Cond, dl, DAG); 15722 } 15723 Cond = ConvertCmpIfNecessary(Cond, DAG); 15724 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 15725 Chain, Dest, CC, Cond); 15726 } 15727 15728 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 15729 // Calls to _alloca are needed to probe the stack when allocating more than 4k 15730 // bytes in one go. Touching the stack at 4K increments is necessary to ensure 15731 // that the guard pages used by the OS virtual memory manager are allocated in 15732 // correct sequence. 15733 SDValue 15734 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 15735 SelectionDAG &DAG) const { 15736 MachineFunction &MF = DAG.getMachineFunction(); 15737 bool SplitStack = MF.shouldSplitStack(); 15738 bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || 15739 SplitStack; 15740 SDLoc dl(Op); 15741 15742 // Get the inputs. 15743 SDNode *Node = Op.getNode(); 15744 SDValue Chain = Op.getOperand(0); 15745 SDValue Size = Op.getOperand(1); 15746 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 15747 EVT VT = Node->getValueType(0); 15748 15749 // Chain the dynamic stack allocation so that it doesn't modify the stack 15750 // pointer when other instructions are using the stack. 15751 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); 15752 15753 bool Is64Bit = Subtarget->is64Bit(); 15754 MVT SPTy = getPointerTy(DAG.getDataLayout()); 15755 15756 SDValue Result; 15757 if (!Lower) { 15758 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15759 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); 15760 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" 15761 " not tell us which reg is the stack pointer!"); 15762 EVT VT = Node->getValueType(0); 15763 SDValue Tmp3 = Node->getOperand(2); 15764 15765 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); 15766 Chain = SP.getValue(1); 15767 unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); 15768 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 15769 unsigned StackAlign = TFI.getStackAlignment(); 15770 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value 15771 if (Align > StackAlign) 15772 Result = DAG.getNode(ISD::AND, dl, VT, Result, 15773 DAG.getConstant(-(uint64_t)Align, dl, VT)); 15774 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain 15775 } else if (SplitStack) { 15776 MachineRegisterInfo &MRI = MF.getRegInfo(); 15777 15778 if (Is64Bit) { 15779 // The 64 bit implementation of segmented stacks needs to clobber both r10 15780 // r11. This makes it impossible to use it along with nested parameters. 15781 const Function *F = MF.getFunction(); 15782 15783 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 15784 I != E; ++I) 15785 if (I->hasNestAttr()) 15786 report_fatal_error("Cannot use segmented stacks with functions that " 15787 "have nested arguments."); 15788 } 15789 15790 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); 15791 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 15792 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 15793 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 15794 DAG.getRegister(Vreg, SPTy)); 15795 } else { 15796 SDValue Flag; 15797 const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); 15798 15799 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 15800 Flag = Chain.getValue(1); 15801 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 15802 15803 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 15804 15805 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 15806 unsigned SPReg = RegInfo->getStackRegister(); 15807 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); 15808 Chain = SP.getValue(1); 15809 15810 if (Align) { 15811 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 15812 DAG.getConstant(-(uint64_t)Align, dl, VT)); 15813 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); 15814 } 15815 15816 Result = SP; 15817 } 15818 15819 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), 15820 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); 15821 15822 SDValue Ops[2] = {Result, Chain}; 15823 return DAG.getMergeValues(Ops, dl); 15824 } 15825 15826 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 15827 MachineFunction &MF = DAG.getMachineFunction(); 15828 auto PtrVT = getPointerTy(MF.getDataLayout()); 15829 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 15830 15831 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 15832 SDLoc DL(Op); 15833 15834 if (!Subtarget->is64Bit() || 15835 Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) { 15836 // vastart just stores the address of the VarArgsFrameIndex slot into the 15837 // memory location argument. 15838 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 15839 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 15840 MachinePointerInfo(SV), false, false, 0); 15841 } 15842 15843 // __va_list_tag: 15844 // gp_offset (0 - 6 * 8) 15845 // fp_offset (48 - 48 + 8 * 16) 15846 // overflow_arg_area (point to parameters coming in memory). 15847 // reg_save_area 15848 SmallVector<SDValue, 8> MemOps; 15849 SDValue FIN = Op.getOperand(1); 15850 // Store gp_offset 15851 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 15852 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 15853 DL, MVT::i32), 15854 FIN, MachinePointerInfo(SV), false, false, 0); 15855 MemOps.push_back(Store); 15856 15857 // Store fp_offset 15858 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); 15859 Store = DAG.getStore(Op.getOperand(0), DL, 15860 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, 15861 MVT::i32), 15862 FIN, MachinePointerInfo(SV, 4), false, false, 0); 15863 MemOps.push_back(Store); 15864 15865 // Store ptr to overflow_arg_area 15866 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); 15867 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 15868 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 15869 MachinePointerInfo(SV, 8), 15870 false, false, 0); 15871 MemOps.push_back(Store); 15872 15873 // Store ptr to reg_save_area. 15874 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( 15875 Subtarget->isTarget64BitLP64() ? 8 : 4, DL)); 15876 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); 15877 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo( 15878 SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0); 15879 MemOps.push_back(Store); 15880 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 15881 } 15882 15883 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 15884 assert(Subtarget->is64Bit() && 15885 "LowerVAARG only handles 64-bit va_arg!"); 15886 assert(Op.getNode()->getNumOperands() == 4); 15887 15888 MachineFunction &MF = DAG.getMachineFunction(); 15889 if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) 15890 // The Win64 ABI uses char* instead of a structure. 15891 return DAG.expandVAArg(Op.getNode()); 15892 15893 SDValue Chain = Op.getOperand(0); 15894 SDValue SrcPtr = Op.getOperand(1); 15895 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 15896 unsigned Align = Op.getConstantOperandVal(3); 15897 SDLoc dl(Op); 15898 15899 EVT ArgVT = Op.getNode()->getValueType(0); 15900 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 15901 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 15902 uint8_t ArgMode; 15903 15904 // Decide which area this value should be read from. 15905 // TODO: Implement the AMD64 ABI in its entirety. This simple 15906 // selection mechanism works only for the basic types. 15907 if (ArgVT == MVT::f80) { 15908 llvm_unreachable("va_arg for f80 not yet implemented"); 15909 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 15910 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 15911 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 15912 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 15913 } else { 15914 llvm_unreachable("Unhandled argument type in LowerVAARG"); 15915 } 15916 15917 if (ArgMode == 2) { 15918 // Sanity Check: Make sure using fp_offset makes sense. 15919 assert(!Subtarget->useSoftFloat() && 15920 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && 15921 Subtarget->hasSSE1()); 15922 } 15923 15924 // Insert VAARG_64 node into the DAG 15925 // VAARG_64 returns two values: Variable Argument Address, Chain 15926 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), 15927 DAG.getConstant(ArgMode, dl, MVT::i8), 15928 DAG.getConstant(Align, dl, MVT::i32)}; 15929 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); 15930 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 15931 VTs, InstOps, MVT::i64, 15932 MachinePointerInfo(SV), 15933 /*Align=*/0, 15934 /*Volatile=*/false, 15935 /*ReadMem=*/true, 15936 /*WriteMem=*/true); 15937 Chain = VAARG.getValue(1); 15938 15939 // Load the next argument and return it 15940 return DAG.getLoad(ArgVT, dl, 15941 Chain, 15942 VAARG, 15943 MachinePointerInfo(), 15944 false, false, false, 0); 15945 } 15946 15947 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, 15948 SelectionDAG &DAG) { 15949 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, 15950 // where a va_list is still an i8*. 15951 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 15952 if (Subtarget->isCallingConvWin64( 15953 DAG.getMachineFunction().getFunction()->getCallingConv())) 15954 // Probably a Win64 va_copy. 15955 return DAG.expandVACopy(Op.getNode()); 15956 15957 SDValue Chain = Op.getOperand(0); 15958 SDValue DstPtr = Op.getOperand(1); 15959 SDValue SrcPtr = Op.getOperand(2); 15960 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 15961 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 15962 SDLoc DL(Op); 15963 15964 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 15965 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, 15966 false, false, 15967 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 15968 } 15969 15970 // getTargetVShiftByConstNode - Handle vector element shifts where the shift 15971 // amount is a constant. Takes immediate version of shift as input. 15972 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, 15973 SDValue SrcOp, uint64_t ShiftAmt, 15974 SelectionDAG &DAG) { 15975 MVT ElementType = VT.getVectorElementType(); 15976 15977 // Fold this packed shift into its first operand if ShiftAmt is 0. 15978 if (ShiftAmt == 0) 15979 return SrcOp; 15980 15981 // Check for ShiftAmt >= element width 15982 if (ShiftAmt >= ElementType.getSizeInBits()) { 15983 if (Opc == X86ISD::VSRAI) 15984 ShiftAmt = ElementType.getSizeInBits() - 1; 15985 else 15986 return DAG.getConstant(0, dl, VT); 15987 } 15988 15989 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) 15990 && "Unknown target vector shift-by-constant node"); 15991 15992 // Fold this packed vector shift into a build vector if SrcOp is a 15993 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. 15994 if (VT == SrcOp.getSimpleValueType() && 15995 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { 15996 SmallVector<SDValue, 8> Elts; 15997 unsigned NumElts = SrcOp->getNumOperands(); 15998 ConstantSDNode *ND; 15999 16000 switch(Opc) { 16001 default: llvm_unreachable(nullptr); 16002 case X86ISD::VSHLI: 16003 for (unsigned i=0; i!=NumElts; ++i) { 16004 SDValue CurrentOp = SrcOp->getOperand(i); 16005 if (CurrentOp->getOpcode() == ISD::UNDEF) { 16006 Elts.push_back(CurrentOp); 16007 continue; 16008 } 16009 ND = cast<ConstantSDNode>(CurrentOp); 16010 const APInt &C = ND->getAPIntValue(); 16011 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); 16012 } 16013 break; 16014 case X86ISD::VSRLI: 16015 for (unsigned i=0; i!=NumElts; ++i) { 16016 SDValue CurrentOp = SrcOp->getOperand(i); 16017 if (CurrentOp->getOpcode() == ISD::UNDEF) { 16018 Elts.push_back(CurrentOp); 16019 continue; 16020 } 16021 ND = cast<ConstantSDNode>(CurrentOp); 16022 const APInt &C = ND->getAPIntValue(); 16023 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); 16024 } 16025 break; 16026 case X86ISD::VSRAI: 16027 for (unsigned i=0; i!=NumElts; ++i) { 16028 SDValue CurrentOp = SrcOp->getOperand(i); 16029 if (CurrentOp->getOpcode() == ISD::UNDEF) { 16030 Elts.push_back(CurrentOp); 16031 continue; 16032 } 16033 ND = cast<ConstantSDNode>(CurrentOp); 16034 const APInt &C = ND->getAPIntValue(); 16035 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); 16036 } 16037 break; 16038 } 16039 16040 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); 16041 } 16042 16043 return DAG.getNode(Opc, dl, VT, SrcOp, 16044 DAG.getConstant(ShiftAmt, dl, MVT::i8)); 16045 } 16046 16047 // getTargetVShiftNode - Handle vector element shifts where the shift amount 16048 // may or may not be a constant. Takes immediate version of shift as input. 16049 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, 16050 SDValue SrcOp, SDValue ShAmt, 16051 SelectionDAG &DAG) { 16052 MVT SVT = ShAmt.getSimpleValueType(); 16053 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); 16054 16055 // Catch shift-by-constant. 16056 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) 16057 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, 16058 CShAmt->getZExtValue(), DAG); 16059 16060 // Change opcode to non-immediate version 16061 switch (Opc) { 16062 default: llvm_unreachable("Unknown target vector shift node"); 16063 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 16064 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 16065 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 16066 } 16067 16068 const X86Subtarget &Subtarget = 16069 static_cast<const X86Subtarget &>(DAG.getSubtarget()); 16070 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && 16071 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { 16072 // Let the shuffle legalizer expand this shift amount node. 16073 SDValue Op0 = ShAmt.getOperand(0); 16074 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); 16075 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); 16076 } else { 16077 // Need to build a vector containing shift amount. 16078 // SSE/AVX packed shifts only use the lower 64-bit of the shift count. 16079 SmallVector<SDValue, 4> ShOps; 16080 ShOps.push_back(ShAmt); 16081 if (SVT == MVT::i32) { 16082 ShOps.push_back(DAG.getConstant(0, dl, SVT)); 16083 ShOps.push_back(DAG.getUNDEF(SVT)); 16084 } 16085 ShOps.push_back(DAG.getUNDEF(SVT)); 16086 16087 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; 16088 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); 16089 } 16090 16091 // The return type has to be a 128-bit type with the same element 16092 // type as the input type. 16093 MVT EltVT = VT.getVectorElementType(); 16094 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 16095 16096 ShAmt = DAG.getBitcast(ShVT, ShAmt); 16097 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 16098 } 16099 16100 /// \brief Return Mask with the necessary casting or extending 16101 /// for \p Mask according to \p MaskVT when lowering masking intrinsics 16102 static SDValue getMaskNode(SDValue Mask, MVT MaskVT, 16103 const X86Subtarget *Subtarget, 16104 SelectionDAG &DAG, SDLoc dl) { 16105 16106 if (MaskVT.bitsGT(Mask.getSimpleValueType())) { 16107 // Mask should be extended 16108 Mask = DAG.getNode(ISD::ANY_EXTEND, dl, 16109 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); 16110 } 16111 16112 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { 16113 if (MaskVT == MVT::v64i1) { 16114 assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); 16115 // In case 32bit mode, bitcast i64 is illegal, extend/split it. 16116 SDValue Lo, Hi; 16117 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, 16118 DAG.getConstant(0, dl, MVT::i32)); 16119 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, 16120 DAG.getConstant(1, dl, MVT::i32)); 16121 16122 Lo = DAG.getBitcast(MVT::v32i1, Lo); 16123 Hi = DAG.getBitcast(MVT::v32i1, Hi); 16124 16125 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); 16126 } else { 16127 // MaskVT require < 64bit. Truncate mask (should succeed in any case), 16128 // and bitcast. 16129 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); 16130 return DAG.getBitcast(MaskVT, 16131 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); 16132 } 16133 16134 } else { 16135 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 16136 Mask.getSimpleValueType().getSizeInBits()); 16137 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements 16138 // are extracted by EXTRACT_SUBVECTOR. 16139 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 16140 DAG.getBitcast(BitcastVT, Mask), 16141 DAG.getIntPtrConstant(0, dl)); 16142 } 16143 } 16144 16145 /// \brief Return (and \p Op, \p Mask) for compare instructions or 16146 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the 16147 /// necessary casting or extending for \p Mask when lowering masking intrinsics 16148 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, 16149 SDValue PreservedSrc, 16150 const X86Subtarget *Subtarget, 16151 SelectionDAG &DAG) { 16152 MVT VT = Op.getSimpleValueType(); 16153 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 16154 unsigned OpcodeSelect = ISD::VSELECT; 16155 SDLoc dl(Op); 16156 16157 if (isAllOnesConstant(Mask)) 16158 return Op; 16159 16160 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); 16161 16162 switch (Op.getOpcode()) { 16163 default: break; 16164 case X86ISD::PCMPEQM: 16165 case X86ISD::PCMPGTM: 16166 case X86ISD::CMPM: 16167 case X86ISD::CMPMU: 16168 return DAG.getNode(ISD::AND, dl, VT, Op, VMask); 16169 case X86ISD::VFPCLASS: 16170 case X86ISD::VFPCLASSS: 16171 return DAG.getNode(ISD::OR, dl, VT, Op, VMask); 16172 case X86ISD::VTRUNC: 16173 case X86ISD::VTRUNCS: 16174 case X86ISD::VTRUNCUS: 16175 // We can't use ISD::VSELECT here because it is not always "Legal" 16176 // for the destination type. For example vpmovqb require only AVX512 16177 // and vselect that can operate on byte element type require BWI 16178 OpcodeSelect = X86ISD::SELECT; 16179 break; 16180 } 16181 if (PreservedSrc.getOpcode() == ISD::UNDEF) 16182 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); 16183 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); 16184 } 16185 16186 /// \brief Creates an SDNode for a predicated scalar operation. 16187 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). 16188 /// The mask is coming as MVT::i8 and it should be truncated 16189 /// to MVT::i1 while lowering masking intrinsics. 16190 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using 16191 /// "X86select" instead of "vselect". We just can't create the "vselect" node 16192 /// for a scalar instruction. 16193 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, 16194 SDValue PreservedSrc, 16195 const X86Subtarget *Subtarget, 16196 SelectionDAG &DAG) { 16197 if (isAllOnesConstant(Mask)) 16198 return Op; 16199 16200 MVT VT = Op.getSimpleValueType(); 16201 SDLoc dl(Op); 16202 // The mask should be of type MVT::i1 16203 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); 16204 16205 if (Op.getOpcode() == X86ISD::FSETCC) 16206 return DAG.getNode(ISD::AND, dl, VT, Op, IMask); 16207 if (Op.getOpcode() == X86ISD::VFPCLASS || 16208 Op.getOpcode() == X86ISD::VFPCLASSS) 16209 return DAG.getNode(ISD::OR, dl, VT, Op, IMask); 16210 16211 if (PreservedSrc.getOpcode() == ISD::UNDEF) 16212 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); 16213 return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); 16214 } 16215 16216 static int getSEHRegistrationNodeSize(const Function *Fn) { 16217 if (!Fn->hasPersonalityFn()) 16218 report_fatal_error( 16219 "querying registration node size for function without personality"); 16220 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See 16221 // WinEHStatePass for the full struct definition. 16222 switch (classifyEHPersonality(Fn->getPersonalityFn())) { 16223 case EHPersonality::MSVC_X86SEH: return 24; 16224 case EHPersonality::MSVC_CXX: return 16; 16225 default: break; 16226 } 16227 report_fatal_error( 16228 "can only recover FP for 32-bit MSVC EH personality functions"); 16229 } 16230 16231 /// When the MSVC runtime transfers control to us, either to an outlined 16232 /// function or when returning to a parent frame after catching an exception, we 16233 /// recover the parent frame pointer by doing arithmetic on the incoming EBP. 16234 /// Here's the math: 16235 /// RegNodeBase = EntryEBP - RegNodeSize 16236 /// ParentFP = RegNodeBase - ParentFrameOffset 16237 /// Subtracting RegNodeSize takes us to the offset of the registration node, and 16238 /// subtracting the offset (negative on x86) takes us back to the parent FP. 16239 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, 16240 SDValue EntryEBP) { 16241 MachineFunction &MF = DAG.getMachineFunction(); 16242 SDLoc dl; 16243 16244 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16245 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); 16246 16247 // It's possible that the parent function no longer has a personality function 16248 // if the exceptional code was optimized away, in which case we just return 16249 // the incoming EBP. 16250 if (!Fn->hasPersonalityFn()) 16251 return EntryEBP; 16252 16253 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH 16254 // registration, or the .set_setframe offset. 16255 MCSymbol *OffsetSym = 16256 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( 16257 GlobalValue::getRealLinkageName(Fn->getName())); 16258 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); 16259 SDValue ParentFrameOffset = 16260 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); 16261 16262 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after 16263 // prologue to RBP in the parent function. 16264 const X86Subtarget &Subtarget = 16265 static_cast<const X86Subtarget &>(DAG.getSubtarget()); 16266 if (Subtarget.is64Bit()) 16267 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); 16268 16269 int RegNodeSize = getSEHRegistrationNodeSize(Fn); 16270 // RegNodeBase = EntryEBP - RegNodeSize 16271 // ParentFP = RegNodeBase - ParentFrameOffset 16272 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, 16273 DAG.getConstant(RegNodeSize, dl, PtrVT)); 16274 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); 16275 } 16276 16277 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, 16278 SelectionDAG &DAG) { 16279 SDLoc dl(Op); 16280 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 16281 MVT VT = Op.getSimpleValueType(); 16282 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); 16283 if (IntrData) { 16284 switch(IntrData->Type) { 16285 case INTR_TYPE_1OP: 16286 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); 16287 case INTR_TYPE_2OP: 16288 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 16289 Op.getOperand(2)); 16290 case INTR_TYPE_2OP_IMM8: 16291 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 16292 DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); 16293 case INTR_TYPE_3OP: 16294 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 16295 Op.getOperand(2), Op.getOperand(3)); 16296 case INTR_TYPE_4OP: 16297 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 16298 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); 16299 case INTR_TYPE_1OP_MASK_RM: { 16300 SDValue Src = Op.getOperand(1); 16301 SDValue PassThru = Op.getOperand(2); 16302 SDValue Mask = Op.getOperand(3); 16303 SDValue RoundingMode; 16304 // We allways add rounding mode to the Node. 16305 // If the rounding mode is not specified, we add the 16306 // "current direction" mode. 16307 if (Op.getNumOperands() == 4) 16308 RoundingMode = 16309 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); 16310 else 16311 RoundingMode = Op.getOperand(4); 16312 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 16313 if (IntrWithRoundingModeOpcode != 0) 16314 if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() != 16315 X86::STATIC_ROUNDING::CUR_DIRECTION) 16316 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 16317 dl, Op.getValueType(), Src, RoundingMode), 16318 Mask, PassThru, Subtarget, DAG); 16319 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, 16320 RoundingMode), 16321 Mask, PassThru, Subtarget, DAG); 16322 } 16323 case INTR_TYPE_1OP_MASK: { 16324 SDValue Src = Op.getOperand(1); 16325 SDValue PassThru = Op.getOperand(2); 16326 SDValue Mask = Op.getOperand(3); 16327 // We add rounding mode to the Node when 16328 // - RM Opcode is specified and 16329 // - RM is not "current direction". 16330 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 16331 if (IntrWithRoundingModeOpcode != 0) { 16332 SDValue Rnd = Op.getOperand(4); 16333 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); 16334 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { 16335 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 16336 dl, Op.getValueType(), 16337 Src, Rnd), 16338 Mask, PassThru, Subtarget, DAG); 16339 } 16340 } 16341 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), 16342 Mask, PassThru, Subtarget, DAG); 16343 } 16344 case INTR_TYPE_SCALAR_MASK: { 16345 SDValue Src1 = Op.getOperand(1); 16346 SDValue Src2 = Op.getOperand(2); 16347 SDValue passThru = Op.getOperand(3); 16348 SDValue Mask = Op.getOperand(4); 16349 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), 16350 Mask, passThru, Subtarget, DAG); 16351 } 16352 case INTR_TYPE_SCALAR_MASK_RM: { 16353 SDValue Src1 = Op.getOperand(1); 16354 SDValue Src2 = Op.getOperand(2); 16355 SDValue Src0 = Op.getOperand(3); 16356 SDValue Mask = Op.getOperand(4); 16357 // There are 2 kinds of intrinsics in this group: 16358 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands 16359 // (2) With rounding mode and sae - 7 operands. 16360 if (Op.getNumOperands() == 6) { 16361 SDValue Sae = Op.getOperand(5); 16362 unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0; 16363 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, 16364 Sae), 16365 Mask, Src0, Subtarget, DAG); 16366 } 16367 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); 16368 SDValue RoundingMode = Op.getOperand(5); 16369 SDValue Sae = Op.getOperand(6); 16370 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, 16371 RoundingMode, Sae), 16372 Mask, Src0, Subtarget, DAG); 16373 } 16374 case INTR_TYPE_2OP_MASK: 16375 case INTR_TYPE_2OP_IMM8_MASK: { 16376 SDValue Src1 = Op.getOperand(1); 16377 SDValue Src2 = Op.getOperand(2); 16378 SDValue PassThru = Op.getOperand(3); 16379 SDValue Mask = Op.getOperand(4); 16380 16381 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK) 16382 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2); 16383 16384 // We specify 2 possible opcodes for intrinsics with rounding modes. 16385 // First, we check if the intrinsic may have non-default rounding mode, 16386 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 16387 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 16388 if (IntrWithRoundingModeOpcode != 0) { 16389 SDValue Rnd = Op.getOperand(5); 16390 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); 16391 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { 16392 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 16393 dl, Op.getValueType(), 16394 Src1, Src2, Rnd), 16395 Mask, PassThru, Subtarget, DAG); 16396 } 16397 } 16398 // TODO: Intrinsics should have fast-math-flags to propagate. 16399 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), 16400 Mask, PassThru, Subtarget, DAG); 16401 } 16402 case INTR_TYPE_2OP_MASK_RM: { 16403 SDValue Src1 = Op.getOperand(1); 16404 SDValue Src2 = Op.getOperand(2); 16405 SDValue PassThru = Op.getOperand(3); 16406 SDValue Mask = Op.getOperand(4); 16407 // We specify 2 possible modes for intrinsics, with/without rounding 16408 // modes. 16409 // First, we check if the intrinsic have rounding mode (6 operands), 16410 // if not, we set rounding mode to "current". 16411 SDValue Rnd; 16412 if (Op.getNumOperands() == 6) 16413 Rnd = Op.getOperand(5); 16414 else 16415 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); 16416 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 16417 Src1, Src2, Rnd), 16418 Mask, PassThru, Subtarget, DAG); 16419 } 16420 case INTR_TYPE_3OP_SCALAR_MASK_RM: { 16421 SDValue Src1 = Op.getOperand(1); 16422 SDValue Src2 = Op.getOperand(2); 16423 SDValue Src3 = Op.getOperand(3); 16424 SDValue PassThru = Op.getOperand(4); 16425 SDValue Mask = Op.getOperand(5); 16426 SDValue Sae = Op.getOperand(6); 16427 16428 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, 16429 Src2, Src3, Sae), 16430 Mask, PassThru, Subtarget, DAG); 16431 } 16432 case INTR_TYPE_3OP_MASK_RM: { 16433 SDValue Src1 = Op.getOperand(1); 16434 SDValue Src2 = Op.getOperand(2); 16435 SDValue Imm = Op.getOperand(3); 16436 SDValue PassThru = Op.getOperand(4); 16437 SDValue Mask = Op.getOperand(5); 16438 // We specify 2 possible modes for intrinsics, with/without rounding 16439 // modes. 16440 // First, we check if the intrinsic have rounding mode (7 operands), 16441 // if not, we set rounding mode to "current". 16442 SDValue Rnd; 16443 if (Op.getNumOperands() == 7) 16444 Rnd = Op.getOperand(6); 16445 else 16446 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); 16447 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 16448 Src1, Src2, Imm, Rnd), 16449 Mask, PassThru, Subtarget, DAG); 16450 } 16451 case INTR_TYPE_3OP_IMM8_MASK: 16452 case INTR_TYPE_3OP_MASK: 16453 case INSERT_SUBVEC: { 16454 SDValue Src1 = Op.getOperand(1); 16455 SDValue Src2 = Op.getOperand(2); 16456 SDValue Src3 = Op.getOperand(3); 16457 SDValue PassThru = Op.getOperand(4); 16458 SDValue Mask = Op.getOperand(5); 16459 16460 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) 16461 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); 16462 else if (IntrData->Type == INSERT_SUBVEC) { 16463 // imm should be adapted to ISD::INSERT_SUBVECTOR behavior 16464 assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!"); 16465 unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue(); 16466 Imm *= Src2.getSimpleValueType().getVectorNumElements(); 16467 Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32); 16468 } 16469 16470 // We specify 2 possible opcodes for intrinsics with rounding modes. 16471 // First, we check if the intrinsic may have non-default rounding mode, 16472 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 16473 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 16474 if (IntrWithRoundingModeOpcode != 0) { 16475 SDValue Rnd = Op.getOperand(6); 16476 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); 16477 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { 16478 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 16479 dl, Op.getValueType(), 16480 Src1, Src2, Src3, Rnd), 16481 Mask, PassThru, Subtarget, DAG); 16482 } 16483 } 16484 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 16485 Src1, Src2, Src3), 16486 Mask, PassThru, Subtarget, DAG); 16487 } 16488 case VPERM_3OP_MASKZ: 16489 case VPERM_3OP_MASK:{ 16490 // Src2 is the PassThru 16491 SDValue Src1 = Op.getOperand(1); 16492 SDValue Src2 = Op.getOperand(2); 16493 SDValue Src3 = Op.getOperand(3); 16494 SDValue Mask = Op.getOperand(4); 16495 MVT VT = Op.getSimpleValueType(); 16496 SDValue PassThru = SDValue(); 16497 16498 // set PassThru element 16499 if (IntrData->Type == VPERM_3OP_MASKZ) 16500 PassThru = getZeroVector(VT, Subtarget, DAG, dl); 16501 else 16502 PassThru = DAG.getBitcast(VT, Src2); 16503 16504 // Swap Src1 and Src2 in the node creation 16505 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, 16506 dl, Op.getValueType(), 16507 Src2, Src1, Src3), 16508 Mask, PassThru, Subtarget, DAG); 16509 } 16510 case FMA_OP_MASK3: 16511 case FMA_OP_MASKZ: 16512 case FMA_OP_MASK: { 16513 SDValue Src1 = Op.getOperand(1); 16514 SDValue Src2 = Op.getOperand(2); 16515 SDValue Src3 = Op.getOperand(3); 16516 SDValue Mask = Op.getOperand(4); 16517 MVT VT = Op.getSimpleValueType(); 16518 SDValue PassThru = SDValue(); 16519 16520 // set PassThru element 16521 if (IntrData->Type == FMA_OP_MASKZ) 16522 PassThru = getZeroVector(VT, Subtarget, DAG, dl); 16523 else if (IntrData->Type == FMA_OP_MASK3) 16524 PassThru = Src3; 16525 else 16526 PassThru = Src1; 16527 16528 // We specify 2 possible opcodes for intrinsics with rounding modes. 16529 // First, we check if the intrinsic may have non-default rounding mode, 16530 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 16531 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 16532 if (IntrWithRoundingModeOpcode != 0) { 16533 SDValue Rnd = Op.getOperand(5); 16534 if (cast<ConstantSDNode>(Rnd)->getZExtValue() != 16535 X86::STATIC_ROUNDING::CUR_DIRECTION) 16536 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 16537 dl, Op.getValueType(), 16538 Src1, Src2, Src3, Rnd), 16539 Mask, PassThru, Subtarget, DAG); 16540 } 16541 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, 16542 dl, Op.getValueType(), 16543 Src1, Src2, Src3), 16544 Mask, PassThru, Subtarget, DAG); 16545 } 16546 case TERLOG_OP_MASK: 16547 case TERLOG_OP_MASKZ: { 16548 SDValue Src1 = Op.getOperand(1); 16549 SDValue Src2 = Op.getOperand(2); 16550 SDValue Src3 = Op.getOperand(3); 16551 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); 16552 SDValue Mask = Op.getOperand(5); 16553 MVT VT = Op.getSimpleValueType(); 16554 SDValue PassThru = Src1; 16555 // Set PassThru element. 16556 if (IntrData->Type == TERLOG_OP_MASKZ) 16557 PassThru = getZeroVector(VT, Subtarget, DAG, dl); 16558 16559 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 16560 Src1, Src2, Src3, Src4), 16561 Mask, PassThru, Subtarget, DAG); 16562 } 16563 case FPCLASS: { 16564 // FPclass intrinsics with mask 16565 SDValue Src1 = Op.getOperand(1); 16566 MVT VT = Src1.getSimpleValueType(); 16567 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 16568 SDValue Imm = Op.getOperand(2); 16569 SDValue Mask = Op.getOperand(3); 16570 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 16571 Mask.getSimpleValueType().getSizeInBits()); 16572 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); 16573 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, 16574 DAG.getTargetConstant(0, dl, MaskVT), 16575 Subtarget, DAG); 16576 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, 16577 DAG.getUNDEF(BitcastVT), FPclassMask, 16578 DAG.getIntPtrConstant(0, dl)); 16579 return DAG.getBitcast(Op.getValueType(), Res); 16580 } 16581 case FPCLASSS: { 16582 SDValue Src1 = Op.getOperand(1); 16583 SDValue Imm = Op.getOperand(2); 16584 SDValue Mask = Op.getOperand(3); 16585 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); 16586 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, 16587 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); 16588 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask); 16589 } 16590 case CMP_MASK: 16591 case CMP_MASK_CC: { 16592 // Comparison intrinsics with masks. 16593 // Example of transformation: 16594 // (i8 (int_x86_avx512_mask_pcmpeq_q_128 16595 // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> 16596 // (i8 (bitcast 16597 // (v8i1 (insert_subvector undef, 16598 // (v2i1 (and (PCMPEQM %a, %b), 16599 // (extract_subvector 16600 // (v8i1 (bitcast %mask)), 0))), 0)))) 16601 MVT VT = Op.getOperand(1).getSimpleValueType(); 16602 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 16603 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); 16604 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 16605 Mask.getSimpleValueType().getSizeInBits()); 16606 SDValue Cmp; 16607 if (IntrData->Type == CMP_MASK_CC) { 16608 SDValue CC = Op.getOperand(3); 16609 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); 16610 // We specify 2 possible opcodes for intrinsics with rounding modes. 16611 // First, we check if the intrinsic may have non-default rounding mode, 16612 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 16613 if (IntrData->Opc1 != 0) { 16614 SDValue Rnd = Op.getOperand(5); 16615 if (cast<ConstantSDNode>(Rnd)->getZExtValue() != 16616 X86::STATIC_ROUNDING::CUR_DIRECTION) 16617 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), 16618 Op.getOperand(2), CC, Rnd); 16619 } 16620 //default rounding mode 16621 if(!Cmp.getNode()) 16622 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), 16623 Op.getOperand(2), CC); 16624 16625 } else { 16626 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); 16627 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), 16628 Op.getOperand(2)); 16629 } 16630 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, 16631 DAG.getTargetConstant(0, dl, 16632 MaskVT), 16633 Subtarget, DAG); 16634 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, 16635 DAG.getUNDEF(BitcastVT), CmpMask, 16636 DAG.getIntPtrConstant(0, dl)); 16637 return DAG.getBitcast(Op.getValueType(), Res); 16638 } 16639 case CMP_MASK_SCALAR_CC: { 16640 SDValue Src1 = Op.getOperand(1); 16641 SDValue Src2 = Op.getOperand(2); 16642 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); 16643 SDValue Mask = Op.getOperand(4); 16644 16645 SDValue Cmp; 16646 if (IntrData->Opc1 != 0) { 16647 SDValue Rnd = Op.getOperand(5); 16648 if (cast<ConstantSDNode>(Rnd)->getZExtValue() != 16649 X86::STATIC_ROUNDING::CUR_DIRECTION) 16650 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); 16651 } 16652 //default rounding mode 16653 if(!Cmp.getNode()) 16654 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); 16655 16656 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, 16657 DAG.getTargetConstant(0, dl, 16658 MVT::i1), 16659 Subtarget, DAG); 16660 16661 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8, 16662 DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask), 16663 DAG.getValueType(MVT::i1)); 16664 } 16665 case COMI: { // Comparison intrinsics 16666 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; 16667 SDValue LHS = Op.getOperand(1); 16668 SDValue RHS = Op.getOperand(2); 16669 unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG); 16670 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 16671 SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); 16672 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 16673 DAG.getConstant(X86CC, dl, MVT::i8), Cond); 16674 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 16675 } 16676 case COMI_RM: { // Comparison intrinsics with Sae 16677 SDValue LHS = Op.getOperand(1); 16678 SDValue RHS = Op.getOperand(2); 16679 SDValue CC = Op.getOperand(3); 16680 SDValue Sae = Op.getOperand(4); 16681 auto ComiType = TranslateX86ConstCondToX86CC(CC); 16682 // choose between ordered and unordered (comi/ucomi) 16683 unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; 16684 SDValue Cond; 16685 if (cast<ConstantSDNode>(Sae)->getZExtValue() != 16686 X86::STATIC_ROUNDING::CUR_DIRECTION) 16687 Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); 16688 else 16689 Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); 16690 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 16691 DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); 16692 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 16693 } 16694 case VSHIFT: 16695 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), 16696 Op.getOperand(1), Op.getOperand(2), DAG); 16697 case VSHIFT_MASK: 16698 return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, 16699 Op.getSimpleValueType(), 16700 Op.getOperand(1), 16701 Op.getOperand(2), DAG), 16702 Op.getOperand(4), Op.getOperand(3), Subtarget, 16703 DAG); 16704 case COMPRESS_EXPAND_IN_REG: { 16705 SDValue Mask = Op.getOperand(3); 16706 SDValue DataToCompress = Op.getOperand(1); 16707 SDValue PassThru = Op.getOperand(2); 16708 if (isAllOnesConstant(Mask)) // return data as is 16709 return Op.getOperand(1); 16710 16711 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 16712 DataToCompress), 16713 Mask, PassThru, Subtarget, DAG); 16714 } 16715 case BROADCASTM: { 16716 SDValue Mask = Op.getOperand(1); 16717 MVT MaskVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); 16718 Mask = DAG.getBitcast(MaskVT, Mask); 16719 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); 16720 } 16721 case BLEND: { 16722 SDValue Mask = Op.getOperand(3); 16723 MVT VT = Op.getSimpleValueType(); 16724 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 16725 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); 16726 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), 16727 Op.getOperand(2)); 16728 } 16729 case KUNPCK: { 16730 MVT VT = Op.getSimpleValueType(); 16731 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); 16732 16733 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); 16734 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); 16735 // Arguments should be swapped. 16736 SDValue Res = DAG.getNode(IntrData->Opc0, dl, 16737 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), 16738 Src2, Src1); 16739 return DAG.getBitcast(VT, Res); 16740 } 16741 default: 16742 break; 16743 } 16744 } 16745 16746 switch (IntNo) { 16747 default: return SDValue(); // Don't custom lower most intrinsics. 16748 16749 case Intrinsic::x86_avx2_permd: 16750 case Intrinsic::x86_avx2_permps: 16751 // Operands intentionally swapped. Mask is last operand to intrinsic, 16752 // but second operand for node/instruction. 16753 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 16754 Op.getOperand(2), Op.getOperand(1)); 16755 16756 // ptest and testp intrinsics. The intrinsic these come from are designed to 16757 // return an integer value, not just an instruction so lower it to the ptest 16758 // or testp pattern and a setcc for the result. 16759 case Intrinsic::x86_sse41_ptestz: 16760 case Intrinsic::x86_sse41_ptestc: 16761 case Intrinsic::x86_sse41_ptestnzc: 16762 case Intrinsic::x86_avx_ptestz_256: 16763 case Intrinsic::x86_avx_ptestc_256: 16764 case Intrinsic::x86_avx_ptestnzc_256: 16765 case Intrinsic::x86_avx_vtestz_ps: 16766 case Intrinsic::x86_avx_vtestc_ps: 16767 case Intrinsic::x86_avx_vtestnzc_ps: 16768 case Intrinsic::x86_avx_vtestz_pd: 16769 case Intrinsic::x86_avx_vtestc_pd: 16770 case Intrinsic::x86_avx_vtestnzc_pd: 16771 case Intrinsic::x86_avx_vtestz_ps_256: 16772 case Intrinsic::x86_avx_vtestc_ps_256: 16773 case Intrinsic::x86_avx_vtestnzc_ps_256: 16774 case Intrinsic::x86_avx_vtestz_pd_256: 16775 case Intrinsic::x86_avx_vtestc_pd_256: 16776 case Intrinsic::x86_avx_vtestnzc_pd_256: { 16777 bool IsTestPacked = false; 16778 unsigned X86CC; 16779 switch (IntNo) { 16780 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 16781 case Intrinsic::x86_avx_vtestz_ps: 16782 case Intrinsic::x86_avx_vtestz_pd: 16783 case Intrinsic::x86_avx_vtestz_ps_256: 16784 case Intrinsic::x86_avx_vtestz_pd_256: 16785 IsTestPacked = true; // Fallthrough 16786 case Intrinsic::x86_sse41_ptestz: 16787 case Intrinsic::x86_avx_ptestz_256: 16788 // ZF = 1 16789 X86CC = X86::COND_E; 16790 break; 16791 case Intrinsic::x86_avx_vtestc_ps: 16792 case Intrinsic::x86_avx_vtestc_pd: 16793 case Intrinsic::x86_avx_vtestc_ps_256: 16794 case Intrinsic::x86_avx_vtestc_pd_256: 16795 IsTestPacked = true; // Fallthrough 16796 case Intrinsic::x86_sse41_ptestc: 16797 case Intrinsic::x86_avx_ptestc_256: 16798 // CF = 1 16799 X86CC = X86::COND_B; 16800 break; 16801 case Intrinsic::x86_avx_vtestnzc_ps: 16802 case Intrinsic::x86_avx_vtestnzc_pd: 16803 case Intrinsic::x86_avx_vtestnzc_ps_256: 16804 case Intrinsic::x86_avx_vtestnzc_pd_256: 16805 IsTestPacked = true; // Fallthrough 16806 case Intrinsic::x86_sse41_ptestnzc: 16807 case Intrinsic::x86_avx_ptestnzc_256: 16808 // ZF and CF = 0 16809 X86CC = X86::COND_A; 16810 break; 16811 } 16812 16813 SDValue LHS = Op.getOperand(1); 16814 SDValue RHS = Op.getOperand(2); 16815 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 16816 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 16817 SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); 16818 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 16819 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 16820 } 16821 case Intrinsic::x86_avx512_kortestz_w: 16822 case Intrinsic::x86_avx512_kortestc_w: { 16823 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; 16824 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); 16825 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); 16826 SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); 16827 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); 16828 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); 16829 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 16830 } 16831 16832 case Intrinsic::x86_sse42_pcmpistria128: 16833 case Intrinsic::x86_sse42_pcmpestria128: 16834 case Intrinsic::x86_sse42_pcmpistric128: 16835 case Intrinsic::x86_sse42_pcmpestric128: 16836 case Intrinsic::x86_sse42_pcmpistrio128: 16837 case Intrinsic::x86_sse42_pcmpestrio128: 16838 case Intrinsic::x86_sse42_pcmpistris128: 16839 case Intrinsic::x86_sse42_pcmpestris128: 16840 case Intrinsic::x86_sse42_pcmpistriz128: 16841 case Intrinsic::x86_sse42_pcmpestriz128: { 16842 unsigned Opcode; 16843 unsigned X86CC; 16844 switch (IntNo) { 16845 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 16846 case Intrinsic::x86_sse42_pcmpistria128: 16847 Opcode = X86ISD::PCMPISTRI; 16848 X86CC = X86::COND_A; 16849 break; 16850 case Intrinsic::x86_sse42_pcmpestria128: 16851 Opcode = X86ISD::PCMPESTRI; 16852 X86CC = X86::COND_A; 16853 break; 16854 case Intrinsic::x86_sse42_pcmpistric128: 16855 Opcode = X86ISD::PCMPISTRI; 16856 X86CC = X86::COND_B; 16857 break; 16858 case Intrinsic::x86_sse42_pcmpestric128: 16859 Opcode = X86ISD::PCMPESTRI; 16860 X86CC = X86::COND_B; 16861 break; 16862 case Intrinsic::x86_sse42_pcmpistrio128: 16863 Opcode = X86ISD::PCMPISTRI; 16864 X86CC = X86::COND_O; 16865 break; 16866 case Intrinsic::x86_sse42_pcmpestrio128: 16867 Opcode = X86ISD::PCMPESTRI; 16868 X86CC = X86::COND_O; 16869 break; 16870 case Intrinsic::x86_sse42_pcmpistris128: 16871 Opcode = X86ISD::PCMPISTRI; 16872 X86CC = X86::COND_S; 16873 break; 16874 case Intrinsic::x86_sse42_pcmpestris128: 16875 Opcode = X86ISD::PCMPESTRI; 16876 X86CC = X86::COND_S; 16877 break; 16878 case Intrinsic::x86_sse42_pcmpistriz128: 16879 Opcode = X86ISD::PCMPISTRI; 16880 X86CC = X86::COND_E; 16881 break; 16882 case Intrinsic::x86_sse42_pcmpestriz128: 16883 Opcode = X86ISD::PCMPESTRI; 16884 X86CC = X86::COND_E; 16885 break; 16886 } 16887 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 16888 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 16889 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); 16890 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 16891 DAG.getConstant(X86CC, dl, MVT::i8), 16892 SDValue(PCMP.getNode(), 1)); 16893 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 16894 } 16895 16896 case Intrinsic::x86_sse42_pcmpistri128: 16897 case Intrinsic::x86_sse42_pcmpestri128: { 16898 unsigned Opcode; 16899 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 16900 Opcode = X86ISD::PCMPISTRI; 16901 else 16902 Opcode = X86ISD::PCMPESTRI; 16903 16904 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 16905 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 16906 return DAG.getNode(Opcode, dl, VTs, NewOps); 16907 } 16908 16909 case Intrinsic::x86_seh_lsda: { 16910 // Compute the symbol for the LSDA. We know it'll get emitted later. 16911 MachineFunction &MF = DAG.getMachineFunction(); 16912 SDValue Op1 = Op.getOperand(1); 16913 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); 16914 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( 16915 GlobalValue::getRealLinkageName(Fn->getName())); 16916 16917 // Generate a simple absolute symbol reference. This intrinsic is only 16918 // supported on 32-bit Windows, which isn't PIC. 16919 SDValue Result = DAG.getMCSymbol(LSDASym, VT); 16920 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); 16921 } 16922 16923 case Intrinsic::x86_seh_recoverfp: { 16924 SDValue FnOp = Op.getOperand(1); 16925 SDValue IncomingFPOp = Op.getOperand(2); 16926 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); 16927 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); 16928 if (!Fn) 16929 report_fatal_error( 16930 "llvm.x86.seh.recoverfp must take a function as the first argument"); 16931 return recoverFramePointer(DAG, Fn, IncomingFPOp); 16932 } 16933 16934 case Intrinsic::localaddress: { 16935 // Returns one of the stack, base, or frame pointer registers, depending on 16936 // which is used to reference local variables. 16937 MachineFunction &MF = DAG.getMachineFunction(); 16938 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 16939 unsigned Reg; 16940 if (RegInfo->hasBasePointer(MF)) 16941 Reg = RegInfo->getBaseRegister(); 16942 else // This function handles the SP or FP case. 16943 Reg = RegInfo->getPtrSizedFrameRegister(MF); 16944 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 16945 } 16946 } 16947 } 16948 16949 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 16950 SDValue Src, SDValue Mask, SDValue Base, 16951 SDValue Index, SDValue ScaleOp, SDValue Chain, 16952 const X86Subtarget * Subtarget) { 16953 SDLoc dl(Op); 16954 auto *C = cast<ConstantSDNode>(ScaleOp); 16955 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); 16956 MVT MaskVT = MVT::getVectorVT(MVT::i1, 16957 Index.getSimpleValueType().getVectorNumElements()); 16958 SDValue MaskInReg; 16959 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 16960 if (MaskC) 16961 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); 16962 else { 16963 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 16964 Mask.getSimpleValueType().getSizeInBits()); 16965 16966 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements 16967 // are extracted by EXTRACT_SUBVECTOR. 16968 MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 16969 DAG.getBitcast(BitcastVT, Mask), 16970 DAG.getIntPtrConstant(0, dl)); 16971 } 16972 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 16973 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); 16974 SDValue Segment = DAG.getRegister(0, MVT::i32); 16975 if (Src.getOpcode() == ISD::UNDEF) 16976 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); 16977 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 16978 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 16979 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 16980 return DAG.getMergeValues(RetOps, dl); 16981 } 16982 16983 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 16984 SDValue Src, SDValue Mask, SDValue Base, 16985 SDValue Index, SDValue ScaleOp, SDValue Chain) { 16986 SDLoc dl(Op); 16987 auto *C = cast<ConstantSDNode>(ScaleOp); 16988 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); 16989 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); 16990 SDValue Segment = DAG.getRegister(0, MVT::i32); 16991 MVT MaskVT = MVT::getVectorVT(MVT::i1, 16992 Index.getSimpleValueType().getVectorNumElements()); 16993 SDValue MaskInReg; 16994 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 16995 if (MaskC) 16996 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); 16997 else { 16998 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 16999 Mask.getSimpleValueType().getSizeInBits()); 17000 17001 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements 17002 // are extracted by EXTRACT_SUBVECTOR. 17003 MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 17004 DAG.getBitcast(BitcastVT, Mask), 17005 DAG.getIntPtrConstant(0, dl)); 17006 } 17007 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 17008 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 17009 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 17010 return SDValue(Res, 1); 17011 } 17012 17013 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 17014 SDValue Mask, SDValue Base, SDValue Index, 17015 SDValue ScaleOp, SDValue Chain) { 17016 SDLoc dl(Op); 17017 auto *C = cast<ConstantSDNode>(ScaleOp); 17018 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); 17019 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); 17020 SDValue Segment = DAG.getRegister(0, MVT::i32); 17021 MVT MaskVT = 17022 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); 17023 SDValue MaskInReg; 17024 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 17025 if (MaskC) 17026 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); 17027 else 17028 MaskInReg = DAG.getBitcast(MaskVT, Mask); 17029 //SDVTList VTs = DAG.getVTList(MVT::Other); 17030 SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 17031 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); 17032 return SDValue(Res, 0); 17033 } 17034 17035 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that 17036 // read performance monitor counters (x86_rdpmc). 17037 static void getReadPerformanceCounter(SDNode *N, SDLoc DL, 17038 SelectionDAG &DAG, const X86Subtarget *Subtarget, 17039 SmallVectorImpl<SDValue> &Results) { 17040 assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); 17041 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 17042 SDValue LO, HI; 17043 17044 // The ECX register is used to select the index of the performance counter 17045 // to read. 17046 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, 17047 N->getOperand(2)); 17048 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); 17049 17050 // Reads the content of a 64-bit performance counter and returns it in the 17051 // registers EDX:EAX. 17052 if (Subtarget->is64Bit()) { 17053 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); 17054 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, 17055 LO.getValue(2)); 17056 } else { 17057 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); 17058 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, 17059 LO.getValue(2)); 17060 } 17061 Chain = HI.getValue(1); 17062 17063 if (Subtarget->is64Bit()) { 17064 // The EAX register is loaded with the low-order 32 bits. The EDX register 17065 // is loaded with the supported high-order bits of the counter. 17066 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, 17067 DAG.getConstant(32, DL, MVT::i8)); 17068 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); 17069 Results.push_back(Chain); 17070 return; 17071 } 17072 17073 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 17074 SDValue Ops[] = { LO, HI }; 17075 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); 17076 Results.push_back(Pair); 17077 Results.push_back(Chain); 17078 } 17079 17080 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that 17081 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is 17082 // also used to custom lower READCYCLECOUNTER nodes. 17083 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, 17084 SelectionDAG &DAG, const X86Subtarget *Subtarget, 17085 SmallVectorImpl<SDValue> &Results) { 17086 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 17087 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); 17088 SDValue LO, HI; 17089 17090 // The processor's time-stamp counter (a 64-bit MSR) is stored into the 17091 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR 17092 // and the EAX register is loaded with the low-order 32 bits. 17093 if (Subtarget->is64Bit()) { 17094 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); 17095 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, 17096 LO.getValue(2)); 17097 } else { 17098 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); 17099 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, 17100 LO.getValue(2)); 17101 } 17102 SDValue Chain = HI.getValue(1); 17103 17104 if (Opcode == X86ISD::RDTSCP_DAG) { 17105 assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); 17106 17107 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into 17108 // the ECX register. Add 'ecx' explicitly to the chain. 17109 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, 17110 HI.getValue(2)); 17111 // Explicitly store the content of ECX at the location passed in input 17112 // to the 'rdtscp' intrinsic. 17113 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), 17114 MachinePointerInfo(), false, false, 0); 17115 } 17116 17117 if (Subtarget->is64Bit()) { 17118 // The EDX register is loaded with the high-order 32 bits of the MSR, and 17119 // the EAX register is loaded with the low-order 32 bits. 17120 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, 17121 DAG.getConstant(32, DL, MVT::i8)); 17122 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); 17123 Results.push_back(Chain); 17124 return; 17125 } 17126 17127 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 17128 SDValue Ops[] = { LO, HI }; 17129 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); 17130 Results.push_back(Pair); 17131 Results.push_back(Chain); 17132 } 17133 17134 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, 17135 SelectionDAG &DAG) { 17136 SmallVector<SDValue, 2> Results; 17137 SDLoc DL(Op); 17138 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, 17139 Results); 17140 return DAG.getMergeValues(Results, DL); 17141 } 17142 17143 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { 17144 MachineFunction &MF = DAG.getMachineFunction(); 17145 SDValue Chain = Op.getOperand(0); 17146 SDValue RegNode = Op.getOperand(2); 17147 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); 17148 if (!EHInfo) 17149 report_fatal_error("EH registrations only live in functions using WinEH"); 17150 17151 // Cast the operand to an alloca, and remember the frame index. 17152 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode); 17153 if (!FINode) 17154 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); 17155 EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); 17156 17157 // Return the chain operand without making any DAG nodes. 17158 return Chain; 17159 } 17160 17161 /// \brief Lower intrinsics for TRUNCATE_TO_MEM case 17162 /// return truncate Store/MaskedStore Node 17163 static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, 17164 SelectionDAG &DAG, 17165 MVT ElementType) { 17166 SDLoc dl(Op); 17167 SDValue Mask = Op.getOperand(4); 17168 SDValue DataToTruncate = Op.getOperand(3); 17169 SDValue Addr = Op.getOperand(2); 17170 SDValue Chain = Op.getOperand(0); 17171 17172 MVT VT = DataToTruncate.getSimpleValueType(); 17173 MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); 17174 17175 if (isAllOnesConstant(Mask)) // return just a truncate store 17176 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, 17177 MachinePointerInfo(), SVT, false, false, 17178 SVT.getScalarSizeInBits()/8); 17179 17180 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 17181 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 17182 Mask.getSimpleValueType().getSizeInBits()); 17183 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements 17184 // are extracted by EXTRACT_SUBVECTOR. 17185 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 17186 DAG.getBitcast(BitcastVT, Mask), 17187 DAG.getIntPtrConstant(0, dl)); 17188 17189 MachineMemOperand *MMO = DAG.getMachineFunction(). 17190 getMachineMemOperand(MachinePointerInfo(), 17191 MachineMemOperand::MOStore, SVT.getStoreSize(), 17192 SVT.getScalarSizeInBits()/8); 17193 17194 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, 17195 VMask, SVT, MMO, true); 17196 } 17197 17198 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, 17199 SelectionDAG &DAG) { 17200 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 17201 17202 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); 17203 if (!IntrData) { 17204 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) 17205 return MarkEHRegistrationNode(Op, DAG); 17206 return SDValue(); 17207 } 17208 17209 SDLoc dl(Op); 17210 switch(IntrData->Type) { 17211 default: llvm_unreachable("Unknown Intrinsic Type"); 17212 case RDSEED: 17213 case RDRAND: { 17214 // Emit the node with the right value type. 17215 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 17216 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); 17217 17218 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. 17219 // Otherwise return the value from Rand, which is always 0, casted to i32. 17220 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 17221 DAG.getConstant(1, dl, Op->getValueType(1)), 17222 DAG.getConstant(X86::COND_B, dl, MVT::i32), 17223 SDValue(Result.getNode(), 1) }; 17224 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 17225 DAG.getVTList(Op->getValueType(1), MVT::Glue), 17226 Ops); 17227 17228 // Return { result, isValid, chain }. 17229 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 17230 SDValue(Result.getNode(), 2)); 17231 } 17232 case GATHER: { 17233 //gather(v1, mask, index, base, scale); 17234 SDValue Chain = Op.getOperand(0); 17235 SDValue Src = Op.getOperand(2); 17236 SDValue Base = Op.getOperand(3); 17237 SDValue Index = Op.getOperand(4); 17238 SDValue Mask = Op.getOperand(5); 17239 SDValue Scale = Op.getOperand(6); 17240 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, 17241 Chain, Subtarget); 17242 } 17243 case SCATTER: { 17244 //scatter(base, mask, index, v1, scale); 17245 SDValue Chain = Op.getOperand(0); 17246 SDValue Base = Op.getOperand(2); 17247 SDValue Mask = Op.getOperand(3); 17248 SDValue Index = Op.getOperand(4); 17249 SDValue Src = Op.getOperand(5); 17250 SDValue Scale = Op.getOperand(6); 17251 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, 17252 Scale, Chain); 17253 } 17254 case PREFETCH: { 17255 SDValue Hint = Op.getOperand(6); 17256 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); 17257 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); 17258 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); 17259 SDValue Chain = Op.getOperand(0); 17260 SDValue Mask = Op.getOperand(2); 17261 SDValue Index = Op.getOperand(3); 17262 SDValue Base = Op.getOperand(4); 17263 SDValue Scale = Op.getOperand(5); 17264 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); 17265 } 17266 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). 17267 case RDTSC: { 17268 SmallVector<SDValue, 2> Results; 17269 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, 17270 Results); 17271 return DAG.getMergeValues(Results, dl); 17272 } 17273 // Read Performance Monitoring Counters. 17274 case RDPMC: { 17275 SmallVector<SDValue, 2> Results; 17276 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); 17277 return DAG.getMergeValues(Results, dl); 17278 } 17279 // XTEST intrinsics. 17280 case XTEST: { 17281 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 17282 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); 17283 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17284 DAG.getConstant(X86::COND_NE, dl, MVT::i8), 17285 InTrans); 17286 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); 17287 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), 17288 Ret, SDValue(InTrans.getNode(), 1)); 17289 } 17290 // ADC/ADCX/SBB 17291 case ADX: { 17292 SmallVector<SDValue, 2> Results; 17293 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 17294 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); 17295 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), 17296 DAG.getConstant(-1, dl, MVT::i8)); 17297 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), 17298 Op.getOperand(4), GenCF.getValue(1)); 17299 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), 17300 Op.getOperand(5), MachinePointerInfo(), 17301 false, false, 0); 17302 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17303 DAG.getConstant(X86::COND_B, dl, MVT::i8), 17304 Res.getValue(1)); 17305 Results.push_back(SetCC); 17306 Results.push_back(Store); 17307 return DAG.getMergeValues(Results, dl); 17308 } 17309 case COMPRESS_TO_MEM: { 17310 SDLoc dl(Op); 17311 SDValue Mask = Op.getOperand(4); 17312 SDValue DataToCompress = Op.getOperand(3); 17313 SDValue Addr = Op.getOperand(2); 17314 SDValue Chain = Op.getOperand(0); 17315 17316 MVT VT = DataToCompress.getSimpleValueType(); 17317 if (isAllOnesConstant(Mask)) // return just a store 17318 return DAG.getStore(Chain, dl, DataToCompress, Addr, 17319 MachinePointerInfo(), false, false, 17320 VT.getScalarSizeInBits()/8); 17321 17322 SDValue Compressed = 17323 getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), 17324 Mask, DAG.getUNDEF(VT), Subtarget, DAG); 17325 return DAG.getStore(Chain, dl, Compressed, Addr, 17326 MachinePointerInfo(), false, false, 17327 VT.getScalarSizeInBits()/8); 17328 } 17329 case TRUNCATE_TO_MEM_VI8: 17330 return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); 17331 case TRUNCATE_TO_MEM_VI16: 17332 return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); 17333 case TRUNCATE_TO_MEM_VI32: 17334 return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); 17335 case EXPAND_FROM_MEM: { 17336 SDLoc dl(Op); 17337 SDValue Mask = Op.getOperand(4); 17338 SDValue PassThru = Op.getOperand(3); 17339 SDValue Addr = Op.getOperand(2); 17340 SDValue Chain = Op.getOperand(0); 17341 MVT VT = Op.getSimpleValueType(); 17342 17343 if (isAllOnesConstant(Mask)) // return just a load 17344 return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, 17345 false, VT.getScalarSizeInBits()/8); 17346 17347 SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), 17348 false, false, false, 17349 VT.getScalarSizeInBits()/8); 17350 17351 SDValue Results[] = { 17352 getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), 17353 Mask, PassThru, Subtarget, DAG), Chain}; 17354 return DAG.getMergeValues(Results, dl); 17355 } 17356 } 17357 } 17358 17359 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 17360 SelectionDAG &DAG) const { 17361 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 17362 MFI->setReturnAddressIsTaken(true); 17363 17364 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 17365 return SDValue(); 17366 17367 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 17368 SDLoc dl(Op); 17369 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 17370 17371 if (Depth > 0) { 17372 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 17373 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 17374 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); 17375 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 17376 DAG.getNode(ISD::ADD, dl, PtrVT, 17377 FrameAddr, Offset), 17378 MachinePointerInfo(), false, false, false, 0); 17379 } 17380 17381 // Just load the return address. 17382 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 17383 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 17384 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 17385 } 17386 17387 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 17388 MachineFunction &MF = DAG.getMachineFunction(); 17389 MachineFrameInfo *MFI = MF.getFrameInfo(); 17390 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 17391 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 17392 EVT VT = Op.getValueType(); 17393 17394 MFI->setFrameAddressIsTaken(true); 17395 17396 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { 17397 // Depth > 0 makes no sense on targets which use Windows unwind codes. It 17398 // is not possible to crawl up the stack without looking at the unwind codes 17399 // simultaneously. 17400 int FrameAddrIndex = FuncInfo->getFAIndex(); 17401 if (!FrameAddrIndex) { 17402 // Set up a frame object for the return address. 17403 unsigned SlotSize = RegInfo->getSlotSize(); 17404 FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( 17405 SlotSize, /*Offset=*/0, /*IsImmutable=*/false); 17406 FuncInfo->setFAIndex(FrameAddrIndex); 17407 } 17408 return DAG.getFrameIndex(FrameAddrIndex, VT); 17409 } 17410 17411 unsigned FrameReg = 17412 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); 17413 SDLoc dl(Op); // FIXME probably not meaningful 17414 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 17415 assert(((FrameReg == X86::RBP && VT == MVT::i64) || 17416 (FrameReg == X86::EBP && VT == MVT::i32)) && 17417 "Invalid Frame Register!"); 17418 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 17419 while (Depth--) 17420 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 17421 MachinePointerInfo(), 17422 false, false, false, 0); 17423 return FrameAddr; 17424 } 17425 17426 // FIXME? Maybe this could be a TableGen attribute on some registers and 17427 // this table could be generated automatically from RegInfo. 17428 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, 17429 SelectionDAG &DAG) const { 17430 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 17431 const MachineFunction &MF = DAG.getMachineFunction(); 17432 17433 unsigned Reg = StringSwitch<unsigned>(RegName) 17434 .Case("esp", X86::ESP) 17435 .Case("rsp", X86::RSP) 17436 .Case("ebp", X86::EBP) 17437 .Case("rbp", X86::RBP) 17438 .Default(0); 17439 17440 if (Reg == X86::EBP || Reg == X86::RBP) { 17441 if (!TFI.hasFP(MF)) 17442 report_fatal_error("register " + StringRef(RegName) + 17443 " is allocatable: function has no frame pointer"); 17444 #ifndef NDEBUG 17445 else { 17446 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 17447 unsigned FrameReg = 17448 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); 17449 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && 17450 "Invalid Frame Register!"); 17451 } 17452 #endif 17453 } 17454 17455 if (Reg) 17456 return Reg; 17457 17458 report_fatal_error("Invalid register name global variable"); 17459 } 17460 17461 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 17462 SelectionDAG &DAG) const { 17463 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 17464 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); 17465 } 17466 17467 unsigned X86TargetLowering::getExceptionPointerRegister( 17468 const Constant *PersonalityFn) const { 17469 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) 17470 return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; 17471 17472 return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; 17473 } 17474 17475 unsigned X86TargetLowering::getExceptionSelectorRegister( 17476 const Constant *PersonalityFn) const { 17477 // Funclet personalities don't use selectors (the runtime does the selection). 17478 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); 17479 return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; 17480 } 17481 17482 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 17483 SDValue Chain = Op.getOperand(0); 17484 SDValue Offset = Op.getOperand(1); 17485 SDValue Handler = Op.getOperand(2); 17486 SDLoc dl (Op); 17487 17488 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 17489 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 17490 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 17491 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || 17492 (FrameReg == X86::EBP && PtrVT == MVT::i32)) && 17493 "Invalid Frame Register!"); 17494 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); 17495 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; 17496 17497 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, 17498 DAG.getIntPtrConstant(RegInfo->getSlotSize(), 17499 dl)); 17500 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); 17501 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 17502 false, false, 0); 17503 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 17504 17505 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, 17506 DAG.getRegister(StoreAddrReg, PtrVT)); 17507 } 17508 17509 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 17510 SelectionDAG &DAG) const { 17511 SDLoc DL(Op); 17512 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, 17513 DAG.getVTList(MVT::i32, MVT::Other), 17514 Op.getOperand(0), Op.getOperand(1)); 17515 } 17516 17517 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 17518 SelectionDAG &DAG) const { 17519 SDLoc DL(Op); 17520 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 17521 Op.getOperand(0), Op.getOperand(1)); 17522 } 17523 17524 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 17525 return Op.getOperand(0); 17526 } 17527 17528 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 17529 SelectionDAG &DAG) const { 17530 SDValue Root = Op.getOperand(0); 17531 SDValue Trmp = Op.getOperand(1); // trampoline 17532 SDValue FPtr = Op.getOperand(2); // nested function 17533 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 17534 SDLoc dl (Op); 17535 17536 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 17537 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 17538 17539 if (Subtarget->is64Bit()) { 17540 SDValue OutChains[6]; 17541 17542 // Large code-model. 17543 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 17544 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 17545 17546 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 17547 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 17548 17549 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 17550 17551 // Load the pointer to the nested function into R11. 17552 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 17553 SDValue Addr = Trmp; 17554 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), 17555 Addr, MachinePointerInfo(TrmpAddr), 17556 false, false, 0); 17557 17558 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 17559 DAG.getConstant(2, dl, MVT::i64)); 17560 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 17561 MachinePointerInfo(TrmpAddr, 2), 17562 false, false, 2); 17563 17564 // Load the 'nest' parameter value into R10. 17565 // R10 is specified in X86CallingConv.td 17566 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 17567 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 17568 DAG.getConstant(10, dl, MVT::i64)); 17569 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), 17570 Addr, MachinePointerInfo(TrmpAddr, 10), 17571 false, false, 0); 17572 17573 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 17574 DAG.getConstant(12, dl, MVT::i64)); 17575 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 17576 MachinePointerInfo(TrmpAddr, 12), 17577 false, false, 2); 17578 17579 // Jump to the nested function. 17580 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 17581 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 17582 DAG.getConstant(20, dl, MVT::i64)); 17583 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), 17584 Addr, MachinePointerInfo(TrmpAddr, 20), 17585 false, false, 0); 17586 17587 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 17588 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 17589 DAG.getConstant(22, dl, MVT::i64)); 17590 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), 17591 Addr, MachinePointerInfo(TrmpAddr, 22), 17592 false, false, 0); 17593 17594 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); 17595 } else { 17596 const Function *Func = 17597 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 17598 CallingConv::ID CC = Func->getCallingConv(); 17599 unsigned NestReg; 17600 17601 switch (CC) { 17602 default: 17603 llvm_unreachable("Unsupported calling convention"); 17604 case CallingConv::C: 17605 case CallingConv::X86_StdCall: { 17606 // Pass 'nest' parameter in ECX. 17607 // Must be kept in sync with X86CallingConv.td 17608 NestReg = X86::ECX; 17609 17610 // Check that ECX wasn't needed by an 'inreg' parameter. 17611 FunctionType *FTy = Func->getFunctionType(); 17612 const AttributeSet &Attrs = Func->getAttributes(); 17613 17614 if (!Attrs.isEmpty() && !Func->isVarArg()) { 17615 unsigned InRegCount = 0; 17616 unsigned Idx = 1; 17617 17618 for (FunctionType::param_iterator I = FTy->param_begin(), 17619 E = FTy->param_end(); I != E; ++I, ++Idx) 17620 if (Attrs.hasAttribute(Idx, Attribute::InReg)) { 17621 auto &DL = DAG.getDataLayout(); 17622 // FIXME: should only count parameters that are lowered to integers. 17623 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; 17624 } 17625 17626 if (InRegCount > 2) { 17627 report_fatal_error("Nest register in use - reduce number of inreg" 17628 " parameters!"); 17629 } 17630 } 17631 break; 17632 } 17633 case CallingConv::X86_FastCall: 17634 case CallingConv::X86_ThisCall: 17635 case CallingConv::Fast: 17636 // Pass 'nest' parameter in EAX. 17637 // Must be kept in sync with X86CallingConv.td 17638 NestReg = X86::EAX; 17639 break; 17640 } 17641 17642 SDValue OutChains[4]; 17643 SDValue Addr, Disp; 17644 17645 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 17646 DAG.getConstant(10, dl, MVT::i32)); 17647 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 17648 17649 // This is storing the opcode for MOV32ri. 17650 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 17651 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 17652 OutChains[0] = DAG.getStore(Root, dl, 17653 DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8), 17654 Trmp, MachinePointerInfo(TrmpAddr), 17655 false, false, 0); 17656 17657 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 17658 DAG.getConstant(1, dl, MVT::i32)); 17659 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 17660 MachinePointerInfo(TrmpAddr, 1), 17661 false, false, 1); 17662 17663 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 17664 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 17665 DAG.getConstant(5, dl, MVT::i32)); 17666 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), 17667 Addr, MachinePointerInfo(TrmpAddr, 5), 17668 false, false, 1); 17669 17670 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 17671 DAG.getConstant(6, dl, MVT::i32)); 17672 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 17673 MachinePointerInfo(TrmpAddr, 6), 17674 false, false, 1); 17675 17676 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); 17677 } 17678 } 17679 17680 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 17681 SelectionDAG &DAG) const { 17682 /* 17683 The rounding mode is in bits 11:10 of FPSR, and has the following 17684 settings: 17685 00 Round to nearest 17686 01 Round to -inf 17687 10 Round to +inf 17688 11 Round to 0 17689 17690 FLT_ROUNDS, on the other hand, expects the following: 17691 -1 Undefined 17692 0 Round to 0 17693 1 Round to nearest 17694 2 Round to +inf 17695 3 Round to -inf 17696 17697 To perform the conversion, we do: 17698 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 17699 */ 17700 17701 MachineFunction &MF = DAG.getMachineFunction(); 17702 const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); 17703 unsigned StackAlignment = TFI.getStackAlignment(); 17704 MVT VT = Op.getSimpleValueType(); 17705 SDLoc DL(Op); 17706 17707 // Save FP Control Word to stack slot 17708 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 17709 SDValue StackSlot = 17710 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); 17711 17712 MachineMemOperand *MMO = 17713 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), 17714 MachineMemOperand::MOStore, 2, 2); 17715 17716 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 17717 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 17718 DAG.getVTList(MVT::Other), 17719 Ops, MVT::i16, MMO); 17720 17721 // Load FP Control Word from stack slot 17722 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 17723 MachinePointerInfo(), false, false, false, 0); 17724 17725 // Transform as necessary 17726 SDValue CWD1 = 17727 DAG.getNode(ISD::SRL, DL, MVT::i16, 17728 DAG.getNode(ISD::AND, DL, MVT::i16, 17729 CWD, DAG.getConstant(0x800, DL, MVT::i16)), 17730 DAG.getConstant(11, DL, MVT::i8)); 17731 SDValue CWD2 = 17732 DAG.getNode(ISD::SRL, DL, MVT::i16, 17733 DAG.getNode(ISD::AND, DL, MVT::i16, 17734 CWD, DAG.getConstant(0x400, DL, MVT::i16)), 17735 DAG.getConstant(9, DL, MVT::i8)); 17736 17737 SDValue RetVal = 17738 DAG.getNode(ISD::AND, DL, MVT::i16, 17739 DAG.getNode(ISD::ADD, DL, MVT::i16, 17740 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 17741 DAG.getConstant(1, DL, MVT::i16)), 17742 DAG.getConstant(3, DL, MVT::i16)); 17743 17744 return DAG.getNode((VT.getSizeInBits() < 16 ? 17745 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 17746 } 17747 17748 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. 17749 // 17750 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended 17751 // to 512-bit vector. 17752 // 2. i8/i16 vector implemented using dword LZCNT vector instruction 17753 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, 17754 // split the vector, perform operation on it's Lo a Hi part and 17755 // concatenate the results. 17756 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { 17757 SDLoc dl(Op); 17758 MVT VT = Op.getSimpleValueType(); 17759 MVT EltVT = VT.getVectorElementType(); 17760 unsigned NumElems = VT.getVectorNumElements(); 17761 17762 if (EltVT == MVT::i64 || EltVT == MVT::i32) { 17763 // Extend to 512 bit vector. 17764 assert((VT.is256BitVector() || VT.is128BitVector()) && 17765 "Unsupported value type for operation"); 17766 17767 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); 17768 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, 17769 DAG.getUNDEF(NewVT), 17770 Op.getOperand(0), 17771 DAG.getIntPtrConstant(0, dl)); 17772 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); 17773 17774 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, 17775 DAG.getIntPtrConstant(0, dl)); 17776 } 17777 17778 assert((EltVT == MVT::i8 || EltVT == MVT::i16) && 17779 "Unsupported element type"); 17780 17781 if (16 < NumElems) { 17782 // Split vector, it's Lo and Hi parts will be handled in next iteration. 17783 SDValue Lo, Hi; 17784 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); 17785 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); 17786 17787 Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo); 17788 Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi); 17789 17790 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); 17791 } 17792 17793 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); 17794 17795 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && 17796 "Unsupported value type for operation"); 17797 17798 // Use native supported vector instruction vplzcntd. 17799 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); 17800 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); 17801 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); 17802 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); 17803 17804 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); 17805 } 17806 17807 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget, 17808 SelectionDAG &DAG) { 17809 MVT VT = Op.getSimpleValueType(); 17810 MVT OpVT = VT; 17811 unsigned NumBits = VT.getSizeInBits(); 17812 SDLoc dl(Op); 17813 17814 if (VT.isVector() && Subtarget->hasAVX512()) 17815 return LowerVectorCTLZ_AVX512(Op, DAG); 17816 17817 Op = Op.getOperand(0); 17818 if (VT == MVT::i8) { 17819 // Zero extend to i32 since there is not an i8 bsr. 17820 OpVT = MVT::i32; 17821 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 17822 } 17823 17824 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 17825 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 17826 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 17827 17828 // If src is zero (i.e. bsr sets ZF), returns NumBits. 17829 SDValue Ops[] = { 17830 Op, 17831 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), 17832 DAG.getConstant(X86::COND_E, dl, MVT::i8), 17833 Op.getValue(1) 17834 }; 17835 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); 17836 17837 // Finally xor with NumBits-1. 17838 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, 17839 DAG.getConstant(NumBits - 1, dl, OpVT)); 17840 17841 if (VT == MVT::i8) 17842 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 17843 return Op; 17844 } 17845 17846 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, 17847 SelectionDAG &DAG) { 17848 MVT VT = Op.getSimpleValueType(); 17849 EVT OpVT = VT; 17850 unsigned NumBits = VT.getSizeInBits(); 17851 SDLoc dl(Op); 17852 17853 if (VT.isVector() && Subtarget->hasAVX512()) 17854 return LowerVectorCTLZ_AVX512(Op, DAG); 17855 17856 Op = Op.getOperand(0); 17857 if (VT == MVT::i8) { 17858 // Zero extend to i32 since there is not an i8 bsr. 17859 OpVT = MVT::i32; 17860 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 17861 } 17862 17863 // Issue a bsr (scan bits in reverse). 17864 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 17865 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 17866 17867 // And xor with NumBits-1. 17868 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, 17869 DAG.getConstant(NumBits - 1, dl, OpVT)); 17870 17871 if (VT == MVT::i8) 17872 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 17873 return Op; 17874 } 17875 17876 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 17877 MVT VT = Op.getSimpleValueType(); 17878 unsigned NumBits = VT.getScalarSizeInBits(); 17879 SDLoc dl(Op); 17880 17881 if (VT.isVector()) { 17882 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17883 17884 SDValue N0 = Op.getOperand(0); 17885 SDValue Zero = DAG.getConstant(0, dl, VT); 17886 17887 // lsb(x) = (x & -x) 17888 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, 17889 DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); 17890 17891 // cttz_undef(x) = (width - 1) - ctlz(lsb) 17892 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF && 17893 TLI.isOperationLegal(ISD::CTLZ, VT)) { 17894 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); 17895 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, 17896 DAG.getNode(ISD::CTLZ, dl, VT, LSB)); 17897 } 17898 17899 // cttz(x) = ctpop(lsb - 1) 17900 SDValue One = DAG.getConstant(1, dl, VT); 17901 return DAG.getNode(ISD::CTPOP, dl, VT, 17902 DAG.getNode(ISD::SUB, dl, VT, LSB, One)); 17903 } 17904 17905 assert(Op.getOpcode() == ISD::CTTZ && 17906 "Only scalar CTTZ requires custom lowering"); 17907 17908 // Issue a bsf (scan bits forward) which also sets EFLAGS. 17909 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 17910 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); 17911 17912 // If src is zero (i.e. bsf sets ZF), returns NumBits. 17913 SDValue Ops[] = { 17914 Op, 17915 DAG.getConstant(NumBits, dl, VT), 17916 DAG.getConstant(X86::COND_E, dl, MVT::i8), 17917 Op.getValue(1) 17918 }; 17919 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); 17920 } 17921 17922 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 17923 // ones, and then concatenate the result back. 17924 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 17925 MVT VT = Op.getSimpleValueType(); 17926 17927 assert(VT.is256BitVector() && VT.isInteger() && 17928 "Unsupported value type for operation"); 17929 17930 unsigned NumElems = VT.getVectorNumElements(); 17931 SDLoc dl(Op); 17932 17933 // Extract the LHS vectors 17934 SDValue LHS = Op.getOperand(0); 17935 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 17936 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 17937 17938 // Extract the RHS vectors 17939 SDValue RHS = Op.getOperand(1); 17940 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 17941 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 17942 17943 MVT EltVT = VT.getVectorElementType(); 17944 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 17945 17946 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 17947 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 17948 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 17949 } 17950 17951 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 17952 if (Op.getValueType() == MVT::i1) 17953 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), 17954 Op.getOperand(0), Op.getOperand(1)); 17955 assert(Op.getSimpleValueType().is256BitVector() && 17956 Op.getSimpleValueType().isInteger() && 17957 "Only handle AVX 256-bit vector integer operation"); 17958 return Lower256IntArith(Op, DAG); 17959 } 17960 17961 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 17962 if (Op.getValueType() == MVT::i1) 17963 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), 17964 Op.getOperand(0), Op.getOperand(1)); 17965 assert(Op.getSimpleValueType().is256BitVector() && 17966 Op.getSimpleValueType().isInteger() && 17967 "Only handle AVX 256-bit vector integer operation"); 17968 return Lower256IntArith(Op, DAG); 17969 } 17970 17971 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { 17972 assert(Op.getSimpleValueType().is256BitVector() && 17973 Op.getSimpleValueType().isInteger() && 17974 "Only handle AVX 256-bit vector integer operation"); 17975 return Lower256IntArith(Op, DAG); 17976 } 17977 17978 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, 17979 SelectionDAG &DAG) { 17980 SDLoc dl(Op); 17981 MVT VT = Op.getSimpleValueType(); 17982 17983 if (VT == MVT::i1) 17984 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); 17985 17986 // Decompose 256-bit ops into smaller 128-bit ops. 17987 if (VT.is256BitVector() && !Subtarget->hasInt256()) 17988 return Lower256IntArith(Op, DAG); 17989 17990 SDValue A = Op.getOperand(0); 17991 SDValue B = Op.getOperand(1); 17992 17993 // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector 17994 // pairs, multiply and truncate. 17995 if (VT == MVT::v16i8 || VT == MVT::v32i8) { 17996 if (Subtarget->hasInt256()) { 17997 if (VT == MVT::v32i8) { 17998 MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2); 17999 SDValue Lo = DAG.getIntPtrConstant(0, dl); 18000 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl); 18001 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo); 18002 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo); 18003 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi); 18004 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi); 18005 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 18006 DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo), 18007 DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi)); 18008 } 18009 18010 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); 18011 return DAG.getNode( 18012 ISD::TRUNCATE, dl, VT, 18013 DAG.getNode(ISD::MUL, dl, ExVT, 18014 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A), 18015 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B))); 18016 } 18017 18018 assert(VT == MVT::v16i8 && 18019 "Pre-AVX2 support only supports v16i8 multiplication"); 18020 MVT ExVT = MVT::v8i16; 18021 18022 // Extract the lo parts and sign extend to i16 18023 SDValue ALo, BLo; 18024 if (Subtarget->hasSSE41()) { 18025 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); 18026 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); 18027 } else { 18028 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, 18029 -1, 4, -1, 5, -1, 6, -1, 7}; 18030 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 18031 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 18032 ALo = DAG.getBitcast(ExVT, ALo); 18033 BLo = DAG.getBitcast(ExVT, BLo); 18034 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); 18035 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); 18036 } 18037 18038 // Extract the hi parts and sign extend to i16 18039 SDValue AHi, BHi; 18040 if (Subtarget->hasSSE41()) { 18041 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, 18042 -1, -1, -1, -1, -1, -1, -1, -1}; 18043 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 18044 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 18045 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); 18046 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); 18047 } else { 18048 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, 18049 -1, 12, -1, 13, -1, 14, -1, 15}; 18050 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 18051 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 18052 AHi = DAG.getBitcast(ExVT, AHi); 18053 BHi = DAG.getBitcast(ExVT, BHi); 18054 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); 18055 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); 18056 } 18057 18058 // Multiply, mask the lower 8bits of the lo/hi results and pack 18059 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); 18060 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); 18061 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); 18062 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT)); 18063 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); 18064 } 18065 18066 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. 18067 if (VT == MVT::v4i32) { 18068 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && 18069 "Should not custom lower when pmuldq is available!"); 18070 18071 // Extract the odd parts. 18072 static const int UnpackMask[] = { 1, -1, 3, -1 }; 18073 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); 18074 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); 18075 18076 // Multiply the even parts. 18077 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); 18078 // Now multiply odd parts. 18079 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); 18080 18081 Evens = DAG.getBitcast(VT, Evens); 18082 Odds = DAG.getBitcast(VT, Odds); 18083 18084 // Merge the two vectors back together with a shuffle. This expands into 2 18085 // shuffles. 18086 static const int ShufMask[] = { 0, 4, 2, 6 }; 18087 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); 18088 } 18089 18090 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && 18091 "Only know how to lower V2I64/V4I64/V8I64 multiply"); 18092 18093 // Ahi = psrlqi(a, 32); 18094 // Bhi = psrlqi(b, 32); 18095 // 18096 // AloBlo = pmuludq(a, b); 18097 // AloBhi = pmuludq(a, Bhi); 18098 // AhiBlo = pmuludq(Ahi, b); 18099 18100 // AloBhi = psllqi(AloBhi, 32); 18101 // AhiBlo = psllqi(AhiBlo, 32); 18102 // return AloBlo + AloBhi + AhiBlo; 18103 18104 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); 18105 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); 18106 18107 SDValue AhiBlo = Ahi; 18108 SDValue AloBhi = Bhi; 18109 // Bit cast to 32-bit vectors for MULUDQ 18110 MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : 18111 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; 18112 A = DAG.getBitcast(MulVT, A); 18113 B = DAG.getBitcast(MulVT, B); 18114 Ahi = DAG.getBitcast(MulVT, Ahi); 18115 Bhi = DAG.getBitcast(MulVT, Bhi); 18116 18117 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 18118 // After shifting right const values the result may be all-zero. 18119 if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) { 18120 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 18121 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); 18122 } 18123 if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) { 18124 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 18125 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); 18126 } 18127 18128 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 18129 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 18130 } 18131 18132 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { 18133 assert(Subtarget->isTargetWin64() && "Unexpected target"); 18134 EVT VT = Op.getValueType(); 18135 assert(VT.isInteger() && VT.getSizeInBits() == 128 && 18136 "Unexpected return type for lowering"); 18137 18138 RTLIB::Libcall LC; 18139 bool isSigned; 18140 switch (Op->getOpcode()) { 18141 default: llvm_unreachable("Unexpected request for libcall!"); 18142 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; 18143 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; 18144 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; 18145 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; 18146 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; 18147 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; 18148 } 18149 18150 SDLoc dl(Op); 18151 SDValue InChain = DAG.getEntryNode(); 18152 18153 TargetLowering::ArgListTy Args; 18154 TargetLowering::ArgListEntry Entry; 18155 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { 18156 EVT ArgVT = Op->getOperand(i).getValueType(); 18157 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && 18158 "Unexpected argument type for lowering"); 18159 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); 18160 Entry.Node = StackPtr; 18161 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), 18162 false, false, 16); 18163 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 18164 Entry.Ty = PointerType::get(ArgTy,0); 18165 Entry.isSExt = false; 18166 Entry.isZExt = false; 18167 Args.push_back(Entry); 18168 } 18169 18170 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 18171 getPointerTy(DAG.getDataLayout())); 18172 18173 TargetLowering::CallLoweringInfo CLI(DAG); 18174 CLI.setDebugLoc(dl).setChain(InChain) 18175 .setCallee(getLibcallCallingConv(LC), 18176 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), 18177 Callee, std::move(Args), 0) 18178 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 18179 18180 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 18181 return DAG.getBitcast(VT, CallInfo.first); 18182 } 18183 18184 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, 18185 SelectionDAG &DAG) { 18186 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); 18187 MVT VT = Op0.getSimpleValueType(); 18188 SDLoc dl(Op); 18189 18190 assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || 18191 (VT == MVT::v8i32 && Subtarget->hasInt256())); 18192 18193 // PMULxD operations multiply each even value (starting at 0) of LHS with 18194 // the related value of RHS and produce a widen result. 18195 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> 18196 // => <2 x i64> <ae|cg> 18197 // 18198 // In other word, to have all the results, we need to perform two PMULxD: 18199 // 1. one with the even values. 18200 // 2. one with the odd values. 18201 // To achieve #2, with need to place the odd values at an even position. 18202 // 18203 // Place the odd value at an even position (basically, shift all values 1 18204 // step to the left): 18205 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; 18206 // <a|b|c|d> => <b|undef|d|undef> 18207 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); 18208 // <e|f|g|h> => <f|undef|h|undef> 18209 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); 18210 18211 // Emit two multiplies, one for the lower 2 ints and one for the higher 2 18212 // ints. 18213 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; 18214 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; 18215 unsigned Opcode = 18216 (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; 18217 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> 18218 // => <2 x i64> <ae|cg> 18219 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); 18220 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef> 18221 // => <2 x i64> <bf|dh> 18222 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); 18223 18224 // Shuffle it back into the right order. 18225 SDValue Highs, Lows; 18226 if (VT == MVT::v8i32) { 18227 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; 18228 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); 18229 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; 18230 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); 18231 } else { 18232 const int HighMask[] = {1, 5, 3, 7}; 18233 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); 18234 const int LowMask[] = {0, 4, 2, 6}; 18235 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); 18236 } 18237 18238 // If we have a signed multiply but no PMULDQ fix up the high parts of a 18239 // unsigned multiply. 18240 if (IsSigned && !Subtarget->hasSSE41()) { 18241 SDValue ShAmt = DAG.getConstant( 18242 31, dl, 18243 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); 18244 SDValue T1 = DAG.getNode(ISD::AND, dl, VT, 18245 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); 18246 SDValue T2 = DAG.getNode(ISD::AND, dl, VT, 18247 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); 18248 18249 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); 18250 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); 18251 } 18252 18253 // The first result of MUL_LOHI is actually the low value, followed by the 18254 // high value. 18255 SDValue Ops[] = {Lows, Highs}; 18256 return DAG.getMergeValues(Ops, dl); 18257 } 18258 18259 // Return true if the required (according to Opcode) shift-imm form is natively 18260 // supported by the Subtarget 18261 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, 18262 unsigned Opcode) { 18263 if (VT.getScalarSizeInBits() < 16) 18264 return false; 18265 18266 if (VT.is512BitVector() && 18267 (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI())) 18268 return true; 18269 18270 bool LShift = VT.is128BitVector() || 18271 (VT.is256BitVector() && Subtarget->hasInt256()); 18272 18273 bool AShift = LShift && (Subtarget->hasVLX() || 18274 (VT != MVT::v2i64 && VT != MVT::v4i64)); 18275 return (Opcode == ISD::SRA) ? AShift : LShift; 18276 } 18277 18278 // The shift amount is a variable, but it is the same for all vector lanes. 18279 // These instructions are defined together with shift-immediate. 18280 static 18281 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, 18282 unsigned Opcode) { 18283 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); 18284 } 18285 18286 // Return true if the required (according to Opcode) variable-shift form is 18287 // natively supported by the Subtarget 18288 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, 18289 unsigned Opcode) { 18290 18291 if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16) 18292 return false; 18293 18294 // vXi16 supported only on AVX-512, BWI 18295 if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI()) 18296 return false; 18297 18298 if (VT.is512BitVector() || Subtarget->hasVLX()) 18299 return true; 18300 18301 bool LShift = VT.is128BitVector() || VT.is256BitVector(); 18302 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; 18303 return (Opcode == ISD::SRA) ? AShift : LShift; 18304 } 18305 18306 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, 18307 const X86Subtarget *Subtarget) { 18308 MVT VT = Op.getSimpleValueType(); 18309 SDLoc dl(Op); 18310 SDValue R = Op.getOperand(0); 18311 SDValue Amt = Op.getOperand(1); 18312 18313 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : 18314 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; 18315 18316 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { 18317 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); 18318 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); 18319 SDValue Ex = DAG.getBitcast(ExVT, R); 18320 18321 if (ShiftAmt >= 32) { 18322 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. 18323 SDValue Upper = 18324 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); 18325 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 18326 ShiftAmt - 32, DAG); 18327 if (VT == MVT::v2i64) 18328 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); 18329 if (VT == MVT::v4i64) 18330 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, 18331 {9, 1, 11, 3, 13, 5, 15, 7}); 18332 } else { 18333 // SRA upper i32, SHL whole i64 and select lower i32. 18334 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 18335 ShiftAmt, DAG); 18336 SDValue Lower = 18337 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); 18338 Lower = DAG.getBitcast(ExVT, Lower); 18339 if (VT == MVT::v2i64) 18340 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); 18341 if (VT == MVT::v4i64) 18342 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, 18343 {8, 1, 10, 3, 12, 5, 14, 7}); 18344 } 18345 return DAG.getBitcast(VT, Ex); 18346 }; 18347 18348 // Optimize shl/srl/sra with constant shift amount. 18349 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { 18350 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { 18351 uint64_t ShiftAmt = ShiftConst->getZExtValue(); 18352 18353 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) 18354 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); 18355 18356 // i64 SRA needs to be performed as partial shifts. 18357 if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && 18358 Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) 18359 return ArithmeticShiftRight64(ShiftAmt); 18360 18361 if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { 18362 unsigned NumElts = VT.getVectorNumElements(); 18363 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); 18364 18365 // Simple i8 add case 18366 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) 18367 return DAG.getNode(ISD::ADD, dl, VT, R, R); 18368 18369 // ashr(R, 7) === cmp_slt(R, 0) 18370 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { 18371 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 18372 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 18373 } 18374 18375 // XOP can shift v16i8 directly instead of as shift v8i16 + mask. 18376 if (VT == MVT::v16i8 && Subtarget->hasXOP()) 18377 return SDValue(); 18378 18379 if (Op.getOpcode() == ISD::SHL) { 18380 // Make a large shift. 18381 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, 18382 R, ShiftAmt, DAG); 18383 SHL = DAG.getBitcast(VT, SHL); 18384 // Zero out the rightmost bits. 18385 SmallVector<SDValue, 32> V( 18386 NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8)); 18387 return DAG.getNode(ISD::AND, dl, VT, SHL, 18388 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); 18389 } 18390 if (Op.getOpcode() == ISD::SRL) { 18391 // Make a large shift. 18392 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, 18393 R, ShiftAmt, DAG); 18394 SRL = DAG.getBitcast(VT, SRL); 18395 // Zero out the leftmost bits. 18396 SmallVector<SDValue, 32> V( 18397 NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8)); 18398 return DAG.getNode(ISD::AND, dl, VT, SRL, 18399 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); 18400 } 18401 if (Op.getOpcode() == ISD::SRA) { 18402 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) 18403 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 18404 SmallVector<SDValue, 32> V(NumElts, 18405 DAG.getConstant(128 >> ShiftAmt, dl, 18406 MVT::i8)); 18407 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); 18408 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 18409 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 18410 return Res; 18411 } 18412 llvm_unreachable("Unknown shift opcode."); 18413 } 18414 } 18415 } 18416 18417 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 18418 if (!Subtarget->is64Bit() && !Subtarget->hasXOP() && 18419 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) { 18420 18421 // Peek through any splat that was introduced for i64 shift vectorization. 18422 int SplatIndex = -1; 18423 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) 18424 if (SVN->isSplat()) { 18425 SplatIndex = SVN->getSplatIndex(); 18426 Amt = Amt.getOperand(0); 18427 assert(SplatIndex < (int)VT.getVectorNumElements() && 18428 "Splat shuffle referencing second operand"); 18429 } 18430 18431 if (Amt.getOpcode() != ISD::BITCAST || 18432 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) 18433 return SDValue(); 18434 18435 Amt = Amt.getOperand(0); 18436 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / 18437 VT.getVectorNumElements(); 18438 unsigned RatioInLog2 = Log2_32_Ceil(Ratio); 18439 uint64_t ShiftAmt = 0; 18440 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); 18441 for (unsigned i = 0; i != Ratio; ++i) { 18442 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp)); 18443 if (!C) 18444 return SDValue(); 18445 // 6 == Log2(64) 18446 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); 18447 } 18448 18449 // Check remaining shift amounts (if not a splat). 18450 if (SplatIndex < 0) { 18451 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 18452 uint64_t ShAmt = 0; 18453 for (unsigned j = 0; j != Ratio; ++j) { 18454 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); 18455 if (!C) 18456 return SDValue(); 18457 // 6 == Log2(64) 18458 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); 18459 } 18460 if (ShAmt != ShiftAmt) 18461 return SDValue(); 18462 } 18463 } 18464 18465 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) 18466 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); 18467 18468 if (Op.getOpcode() == ISD::SRA) 18469 return ArithmeticShiftRight64(ShiftAmt); 18470 } 18471 18472 return SDValue(); 18473 } 18474 18475 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, 18476 const X86Subtarget* Subtarget) { 18477 MVT VT = Op.getSimpleValueType(); 18478 SDLoc dl(Op); 18479 SDValue R = Op.getOperand(0); 18480 SDValue Amt = Op.getOperand(1); 18481 18482 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : 18483 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; 18484 18485 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL : 18486 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA; 18487 18488 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { 18489 SDValue BaseShAmt; 18490 MVT EltVT = VT.getVectorElementType(); 18491 18492 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { 18493 // Check if this build_vector node is doing a splat. 18494 // If so, then set BaseShAmt equal to the splat value. 18495 BaseShAmt = BV->getSplatValue(); 18496 if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) 18497 BaseShAmt = SDValue(); 18498 } else { 18499 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) 18500 Amt = Amt.getOperand(0); 18501 18502 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt); 18503 if (SVN && SVN->isSplat()) { 18504 unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); 18505 SDValue InVec = Amt.getOperand(0); 18506 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 18507 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && 18508 "Unexpected shuffle index found!"); 18509 BaseShAmt = InVec.getOperand(SplatIdx); 18510 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 18511 if (ConstantSDNode *C = 18512 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 18513 if (C->getZExtValue() == SplatIdx) 18514 BaseShAmt = InVec.getOperand(1); 18515 } 18516 } 18517 18518 if (!BaseShAmt) 18519 // Avoid introducing an extract element from a shuffle. 18520 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, 18521 DAG.getIntPtrConstant(SplatIdx, dl)); 18522 } 18523 } 18524 18525 if (BaseShAmt.getNode()) { 18526 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); 18527 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) 18528 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); 18529 else if (EltVT.bitsLT(MVT::i32)) 18530 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); 18531 18532 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG); 18533 } 18534 } 18535 18536 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 18537 if (!Subtarget->is64Bit() && VT == MVT::v2i64 && 18538 Amt.getOpcode() == ISD::BITCAST && 18539 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 18540 Amt = Amt.getOperand(0); 18541 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / 18542 VT.getVectorNumElements(); 18543 std::vector<SDValue> Vals(Ratio); 18544 for (unsigned i = 0; i != Ratio; ++i) 18545 Vals[i] = Amt.getOperand(i); 18546 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 18547 for (unsigned j = 0; j != Ratio; ++j) 18548 if (Vals[j] != Amt.getOperand(i + j)) 18549 return SDValue(); 18550 } 18551 18552 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) 18553 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); 18554 } 18555 return SDValue(); 18556 } 18557 18558 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, 18559 SelectionDAG &DAG) { 18560 MVT VT = Op.getSimpleValueType(); 18561 SDLoc dl(Op); 18562 SDValue R = Op.getOperand(0); 18563 SDValue Amt = Op.getOperand(1); 18564 18565 assert(VT.isVector() && "Custom lowering only for vector shifts!"); 18566 assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); 18567 18568 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) 18569 return V; 18570 18571 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) 18572 return V; 18573 18574 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) 18575 return Op; 18576 18577 // XOP has 128-bit variable logical/arithmetic shifts. 18578 // +ve/-ve Amt = shift left/right. 18579 if (Subtarget->hasXOP() && 18580 (VT == MVT::v2i64 || VT == MVT::v4i32 || 18581 VT == MVT::v8i16 || VT == MVT::v16i8)) { 18582 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { 18583 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); 18584 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); 18585 } 18586 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) 18587 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); 18588 if (Op.getOpcode() == ISD::SRA) 18589 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); 18590 } 18591 18592 // 2i64 vector logical shifts can efficiently avoid scalarization - do the 18593 // shifts per-lane and then shuffle the partial results back together. 18594 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { 18595 // Splat the shift amounts so the scalar shifts above will catch it. 18596 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); 18597 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); 18598 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); 18599 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); 18600 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); 18601 } 18602 18603 // i64 vector arithmetic shift can be emulated with the transform: 18604 // M = lshr(SIGN_BIT, Amt) 18605 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) 18606 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && 18607 Op.getOpcode() == ISD::SRA) { 18608 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); 18609 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); 18610 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 18611 R = DAG.getNode(ISD::XOR, dl, VT, R, M); 18612 R = DAG.getNode(ISD::SUB, dl, VT, R, M); 18613 return R; 18614 } 18615 18616 // If possible, lower this packed shift into a vector multiply instead of 18617 // expanding it into a sequence of scalar shifts. 18618 // Do this only if the vector shift count is a constant build_vector. 18619 if (Op.getOpcode() == ISD::SHL && 18620 (VT == MVT::v8i16 || VT == MVT::v4i32 || 18621 (Subtarget->hasInt256() && VT == MVT::v16i16)) && 18622 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { 18623 SmallVector<SDValue, 8> Elts; 18624 MVT SVT = VT.getVectorElementType(); 18625 unsigned SVTBits = SVT.getSizeInBits(); 18626 APInt One(SVTBits, 1); 18627 unsigned NumElems = VT.getVectorNumElements(); 18628 18629 for (unsigned i=0; i !=NumElems; ++i) { 18630 SDValue Op = Amt->getOperand(i); 18631 if (Op->getOpcode() == ISD::UNDEF) { 18632 Elts.push_back(Op); 18633 continue; 18634 } 18635 18636 ConstantSDNode *ND = cast<ConstantSDNode>(Op); 18637 APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); 18638 uint64_t ShAmt = C.getZExtValue(); 18639 if (ShAmt >= SVTBits) { 18640 Elts.push_back(DAG.getUNDEF(SVT)); 18641 continue; 18642 } 18643 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); 18644 } 18645 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); 18646 return DAG.getNode(ISD::MUL, dl, VT, R, BV); 18647 } 18648 18649 // Lower SHL with variable shift amount. 18650 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 18651 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); 18652 18653 Op = DAG.getNode(ISD::ADD, dl, VT, Op, 18654 DAG.getConstant(0x3f800000U, dl, VT)); 18655 Op = DAG.getBitcast(MVT::v4f32, Op); 18656 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 18657 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 18658 } 18659 18660 // If possible, lower this shift as a sequence of two shifts by 18661 // constant plus a MOVSS/MOVSD instead of scalarizing it. 18662 // Example: 18663 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) 18664 // 18665 // Could be rewritten as: 18666 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) 18667 // 18668 // The advantage is that the two shifts from the example would be 18669 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing 18670 // the vector shift into four scalar shifts plus four pairs of vector 18671 // insert/extract. 18672 if ((VT == MVT::v8i16 || VT == MVT::v4i32) && 18673 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { 18674 unsigned TargetOpcode = X86ISD::MOVSS; 18675 bool CanBeSimplified; 18676 // The splat value for the first packed shift (the 'X' from the example). 18677 SDValue Amt1 = Amt->getOperand(0); 18678 // The splat value for the second packed shift (the 'Y' from the example). 18679 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : 18680 Amt->getOperand(2); 18681 18682 // See if it is possible to replace this node with a sequence of 18683 // two shifts followed by a MOVSS/MOVSD 18684 if (VT == MVT::v4i32) { 18685 // Check if it is legal to use a MOVSS. 18686 CanBeSimplified = Amt2 == Amt->getOperand(2) && 18687 Amt2 == Amt->getOperand(3); 18688 if (!CanBeSimplified) { 18689 // Otherwise, check if we can still simplify this node using a MOVSD. 18690 CanBeSimplified = Amt1 == Amt->getOperand(1) && 18691 Amt->getOperand(2) == Amt->getOperand(3); 18692 TargetOpcode = X86ISD::MOVSD; 18693 Amt2 = Amt->getOperand(2); 18694 } 18695 } else { 18696 // Do similar checks for the case where the machine value type 18697 // is MVT::v8i16. 18698 CanBeSimplified = Amt1 == Amt->getOperand(1); 18699 for (unsigned i=3; i != 8 && CanBeSimplified; ++i) 18700 CanBeSimplified = Amt2 == Amt->getOperand(i); 18701 18702 if (!CanBeSimplified) { 18703 TargetOpcode = X86ISD::MOVSD; 18704 CanBeSimplified = true; 18705 Amt2 = Amt->getOperand(4); 18706 for (unsigned i=0; i != 4 && CanBeSimplified; ++i) 18707 CanBeSimplified = Amt1 == Amt->getOperand(i); 18708 for (unsigned j=4; j != 8 && CanBeSimplified; ++j) 18709 CanBeSimplified = Amt2 == Amt->getOperand(j); 18710 } 18711 } 18712 18713 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && 18714 isa<ConstantSDNode>(Amt2)) { 18715 // Replace this node with two shifts followed by a MOVSS/MOVSD. 18716 MVT CastVT = MVT::v4i32; 18717 SDValue Splat1 = 18718 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); 18719 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); 18720 SDValue Splat2 = 18721 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT); 18722 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); 18723 if (TargetOpcode == X86ISD::MOVSD) 18724 CastVT = MVT::v2i64; 18725 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1); 18726 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2); 18727 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, 18728 BitCast1, DAG); 18729 return DAG.getBitcast(VT, Result); 18730 } 18731 } 18732 18733 // v4i32 Non Uniform Shifts. 18734 // If the shift amount is constant we can shift each lane using the SSE2 18735 // immediate shifts, else we need to zero-extend each lane to the lower i64 18736 // and shift using the SSE2 variable shifts. 18737 // The separate results can then be blended together. 18738 if (VT == MVT::v4i32) { 18739 unsigned Opc = Op.getOpcode(); 18740 SDValue Amt0, Amt1, Amt2, Amt3; 18741 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { 18742 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); 18743 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); 18744 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); 18745 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); 18746 } else { 18747 // ISD::SHL is handled above but we include it here for completeness. 18748 switch (Opc) { 18749 default: 18750 llvm_unreachable("Unknown target vector shift node"); 18751 case ISD::SHL: 18752 Opc = X86ISD::VSHL; 18753 break; 18754 case ISD::SRL: 18755 Opc = X86ISD::VSRL; 18756 break; 18757 case ISD::SRA: 18758 Opc = X86ISD::VSRA; 18759 break; 18760 } 18761 // The SSE2 shifts use the lower i64 as the same shift amount for 18762 // all lanes and the upper i64 is ignored. These shuffle masks 18763 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets. 18764 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); 18765 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); 18766 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); 18767 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); 18768 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); 18769 } 18770 18771 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); 18772 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); 18773 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2); 18774 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3); 18775 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); 18776 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); 18777 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); 18778 } 18779 18780 if (VT == MVT::v16i8 || 18781 (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) { 18782 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); 18783 unsigned ShiftOpcode = Op->getOpcode(); 18784 18785 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { 18786 // On SSE41 targets we make use of the fact that VSELECT lowers 18787 // to PBLENDVB which selects bytes based just on the sign bit. 18788 if (Subtarget->hasSSE41()) { 18789 V0 = DAG.getBitcast(VT, V0); 18790 V1 = DAG.getBitcast(VT, V1); 18791 Sel = DAG.getBitcast(VT, Sel); 18792 return DAG.getBitcast(SelVT, 18793 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); 18794 } 18795 // On pre-SSE41 targets we test for the sign bit by comparing to 18796 // zero - a negative value will set all bits of the lanes to true 18797 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. 18798 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); 18799 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); 18800 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); 18801 }; 18802 18803 // Turn 'a' into a mask suitable for VSELECT: a = a << 5; 18804 // We can safely do this using i16 shifts as we're only interested in 18805 // the 3 lower bits of each byte. 18806 Amt = DAG.getBitcast(ExtVT, Amt); 18807 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT)); 18808 Amt = DAG.getBitcast(VT, Amt); 18809 18810 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) { 18811 // r = VSELECT(r, shift(r, 4), a); 18812 SDValue M = 18813 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); 18814 R = SignBitSelect(VT, Amt, M, R); 18815 18816 // a += a 18817 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 18818 18819 // r = VSELECT(r, shift(r, 2), a); 18820 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); 18821 R = SignBitSelect(VT, Amt, M, R); 18822 18823 // a += a 18824 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 18825 18826 // return VSELECT(r, shift(r, 1), a); 18827 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); 18828 R = SignBitSelect(VT, Amt, M, R); 18829 return R; 18830 } 18831 18832 if (Op->getOpcode() == ISD::SRA) { 18833 // For SRA we need to unpack each byte to the higher byte of a i16 vector 18834 // so we can correctly sign extend. We don't care what happens to the 18835 // lower byte. 18836 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt); 18837 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt); 18838 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R); 18839 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R); 18840 ALo = DAG.getBitcast(ExtVT, ALo); 18841 AHi = DAG.getBitcast(ExtVT, AHi); 18842 RLo = DAG.getBitcast(ExtVT, RLo); 18843 RHi = DAG.getBitcast(ExtVT, RHi); 18844 18845 // r = VSELECT(r, shift(r, 4), a); 18846 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, 18847 DAG.getConstant(4, dl, ExtVT)); 18848 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, 18849 DAG.getConstant(4, dl, ExtVT)); 18850 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); 18851 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); 18852 18853 // a += a 18854 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); 18855 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); 18856 18857 // r = VSELECT(r, shift(r, 2), a); 18858 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, 18859 DAG.getConstant(2, dl, ExtVT)); 18860 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, 18861 DAG.getConstant(2, dl, ExtVT)); 18862 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); 18863 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); 18864 18865 // a += a 18866 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); 18867 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); 18868 18869 // r = VSELECT(r, shift(r, 1), a); 18870 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, 18871 DAG.getConstant(1, dl, ExtVT)); 18872 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, 18873 DAG.getConstant(1, dl, ExtVT)); 18874 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); 18875 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); 18876 18877 // Logical shift the result back to the lower byte, leaving a zero upper 18878 // byte 18879 // meaning that we can safely pack with PACKUSWB. 18880 RLo = 18881 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT)); 18882 RHi = 18883 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT)); 18884 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); 18885 } 18886 } 18887 18888 // It's worth extending once and using the v8i32 shifts for 16-bit types, but 18889 // the extra overheads to get from v16i8 to v8i32 make the existing SSE 18890 // solution better. 18891 if (Subtarget->hasInt256() && VT == MVT::v8i16) { 18892 MVT ExtVT = MVT::v8i32; 18893 unsigned ExtOpc = 18894 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 18895 R = DAG.getNode(ExtOpc, dl, ExtVT, R); 18896 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); 18897 return DAG.getNode(ISD::TRUNCATE, dl, VT, 18898 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); 18899 } 18900 18901 if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) { 18902 MVT ExtVT = MVT::v8i32; 18903 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); 18904 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); 18905 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); 18906 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R); 18907 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R); 18908 ALo = DAG.getBitcast(ExtVT, ALo); 18909 AHi = DAG.getBitcast(ExtVT, AHi); 18910 RLo = DAG.getBitcast(ExtVT, RLo); 18911 RHi = DAG.getBitcast(ExtVT, RHi); 18912 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo); 18913 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi); 18914 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT)); 18915 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT)); 18916 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); 18917 } 18918 18919 if (VT == MVT::v8i16) { 18920 unsigned ShiftOpcode = Op->getOpcode(); 18921 18922 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { 18923 // On SSE41 targets we make use of the fact that VSELECT lowers 18924 // to PBLENDVB which selects bytes based just on the sign bit. 18925 if (Subtarget->hasSSE41()) { 18926 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); 18927 V0 = DAG.getBitcast(ExtVT, V0); 18928 V1 = DAG.getBitcast(ExtVT, V1); 18929 Sel = DAG.getBitcast(ExtVT, Sel); 18930 return DAG.getBitcast( 18931 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); 18932 } 18933 // On pre-SSE41 targets we splat the sign bit - a negative value will 18934 // set all bits of the lanes to true and VSELECT uses that in 18935 // its OR(AND(V0,C),AND(V1,~C)) lowering. 18936 SDValue C = 18937 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); 18938 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); 18939 }; 18940 18941 // Turn 'a' into a mask suitable for VSELECT: a = a << 12; 18942 if (Subtarget->hasSSE41()) { 18943 // On SSE41 targets we need to replicate the shift mask in both 18944 // bytes for PBLENDVB. 18945 Amt = DAG.getNode( 18946 ISD::OR, dl, VT, 18947 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)), 18948 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT))); 18949 } else { 18950 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)); 18951 } 18952 18953 // r = VSELECT(r, shift(r, 8), a); 18954 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT)); 18955 R = SignBitSelect(Amt, M, R); 18956 18957 // a += a 18958 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 18959 18960 // r = VSELECT(r, shift(r, 4), a); 18961 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); 18962 R = SignBitSelect(Amt, M, R); 18963 18964 // a += a 18965 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 18966 18967 // r = VSELECT(r, shift(r, 2), a); 18968 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); 18969 R = SignBitSelect(Amt, M, R); 18970 18971 // a += a 18972 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 18973 18974 // return VSELECT(r, shift(r, 1), a); 18975 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); 18976 R = SignBitSelect(Amt, M, R); 18977 return R; 18978 } 18979 18980 // Decompose 256-bit shifts into smaller 128-bit shifts. 18981 if (VT.is256BitVector()) { 18982 unsigned NumElems = VT.getVectorNumElements(); 18983 MVT EltVT = VT.getVectorElementType(); 18984 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 18985 18986 // Extract the two vectors 18987 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 18988 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 18989 18990 // Recreate the shift amount vectors 18991 SDValue Amt1, Amt2; 18992 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 18993 // Constant shift amount 18994 SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems); 18995 ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2); 18996 ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2); 18997 18998 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); 18999 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); 19000 } else { 19001 // Variable shift amount 19002 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 19003 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 19004 } 19005 19006 // Issue new vector shifts for the smaller types 19007 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 19008 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 19009 19010 // Concatenate the result back 19011 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 19012 } 19013 19014 return SDValue(); 19015 } 19016 19017 static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, 19018 SelectionDAG &DAG) { 19019 MVT VT = Op.getSimpleValueType(); 19020 SDLoc DL(Op); 19021 SDValue R = Op.getOperand(0); 19022 SDValue Amt = Op.getOperand(1); 19023 19024 assert(VT.isVector() && "Custom lowering only for vector rotates!"); 19025 assert(Subtarget->hasXOP() && "XOP support required for vector rotates!"); 19026 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); 19027 19028 // XOP has 128-bit vector variable + immediate rotates. 19029 // +ve/-ve Amt = rotate left/right. 19030 19031 // Split 256-bit integers. 19032 if (VT.is256BitVector()) 19033 return Lower256IntArith(Op, DAG); 19034 19035 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); 19036 19037 // Attempt to rotate by immediate. 19038 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { 19039 if (auto *RotateConst = BVAmt->getConstantSplatNode()) { 19040 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); 19041 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); 19042 return DAG.getNode(X86ISD::VPROTI, DL, VT, R, 19043 DAG.getConstant(RotateAmt, DL, MVT::i8)); 19044 } 19045 } 19046 19047 // Use general rotate by variable (per-element). 19048 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); 19049 } 19050 19051 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 19052 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 19053 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 19054 // looks for this combo and may remove the "setcc" instruction if the "setcc" 19055 // has only one use. 19056 SDNode *N = Op.getNode(); 19057 SDValue LHS = N->getOperand(0); 19058 SDValue RHS = N->getOperand(1); 19059 unsigned BaseOp = 0; 19060 unsigned Cond = 0; 19061 SDLoc DL(Op); 19062 switch (Op.getOpcode()) { 19063 default: llvm_unreachable("Unknown ovf instruction!"); 19064 case ISD::SADDO: 19065 // A subtract of one will be selected as a INC. Note that INC doesn't 19066 // set CF, so we can't do this for UADDO. 19067 if (isOneConstant(RHS)) { 19068 BaseOp = X86ISD::INC; 19069 Cond = X86::COND_O; 19070 break; 19071 } 19072 BaseOp = X86ISD::ADD; 19073 Cond = X86::COND_O; 19074 break; 19075 case ISD::UADDO: 19076 BaseOp = X86ISD::ADD; 19077 Cond = X86::COND_B; 19078 break; 19079 case ISD::SSUBO: 19080 // A subtract of one will be selected as a DEC. Note that DEC doesn't 19081 // set CF, so we can't do this for USUBO. 19082 if (isOneConstant(RHS)) { 19083 BaseOp = X86ISD::DEC; 19084 Cond = X86::COND_O; 19085 break; 19086 } 19087 BaseOp = X86ISD::SUB; 19088 Cond = X86::COND_O; 19089 break; 19090 case ISD::USUBO: 19091 BaseOp = X86ISD::SUB; 19092 Cond = X86::COND_B; 19093 break; 19094 case ISD::SMULO: 19095 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; 19096 Cond = X86::COND_O; 19097 break; 19098 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 19099 if (N->getValueType(0) == MVT::i8) { 19100 BaseOp = X86ISD::UMUL8; 19101 Cond = X86::COND_O; 19102 break; 19103 } 19104 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 19105 MVT::i32); 19106 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 19107 19108 SDValue SetCC = 19109 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 19110 DAG.getConstant(X86::COND_O, DL, MVT::i32), 19111 SDValue(Sum.getNode(), 2)); 19112 19113 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 19114 } 19115 } 19116 19117 // Also sets EFLAGS. 19118 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 19119 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 19120 19121 SDValue SetCC = 19122 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 19123 DAG.getConstant(Cond, DL, MVT::i32), 19124 SDValue(Sum.getNode(), 1)); 19125 19126 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 19127 } 19128 19129 /// Returns true if the operand type is exactly twice the native width, and 19130 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. 19131 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations 19132 /// (otherwise we leave them alone to become __sync_fetch_and_... calls). 19133 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { 19134 unsigned OpWidth = MemType->getPrimitiveSizeInBits(); 19135 19136 if (OpWidth == 64) 19137 return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b 19138 else if (OpWidth == 128) 19139 return Subtarget->hasCmpxchg16b(); 19140 else 19141 return false; 19142 } 19143 19144 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 19145 return needsCmpXchgNb(SI->getValueOperand()->getType()); 19146 } 19147 19148 // Note: this turns large loads into lock cmpxchg8b/16b. 19149 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. 19150 TargetLowering::AtomicExpansionKind 19151 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 19152 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); 19153 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg 19154 : AtomicExpansionKind::None; 19155 } 19156 19157 TargetLowering::AtomicExpansionKind 19158 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 19159 unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; 19160 Type *MemType = AI->getType(); 19161 19162 // If the operand is too big, we must see if cmpxchg8/16b is available 19163 // and default to library calls otherwise. 19164 if (MemType->getPrimitiveSizeInBits() > NativeWidth) { 19165 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg 19166 : AtomicExpansionKind::None; 19167 } 19168 19169 AtomicRMWInst::BinOp Op = AI->getOperation(); 19170 switch (Op) { 19171 default: 19172 llvm_unreachable("Unknown atomic operation"); 19173 case AtomicRMWInst::Xchg: 19174 case AtomicRMWInst::Add: 19175 case AtomicRMWInst::Sub: 19176 // It's better to use xadd, xsub or xchg for these in all cases. 19177 return AtomicExpansionKind::None; 19178 case AtomicRMWInst::Or: 19179 case AtomicRMWInst::And: 19180 case AtomicRMWInst::Xor: 19181 // If the atomicrmw's result isn't actually used, we can just add a "lock" 19182 // prefix to a normal instruction for these operations. 19183 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg 19184 : AtomicExpansionKind::None; 19185 case AtomicRMWInst::Nand: 19186 case AtomicRMWInst::Max: 19187 case AtomicRMWInst::Min: 19188 case AtomicRMWInst::UMax: 19189 case AtomicRMWInst::UMin: 19190 // These always require a non-trivial set of data operations on x86. We must 19191 // use a cmpxchg loop. 19192 return AtomicExpansionKind::CmpXChg; 19193 } 19194 } 19195 19196 static bool hasMFENCE(const X86Subtarget& Subtarget) { 19197 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 19198 // no-sse2). There isn't any reason to disable it if the target processor 19199 // supports it. 19200 return Subtarget.hasSSE2() || Subtarget.is64Bit(); 19201 } 19202 19203 LoadInst * 19204 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { 19205 unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; 19206 Type *MemType = AI->getType(); 19207 // Accesses larger than the native width are turned into cmpxchg/libcalls, so 19208 // there is no benefit in turning such RMWs into loads, and it is actually 19209 // harmful as it introduces a mfence. 19210 if (MemType->getPrimitiveSizeInBits() > NativeWidth) 19211 return nullptr; 19212 19213 auto Builder = IRBuilder<>(AI); 19214 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 19215 auto SynchScope = AI->getSynchScope(); 19216 // We must restrict the ordering to avoid generating loads with Release or 19217 // ReleaseAcquire orderings. 19218 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); 19219 auto Ptr = AI->getPointerOperand(); 19220 19221 // Before the load we need a fence. Here is an example lifted from 19222 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence 19223 // is required: 19224 // Thread 0: 19225 // x.store(1, relaxed); 19226 // r1 = y.fetch_add(0, release); 19227 // Thread 1: 19228 // y.fetch_add(42, acquire); 19229 // r2 = x.load(relaxed); 19230 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is 19231 // lowered to just a load without a fence. A mfence flushes the store buffer, 19232 // making the optimization clearly correct. 19233 // FIXME: it is required if isAtLeastRelease(Order) but it is not clear 19234 // otherwise, we might be able to be more aggressive on relaxed idempotent 19235 // rmw. In practice, they do not look useful, so we don't try to be 19236 // especially clever. 19237 if (SynchScope == SingleThread) 19238 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at 19239 // the IR level, so we must wrap it in an intrinsic. 19240 return nullptr; 19241 19242 if (!hasMFENCE(*Subtarget)) 19243 // FIXME: it might make sense to use a locked operation here but on a 19244 // different cache-line to prevent cache-line bouncing. In practice it 19245 // is probably a small win, and x86 processors without mfence are rare 19246 // enough that we do not bother. 19247 return nullptr; 19248 19249 Function *MFence = 19250 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); 19251 Builder.CreateCall(MFence, {}); 19252 19253 // Finally we can emit the atomic load. 19254 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, 19255 AI->getType()->getPrimitiveSizeInBits()); 19256 Loaded->setAtomic(Order, SynchScope); 19257 AI->replaceAllUsesWith(Loaded); 19258 AI->eraseFromParent(); 19259 return Loaded; 19260 } 19261 19262 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, 19263 SelectionDAG &DAG) { 19264 SDLoc dl(Op); 19265 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 19266 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 19267 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 19268 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 19269 19270 // The only fence that needs an instruction is a sequentially-consistent 19271 // cross-thread fence. 19272 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 19273 if (hasMFENCE(*Subtarget)) 19274 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 19275 19276 SDValue Chain = Op.getOperand(0); 19277 SDValue Zero = DAG.getConstant(0, dl, MVT::i32); 19278 SDValue Ops[] = { 19279 DAG.getRegister(X86::ESP, MVT::i32), // Base 19280 DAG.getTargetConstant(1, dl, MVT::i8), // Scale 19281 DAG.getRegister(0, MVT::i32), // Index 19282 DAG.getTargetConstant(0, dl, MVT::i32), // Disp 19283 DAG.getRegister(0, MVT::i32), // Segment. 19284 Zero, 19285 Chain 19286 }; 19287 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); 19288 return SDValue(Res, 0); 19289 } 19290 19291 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 19292 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 19293 } 19294 19295 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, 19296 SelectionDAG &DAG) { 19297 MVT T = Op.getSimpleValueType(); 19298 SDLoc DL(Op); 19299 unsigned Reg = 0; 19300 unsigned size = 0; 19301 switch(T.SimpleTy) { 19302 default: llvm_unreachable("Invalid value type!"); 19303 case MVT::i8: Reg = X86::AL; size = 1; break; 19304 case MVT::i16: Reg = X86::AX; size = 2; break; 19305 case MVT::i32: Reg = X86::EAX; size = 4; break; 19306 case MVT::i64: 19307 assert(Subtarget->is64Bit() && "Node not type legal!"); 19308 Reg = X86::RAX; size = 8; 19309 break; 19310 } 19311 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 19312 Op.getOperand(2), SDValue()); 19313 SDValue Ops[] = { cpIn.getValue(0), 19314 Op.getOperand(1), 19315 Op.getOperand(3), 19316 DAG.getTargetConstant(size, DL, MVT::i8), 19317 cpIn.getValue(1) }; 19318 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 19319 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 19320 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 19321 Ops, T, MMO); 19322 19323 SDValue cpOut = 19324 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 19325 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, 19326 MVT::i32, cpOut.getValue(2)); 19327 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), 19328 DAG.getConstant(X86::COND_E, DL, MVT::i8), 19329 EFLAGS); 19330 19331 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); 19332 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); 19333 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); 19334 return SDValue(); 19335 } 19336 19337 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, 19338 SelectionDAG &DAG) { 19339 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 19340 MVT DstVT = Op.getSimpleValueType(); 19341 19342 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { 19343 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 19344 if (DstVT != MVT::f64) 19345 // This conversion needs to be expanded. 19346 return SDValue(); 19347 19348 SDValue InVec = Op->getOperand(0); 19349 SDLoc dl(Op); 19350 unsigned NumElts = SrcVT.getVectorNumElements(); 19351 MVT SVT = SrcVT.getVectorElementType(); 19352 19353 // Widen the vector in input in the case of MVT::v2i32. 19354 // Example: from MVT::v2i32 to MVT::v4i32. 19355 SmallVector<SDValue, 16> Elts; 19356 for (unsigned i = 0, e = NumElts; i != e; ++i) 19357 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, 19358 DAG.getIntPtrConstant(i, dl))); 19359 19360 // Explicitly mark the extra elements as Undef. 19361 Elts.append(NumElts, DAG.getUNDEF(SVT)); 19362 19363 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); 19364 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); 19365 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV); 19366 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, 19367 DAG.getIntPtrConstant(0, dl)); 19368 } 19369 19370 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 19371 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 19372 assert((DstVT == MVT::i64 || 19373 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 19374 "Unexpected custom BITCAST"); 19375 // i64 <=> MMX conversions are Legal. 19376 if (SrcVT==MVT::i64 && DstVT.isVector()) 19377 return Op; 19378 if (DstVT==MVT::i64 && SrcVT.isVector()) 19379 return Op; 19380 // MMX <=> MMX conversions are Legal. 19381 if (SrcVT.isVector() && DstVT.isVector()) 19382 return Op; 19383 // All other conversions need to be expanded. 19384 return SDValue(); 19385 } 19386 19387 /// Compute the horizontal sum of bytes in V for the elements of VT. 19388 /// 19389 /// Requires V to be a byte vector and VT to be an integer vector type with 19390 /// wider elements than V's type. The width of the elements of VT determines 19391 /// how many bytes of V are summed horizontally to produce each element of the 19392 /// result. 19393 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, 19394 const X86Subtarget *Subtarget, 19395 SelectionDAG &DAG) { 19396 SDLoc DL(V); 19397 MVT ByteVecVT = V.getSimpleValueType(); 19398 MVT EltVT = VT.getVectorElementType(); 19399 int NumElts = VT.getVectorNumElements(); 19400 assert(ByteVecVT.getVectorElementType() == MVT::i8 && 19401 "Expected value to have byte element type."); 19402 assert(EltVT != MVT::i8 && 19403 "Horizontal byte sum only makes sense for wider elements!"); 19404 unsigned VecSize = VT.getSizeInBits(); 19405 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"); 19406 19407 // PSADBW instruction horizontally add all bytes and leave the result in i64 19408 // chunks, thus directly computes the pop count for v2i64 and v4i64. 19409 if (EltVT == MVT::i64) { 19410 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); 19411 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); 19412 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); 19413 return DAG.getBitcast(VT, V); 19414 } 19415 19416 if (EltVT == MVT::i32) { 19417 // We unpack the low half and high half into i32s interleaved with zeros so 19418 // that we can use PSADBW to horizontally sum them. The most useful part of 19419 // this is that it lines up the results of two PSADBW instructions to be 19420 // two v2i64 vectors which concatenated are the 4 population counts. We can 19421 // then use PACKUSWB to shrink and concatenate them into a v4i32 again. 19422 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); 19423 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros); 19424 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros); 19425 19426 // Do the horizontal sums into two v2i64s. 19427 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); 19428 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); 19429 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, 19430 DAG.getBitcast(ByteVecVT, Low), Zeros); 19431 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, 19432 DAG.getBitcast(ByteVecVT, High), Zeros); 19433 19434 // Merge them together. 19435 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); 19436 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, 19437 DAG.getBitcast(ShortVecVT, Low), 19438 DAG.getBitcast(ShortVecVT, High)); 19439 19440 return DAG.getBitcast(VT, V); 19441 } 19442 19443 // The only element type left is i16. 19444 assert(EltVT == MVT::i16 && "Unknown how to handle type"); 19445 19446 // To obtain pop count for each i16 element starting from the pop count for 19447 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s 19448 // right by 8. It is important to shift as i16s as i8 vector shift isn't 19449 // directly supported. 19450 SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT)); 19451 SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters); 19452 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter); 19453 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), 19454 DAG.getBitcast(ByteVecVT, V)); 19455 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter); 19456 } 19457 19458 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL, 19459 const X86Subtarget *Subtarget, 19460 SelectionDAG &DAG) { 19461 MVT VT = Op.getSimpleValueType(); 19462 MVT EltVT = VT.getVectorElementType(); 19463 unsigned VecSize = VT.getSizeInBits(); 19464 19465 // Implement a lookup table in register by using an algorithm based on: 19466 // http://wm.ite.pl/articles/sse-popcount.html 19467 // 19468 // The general idea is that every lower byte nibble in the input vector is an 19469 // index into a in-register pre-computed pop count table. We then split up the 19470 // input vector in two new ones: (1) a vector with only the shifted-right 19471 // higher nibbles for each byte and (2) a vector with the lower nibbles (and 19472 // masked out higher ones) for each byte. PSHUB is used separately with both 19473 // to index the in-register table. Next, both are added and the result is a 19474 // i8 vector where each element contains the pop count for input byte. 19475 // 19476 // To obtain the pop count for elements != i8, we follow up with the same 19477 // approach and use additional tricks as described below. 19478 // 19479 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 19480 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 19481 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 19482 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; 19483 19484 int NumByteElts = VecSize / 8; 19485 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); 19486 SDValue In = DAG.getBitcast(ByteVecVT, Op); 19487 SmallVector<SDValue, 16> LUTVec; 19488 for (int i = 0; i < NumByteElts; ++i) 19489 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); 19490 SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec); 19491 SmallVector<SDValue, 16> Mask0F(NumByteElts, 19492 DAG.getConstant(0x0F, DL, MVT::i8)); 19493 SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F); 19494 19495 // High nibbles 19496 SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8)); 19497 SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four); 19498 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); 19499 19500 // Low nibbles 19501 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); 19502 19503 // The input vector is used as the shuffle mask that index elements into the 19504 // LUT. After counting low and high nibbles, add the vector to obtain the 19505 // final pop count per i8 element. 19506 SDValue HighPopCnt = 19507 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); 19508 SDValue LowPopCnt = 19509 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); 19510 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); 19511 19512 if (EltVT == MVT::i8) 19513 return PopCnt; 19514 19515 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); 19516 } 19517 19518 static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, 19519 const X86Subtarget *Subtarget, 19520 SelectionDAG &DAG) { 19521 MVT VT = Op.getSimpleValueType(); 19522 assert(VT.is128BitVector() && 19523 "Only 128-bit vector bitmath lowering supported."); 19524 19525 int VecSize = VT.getSizeInBits(); 19526 MVT EltVT = VT.getVectorElementType(); 19527 int Len = EltVT.getSizeInBits(); 19528 19529 // This is the vectorized version of the "best" algorithm from 19530 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 19531 // with a minor tweak to use a series of adds + shifts instead of vector 19532 // multiplications. Implemented for all integer vector types. We only use 19533 // this when we don't have SSSE3 which allows a LUT-based lowering that is 19534 // much faster, even faster than using native popcnt instructions. 19535 19536 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { 19537 MVT VT = V.getSimpleValueType(); 19538 SmallVector<SDValue, 32> Shifters( 19539 VT.getVectorNumElements(), 19540 DAG.getConstant(Shifter, DL, VT.getVectorElementType())); 19541 return DAG.getNode(OpCode, DL, VT, V, 19542 DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters)); 19543 }; 19544 auto GetMask = [&](SDValue V, APInt Mask) { 19545 MVT VT = V.getSimpleValueType(); 19546 SmallVector<SDValue, 32> Masks( 19547 VT.getVectorNumElements(), 19548 DAG.getConstant(Mask, DL, VT.getVectorElementType())); 19549 return DAG.getNode(ISD::AND, DL, VT, V, 19550 DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks)); 19551 }; 19552 19553 // We don't want to incur the implicit masks required to SRL vNi8 vectors on 19554 // x86, so set the SRL type to have elements at least i16 wide. This is 19555 // correct because all of our SRLs are followed immediately by a mask anyways 19556 // that handles any bits that sneak into the high bits of the byte elements. 19557 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16); 19558 19559 SDValue V = Op; 19560 19561 // v = v - ((v >> 1) & 0x55555555...) 19562 SDValue Srl = 19563 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); 19564 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55))); 19565 V = DAG.getNode(ISD::SUB, DL, VT, V, And); 19566 19567 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) 19568 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33))); 19569 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); 19570 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33))); 19571 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); 19572 19573 // v = (v + (v >> 4)) & 0x0F0F0F0F... 19574 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); 19575 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); 19576 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F))); 19577 19578 // At this point, V contains the byte-wise population count, and we are 19579 // merely doing a horizontal sum if necessary to get the wider element 19580 // counts. 19581 if (EltVT == MVT::i8) 19582 return V; 19583 19584 return LowerHorizontalByteSum( 19585 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget, 19586 DAG); 19587 } 19588 19589 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, 19590 SelectionDAG &DAG) { 19591 MVT VT = Op.getSimpleValueType(); 19592 // FIXME: Need to add AVX-512 support here! 19593 assert((VT.is256BitVector() || VT.is128BitVector()) && 19594 "Unknown CTPOP type to handle"); 19595 SDLoc DL(Op.getNode()); 19596 SDValue Op0 = Op.getOperand(0); 19597 19598 if (!Subtarget->hasSSSE3()) { 19599 // We can't use the fast LUT approach, so fall back on vectorized bitmath. 19600 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); 19601 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); 19602 } 19603 19604 if (VT.is256BitVector() && !Subtarget->hasInt256()) { 19605 unsigned NumElems = VT.getVectorNumElements(); 19606 19607 // Extract each 128-bit vector, compute pop count and concat the result. 19608 SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL); 19609 SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL); 19610 19611 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, 19612 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), 19613 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); 19614 } 19615 19616 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); 19617 } 19618 19619 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, 19620 SelectionDAG &DAG) { 19621 assert(Op.getSimpleValueType().isVector() && 19622 "We only do custom lowering for vector population count."); 19623 return LowerVectorCTPOP(Op, Subtarget, DAG); 19624 } 19625 19626 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 19627 SDNode *Node = Op.getNode(); 19628 SDLoc dl(Node); 19629 EVT T = Node->getValueType(0); 19630 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 19631 DAG.getConstant(0, dl, T), Node->getOperand(2)); 19632 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 19633 cast<AtomicSDNode>(Node)->getMemoryVT(), 19634 Node->getOperand(0), 19635 Node->getOperand(1), negOp, 19636 cast<AtomicSDNode>(Node)->getMemOperand(), 19637 cast<AtomicSDNode>(Node)->getOrdering(), 19638 cast<AtomicSDNode>(Node)->getSynchScope()); 19639 } 19640 19641 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 19642 SDNode *Node = Op.getNode(); 19643 SDLoc dl(Node); 19644 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 19645 19646 // Convert seq_cst store -> xchg 19647 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 19648 // FIXME: On 32-bit, store -> fist or movq would be more efficient 19649 // (The only way to get a 16-byte store is cmpxchg16b) 19650 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 19651 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 19652 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 19653 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 19654 cast<AtomicSDNode>(Node)->getMemoryVT(), 19655 Node->getOperand(0), 19656 Node->getOperand(1), Node->getOperand(2), 19657 cast<AtomicSDNode>(Node)->getMemOperand(), 19658 cast<AtomicSDNode>(Node)->getOrdering(), 19659 cast<AtomicSDNode>(Node)->getSynchScope()); 19660 return Swap.getValue(1); 19661 } 19662 // Other atomic stores have a simple pattern. 19663 return Op; 19664 } 19665 19666 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 19667 MVT VT = Op.getNode()->getSimpleValueType(0); 19668 19669 // Let legalize expand this if it isn't a legal type yet. 19670 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 19671 return SDValue(); 19672 19673 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 19674 19675 unsigned Opc; 19676 bool ExtraOp = false; 19677 switch (Op.getOpcode()) { 19678 default: llvm_unreachable("Invalid code"); 19679 case ISD::ADDC: Opc = X86ISD::ADD; break; 19680 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 19681 case ISD::SUBC: Opc = X86ISD::SUB; break; 19682 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 19683 } 19684 19685 if (!ExtraOp) 19686 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 19687 Op.getOperand(1)); 19688 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 19689 Op.getOperand(1), Op.getOperand(2)); 19690 } 19691 19692 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, 19693 SelectionDAG &DAG) { 19694 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); 19695 19696 // For MacOSX, we want to call an alternative entry point: __sincos_stret, 19697 // which returns the values as { float, float } (in XMM0) or 19698 // { double, double } (which is returned in XMM0, XMM1). 19699 SDLoc dl(Op); 19700 SDValue Arg = Op.getOperand(0); 19701 EVT ArgVT = Arg.getValueType(); 19702 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 19703 19704 TargetLowering::ArgListTy Args; 19705 TargetLowering::ArgListEntry Entry; 19706 19707 Entry.Node = Arg; 19708 Entry.Ty = ArgTy; 19709 Entry.isSExt = false; 19710 Entry.isZExt = false; 19711 Args.push_back(Entry); 19712 19713 bool isF64 = ArgVT == MVT::f64; 19714 // Only optimize x86_64 for now. i386 is a bit messy. For f32, 19715 // the small struct {f32, f32} is returned in (eax, edx). For f64, 19716 // the results are returned via SRet in memory. 19717 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; 19718 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19719 SDValue Callee = 19720 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); 19721 19722 Type *RetTy = isF64 19723 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) 19724 : (Type*)VectorType::get(ArgTy, 4); 19725 19726 TargetLowering::CallLoweringInfo CLI(DAG); 19727 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 19728 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); 19729 19730 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); 19731 19732 if (isF64) 19733 // Returned in xmm0 and xmm1. 19734 return CallResult.first; 19735 19736 // Returned in bits 0:31 and 32:64 xmm0. 19737 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 19738 CallResult.first, DAG.getIntPtrConstant(0, dl)); 19739 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 19740 CallResult.first, DAG.getIntPtrConstant(1, dl)); 19741 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 19742 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); 19743 } 19744 19745 /// Widen a vector input to a vector of NVT. The 19746 /// input vector must have the same element type as NVT. 19747 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, 19748 bool FillWithZeroes = false) { 19749 // Check if InOp already has the right width. 19750 MVT InVT = InOp.getSimpleValueType(); 19751 if (InVT == NVT) 19752 return InOp; 19753 19754 if (InOp.isUndef()) 19755 return DAG.getUNDEF(NVT); 19756 19757 assert(InVT.getVectorElementType() == NVT.getVectorElementType() && 19758 "input and widen element type must match"); 19759 19760 unsigned InNumElts = InVT.getVectorNumElements(); 19761 unsigned WidenNumElts = NVT.getVectorNumElements(); 19762 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && 19763 "Unexpected request for vector widening"); 19764 19765 EVT EltVT = NVT.getVectorElementType(); 19766 19767 SDLoc dl(InOp); 19768 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && 19769 InOp.getNumOperands() == 2) { 19770 SDValue N1 = InOp.getOperand(1); 19771 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || 19772 N1.isUndef()) { 19773 InOp = InOp.getOperand(0); 19774 InVT = InOp.getSimpleValueType(); 19775 InNumElts = InVT.getVectorNumElements(); 19776 } 19777 } 19778 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || 19779 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { 19780 SmallVector<SDValue, 16> Ops; 19781 for (unsigned i = 0; i < InNumElts; ++i) 19782 Ops.push_back(InOp.getOperand(i)); 19783 19784 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : 19785 DAG.getUNDEF(EltVT); 19786 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) 19787 Ops.push_back(FillVal); 19788 return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); 19789 } 19790 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : 19791 DAG.getUNDEF(NVT); 19792 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, 19793 InOp, DAG.getIntPtrConstant(0, dl)); 19794 } 19795 19796 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, 19797 SelectionDAG &DAG) { 19798 assert(Subtarget->hasAVX512() && 19799 "MGATHER/MSCATTER are supported on AVX-512 arch only"); 19800 19801 // X86 scatter kills mask register, so its type should be added to 19802 // the list of return values. 19803 // If the "scatter" has 2 return values, it is already handled. 19804 if (Op.getNode()->getNumValues() == 2) 19805 return Op; 19806 19807 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); 19808 SDValue Src = N->getValue(); 19809 MVT VT = Src.getSimpleValueType(); 19810 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); 19811 SDLoc dl(Op); 19812 19813 SDValue NewScatter; 19814 SDValue Index = N->getIndex(); 19815 SDValue Mask = N->getMask(); 19816 SDValue Chain = N->getChain(); 19817 SDValue BasePtr = N->getBasePtr(); 19818 MVT MemVT = N->getMemoryVT().getSimpleVT(); 19819 MVT IndexVT = Index.getSimpleValueType(); 19820 MVT MaskVT = Mask.getSimpleValueType(); 19821 19822 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { 19823 // The v2i32 value was promoted to v2i64. 19824 // Now we "redo" the type legalizer's work and widen the original 19825 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 19826 // with a shuffle. 19827 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && 19828 "Unexpected memory type"); 19829 int ShuffleMask[] = {0, 2, -1, -1}; 19830 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), 19831 DAG.getUNDEF(MVT::v4i32), ShuffleMask); 19832 // Now we have 4 elements instead of 2. 19833 // Expand the index. 19834 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); 19835 Index = ExtendToType(Index, NewIndexVT, DAG); 19836 19837 // Expand the mask with zeroes 19838 // Mask may be <2 x i64> or <2 x i1> at this moment 19839 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && 19840 "Unexpected mask type"); 19841 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); 19842 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); 19843 VT = MVT::v4i32; 19844 } 19845 19846 unsigned NumElts = VT.getVectorNumElements(); 19847 if (!Subtarget->hasVLX() && !VT.is512BitVector() && 19848 !Index.getSimpleValueType().is512BitVector()) { 19849 // AVX512F supports only 512-bit vectors. Or data or index should 19850 // be 512 bit wide. If now the both index and data are 256-bit, but 19851 // the vector contains 8 elements, we just sign-extend the index 19852 if (IndexVT == MVT::v8i32) 19853 // Just extend index 19854 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); 19855 else { 19856 // The minimal number of elts in scatter is 8 19857 NumElts = 8; 19858 // Index 19859 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); 19860 // Use original index here, do not modify the index twice 19861 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); 19862 if (IndexVT.getScalarType() == MVT::i32) 19863 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); 19864 19865 // Mask 19866 // At this point we have promoted mask operand 19867 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); 19868 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); 19869 // Use the original mask here, do not modify the mask twice 19870 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); 19871 19872 // The value that should be stored 19873 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); 19874 Src = ExtendToType(Src, NewVT, DAG); 19875 } 19876 } 19877 // If the mask is "wide" at this point - truncate it to i1 vector 19878 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); 19879 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); 19880 19881 // The mask is killed by scatter, add it to the values 19882 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); 19883 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; 19884 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, 19885 N->getMemOperand()); 19886 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); 19887 return SDValue(NewScatter.getNode(), 0); 19888 } 19889 19890 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, 19891 SelectionDAG &DAG) { 19892 19893 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 19894 MVT VT = Op.getSimpleValueType(); 19895 SDValue Mask = N->getMask(); 19896 SDLoc dl(Op); 19897 19898 if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && 19899 !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { 19900 // This operation is legal for targets with VLX, but without 19901 // VLX the vector should be widened to 512 bit 19902 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); 19903 MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); 19904 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); 19905 SDValue Src0 = N->getSrc0(); 19906 Src0 = ExtendToType(Src0, WideDataVT, DAG); 19907 Mask = ExtendToType(Mask, WideMaskVT, DAG, true); 19908 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), 19909 N->getBasePtr(), Mask, Src0, 19910 N->getMemoryVT(), N->getMemOperand(), 19911 N->getExtensionType()); 19912 19913 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, 19914 NewLoad.getValue(0), 19915 DAG.getIntPtrConstant(0, dl)); 19916 SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; 19917 return DAG.getMergeValues(RetOps, dl); 19918 } 19919 return Op; 19920 } 19921 19922 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, 19923 SelectionDAG &DAG) { 19924 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode()); 19925 SDValue DataToStore = N->getValue(); 19926 MVT VT = DataToStore.getSimpleValueType(); 19927 SDValue Mask = N->getMask(); 19928 SDLoc dl(Op); 19929 19930 if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && 19931 !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { 19932 // This operation is legal for targets with VLX, but without 19933 // VLX the vector should be widened to 512 bit 19934 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); 19935 MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); 19936 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); 19937 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); 19938 Mask = ExtendToType(Mask, WideMaskVT, DAG, true); 19939 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), 19940 Mask, N->getMemoryVT(), N->getMemOperand(), 19941 N->isTruncatingStore()); 19942 } 19943 return Op; 19944 } 19945 19946 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, 19947 SelectionDAG &DAG) { 19948 assert(Subtarget->hasAVX512() && 19949 "MGATHER/MSCATTER are supported on AVX-512 arch only"); 19950 19951 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); 19952 SDLoc dl(Op); 19953 MVT VT = Op.getSimpleValueType(); 19954 SDValue Index = N->getIndex(); 19955 SDValue Mask = N->getMask(); 19956 SDValue Src0 = N->getValue(); 19957 MVT IndexVT = Index.getSimpleValueType(); 19958 MVT MaskVT = Mask.getSimpleValueType(); 19959 19960 unsigned NumElts = VT.getVectorNumElements(); 19961 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); 19962 19963 if (!Subtarget->hasVLX() && !VT.is512BitVector() && 19964 !Index.getSimpleValueType().is512BitVector()) { 19965 // AVX512F supports only 512-bit vectors. Or data or index should 19966 // be 512 bit wide. If now the both index and data are 256-bit, but 19967 // the vector contains 8 elements, we just sign-extend the index 19968 if (NumElts == 8) { 19969 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); 19970 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), 19971 N->getOperand(3), Index }; 19972 DAG.UpdateNodeOperands(N, Ops); 19973 return Op; 19974 } 19975 19976 // Minimal number of elements in Gather 19977 NumElts = 8; 19978 // Index 19979 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); 19980 Index = ExtendToType(Index, NewIndexVT, DAG); 19981 if (IndexVT.getScalarType() == MVT::i32) 19982 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); 19983 19984 // Mask 19985 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); 19986 // At this point we have promoted mask operand 19987 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); 19988 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); 19989 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); 19990 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); 19991 19992 // The pass-thru value 19993 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); 19994 Src0 = ExtendToType(Src0, NewVT, DAG); 19995 19996 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; 19997 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), 19998 N->getMemoryVT(), dl, Ops, 19999 N->getMemOperand()); 20000 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, 20001 NewGather.getValue(0), 20002 DAG.getIntPtrConstant(0, dl)); 20003 SDValue RetOps[] = {Exract, NewGather.getValue(1)}; 20004 return DAG.getMergeValues(RetOps, dl); 20005 } 20006 return Op; 20007 } 20008 20009 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, 20010 SelectionDAG &DAG) const { 20011 // TODO: Eventually, the lowering of these nodes should be informed by or 20012 // deferred to the GC strategy for the function in which they appear. For 20013 // now, however, they must be lowered to something. Since they are logically 20014 // no-ops in the case of a null GC strategy (or a GC strategy which does not 20015 // require special handling for these nodes), lower them as literal NOOPs for 20016 // the time being. 20017 SmallVector<SDValue, 2> Ops; 20018 20019 Ops.push_back(Op.getOperand(0)); 20020 if (Op->getGluedNode()) 20021 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); 20022 20023 SDLoc OpDL(Op); 20024 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 20025 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); 20026 20027 return NOOP; 20028 } 20029 20030 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, 20031 SelectionDAG &DAG) const { 20032 // TODO: Eventually, the lowering of these nodes should be informed by or 20033 // deferred to the GC strategy for the function in which they appear. For 20034 // now, however, they must be lowered to something. Since they are logically 20035 // no-ops in the case of a null GC strategy (or a GC strategy which does not 20036 // require special handling for these nodes), lower them as literal NOOPs for 20037 // the time being. 20038 SmallVector<SDValue, 2> Ops; 20039 20040 Ops.push_back(Op.getOperand(0)); 20041 if (Op->getGluedNode()) 20042 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); 20043 20044 SDLoc OpDL(Op); 20045 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 20046 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); 20047 20048 return NOOP; 20049 } 20050 20051 /// LowerOperation - Provide custom lowering hooks for some operations. 20052 /// 20053 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 20054 switch (Op.getOpcode()) { 20055 default: llvm_unreachable("Should not custom lower this!"); 20056 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 20057 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 20058 return LowerCMP_SWAP(Op, Subtarget, DAG); 20059 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); 20060 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 20061 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 20062 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 20063 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); 20064 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); 20065 case ISD::VSELECT: return LowerVSELECT(Op, DAG); 20066 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 20067 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 20068 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 20069 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 20070 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 20071 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 20072 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 20073 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 20074 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 20075 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 20076 case ISD::SHL_PARTS: 20077 case ISD::SRA_PARTS: 20078 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 20079 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 20080 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 20081 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 20082 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); 20083 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); 20084 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); 20085 case ISD::SIGN_EXTEND_VECTOR_INREG: 20086 return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); 20087 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 20088 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 20089 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 20090 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); 20091 case ISD::FABS: 20092 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); 20093 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 20094 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 20095 case ISD::SETCC: return LowerSETCC(Op, DAG); 20096 case ISD::SETCCE: return LowerSETCCE(Op, DAG); 20097 case ISD::SELECT: return LowerSELECT(Op, DAG); 20098 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 20099 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 20100 case ISD::VASTART: return LowerVASTART(Op, DAG); 20101 case ISD::VAARG: return LowerVAARG(Op, DAG); 20102 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 20103 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); 20104 case ISD::INTRINSIC_VOID: 20105 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); 20106 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 20107 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 20108 case ISD::FRAME_TO_ARGS_OFFSET: 20109 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 20110 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 20111 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 20112 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 20113 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 20114 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 20115 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 20116 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 20117 case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG); 20118 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG); 20119 case ISD::CTTZ: 20120 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); 20121 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 20122 case ISD::UMUL_LOHI: 20123 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); 20124 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); 20125 case ISD::SRA: 20126 case ISD::SRL: 20127 case ISD::SHL: return LowerShift(Op, Subtarget, DAG); 20128 case ISD::SADDO: 20129 case ISD::UADDO: 20130 case ISD::SSUBO: 20131 case ISD::USUBO: 20132 case ISD::SMULO: 20133 case ISD::UMULO: return LowerXALUO(Op, DAG); 20134 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 20135 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); 20136 case ISD::ADDC: 20137 case ISD::ADDE: 20138 case ISD::SUBC: 20139 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 20140 case ISD::ADD: return LowerADD(Op, DAG); 20141 case ISD::SUB: return LowerSUB(Op, DAG); 20142 case ISD::SMAX: 20143 case ISD::SMIN: 20144 case ISD::UMAX: 20145 case ISD::UMIN: return LowerMINMAX(Op, DAG); 20146 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); 20147 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); 20148 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); 20149 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); 20150 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); 20151 case ISD::GC_TRANSITION_START: 20152 return LowerGC_TRANSITION_START(Op, DAG); 20153 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); 20154 } 20155 } 20156 20157 /// ReplaceNodeResults - Replace a node with an illegal result type 20158 /// with a new node built out of custom code. 20159 void X86TargetLowering::ReplaceNodeResults(SDNode *N, 20160 SmallVectorImpl<SDValue>&Results, 20161 SelectionDAG &DAG) const { 20162 SDLoc dl(N); 20163 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20164 switch (N->getOpcode()) { 20165 default: 20166 llvm_unreachable("Do not know how to custom type legalize this operation!"); 20167 case X86ISD::AVG: { 20168 // Legalize types for X86ISD::AVG by expanding vectors. 20169 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 20170 20171 auto InVT = N->getValueType(0); 20172 auto InVTSize = InVT.getSizeInBits(); 20173 const unsigned RegSize = 20174 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; 20175 assert((!Subtarget->hasAVX512() || RegSize < 512) && 20176 "512-bit vector requires AVX512"); 20177 assert((!Subtarget->hasAVX2() || RegSize < 256) && 20178 "256-bit vector requires AVX2"); 20179 20180 auto ElemVT = InVT.getVectorElementType(); 20181 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 20182 RegSize / ElemVT.getSizeInBits()); 20183 assert(RegSize % InVT.getSizeInBits() == 0); 20184 unsigned NumConcat = RegSize / InVT.getSizeInBits(); 20185 20186 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT)); 20187 Ops[0] = N->getOperand(0); 20188 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); 20189 Ops[0] = N->getOperand(1); 20190 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); 20191 20192 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); 20193 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, 20194 DAG.getIntPtrConstant(0, dl))); 20195 return; 20196 } 20197 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. 20198 case X86ISD::FMINC: 20199 case X86ISD::FMIN: 20200 case X86ISD::FMAXC: 20201 case X86ISD::FMAX: { 20202 EVT VT = N->getValueType(0); 20203 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); 20204 SDValue UNDEF = DAG.getUNDEF(VT); 20205 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, 20206 N->getOperand(0), UNDEF); 20207 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, 20208 N->getOperand(1), UNDEF); 20209 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); 20210 return; 20211 } 20212 case ISD::SIGN_EXTEND_INREG: 20213 case ISD::ADDC: 20214 case ISD::ADDE: 20215 case ISD::SUBC: 20216 case ISD::SUBE: 20217 // We don't want to expand or promote these. 20218 return; 20219 case ISD::SDIV: 20220 case ISD::UDIV: 20221 case ISD::SREM: 20222 case ISD::UREM: 20223 case ISD::SDIVREM: 20224 case ISD::UDIVREM: { 20225 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); 20226 Results.push_back(V); 20227 return; 20228 } 20229 case ISD::FP_TO_SINT: 20230 case ISD::FP_TO_UINT: { 20231 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 20232 20233 std::pair<SDValue,SDValue> Vals = 20234 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 20235 SDValue FIST = Vals.first, StackSlot = Vals.second; 20236 if (FIST.getNode()) { 20237 EVT VT = N->getValueType(0); 20238 // Return a load from the stack slot. 20239 if (StackSlot.getNode()) 20240 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 20241 MachinePointerInfo(), 20242 false, false, false, 0)); 20243 else 20244 Results.push_back(FIST); 20245 } 20246 return; 20247 } 20248 case ISD::UINT_TO_FP: { 20249 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 20250 if (N->getOperand(0).getValueType() != MVT::v2i32 || 20251 N->getValueType(0) != MVT::v2f32) 20252 return; 20253 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, 20254 N->getOperand(0)); 20255 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, 20256 MVT::f64); 20257 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); 20258 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, 20259 DAG.getBitcast(MVT::v2i64, VBias)); 20260 Or = DAG.getBitcast(MVT::v2f64, Or); 20261 // TODO: Are there any fast-math-flags to propagate here? 20262 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); 20263 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); 20264 return; 20265 } 20266 case ISD::FP_ROUND: { 20267 if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) 20268 return; 20269 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 20270 Results.push_back(V); 20271 return; 20272 } 20273 case ISD::FP_EXTEND: { 20274 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. 20275 // No other ValueType for FP_EXTEND should reach this point. 20276 assert(N->getValueType(0) == MVT::v2f32 && 20277 "Do not know how to legalize this Node"); 20278 return; 20279 } 20280 case ISD::INTRINSIC_W_CHAIN: { 20281 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 20282 switch (IntNo) { 20283 default : llvm_unreachable("Do not know how to custom type " 20284 "legalize this intrinsic operation!"); 20285 case Intrinsic::x86_rdtsc: 20286 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, 20287 Results); 20288 case Intrinsic::x86_rdtscp: 20289 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, 20290 Results); 20291 case Intrinsic::x86_rdpmc: 20292 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); 20293 } 20294 } 20295 case ISD::INTRINSIC_WO_CHAIN: { 20296 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) 20297 Results.push_back(V); 20298 return; 20299 } 20300 case ISD::READCYCLECOUNTER: { 20301 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, 20302 Results); 20303 } 20304 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { 20305 EVT T = N->getValueType(0); 20306 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 20307 bool Regs64bit = T == MVT::i128; 20308 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 20309 SDValue cpInL, cpInH; 20310 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 20311 DAG.getConstant(0, dl, HalfT)); 20312 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 20313 DAG.getConstant(1, dl, HalfT)); 20314 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 20315 Regs64bit ? X86::RAX : X86::EAX, 20316 cpInL, SDValue()); 20317 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 20318 Regs64bit ? X86::RDX : X86::EDX, 20319 cpInH, cpInL.getValue(1)); 20320 SDValue swapInL, swapInH; 20321 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 20322 DAG.getConstant(0, dl, HalfT)); 20323 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 20324 DAG.getConstant(1, dl, HalfT)); 20325 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 20326 Regs64bit ? X86::RBX : X86::EBX, 20327 swapInL, cpInH.getValue(1)); 20328 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 20329 Regs64bit ? X86::RCX : X86::ECX, 20330 swapInH, swapInL.getValue(1)); 20331 SDValue Ops[] = { swapInH.getValue(0), 20332 N->getOperand(1), 20333 swapInH.getValue(1) }; 20334 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 20335 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 20336 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 20337 X86ISD::LCMPXCHG8_DAG; 20338 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); 20339 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 20340 Regs64bit ? X86::RAX : X86::EAX, 20341 HalfT, Result.getValue(1)); 20342 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 20343 Regs64bit ? X86::RDX : X86::EDX, 20344 HalfT, cpOutL.getValue(2)); 20345 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 20346 20347 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, 20348 MVT::i32, cpOutH.getValue(2)); 20349 SDValue Success = 20350 DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 20351 DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS); 20352 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); 20353 20354 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); 20355 Results.push_back(Success); 20356 Results.push_back(EFLAGS.getValue(1)); 20357 return; 20358 } 20359 case ISD::ATOMIC_SWAP: 20360 case ISD::ATOMIC_LOAD_ADD: 20361 case ISD::ATOMIC_LOAD_SUB: 20362 case ISD::ATOMIC_LOAD_AND: 20363 case ISD::ATOMIC_LOAD_OR: 20364 case ISD::ATOMIC_LOAD_XOR: 20365 case ISD::ATOMIC_LOAD_NAND: 20366 case ISD::ATOMIC_LOAD_MIN: 20367 case ISD::ATOMIC_LOAD_MAX: 20368 case ISD::ATOMIC_LOAD_UMIN: 20369 case ISD::ATOMIC_LOAD_UMAX: 20370 case ISD::ATOMIC_LOAD: { 20371 // Delegate to generic TypeLegalization. Situations we can really handle 20372 // should have already been dealt with by AtomicExpandPass.cpp. 20373 break; 20374 } 20375 case ISD::BITCAST: { 20376 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 20377 EVT DstVT = N->getValueType(0); 20378 EVT SrcVT = N->getOperand(0)->getValueType(0); 20379 20380 if (SrcVT != MVT::f64 || 20381 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) 20382 return; 20383 20384 unsigned NumElts = DstVT.getVectorNumElements(); 20385 EVT SVT = DstVT.getVectorElementType(); 20386 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); 20387 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 20388 MVT::v2f64, N->getOperand(0)); 20389 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); 20390 20391 if (ExperimentalVectorWideningLegalization) { 20392 // If we are legalizing vectors by widening, we already have the desired 20393 // legal vector type, just return it. 20394 Results.push_back(ToVecInt); 20395 return; 20396 } 20397 20398 SmallVector<SDValue, 8> Elts; 20399 for (unsigned i = 0, e = NumElts; i != e; ++i) 20400 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, 20401 ToVecInt, DAG.getIntPtrConstant(i, dl))); 20402 20403 Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); 20404 } 20405 } 20406 } 20407 20408 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 20409 switch ((X86ISD::NodeType)Opcode) { 20410 case X86ISD::FIRST_NUMBER: break; 20411 case X86ISD::BSF: return "X86ISD::BSF"; 20412 case X86ISD::BSR: return "X86ISD::BSR"; 20413 case X86ISD::SHLD: return "X86ISD::SHLD"; 20414 case X86ISD::SHRD: return "X86ISD::SHRD"; 20415 case X86ISD::FAND: return "X86ISD::FAND"; 20416 case X86ISD::FANDN: return "X86ISD::FANDN"; 20417 case X86ISD::FOR: return "X86ISD::FOR"; 20418 case X86ISD::FXOR: return "X86ISD::FXOR"; 20419 case X86ISD::FILD: return "X86ISD::FILD"; 20420 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 20421 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 20422 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 20423 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 20424 case X86ISD::FLD: return "X86ISD::FLD"; 20425 case X86ISD::FST: return "X86ISD::FST"; 20426 case X86ISD::CALL: return "X86ISD::CALL"; 20427 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 20428 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; 20429 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; 20430 case X86ISD::BT: return "X86ISD::BT"; 20431 case X86ISD::CMP: return "X86ISD::CMP"; 20432 case X86ISD::COMI: return "X86ISD::COMI"; 20433 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 20434 case X86ISD::CMPM: return "X86ISD::CMPM"; 20435 case X86ISD::CMPMU: return "X86ISD::CMPMU"; 20436 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; 20437 case X86ISD::SETCC: return "X86ISD::SETCC"; 20438 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 20439 case X86ISD::FSETCC: return "X86ISD::FSETCC"; 20440 case X86ISD::FGETSIGNx86: return "X86ISD::FGETSIGNx86"; 20441 case X86ISD::CMOV: return "X86ISD::CMOV"; 20442 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 20443 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 20444 case X86ISD::IRET: return "X86ISD::IRET"; 20445 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 20446 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 20447 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 20448 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 20449 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 20450 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; 20451 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; 20452 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; 20453 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 20454 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 20455 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 20456 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 20457 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 20458 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 20459 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 20460 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 20461 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 20462 case X86ISD::BLENDI: return "X86ISD::BLENDI"; 20463 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; 20464 case X86ISD::ADDUS: return "X86ISD::ADDUS"; 20465 case X86ISD::SUBUS: return "X86ISD::SUBUS"; 20466 case X86ISD::HADD: return "X86ISD::HADD"; 20467 case X86ISD::HSUB: return "X86ISD::HSUB"; 20468 case X86ISD::FHADD: return "X86ISD::FHADD"; 20469 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 20470 case X86ISD::ABS: return "X86ISD::ABS"; 20471 case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; 20472 case X86ISD::FMAX: return "X86ISD::FMAX"; 20473 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; 20474 case X86ISD::FMIN: return "X86ISD::FMIN"; 20475 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; 20476 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 20477 case X86ISD::FMINC: return "X86ISD::FMINC"; 20478 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 20479 case X86ISD::FRCP: return "X86ISD::FRCP"; 20480 case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; 20481 case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; 20482 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 20483 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 20484 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 20485 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; 20486 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; 20487 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 20488 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 20489 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 20490 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 20491 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 20492 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 20493 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; 20494 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 20495 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 20496 case X86ISD::VZEXT: return "X86ISD::VZEXT"; 20497 case X86ISD::VSEXT: return "X86ISD::VSEXT"; 20498 case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; 20499 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; 20500 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; 20501 case X86ISD::VINSERT: return "X86ISD::VINSERT"; 20502 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 20503 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 20504 case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; 20505 case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; 20506 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 20507 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 20508 case X86ISD::VSHL: return "X86ISD::VSHL"; 20509 case X86ISD::VSRL: return "X86ISD::VSRL"; 20510 case X86ISD::VSRA: return "X86ISD::VSRA"; 20511 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 20512 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 20513 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 20514 case X86ISD::CMPP: return "X86ISD::CMPP"; 20515 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 20516 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 20517 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; 20518 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; 20519 case X86ISD::ADD: return "X86ISD::ADD"; 20520 case X86ISD::SUB: return "X86ISD::SUB"; 20521 case X86ISD::ADC: return "X86ISD::ADC"; 20522 case X86ISD::SBB: return "X86ISD::SBB"; 20523 case X86ISD::SMUL: return "X86ISD::SMUL"; 20524 case X86ISD::UMUL: return "X86ISD::UMUL"; 20525 case X86ISD::SMUL8: return "X86ISD::SMUL8"; 20526 case X86ISD::UMUL8: return "X86ISD::UMUL8"; 20527 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; 20528 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; 20529 case X86ISD::INC: return "X86ISD::INC"; 20530 case X86ISD::DEC: return "X86ISD::DEC"; 20531 case X86ISD::OR: return "X86ISD::OR"; 20532 case X86ISD::XOR: return "X86ISD::XOR"; 20533 case X86ISD::AND: return "X86ISD::AND"; 20534 case X86ISD::BEXTR: return "X86ISD::BEXTR"; 20535 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 20536 case X86ISD::PTEST: return "X86ISD::PTEST"; 20537 case X86ISD::TESTP: return "X86ISD::TESTP"; 20538 case X86ISD::TESTM: return "X86ISD::TESTM"; 20539 case X86ISD::TESTNM: return "X86ISD::TESTNM"; 20540 case X86ISD::KORTEST: return "X86ISD::KORTEST"; 20541 case X86ISD::KTEST: return "X86ISD::KTEST"; 20542 case X86ISD::PACKSS: return "X86ISD::PACKSS"; 20543 case X86ISD::PACKUS: return "X86ISD::PACKUS"; 20544 case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; 20545 case X86ISD::VALIGN: return "X86ISD::VALIGN"; 20546 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 20547 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 20548 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 20549 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 20550 case X86ISD::SHUF128: return "X86ISD::SHUF128"; 20551 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 20552 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 20553 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 20554 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 20555 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 20556 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 20557 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 20558 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 20559 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 20560 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 20561 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 20562 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 20563 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 20564 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; 20565 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; 20566 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; 20567 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; 20568 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; 20569 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 20570 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 20571 case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; 20572 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; 20573 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 20574 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; 20575 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; 20576 case X86ISD::VRANGE: return "X86ISD::VRANGE"; 20577 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 20578 case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; 20579 case X86ISD::PSADBW: return "X86ISD::PSADBW"; 20580 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; 20581 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 20582 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 20583 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 20584 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 20585 case X86ISD::MFENCE: return "X86ISD::MFENCE"; 20586 case X86ISD::SFENCE: return "X86ISD::SFENCE"; 20587 case X86ISD::LFENCE: return "X86ISD::LFENCE"; 20588 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 20589 case X86ISD::SAHF: return "X86ISD::SAHF"; 20590 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 20591 case X86ISD::RDSEED: return "X86ISD::RDSEED"; 20592 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; 20593 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; 20594 case X86ISD::VPROT: return "X86ISD::VPROT"; 20595 case X86ISD::VPROTI: return "X86ISD::VPROTI"; 20596 case X86ISD::VPSHA: return "X86ISD::VPSHA"; 20597 case X86ISD::VPSHL: return "X86ISD::VPSHL"; 20598 case X86ISD::VPCOM: return "X86ISD::VPCOM"; 20599 case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; 20600 case X86ISD::FMADD: return "X86ISD::FMADD"; 20601 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 20602 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 20603 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 20604 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 20605 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 20606 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; 20607 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND"; 20608 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND"; 20609 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; 20610 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; 20611 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; 20612 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; 20613 case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; 20614 case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; 20615 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; 20616 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; 20617 case X86ISD::XTEST: return "X86ISD::XTEST"; 20618 case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; 20619 case X86ISD::EXPAND: return "X86ISD::EXPAND"; 20620 case X86ISD::SELECT: return "X86ISD::SELECT"; 20621 case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; 20622 case X86ISD::RCP28: return "X86ISD::RCP28"; 20623 case X86ISD::EXP2: return "X86ISD::EXP2"; 20624 case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; 20625 case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; 20626 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; 20627 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; 20628 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; 20629 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; 20630 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; 20631 case X86ISD::SCALEF: return "X86ISD::SCALEF"; 20632 case X86ISD::ADDS: return "X86ISD::ADDS"; 20633 case X86ISD::SUBS: return "X86ISD::SUBS"; 20634 case X86ISD::AVG: return "X86ISD::AVG"; 20635 case X86ISD::MULHRS: return "X86ISD::MULHRS"; 20636 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; 20637 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; 20638 case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; 20639 case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; 20640 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; 20641 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; 20642 } 20643 return nullptr; 20644 } 20645 20646 // isLegalAddressingMode - Return true if the addressing mode represented 20647 // by AM is legal for this target, for a load/store of the specified type. 20648 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, 20649 const AddrMode &AM, Type *Ty, 20650 unsigned AS) const { 20651 // X86 supports extremely general addressing modes. 20652 CodeModel::Model M = getTargetMachine().getCodeModel(); 20653 Reloc::Model R = getTargetMachine().getRelocationModel(); 20654 20655 // X86 allows a sign-extended 32-bit immediate field as a displacement. 20656 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) 20657 return false; 20658 20659 if (AM.BaseGV) { 20660 unsigned GVFlags = 20661 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 20662 20663 // If a reference to this global requires an extra load, we can't fold it. 20664 if (isGlobalStubReference(GVFlags)) 20665 return false; 20666 20667 // If BaseGV requires a register for the PIC base, we cannot also have a 20668 // BaseReg specified. 20669 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 20670 return false; 20671 20672 // If lower 4G is not available, then we must use rip-relative addressing. 20673 if ((M != CodeModel::Small || R != Reloc::Static) && 20674 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 20675 return false; 20676 } 20677 20678 switch (AM.Scale) { 20679 case 0: 20680 case 1: 20681 case 2: 20682 case 4: 20683 case 8: 20684 // These scales always work. 20685 break; 20686 case 3: 20687 case 5: 20688 case 9: 20689 // These scales are formed with basereg+scalereg. Only accept if there is 20690 // no basereg yet. 20691 if (AM.HasBaseReg) 20692 return false; 20693 break; 20694 default: // Other stuff never works. 20695 return false; 20696 } 20697 20698 return true; 20699 } 20700 20701 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { 20702 unsigned Bits = Ty->getScalarSizeInBits(); 20703 20704 // 8-bit shifts are always expensive, but versions with a scalar amount aren't 20705 // particularly cheaper than those without. 20706 if (Bits == 8) 20707 return false; 20708 20709 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make 20710 // variable shifts just as cheap as scalar ones. 20711 if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) 20712 return false; 20713 20714 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a 20715 // fully general vector. 20716 return true; 20717 } 20718 20719 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 20720 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 20721 return false; 20722 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 20723 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 20724 return NumBits1 > NumBits2; 20725 } 20726 20727 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 20728 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 20729 return false; 20730 20731 if (!isTypeLegal(EVT::getEVT(Ty1))) 20732 return false; 20733 20734 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 20735 20736 // Assuming the caller doesn't have a zeroext or signext return parameter, 20737 // truncation all the way down to i1 is valid. 20738 return true; 20739 } 20740 20741 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 20742 return isInt<32>(Imm); 20743 } 20744 20745 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 20746 // Can also use sub to handle negated immediates. 20747 return isInt<32>(Imm); 20748 } 20749 20750 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 20751 if (!VT1.isInteger() || !VT2.isInteger()) 20752 return false; 20753 unsigned NumBits1 = VT1.getSizeInBits(); 20754 unsigned NumBits2 = VT2.getSizeInBits(); 20755 return NumBits1 > NumBits2; 20756 } 20757 20758 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 20759 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 20760 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 20761 } 20762 20763 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 20764 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 20765 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 20766 } 20767 20768 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 20769 EVT VT1 = Val.getValueType(); 20770 if (isZExtFree(VT1, VT2)) 20771 return true; 20772 20773 if (Val.getOpcode() != ISD::LOAD) 20774 return false; 20775 20776 if (!VT1.isSimple() || !VT1.isInteger() || 20777 !VT2.isSimple() || !VT2.isInteger()) 20778 return false; 20779 20780 switch (VT1.getSimpleVT().SimpleTy) { 20781 default: break; 20782 case MVT::i8: 20783 case MVT::i16: 20784 case MVT::i32: 20785 // X86 has 8, 16, and 32-bit zero-extending loads. 20786 return true; 20787 } 20788 20789 return false; 20790 } 20791 20792 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } 20793 20794 bool 20795 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 20796 if (!Subtarget->hasAnyFMA()) 20797 return false; 20798 20799 VT = VT.getScalarType(); 20800 20801 if (!VT.isSimple()) 20802 return false; 20803 20804 switch (VT.getSimpleVT().SimpleTy) { 20805 case MVT::f32: 20806 case MVT::f64: 20807 return true; 20808 default: 20809 break; 20810 } 20811 20812 return false; 20813 } 20814 20815 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 20816 // i16 instructions are longer (0x66 prefix) and potentially slower. 20817 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 20818 } 20819 20820 /// isShuffleMaskLegal - Targets can use this to indicate that they only 20821 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 20822 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 20823 /// are assumed to be legal. 20824 bool 20825 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 20826 EVT VT) const { 20827 if (!VT.isSimple()) 20828 return false; 20829 20830 // Not for i1 vectors 20831 if (VT.getSimpleVT().getScalarType() == MVT::i1) 20832 return false; 20833 20834 // Very little shuffling can be done for 64-bit vectors right now. 20835 if (VT.getSimpleVT().getSizeInBits() == 64) 20836 return false; 20837 20838 // We only care that the types being shuffled are legal. The lowering can 20839 // handle any possible shuffle mask that results. 20840 return isTypeLegal(VT.getSimpleVT()); 20841 } 20842 20843 bool 20844 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 20845 EVT VT) const { 20846 // Just delegate to the generic legality, clear masks aren't special. 20847 return isShuffleMaskLegal(Mask, VT); 20848 } 20849 20850 //===----------------------------------------------------------------------===// 20851 // X86 Scheduler Hooks 20852 //===----------------------------------------------------------------------===// 20853 20854 /// Utility function to emit xbegin specifying the start of an RTM region. 20855 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, 20856 const TargetInstrInfo *TII) { 20857 DebugLoc DL = MI->getDebugLoc(); 20858 20859 const BasicBlock *BB = MBB->getBasicBlock(); 20860 MachineFunction::iterator I = ++MBB->getIterator(); 20861 20862 // For the v = xbegin(), we generate 20863 // 20864 // thisMBB: 20865 // xbegin sinkMBB 20866 // 20867 // mainMBB: 20868 // eax = -1 20869 // 20870 // sinkMBB: 20871 // v = eax 20872 20873 MachineBasicBlock *thisMBB = MBB; 20874 MachineFunction *MF = MBB->getParent(); 20875 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 20876 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 20877 MF->insert(I, mainMBB); 20878 MF->insert(I, sinkMBB); 20879 20880 // Transfer the remainder of BB and its successor edges to sinkMBB. 20881 sinkMBB->splice(sinkMBB->begin(), MBB, 20882 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 20883 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 20884 20885 // thisMBB: 20886 // xbegin sinkMBB 20887 // # fallthrough to mainMBB 20888 // # abortion to sinkMBB 20889 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); 20890 thisMBB->addSuccessor(mainMBB); 20891 thisMBB->addSuccessor(sinkMBB); 20892 20893 // mainMBB: 20894 // EAX = -1 20895 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); 20896 mainMBB->addSuccessor(sinkMBB); 20897 20898 // sinkMBB: 20899 // EAX is live into the sinkMBB 20900 sinkMBB->addLiveIn(X86::EAX); 20901 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 20902 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 20903 .addReg(X86::EAX); 20904 20905 MI->eraseFromParent(); 20906 return sinkMBB; 20907 } 20908 20909 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 20910 // or XMM0_V32I8 in AVX all of this code can be replaced with that 20911 // in the .td file. 20912 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, 20913 const TargetInstrInfo *TII) { 20914 unsigned Opc; 20915 switch (MI->getOpcode()) { 20916 default: llvm_unreachable("illegal opcode!"); 20917 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; 20918 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; 20919 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; 20920 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; 20921 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; 20922 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; 20923 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; 20924 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; 20925 } 20926 20927 DebugLoc dl = MI->getDebugLoc(); 20928 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 20929 20930 unsigned NumArgs = MI->getNumOperands(); 20931 for (unsigned i = 1; i < NumArgs; ++i) { 20932 MachineOperand &Op = MI->getOperand(i); 20933 if (!(Op.isReg() && Op.isImplicit())) 20934 MIB.addOperand(Op); 20935 } 20936 if (MI->hasOneMemOperand()) 20937 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 20938 20939 BuildMI(*BB, MI, dl, 20940 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 20941 .addReg(X86::XMM0); 20942 20943 MI->eraseFromParent(); 20944 return BB; 20945 } 20946 20947 // FIXME: Custom handling because TableGen doesn't support multiple implicit 20948 // defs in an instruction pattern 20949 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, 20950 const TargetInstrInfo *TII) { 20951 unsigned Opc; 20952 switch (MI->getOpcode()) { 20953 default: llvm_unreachable("illegal opcode!"); 20954 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; 20955 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; 20956 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; 20957 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; 20958 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; 20959 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; 20960 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; 20961 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; 20962 } 20963 20964 DebugLoc dl = MI->getDebugLoc(); 20965 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 20966 20967 unsigned NumArgs = MI->getNumOperands(); // remove the results 20968 for (unsigned i = 1; i < NumArgs; ++i) { 20969 MachineOperand &Op = MI->getOperand(i); 20970 if (!(Op.isReg() && Op.isImplicit())) 20971 MIB.addOperand(Op); 20972 } 20973 if (MI->hasOneMemOperand()) 20974 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 20975 20976 BuildMI(*BB, MI, dl, 20977 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 20978 .addReg(X86::ECX); 20979 20980 MI->eraseFromParent(); 20981 return BB; 20982 } 20983 20984 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, 20985 const X86Subtarget *Subtarget) { 20986 DebugLoc dl = MI->getDebugLoc(); 20987 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 20988 // Address into RAX/EAX, other two args into ECX, EDX. 20989 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 20990 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 20991 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 20992 for (int i = 0; i < X86::AddrNumOperands; ++i) 20993 MIB.addOperand(MI->getOperand(i)); 20994 20995 unsigned ValOps = X86::AddrNumOperands; 20996 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 20997 .addReg(MI->getOperand(ValOps).getReg()); 20998 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 20999 .addReg(MI->getOperand(ValOps+1).getReg()); 21000 21001 // The instruction doesn't actually take any operands though. 21002 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 21003 21004 MI->eraseFromParent(); // The pseudo is gone now. 21005 return BB; 21006 } 21007 21008 MachineBasicBlock * 21009 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, 21010 MachineBasicBlock *MBB) const { 21011 // Emit va_arg instruction on X86-64. 21012 21013 // Operands to this pseudo-instruction: 21014 // 0 ) Output : destination address (reg) 21015 // 1-5) Input : va_list address (addr, i64mem) 21016 // 6 ) ArgSize : Size (in bytes) of vararg type 21017 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 21018 // 8 ) Align : Alignment of type 21019 // 9 ) EFLAGS (implicit-def) 21020 21021 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 21022 static_assert(X86::AddrNumOperands == 5, 21023 "VAARG_64 assumes 5 address operands"); 21024 21025 unsigned DestReg = MI->getOperand(0).getReg(); 21026 MachineOperand &Base = MI->getOperand(1); 21027 MachineOperand &Scale = MI->getOperand(2); 21028 MachineOperand &Index = MI->getOperand(3); 21029 MachineOperand &Disp = MI->getOperand(4); 21030 MachineOperand &Segment = MI->getOperand(5); 21031 unsigned ArgSize = MI->getOperand(6).getImm(); 21032 unsigned ArgMode = MI->getOperand(7).getImm(); 21033 unsigned Align = MI->getOperand(8).getImm(); 21034 21035 // Memory Reference 21036 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 21037 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 21038 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 21039 21040 // Machine Information 21041 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 21042 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 21043 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 21044 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 21045 DebugLoc DL = MI->getDebugLoc(); 21046 21047 // struct va_list { 21048 // i32 gp_offset 21049 // i32 fp_offset 21050 // i64 overflow_area (address) 21051 // i64 reg_save_area (address) 21052 // } 21053 // sizeof(va_list) = 24 21054 // alignment(va_list) = 8 21055 21056 unsigned TotalNumIntRegs = 6; 21057 unsigned TotalNumXMMRegs = 8; 21058 bool UseGPOffset = (ArgMode == 1); 21059 bool UseFPOffset = (ArgMode == 2); 21060 unsigned MaxOffset = TotalNumIntRegs * 8 + 21061 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 21062 21063 /* Align ArgSize to a multiple of 8 */ 21064 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 21065 bool NeedsAlign = (Align > 8); 21066 21067 MachineBasicBlock *thisMBB = MBB; 21068 MachineBasicBlock *overflowMBB; 21069 MachineBasicBlock *offsetMBB; 21070 MachineBasicBlock *endMBB; 21071 21072 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 21073 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 21074 unsigned OffsetReg = 0; 21075 21076 if (!UseGPOffset && !UseFPOffset) { 21077 // If we only pull from the overflow region, we don't create a branch. 21078 // We don't need to alter control flow. 21079 OffsetDestReg = 0; // unused 21080 OverflowDestReg = DestReg; 21081 21082 offsetMBB = nullptr; 21083 overflowMBB = thisMBB; 21084 endMBB = thisMBB; 21085 } else { 21086 // First emit code to check if gp_offset (or fp_offset) is below the bound. 21087 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 21088 // If not, pull from overflow_area. (branch to overflowMBB) 21089 // 21090 // thisMBB 21091 // | . 21092 // | . 21093 // offsetMBB overflowMBB 21094 // | . 21095 // | . 21096 // endMBB 21097 21098 // Registers for the PHI in endMBB 21099 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 21100 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 21101 21102 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 21103 MachineFunction *MF = MBB->getParent(); 21104 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 21105 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 21106 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 21107 21108 MachineFunction::iterator MBBIter = ++MBB->getIterator(); 21109 21110 // Insert the new basic blocks 21111 MF->insert(MBBIter, offsetMBB); 21112 MF->insert(MBBIter, overflowMBB); 21113 MF->insert(MBBIter, endMBB); 21114 21115 // Transfer the remainder of MBB and its successor edges to endMBB. 21116 endMBB->splice(endMBB->begin(), thisMBB, 21117 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); 21118 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 21119 21120 // Make offsetMBB and overflowMBB successors of thisMBB 21121 thisMBB->addSuccessor(offsetMBB); 21122 thisMBB->addSuccessor(overflowMBB); 21123 21124 // endMBB is a successor of both offsetMBB and overflowMBB 21125 offsetMBB->addSuccessor(endMBB); 21126 overflowMBB->addSuccessor(endMBB); 21127 21128 // Load the offset value into a register 21129 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 21130 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 21131 .addOperand(Base) 21132 .addOperand(Scale) 21133 .addOperand(Index) 21134 .addDisp(Disp, UseFPOffset ? 4 : 0) 21135 .addOperand(Segment) 21136 .setMemRefs(MMOBegin, MMOEnd); 21137 21138 // Check if there is enough room left to pull this argument. 21139 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 21140 .addReg(OffsetReg) 21141 .addImm(MaxOffset + 8 - ArgSizeA8); 21142 21143 // Branch to "overflowMBB" if offset >= max 21144 // Fall through to "offsetMBB" otherwise 21145 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 21146 .addMBB(overflowMBB); 21147 } 21148 21149 // In offsetMBB, emit code to use the reg_save_area. 21150 if (offsetMBB) { 21151 assert(OffsetReg != 0); 21152 21153 // Read the reg_save_area address. 21154 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 21155 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 21156 .addOperand(Base) 21157 .addOperand(Scale) 21158 .addOperand(Index) 21159 .addDisp(Disp, 16) 21160 .addOperand(Segment) 21161 .setMemRefs(MMOBegin, MMOEnd); 21162 21163 // Zero-extend the offset 21164 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 21165 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 21166 .addImm(0) 21167 .addReg(OffsetReg) 21168 .addImm(X86::sub_32bit); 21169 21170 // Add the offset to the reg_save_area to get the final address. 21171 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 21172 .addReg(OffsetReg64) 21173 .addReg(RegSaveReg); 21174 21175 // Compute the offset for the next argument 21176 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 21177 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 21178 .addReg(OffsetReg) 21179 .addImm(UseFPOffset ? 16 : 8); 21180 21181 // Store it back into the va_list. 21182 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 21183 .addOperand(Base) 21184 .addOperand(Scale) 21185 .addOperand(Index) 21186 .addDisp(Disp, UseFPOffset ? 4 : 0) 21187 .addOperand(Segment) 21188 .addReg(NextOffsetReg) 21189 .setMemRefs(MMOBegin, MMOEnd); 21190 21191 // Jump to endMBB 21192 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) 21193 .addMBB(endMBB); 21194 } 21195 21196 // 21197 // Emit code to use overflow area 21198 // 21199 21200 // Load the overflow_area address into a register. 21201 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 21202 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 21203 .addOperand(Base) 21204 .addOperand(Scale) 21205 .addOperand(Index) 21206 .addDisp(Disp, 8) 21207 .addOperand(Segment) 21208 .setMemRefs(MMOBegin, MMOEnd); 21209 21210 // If we need to align it, do so. Otherwise, just copy the address 21211 // to OverflowDestReg. 21212 if (NeedsAlign) { 21213 // Align the overflow address 21214 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 21215 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 21216 21217 // aligned_addr = (addr + (align-1)) & ~(align-1) 21218 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 21219 .addReg(OverflowAddrReg) 21220 .addImm(Align-1); 21221 21222 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 21223 .addReg(TmpReg) 21224 .addImm(~(uint64_t)(Align-1)); 21225 } else { 21226 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 21227 .addReg(OverflowAddrReg); 21228 } 21229 21230 // Compute the next overflow address after this argument. 21231 // (the overflow address should be kept 8-byte aligned) 21232 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 21233 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 21234 .addReg(OverflowDestReg) 21235 .addImm(ArgSizeA8); 21236 21237 // Store the new overflow address. 21238 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 21239 .addOperand(Base) 21240 .addOperand(Scale) 21241 .addOperand(Index) 21242 .addDisp(Disp, 8) 21243 .addOperand(Segment) 21244 .addReg(NextAddrReg) 21245 .setMemRefs(MMOBegin, MMOEnd); 21246 21247 // If we branched, emit the PHI to the front of endMBB. 21248 if (offsetMBB) { 21249 BuildMI(*endMBB, endMBB->begin(), DL, 21250 TII->get(X86::PHI), DestReg) 21251 .addReg(OffsetDestReg).addMBB(offsetMBB) 21252 .addReg(OverflowDestReg).addMBB(overflowMBB); 21253 } 21254 21255 // Erase the pseudo instruction 21256 MI->eraseFromParent(); 21257 21258 return endMBB; 21259 } 21260 21261 MachineBasicBlock * 21262 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 21263 MachineInstr *MI, 21264 MachineBasicBlock *MBB) const { 21265 // Emit code to save XMM registers to the stack. The ABI says that the 21266 // number of registers to save is given in %al, so it's theoretically 21267 // possible to do an indirect jump trick to avoid saving all of them, 21268 // however this code takes a simpler approach and just executes all 21269 // of the stores if %al is non-zero. It's less code, and it's probably 21270 // easier on the hardware branch predictor, and stores aren't all that 21271 // expensive anyway. 21272 21273 // Create the new basic blocks. One block contains all the XMM stores, 21274 // and one block is the final destination regardless of whether any 21275 // stores were performed. 21276 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 21277 MachineFunction *F = MBB->getParent(); 21278 MachineFunction::iterator MBBIter = ++MBB->getIterator(); 21279 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 21280 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 21281 F->insert(MBBIter, XMMSaveMBB); 21282 F->insert(MBBIter, EndMBB); 21283 21284 // Transfer the remainder of MBB and its successor edges to EndMBB. 21285 EndMBB->splice(EndMBB->begin(), MBB, 21286 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 21287 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 21288 21289 // The original block will now fall through to the XMM save block. 21290 MBB->addSuccessor(XMMSaveMBB); 21291 // The XMMSaveMBB will fall through to the end block. 21292 XMMSaveMBB->addSuccessor(EndMBB); 21293 21294 // Now add the instructions. 21295 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 21296 DebugLoc DL = MI->getDebugLoc(); 21297 21298 unsigned CountReg = MI->getOperand(0).getReg(); 21299 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 21300 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 21301 21302 if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) { 21303 // If %al is 0, branch around the XMM save block. 21304 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 21305 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); 21306 MBB->addSuccessor(EndMBB); 21307 } 21308 21309 // Make sure the last operand is EFLAGS, which gets clobbered by the branch 21310 // that was just emitted, but clearly shouldn't be "saved". 21311 assert((MI->getNumOperands() <= 3 || 21312 !MI->getOperand(MI->getNumOperands() - 1).isReg() || 21313 MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) 21314 && "Expected last argument to be EFLAGS"); 21315 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; 21316 // In the XMM save block, save all the XMM argument registers. 21317 for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { 21318 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 21319 MachineMemOperand *MMO = F->getMachineMemOperand( 21320 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), 21321 MachineMemOperand::MOStore, 21322 /*Size=*/16, /*Align=*/16); 21323 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 21324 .addFrameIndex(RegSaveFrameIndex) 21325 .addImm(/*Scale=*/1) 21326 .addReg(/*IndexReg=*/0) 21327 .addImm(/*Disp=*/Offset) 21328 .addReg(/*Segment=*/0) 21329 .addReg(MI->getOperand(i).getReg()) 21330 .addMemOperand(MMO); 21331 } 21332 21333 MI->eraseFromParent(); // The pseudo instruction is gone now. 21334 21335 return EndMBB; 21336 } 21337 21338 // The EFLAGS operand of SelectItr might be missing a kill marker 21339 // because there were multiple uses of EFLAGS, and ISel didn't know 21340 // which to mark. Figure out whether SelectItr should have had a 21341 // kill marker, and set it if it should. Returns the correct kill 21342 // marker value. 21343 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 21344 MachineBasicBlock* BB, 21345 const TargetRegisterInfo* TRI) { 21346 // Scan forward through BB for a use/def of EFLAGS. 21347 MachineBasicBlock::iterator miI(std::next(SelectItr)); 21348 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 21349 const MachineInstr& mi = *miI; 21350 if (mi.readsRegister(X86::EFLAGS)) 21351 return false; 21352 if (mi.definesRegister(X86::EFLAGS)) 21353 break; // Should have kill-flag - update below. 21354 } 21355 21356 // If we hit the end of the block, check whether EFLAGS is live into a 21357 // successor. 21358 if (miI == BB->end()) { 21359 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 21360 sEnd = BB->succ_end(); 21361 sItr != sEnd; ++sItr) { 21362 MachineBasicBlock* succ = *sItr; 21363 if (succ->isLiveIn(X86::EFLAGS)) 21364 return false; 21365 } 21366 } 21367 21368 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 21369 // out. SelectMI should have a kill flag on EFLAGS. 21370 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 21371 return true; 21372 } 21373 21374 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded 21375 // together with other CMOV pseudo-opcodes into a single basic-block with 21376 // conditional jump around it. 21377 static bool isCMOVPseudo(MachineInstr *MI) { 21378 switch (MI->getOpcode()) { 21379 case X86::CMOV_FR32: 21380 case X86::CMOV_FR64: 21381 case X86::CMOV_GR8: 21382 case X86::CMOV_GR16: 21383 case X86::CMOV_GR32: 21384 case X86::CMOV_RFP32: 21385 case X86::CMOV_RFP64: 21386 case X86::CMOV_RFP80: 21387 case X86::CMOV_V2F64: 21388 case X86::CMOV_V2I64: 21389 case X86::CMOV_V4F32: 21390 case X86::CMOV_V4F64: 21391 case X86::CMOV_V4I64: 21392 case X86::CMOV_V16F32: 21393 case X86::CMOV_V8F32: 21394 case X86::CMOV_V8F64: 21395 case X86::CMOV_V8I64: 21396 case X86::CMOV_V8I1: 21397 case X86::CMOV_V16I1: 21398 case X86::CMOV_V32I1: 21399 case X86::CMOV_V64I1: 21400 return true; 21401 21402 default: 21403 return false; 21404 } 21405 } 21406 21407 MachineBasicBlock * 21408 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 21409 MachineBasicBlock *BB) const { 21410 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 21411 DebugLoc DL = MI->getDebugLoc(); 21412 21413 // To "insert" a SELECT_CC instruction, we actually have to insert the 21414 // diamond control-flow pattern. The incoming instruction knows the 21415 // destination vreg to set, the condition code register to branch on, the 21416 // true/false values to select between, and a branch opcode to use. 21417 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 21418 MachineFunction::iterator It = ++BB->getIterator(); 21419 21420 // thisMBB: 21421 // ... 21422 // TrueVal = ... 21423 // cmpTY ccX, r1, r2 21424 // bCC copy1MBB 21425 // fallthrough --> copy0MBB 21426 MachineBasicBlock *thisMBB = BB; 21427 MachineFunction *F = BB->getParent(); 21428 21429 // This code lowers all pseudo-CMOV instructions. Generally it lowers these 21430 // as described above, by inserting a BB, and then making a PHI at the join 21431 // point to select the true and false operands of the CMOV in the PHI. 21432 // 21433 // The code also handles two different cases of multiple CMOV opcodes 21434 // in a row. 21435 // 21436 // Case 1: 21437 // In this case, there are multiple CMOVs in a row, all which are based on 21438 // the same condition setting (or the exact opposite condition setting). 21439 // In this case we can lower all the CMOVs using a single inserted BB, and 21440 // then make a number of PHIs at the join point to model the CMOVs. The only 21441 // trickiness here, is that in a case like: 21442 // 21443 // t2 = CMOV cond1 t1, f1 21444 // t3 = CMOV cond1 t2, f2 21445 // 21446 // when rewriting this into PHIs, we have to perform some renaming on the 21447 // temps since you cannot have a PHI operand refer to a PHI result earlier 21448 // in the same block. The "simple" but wrong lowering would be: 21449 // 21450 // t2 = PHI t1(BB1), f1(BB2) 21451 // t3 = PHI t2(BB1), f2(BB2) 21452 // 21453 // but clearly t2 is not defined in BB1, so that is incorrect. The proper 21454 // renaming is to note that on the path through BB1, t2 is really just a 21455 // copy of t1, and do that renaming, properly generating: 21456 // 21457 // t2 = PHI t1(BB1), f1(BB2) 21458 // t3 = PHI t1(BB1), f2(BB2) 21459 // 21460 // Case 2, we lower cascaded CMOVs such as 21461 // 21462 // (CMOV (CMOV F, T, cc1), T, cc2) 21463 // 21464 // to two successives branches. For that, we look for another CMOV as the 21465 // following instruction. 21466 // 21467 // Without this, we would add a PHI between the two jumps, which ends up 21468 // creating a few copies all around. For instance, for 21469 // 21470 // (sitofp (zext (fcmp une))) 21471 // 21472 // we would generate: 21473 // 21474 // ucomiss %xmm1, %xmm0 21475 // movss <1.0f>, %xmm0 21476 // movaps %xmm0, %xmm1 21477 // jne .LBB5_2 21478 // xorps %xmm1, %xmm1 21479 // .LBB5_2: 21480 // jp .LBB5_4 21481 // movaps %xmm1, %xmm0 21482 // .LBB5_4: 21483 // retq 21484 // 21485 // because this custom-inserter would have generated: 21486 // 21487 // A 21488 // | \ 21489 // | B 21490 // | / 21491 // C 21492 // | \ 21493 // | D 21494 // | / 21495 // E 21496 // 21497 // A: X = ...; Y = ... 21498 // B: empty 21499 // C: Z = PHI [X, A], [Y, B] 21500 // D: empty 21501 // E: PHI [X, C], [Z, D] 21502 // 21503 // If we lower both CMOVs in a single step, we can instead generate: 21504 // 21505 // A 21506 // | \ 21507 // | C 21508 // | /| 21509 // |/ | 21510 // | | 21511 // | D 21512 // | / 21513 // E 21514 // 21515 // A: X = ...; Y = ... 21516 // D: empty 21517 // E: PHI [X, A], [X, C], [Y, D] 21518 // 21519 // Which, in our sitofp/fcmp example, gives us something like: 21520 // 21521 // ucomiss %xmm1, %xmm0 21522 // movss <1.0f>, %xmm0 21523 // jne .LBB5_4 21524 // jp .LBB5_4 21525 // xorps %xmm0, %xmm0 21526 // .LBB5_4: 21527 // retq 21528 // 21529 MachineInstr *CascadedCMOV = nullptr; 21530 MachineInstr *LastCMOV = MI; 21531 X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); 21532 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); 21533 MachineBasicBlock::iterator NextMIIt = 21534 std::next(MachineBasicBlock::iterator(MI)); 21535 21536 // Check for case 1, where there are multiple CMOVs with the same condition 21537 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the 21538 // number of jumps the most. 21539 21540 if (isCMOVPseudo(MI)) { 21541 // See if we have a string of CMOVS with the same condition. 21542 while (NextMIIt != BB->end() && 21543 isCMOVPseudo(NextMIIt) && 21544 (NextMIIt->getOperand(3).getImm() == CC || 21545 NextMIIt->getOperand(3).getImm() == OppCC)) { 21546 LastCMOV = &*NextMIIt; 21547 ++NextMIIt; 21548 } 21549 } 21550 21551 // This checks for case 2, but only do this if we didn't already find 21552 // case 1, as indicated by LastCMOV == MI. 21553 if (LastCMOV == MI && 21554 NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && 21555 NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && 21556 NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { 21557 CascadedCMOV = &*NextMIIt; 21558 } 21559 21560 MachineBasicBlock *jcc1MBB = nullptr; 21561 21562 // If we have a cascaded CMOV, we lower it to two successive branches to 21563 // the same block. EFLAGS is used by both, so mark it as live in the second. 21564 if (CascadedCMOV) { 21565 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); 21566 F->insert(It, jcc1MBB); 21567 jcc1MBB->addLiveIn(X86::EFLAGS); 21568 } 21569 21570 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 21571 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 21572 F->insert(It, copy0MBB); 21573 F->insert(It, sinkMBB); 21574 21575 // If the EFLAGS register isn't dead in the terminator, then claim that it's 21576 // live into the sink and copy blocks. 21577 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 21578 21579 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; 21580 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && 21581 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { 21582 copy0MBB->addLiveIn(X86::EFLAGS); 21583 sinkMBB->addLiveIn(X86::EFLAGS); 21584 } 21585 21586 // Transfer the remainder of BB and its successor edges to sinkMBB. 21587 sinkMBB->splice(sinkMBB->begin(), BB, 21588 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); 21589 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 21590 21591 // Add the true and fallthrough blocks as its successors. 21592 if (CascadedCMOV) { 21593 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. 21594 BB->addSuccessor(jcc1MBB); 21595 21596 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and 21597 // jump to the sinkMBB. 21598 jcc1MBB->addSuccessor(copy0MBB); 21599 jcc1MBB->addSuccessor(sinkMBB); 21600 } else { 21601 BB->addSuccessor(copy0MBB); 21602 } 21603 21604 // The true block target of the first (or only) branch is always sinkMBB. 21605 BB->addSuccessor(sinkMBB); 21606 21607 // Create the conditional branch instruction. 21608 unsigned Opc = X86::GetCondBranchFromCond(CC); 21609 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 21610 21611 if (CascadedCMOV) { 21612 unsigned Opc2 = X86::GetCondBranchFromCond( 21613 (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); 21614 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); 21615 } 21616 21617 // copy0MBB: 21618 // %FalseValue = ... 21619 // # fallthrough to sinkMBB 21620 copy0MBB->addSuccessor(sinkMBB); 21621 21622 // sinkMBB: 21623 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 21624 // ... 21625 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); 21626 MachineBasicBlock::iterator MIItEnd = 21627 std::next(MachineBasicBlock::iterator(LastCMOV)); 21628 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); 21629 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; 21630 MachineInstrBuilder MIB; 21631 21632 // As we are creating the PHIs, we have to be careful if there is more than 21633 // one. Later CMOVs may reference the results of earlier CMOVs, but later 21634 // PHIs have to reference the individual true/false inputs from earlier PHIs. 21635 // That also means that PHI construction must work forward from earlier to 21636 // later, and that the code must maintain a mapping from earlier PHI's 21637 // destination registers, and the registers that went into the PHI. 21638 21639 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { 21640 unsigned DestReg = MIIt->getOperand(0).getReg(); 21641 unsigned Op1Reg = MIIt->getOperand(1).getReg(); 21642 unsigned Op2Reg = MIIt->getOperand(2).getReg(); 21643 21644 // If this CMOV we are generating is the opposite condition from 21645 // the jump we generated, then we have to swap the operands for the 21646 // PHI that is going to be generated. 21647 if (MIIt->getOperand(3).getImm() == OppCC) 21648 std::swap(Op1Reg, Op2Reg); 21649 21650 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) 21651 Op1Reg = RegRewriteTable[Op1Reg].first; 21652 21653 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) 21654 Op2Reg = RegRewriteTable[Op2Reg].second; 21655 21656 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, 21657 TII->get(X86::PHI), DestReg) 21658 .addReg(Op1Reg).addMBB(copy0MBB) 21659 .addReg(Op2Reg).addMBB(thisMBB); 21660 21661 // Add this PHI to the rewrite table. 21662 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); 21663 } 21664 21665 // If we have a cascaded CMOV, the second Jcc provides the same incoming 21666 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). 21667 if (CascadedCMOV) { 21668 MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); 21669 // Copy the PHI result to the register defined by the second CMOV. 21670 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), 21671 DL, TII->get(TargetOpcode::COPY), 21672 CascadedCMOV->getOperand(0).getReg()) 21673 .addReg(MI->getOperand(0).getReg()); 21674 CascadedCMOV->eraseFromParent(); 21675 } 21676 21677 // Now remove the CMOV(s). 21678 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) 21679 (MIIt++)->eraseFromParent(); 21680 21681 return sinkMBB; 21682 } 21683 21684 MachineBasicBlock * 21685 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, 21686 MachineBasicBlock *BB) const { 21687 // Combine the following atomic floating-point modification pattern: 21688 // a.store(reg OP a.load(acquire), release) 21689 // Transform them into: 21690 // OPss (%gpr), %xmm 21691 // movss %xmm, (%gpr) 21692 // Or sd equivalent for 64-bit operations. 21693 unsigned MOp, FOp; 21694 switch (MI->getOpcode()) { 21695 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); 21696 case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; 21697 case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; 21698 } 21699 const X86InstrInfo *TII = Subtarget->getInstrInfo(); 21700 DebugLoc DL = MI->getDebugLoc(); 21701 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 21702 MachineOperand MSrc = MI->getOperand(0); 21703 unsigned VSrc = MI->getOperand(5).getReg(); 21704 const MachineOperand &Disp = MI->getOperand(3); 21705 MachineOperand ZeroDisp = MachineOperand::CreateImm(0); 21706 bool hasDisp = Disp.isGlobal() || Disp.isImm(); 21707 if (hasDisp && MSrc.isReg()) 21708 MSrc.setIsKill(false); 21709 MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) 21710 .addOperand(/*Base=*/MSrc) 21711 .addImm(/*Scale=*/1) 21712 .addReg(/*Index=*/0) 21713 .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) 21714 .addReg(0); 21715 MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), 21716 MRI.createVirtualRegister(MRI.getRegClass(VSrc))) 21717 .addReg(VSrc) 21718 .addOperand(/*Base=*/MSrc) 21719 .addImm(/*Scale=*/1) 21720 .addReg(/*Index=*/0) 21721 .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) 21722 .addReg(/*Segment=*/0); 21723 MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); 21724 MI->eraseFromParent(); // The pseudo instruction is gone now. 21725 return BB; 21726 } 21727 21728 MachineBasicBlock * 21729 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, 21730 MachineBasicBlock *BB) const { 21731 MachineFunction *MF = BB->getParent(); 21732 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 21733 DebugLoc DL = MI->getDebugLoc(); 21734 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 21735 21736 assert(MF->shouldSplitStack()); 21737 21738 const bool Is64Bit = Subtarget->is64Bit(); 21739 const bool IsLP64 = Subtarget->isTarget64BitLP64(); 21740 21741 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 21742 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; 21743 21744 // BB: 21745 // ... [Till the alloca] 21746 // If stacklet is not large enough, jump to mallocMBB 21747 // 21748 // bumpMBB: 21749 // Allocate by subtracting from RSP 21750 // Jump to continueMBB 21751 // 21752 // mallocMBB: 21753 // Allocate by call to runtime 21754 // 21755 // continueMBB: 21756 // ... 21757 // [rest of original BB] 21758 // 21759 21760 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 21761 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 21762 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 21763 21764 MachineRegisterInfo &MRI = MF->getRegInfo(); 21765 const TargetRegisterClass *AddrRegClass = 21766 getRegClassFor(getPointerTy(MF->getDataLayout())); 21767 21768 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 21769 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 21770 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 21771 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 21772 sizeVReg = MI->getOperand(1).getReg(), 21773 physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; 21774 21775 MachineFunction::iterator MBBIter = ++BB->getIterator(); 21776 21777 MF->insert(MBBIter, bumpMBB); 21778 MF->insert(MBBIter, mallocMBB); 21779 MF->insert(MBBIter, continueMBB); 21780 21781 continueMBB->splice(continueMBB->begin(), BB, 21782 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 21783 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 21784 21785 // Add code to the main basic block to check if the stack limit has been hit, 21786 // and if so, jump to mallocMBB otherwise to bumpMBB. 21787 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 21788 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 21789 .addReg(tmpSPVReg).addReg(sizeVReg); 21790 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) 21791 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 21792 .addReg(SPLimitVReg); 21793 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); 21794 21795 // bumpMBB simply decreases the stack pointer, since we know the current 21796 // stacklet has enough space. 21797 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 21798 .addReg(SPLimitVReg); 21799 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 21800 .addReg(SPLimitVReg); 21801 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); 21802 21803 // Calls into a routine in libgcc to allocate more space from the heap. 21804 const uint32_t *RegMask = 21805 Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); 21806 if (IsLP64) { 21807 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 21808 .addReg(sizeVReg); 21809 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 21810 .addExternalSymbol("__morestack_allocate_stack_space") 21811 .addRegMask(RegMask) 21812 .addReg(X86::RDI, RegState::Implicit) 21813 .addReg(X86::RAX, RegState::ImplicitDefine); 21814 } else if (Is64Bit) { 21815 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) 21816 .addReg(sizeVReg); 21817 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 21818 .addExternalSymbol("__morestack_allocate_stack_space") 21819 .addRegMask(RegMask) 21820 .addReg(X86::EDI, RegState::Implicit) 21821 .addReg(X86::EAX, RegState::ImplicitDefine); 21822 } else { 21823 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 21824 .addImm(12); 21825 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 21826 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 21827 .addExternalSymbol("__morestack_allocate_stack_space") 21828 .addRegMask(RegMask) 21829 .addReg(X86::EAX, RegState::ImplicitDefine); 21830 } 21831 21832 if (!Is64Bit) 21833 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 21834 .addImm(16); 21835 21836 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 21837 .addReg(IsLP64 ? X86::RAX : X86::EAX); 21838 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); 21839 21840 // Set up the CFG correctly. 21841 BB->addSuccessor(bumpMBB); 21842 BB->addSuccessor(mallocMBB); 21843 mallocMBB->addSuccessor(continueMBB); 21844 bumpMBB->addSuccessor(continueMBB); 21845 21846 // Take care of the PHI nodes. 21847 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 21848 MI->getOperand(0).getReg()) 21849 .addReg(mallocPtrVReg).addMBB(mallocMBB) 21850 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 21851 21852 // Delete the original pseudo instruction. 21853 MI->eraseFromParent(); 21854 21855 // And we're done. 21856 return continueMBB; 21857 } 21858 21859 MachineBasicBlock * 21860 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 21861 MachineBasicBlock *BB) const { 21862 assert(!Subtarget->isTargetMachO()); 21863 DebugLoc DL = MI->getDebugLoc(); 21864 MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( 21865 *BB->getParent(), *BB, MI, DL, false); 21866 MachineBasicBlock *ResumeBB = ResumeMI->getParent(); 21867 MI->eraseFromParent(); // The pseudo instruction is gone now. 21868 return ResumeBB; 21869 } 21870 21871 MachineBasicBlock * 21872 X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, 21873 MachineBasicBlock *BB) const { 21874 MachineFunction *MF = BB->getParent(); 21875 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 21876 MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB(); 21877 DebugLoc DL = MI->getDebugLoc(); 21878 21879 assert(!isAsynchronousEHPersonality( 21880 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && 21881 "SEH does not use catchret!"); 21882 21883 // Only 32-bit EH needs to worry about manually restoring stack pointers. 21884 if (!Subtarget->is32Bit()) 21885 return BB; 21886 21887 // C++ EH creates a new target block to hold the restore code, and wires up 21888 // the new block to the return destination with a normal JMP_4. 21889 MachineBasicBlock *RestoreMBB = 21890 MF->CreateMachineBasicBlock(BB->getBasicBlock()); 21891 assert(BB->succ_size() == 1); 21892 MF->insert(std::next(BB->getIterator()), RestoreMBB); 21893 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); 21894 BB->addSuccessor(RestoreMBB); 21895 MI->getOperand(0).setMBB(RestoreMBB); 21896 21897 auto RestoreMBBI = RestoreMBB->begin(); 21898 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); 21899 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); 21900 return BB; 21901 } 21902 21903 MachineBasicBlock * 21904 X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI, 21905 MachineBasicBlock *BB) const { 21906 MachineFunction *MF = BB->getParent(); 21907 const Constant *PerFn = MF->getFunction()->getPersonalityFn(); 21908 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); 21909 // Only 32-bit SEH requires special handling for catchpad. 21910 if (IsSEH && Subtarget->is32Bit()) { 21911 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 21912 DebugLoc DL = MI->getDebugLoc(); 21913 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); 21914 } 21915 MI->eraseFromParent(); 21916 return BB; 21917 } 21918 21919 MachineBasicBlock * 21920 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 21921 MachineBasicBlock *BB) const { 21922 // This is pretty easy. We're taking the value that we received from 21923 // our load from the relocation, sticking it in either RDI (x86-64) 21924 // or EAX and doing an indirect call. The return value will then 21925 // be in the normal return register. 21926 MachineFunction *F = BB->getParent(); 21927 const X86InstrInfo *TII = Subtarget->getInstrInfo(); 21928 DebugLoc DL = MI->getDebugLoc(); 21929 21930 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 21931 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 21932 21933 // Get a register mask for the lowered call. 21934 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 21935 // proper register mask. 21936 const uint32_t *RegMask = 21937 Subtarget->is64Bit() ? 21938 Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() : 21939 Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); 21940 if (Subtarget->is64Bit()) { 21941 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 21942 TII->get(X86::MOV64rm), X86::RDI) 21943 .addReg(X86::RIP) 21944 .addImm(0).addReg(0) 21945 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 21946 MI->getOperand(3).getTargetFlags()) 21947 .addReg(0); 21948 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 21949 addDirectMem(MIB, X86::RDI); 21950 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 21951 } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { 21952 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 21953 TII->get(X86::MOV32rm), X86::EAX) 21954 .addReg(0) 21955 .addImm(0).addReg(0) 21956 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 21957 MI->getOperand(3).getTargetFlags()) 21958 .addReg(0); 21959 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 21960 addDirectMem(MIB, X86::EAX); 21961 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 21962 } else { 21963 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 21964 TII->get(X86::MOV32rm), X86::EAX) 21965 .addReg(TII->getGlobalBaseReg(F)) 21966 .addImm(0).addReg(0) 21967 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 21968 MI->getOperand(3).getTargetFlags()) 21969 .addReg(0); 21970 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 21971 addDirectMem(MIB, X86::EAX); 21972 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 21973 } 21974 21975 MI->eraseFromParent(); // The pseudo instruction is gone now. 21976 return BB; 21977 } 21978 21979 MachineBasicBlock * 21980 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, 21981 MachineBasicBlock *MBB) const { 21982 DebugLoc DL = MI->getDebugLoc(); 21983 MachineFunction *MF = MBB->getParent(); 21984 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 21985 MachineRegisterInfo &MRI = MF->getRegInfo(); 21986 21987 const BasicBlock *BB = MBB->getBasicBlock(); 21988 MachineFunction::iterator I = ++MBB->getIterator(); 21989 21990 // Memory Reference 21991 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 21992 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 21993 21994 unsigned DstReg; 21995 unsigned MemOpndSlot = 0; 21996 21997 unsigned CurOp = 0; 21998 21999 DstReg = MI->getOperand(CurOp++).getReg(); 22000 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 22001 assert(RC->hasType(MVT::i32) && "Invalid destination!"); 22002 unsigned mainDstReg = MRI.createVirtualRegister(RC); 22003 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 22004 22005 MemOpndSlot = CurOp; 22006 22007 MVT PVT = getPointerTy(MF->getDataLayout()); 22008 assert((PVT == MVT::i64 || PVT == MVT::i32) && 22009 "Invalid Pointer Size!"); 22010 22011 // For v = setjmp(buf), we generate 22012 // 22013 // thisMBB: 22014 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB 22015 // SjLjSetup restoreMBB 22016 // 22017 // mainMBB: 22018 // v_main = 0 22019 // 22020 // sinkMBB: 22021 // v = phi(main, restore) 22022 // 22023 // restoreMBB: 22024 // if base pointer being used, load it from frame 22025 // v_restore = 1 22026 22027 MachineBasicBlock *thisMBB = MBB; 22028 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 22029 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 22030 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); 22031 MF->insert(I, mainMBB); 22032 MF->insert(I, sinkMBB); 22033 MF->push_back(restoreMBB); 22034 restoreMBB->setHasAddressTaken(); 22035 22036 MachineInstrBuilder MIB; 22037 22038 // Transfer the remainder of BB and its successor edges to sinkMBB. 22039 sinkMBB->splice(sinkMBB->begin(), MBB, 22040 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 22041 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 22042 22043 // thisMBB: 22044 unsigned PtrStoreOpc = 0; 22045 unsigned LabelReg = 0; 22046 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 22047 Reloc::Model RM = MF->getTarget().getRelocationModel(); 22048 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && 22049 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); 22050 22051 // Prepare IP either in reg or imm. 22052 if (!UseImmLabel) { 22053 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 22054 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 22055 LabelReg = MRI.createVirtualRegister(PtrRC); 22056 if (Subtarget->is64Bit()) { 22057 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) 22058 .addReg(X86::RIP) 22059 .addImm(0) 22060 .addReg(0) 22061 .addMBB(restoreMBB) 22062 .addReg(0); 22063 } else { 22064 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); 22065 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) 22066 .addReg(XII->getGlobalBaseReg(MF)) 22067 .addImm(0) 22068 .addReg(0) 22069 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) 22070 .addReg(0); 22071 } 22072 } else 22073 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 22074 // Store IP 22075 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); 22076 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 22077 if (i == X86::AddrDisp) 22078 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); 22079 else 22080 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 22081 } 22082 if (!UseImmLabel) 22083 MIB.addReg(LabelReg); 22084 else 22085 MIB.addMBB(restoreMBB); 22086 MIB.setMemRefs(MMOBegin, MMOEnd); 22087 // Setup 22088 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) 22089 .addMBB(restoreMBB); 22090 22091 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 22092 MIB.addRegMask(RegInfo->getNoPreservedMask()); 22093 thisMBB->addSuccessor(mainMBB); 22094 thisMBB->addSuccessor(restoreMBB); 22095 22096 // mainMBB: 22097 // EAX = 0 22098 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); 22099 mainMBB->addSuccessor(sinkMBB); 22100 22101 // sinkMBB: 22102 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 22103 TII->get(X86::PHI), DstReg) 22104 .addReg(mainDstReg).addMBB(mainMBB) 22105 .addReg(restoreDstReg).addMBB(restoreMBB); 22106 22107 // restoreMBB: 22108 if (RegInfo->hasBasePointer(*MF)) { 22109 const bool Uses64BitFramePtr = 22110 Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64(); 22111 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); 22112 X86FI->setRestoreBasePointer(MF); 22113 unsigned FramePtr = RegInfo->getFrameRegister(*MF); 22114 unsigned BasePtr = RegInfo->getBaseRegister(); 22115 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; 22116 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), 22117 FramePtr, true, X86FI->getRestoreBasePointerOffset()) 22118 .setMIFlag(MachineInstr::FrameSetup); 22119 } 22120 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); 22121 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); 22122 restoreMBB->addSuccessor(sinkMBB); 22123 22124 MI->eraseFromParent(); 22125 return sinkMBB; 22126 } 22127 22128 MachineBasicBlock * 22129 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, 22130 MachineBasicBlock *MBB) const { 22131 DebugLoc DL = MI->getDebugLoc(); 22132 MachineFunction *MF = MBB->getParent(); 22133 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 22134 MachineRegisterInfo &MRI = MF->getRegInfo(); 22135 22136 // Memory Reference 22137 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 22138 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 22139 22140 MVT PVT = getPointerTy(MF->getDataLayout()); 22141 assert((PVT == MVT::i64 || PVT == MVT::i32) && 22142 "Invalid Pointer Size!"); 22143 22144 const TargetRegisterClass *RC = 22145 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 22146 unsigned Tmp = MRI.createVirtualRegister(RC); 22147 // Since FP is only updated here but NOT referenced, it's treated as GPR. 22148 const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); 22149 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; 22150 unsigned SP = RegInfo->getStackRegister(); 22151 22152 MachineInstrBuilder MIB; 22153 22154 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 22155 const int64_t SPOffset = 2 * PVT.getStoreSize(); 22156 22157 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; 22158 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; 22159 22160 // Reload FP 22161 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); 22162 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 22163 MIB.addOperand(MI->getOperand(i)); 22164 MIB.setMemRefs(MMOBegin, MMOEnd); 22165 // Reload IP 22166 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); 22167 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 22168 if (i == X86::AddrDisp) 22169 MIB.addDisp(MI->getOperand(i), LabelOffset); 22170 else 22171 MIB.addOperand(MI->getOperand(i)); 22172 } 22173 MIB.setMemRefs(MMOBegin, MMOEnd); 22174 // Reload SP 22175 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); 22176 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 22177 if (i == X86::AddrDisp) 22178 MIB.addDisp(MI->getOperand(i), SPOffset); 22179 else 22180 MIB.addOperand(MI->getOperand(i)); 22181 } 22182 MIB.setMemRefs(MMOBegin, MMOEnd); 22183 // Jump 22184 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); 22185 22186 MI->eraseFromParent(); 22187 return MBB; 22188 } 22189 22190 // Replace 213-type (isel default) FMA3 instructions with 231-type for 22191 // accumulator loops. Writing back to the accumulator allows the coalescer 22192 // to remove extra copies in the loop. 22193 // FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937). 22194 MachineBasicBlock * 22195 X86TargetLowering::emitFMA3Instr(MachineInstr *MI, 22196 MachineBasicBlock *MBB) const { 22197 MachineOperand &AddendOp = MI->getOperand(3); 22198 22199 // Bail out early if the addend isn't a register - we can't switch these. 22200 if (!AddendOp.isReg()) 22201 return MBB; 22202 22203 MachineFunction &MF = *MBB->getParent(); 22204 MachineRegisterInfo &MRI = MF.getRegInfo(); 22205 22206 // Check whether the addend is defined by a PHI: 22207 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); 22208 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); 22209 if (!AddendDef.isPHI()) 22210 return MBB; 22211 22212 // Look for the following pattern: 22213 // loop: 22214 // %addend = phi [%entry, 0], [%loop, %result] 22215 // ... 22216 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend 22217 22218 // Replace with: 22219 // loop: 22220 // %addend = phi [%entry, 0], [%loop, %result] 22221 // ... 22222 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2 22223 22224 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { 22225 assert(AddendDef.getOperand(i).isReg()); 22226 MachineOperand PHISrcOp = AddendDef.getOperand(i); 22227 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); 22228 if (&PHISrcInst == MI) { 22229 // Found a matching instruction. 22230 unsigned NewFMAOpc = 0; 22231 switch (MI->getOpcode()) { 22232 case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; 22233 case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; 22234 case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; 22235 case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; 22236 case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; 22237 case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; 22238 case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; 22239 case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; 22240 case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; 22241 case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; 22242 case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; 22243 case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; 22244 case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; 22245 case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; 22246 case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; 22247 case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; 22248 case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; 22249 case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; 22250 case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; 22251 case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; 22252 22253 case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; 22254 case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; 22255 case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; 22256 case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; 22257 case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; 22258 case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; 22259 case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; 22260 case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; 22261 case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; 22262 case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; 22263 case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; 22264 case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; 22265 default: llvm_unreachable("Unrecognized FMA variant."); 22266 } 22267 22268 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 22269 MachineInstrBuilder MIB = 22270 BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) 22271 .addOperand(MI->getOperand(0)) 22272 .addOperand(MI->getOperand(3)) 22273 .addOperand(MI->getOperand(2)) 22274 .addOperand(MI->getOperand(1)); 22275 MBB->insert(MachineBasicBlock::iterator(MI), MIB); 22276 MI->eraseFromParent(); 22277 } 22278 } 22279 22280 return MBB; 22281 } 22282 22283 MachineBasicBlock * 22284 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 22285 MachineBasicBlock *BB) const { 22286 switch (MI->getOpcode()) { 22287 default: llvm_unreachable("Unexpected instr type to insert"); 22288 case X86::TAILJMPd64: 22289 case X86::TAILJMPr64: 22290 case X86::TAILJMPm64: 22291 case X86::TAILJMPd64_REX: 22292 case X86::TAILJMPr64_REX: 22293 case X86::TAILJMPm64_REX: 22294 llvm_unreachable("TAILJMP64 would not be touched here."); 22295 case X86::TCRETURNdi64: 22296 case X86::TCRETURNri64: 22297 case X86::TCRETURNmi64: 22298 return BB; 22299 case X86::WIN_ALLOCA: 22300 return EmitLoweredWinAlloca(MI, BB); 22301 case X86::CATCHRET: 22302 return EmitLoweredCatchRet(MI, BB); 22303 case X86::CATCHPAD: 22304 return EmitLoweredCatchPad(MI, BB); 22305 case X86::SEG_ALLOCA_32: 22306 case X86::SEG_ALLOCA_64: 22307 return EmitLoweredSegAlloca(MI, BB); 22308 case X86::TLSCall_32: 22309 case X86::TLSCall_64: 22310 return EmitLoweredTLSCall(MI, BB); 22311 case X86::CMOV_FR32: 22312 case X86::CMOV_FR64: 22313 case X86::CMOV_FR128: 22314 case X86::CMOV_GR8: 22315 case X86::CMOV_GR16: 22316 case X86::CMOV_GR32: 22317 case X86::CMOV_RFP32: 22318 case X86::CMOV_RFP64: 22319 case X86::CMOV_RFP80: 22320 case X86::CMOV_V2F64: 22321 case X86::CMOV_V2I64: 22322 case X86::CMOV_V4F32: 22323 case X86::CMOV_V4F64: 22324 case X86::CMOV_V4I64: 22325 case X86::CMOV_V16F32: 22326 case X86::CMOV_V8F32: 22327 case X86::CMOV_V8F64: 22328 case X86::CMOV_V8I64: 22329 case X86::CMOV_V8I1: 22330 case X86::CMOV_V16I1: 22331 case X86::CMOV_V32I1: 22332 case X86::CMOV_V64I1: 22333 return EmitLoweredSelect(MI, BB); 22334 22335 case X86::RELEASE_FADD32mr: 22336 case X86::RELEASE_FADD64mr: 22337 return EmitLoweredAtomicFP(MI, BB); 22338 22339 case X86::FP32_TO_INT16_IN_MEM: 22340 case X86::FP32_TO_INT32_IN_MEM: 22341 case X86::FP32_TO_INT64_IN_MEM: 22342 case X86::FP64_TO_INT16_IN_MEM: 22343 case X86::FP64_TO_INT32_IN_MEM: 22344 case X86::FP64_TO_INT64_IN_MEM: 22345 case X86::FP80_TO_INT16_IN_MEM: 22346 case X86::FP80_TO_INT32_IN_MEM: 22347 case X86::FP80_TO_INT64_IN_MEM: { 22348 MachineFunction *F = BB->getParent(); 22349 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 22350 DebugLoc DL = MI->getDebugLoc(); 22351 22352 // Change the floating point control register to use "round towards zero" 22353 // mode when truncating to an integer value. 22354 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 22355 addFrameReference(BuildMI(*BB, MI, DL, 22356 TII->get(X86::FNSTCW16m)), CWFrameIdx); 22357 22358 // Load the old value of the high byte of the control word... 22359 unsigned OldCW = 22360 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 22361 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 22362 CWFrameIdx); 22363 22364 // Set the high part to be round to zero... 22365 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 22366 .addImm(0xC7F); 22367 22368 // Reload the modified control word now... 22369 addFrameReference(BuildMI(*BB, MI, DL, 22370 TII->get(X86::FLDCW16m)), CWFrameIdx); 22371 22372 // Restore the memory image of control word to original value 22373 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 22374 .addReg(OldCW); 22375 22376 // Get the X86 opcode to use. 22377 unsigned Opc; 22378 switch (MI->getOpcode()) { 22379 default: llvm_unreachable("illegal opcode!"); 22380 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 22381 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 22382 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 22383 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 22384 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 22385 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 22386 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 22387 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 22388 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 22389 } 22390 22391 X86AddressMode AM; 22392 MachineOperand &Op = MI->getOperand(0); 22393 if (Op.isReg()) { 22394 AM.BaseType = X86AddressMode::RegBase; 22395 AM.Base.Reg = Op.getReg(); 22396 } else { 22397 AM.BaseType = X86AddressMode::FrameIndexBase; 22398 AM.Base.FrameIndex = Op.getIndex(); 22399 } 22400 Op = MI->getOperand(1); 22401 if (Op.isImm()) 22402 AM.Scale = Op.getImm(); 22403 Op = MI->getOperand(2); 22404 if (Op.isImm()) 22405 AM.IndexReg = Op.getImm(); 22406 Op = MI->getOperand(3); 22407 if (Op.isGlobal()) { 22408 AM.GV = Op.getGlobal(); 22409 } else { 22410 AM.Disp = Op.getImm(); 22411 } 22412 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 22413 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 22414 22415 // Reload the original control word now. 22416 addFrameReference(BuildMI(*BB, MI, DL, 22417 TII->get(X86::FLDCW16m)), CWFrameIdx); 22418 22419 MI->eraseFromParent(); // The pseudo instruction is gone now. 22420 return BB; 22421 } 22422 // String/text processing lowering. 22423 case X86::PCMPISTRM128REG: 22424 case X86::VPCMPISTRM128REG: 22425 case X86::PCMPISTRM128MEM: 22426 case X86::VPCMPISTRM128MEM: 22427 case X86::PCMPESTRM128REG: 22428 case X86::VPCMPESTRM128REG: 22429 case X86::PCMPESTRM128MEM: 22430 case X86::VPCMPESTRM128MEM: 22431 assert(Subtarget->hasSSE42() && 22432 "Target must have SSE4.2 or AVX features enabled"); 22433 return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo()); 22434 22435 // String/text processing lowering. 22436 case X86::PCMPISTRIREG: 22437 case X86::VPCMPISTRIREG: 22438 case X86::PCMPISTRIMEM: 22439 case X86::VPCMPISTRIMEM: 22440 case X86::PCMPESTRIREG: 22441 case X86::VPCMPESTRIREG: 22442 case X86::PCMPESTRIMEM: 22443 case X86::VPCMPESTRIMEM: 22444 assert(Subtarget->hasSSE42() && 22445 "Target must have SSE4.2 or AVX features enabled"); 22446 return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo()); 22447 22448 // Thread synchronization. 22449 case X86::MONITOR: 22450 return EmitMonitor(MI, BB, Subtarget); 22451 22452 // xbegin 22453 case X86::XBEGIN: 22454 return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); 22455 22456 case X86::VASTART_SAVE_XMM_REGS: 22457 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 22458 22459 case X86::VAARG_64: 22460 return EmitVAARG64WithCustomInserter(MI, BB); 22461 22462 case X86::EH_SjLj_SetJmp32: 22463 case X86::EH_SjLj_SetJmp64: 22464 return emitEHSjLjSetJmp(MI, BB); 22465 22466 case X86::EH_SjLj_LongJmp32: 22467 case X86::EH_SjLj_LongJmp64: 22468 return emitEHSjLjLongJmp(MI, BB); 22469 22470 case TargetOpcode::STATEPOINT: 22471 // As an implementation detail, STATEPOINT shares the STACKMAP format at 22472 // this point in the process. We diverge later. 22473 return emitPatchPoint(MI, BB); 22474 22475 case TargetOpcode::STACKMAP: 22476 case TargetOpcode::PATCHPOINT: 22477 return emitPatchPoint(MI, BB); 22478 22479 case X86::VFMADDPDr213r: 22480 case X86::VFMADDPSr213r: 22481 case X86::VFMADDSDr213r: 22482 case X86::VFMADDSSr213r: 22483 case X86::VFMSUBPDr213r: 22484 case X86::VFMSUBPSr213r: 22485 case X86::VFMSUBSDr213r: 22486 case X86::VFMSUBSSr213r: 22487 case X86::VFNMADDPDr213r: 22488 case X86::VFNMADDPSr213r: 22489 case X86::VFNMADDSDr213r: 22490 case X86::VFNMADDSSr213r: 22491 case X86::VFNMSUBPDr213r: 22492 case X86::VFNMSUBPSr213r: 22493 case X86::VFNMSUBSDr213r: 22494 case X86::VFNMSUBSSr213r: 22495 case X86::VFMADDSUBPDr213r: 22496 case X86::VFMADDSUBPSr213r: 22497 case X86::VFMSUBADDPDr213r: 22498 case X86::VFMSUBADDPSr213r: 22499 case X86::VFMADDPDr213rY: 22500 case X86::VFMADDPSr213rY: 22501 case X86::VFMSUBPDr213rY: 22502 case X86::VFMSUBPSr213rY: 22503 case X86::VFNMADDPDr213rY: 22504 case X86::VFNMADDPSr213rY: 22505 case X86::VFNMSUBPDr213rY: 22506 case X86::VFNMSUBPSr213rY: 22507 case X86::VFMADDSUBPDr213rY: 22508 case X86::VFMADDSUBPSr213rY: 22509 case X86::VFMSUBADDPDr213rY: 22510 case X86::VFMSUBADDPSr213rY: 22511 return emitFMA3Instr(MI, BB); 22512 } 22513 } 22514 22515 //===----------------------------------------------------------------------===// 22516 // X86 Optimization Hooks 22517 //===----------------------------------------------------------------------===// 22518 22519 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 22520 APInt &KnownZero, 22521 APInt &KnownOne, 22522 const SelectionDAG &DAG, 22523 unsigned Depth) const { 22524 unsigned BitWidth = KnownZero.getBitWidth(); 22525 unsigned Opc = Op.getOpcode(); 22526 assert((Opc >= ISD::BUILTIN_OP_END || 22527 Opc == ISD::INTRINSIC_WO_CHAIN || 22528 Opc == ISD::INTRINSIC_W_CHAIN || 22529 Opc == ISD::INTRINSIC_VOID) && 22530 "Should use MaskedValueIsZero if you don't know whether Op" 22531 " is a target node!"); 22532 22533 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 22534 switch (Opc) { 22535 default: break; 22536 case X86ISD::ADD: 22537 case X86ISD::SUB: 22538 case X86ISD::ADC: 22539 case X86ISD::SBB: 22540 case X86ISD::SMUL: 22541 case X86ISD::UMUL: 22542 case X86ISD::INC: 22543 case X86ISD::DEC: 22544 case X86ISD::OR: 22545 case X86ISD::XOR: 22546 case X86ISD::AND: 22547 // These nodes' second result is a boolean. 22548 if (Op.getResNo() == 0) 22549 break; 22550 // Fallthrough 22551 case X86ISD::SETCC: 22552 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 22553 break; 22554 case ISD::INTRINSIC_WO_CHAIN: { 22555 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 22556 unsigned NumLoBits = 0; 22557 switch (IntId) { 22558 default: break; 22559 case Intrinsic::x86_sse_movmsk_ps: 22560 case Intrinsic::x86_avx_movmsk_ps_256: 22561 case Intrinsic::x86_sse2_movmsk_pd: 22562 case Intrinsic::x86_avx_movmsk_pd_256: 22563 case Intrinsic::x86_mmx_pmovmskb: 22564 case Intrinsic::x86_sse2_pmovmskb_128: 22565 case Intrinsic::x86_avx2_pmovmskb: { 22566 // High bits of movmskp{s|d}, pmovmskb are known zero. 22567 switch (IntId) { 22568 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 22569 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 22570 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 22571 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 22572 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 22573 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 22574 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 22575 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 22576 } 22577 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 22578 break; 22579 } 22580 } 22581 break; 22582 } 22583 } 22584 } 22585 22586 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( 22587 SDValue Op, 22588 const SelectionDAG &, 22589 unsigned Depth) const { 22590 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 22591 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 22592 return Op.getValueType().getScalarSizeInBits(); 22593 22594 // Fallback case. 22595 return 1; 22596 } 22597 22598 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 22599 /// node is a GlobalAddress + offset. 22600 bool X86TargetLowering::isGAPlusOffset(SDNode *N, 22601 const GlobalValue* &GA, 22602 int64_t &Offset) const { 22603 if (N->getOpcode() == X86ISD::Wrapper) { 22604 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 22605 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 22606 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 22607 return true; 22608 } 22609 } 22610 return TargetLowering::isGAPlusOffset(N, GA, Offset); 22611 } 22612 22613 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 22614 /// same as extracting the high 128-bit part of 256-bit vector and then 22615 /// inserting the result into the low part of a new 256-bit vector 22616 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 22617 EVT VT = SVOp->getValueType(0); 22618 unsigned NumElems = VT.getVectorNumElements(); 22619 22620 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 22621 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 22622 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 22623 SVOp->getMaskElt(j) >= 0) 22624 return false; 22625 22626 return true; 22627 } 22628 22629 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 22630 /// same as extracting the low 128-bit part of 256-bit vector and then 22631 /// inserting the result into the high part of a new 256-bit vector 22632 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 22633 EVT VT = SVOp->getValueType(0); 22634 unsigned NumElems = VT.getVectorNumElements(); 22635 22636 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 22637 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 22638 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 22639 SVOp->getMaskElt(j) >= 0) 22640 return false; 22641 22642 return true; 22643 } 22644 22645 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 22646 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 22647 TargetLowering::DAGCombinerInfo &DCI, 22648 const X86Subtarget* Subtarget) { 22649 SDLoc dl(N); 22650 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 22651 SDValue V1 = SVOp->getOperand(0); 22652 SDValue V2 = SVOp->getOperand(1); 22653 MVT VT = SVOp->getSimpleValueType(0); 22654 unsigned NumElems = VT.getVectorNumElements(); 22655 22656 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 22657 V2.getOpcode() == ISD::CONCAT_VECTORS) { 22658 // 22659 // 0,0,0,... 22660 // | 22661 // V UNDEF BUILD_VECTOR UNDEF 22662 // \ / \ / 22663 // CONCAT_VECTOR CONCAT_VECTOR 22664 // \ / 22665 // \ / 22666 // RESULT: V + zero extended 22667 // 22668 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 22669 V2.getOperand(1).getOpcode() != ISD::UNDEF || 22670 V1.getOperand(1).getOpcode() != ISD::UNDEF) 22671 return SDValue(); 22672 22673 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 22674 return SDValue(); 22675 22676 // To match the shuffle mask, the first half of the mask should 22677 // be exactly the first vector, and all the rest a splat with the 22678 // first element of the second one. 22679 for (unsigned i = 0; i != NumElems/2; ++i) 22680 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 22681 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 22682 return SDValue(); 22683 22684 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 22685 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 22686 if (Ld->hasNUsesOfValue(1, 0)) { 22687 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 22688 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 22689 SDValue ResNode = 22690 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 22691 Ld->getMemoryVT(), 22692 Ld->getPointerInfo(), 22693 Ld->getAlignment(), 22694 false/*isVolatile*/, true/*ReadMem*/, 22695 false/*WriteMem*/); 22696 22697 // Make sure the newly-created LOAD is in the same position as Ld in 22698 // terms of dependency. We create a TokenFactor for Ld and ResNode, 22699 // and update uses of Ld's output chain to use the TokenFactor. 22700 if (Ld->hasAnyUseOfValue(1)) { 22701 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 22702 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); 22703 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 22704 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), 22705 SDValue(ResNode.getNode(), 1)); 22706 } 22707 22708 return DAG.getBitcast(VT, ResNode); 22709 } 22710 } 22711 22712 // Emit a zeroed vector and insert the desired subvector on its 22713 // first half. 22714 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 22715 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 22716 return DCI.CombineTo(N, InsV); 22717 } 22718 22719 //===--------------------------------------------------------------------===// 22720 // Combine some shuffles into subvector extracts and inserts: 22721 // 22722 22723 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 22724 if (isShuffleHigh128VectorInsertLow(SVOp)) { 22725 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 22726 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 22727 return DCI.CombineTo(N, InsV); 22728 } 22729 22730 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 22731 if (isShuffleLow128VectorInsertHigh(SVOp)) { 22732 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 22733 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 22734 return DCI.CombineTo(N, InsV); 22735 } 22736 22737 return SDValue(); 22738 } 22739 22740 /// \brief Combine an arbitrary chain of shuffles into a single instruction if 22741 /// possible. 22742 /// 22743 /// This is the leaf of the recursive combinine below. When we have found some 22744 /// chain of single-use x86 shuffle instructions and accumulated the combined 22745 /// shuffle mask represented by them, this will try to pattern match that mask 22746 /// into either a single instruction if there is a special purpose instruction 22747 /// for this operation, or into a PSHUFB instruction which is a fully general 22748 /// instruction but should only be used to replace chains over a certain depth. 22749 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, 22750 int Depth, bool HasPSHUFB, SelectionDAG &DAG, 22751 TargetLowering::DAGCombinerInfo &DCI, 22752 const X86Subtarget *Subtarget) { 22753 assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); 22754 22755 // Find the operand that enters the chain. Note that multiple uses are OK 22756 // here, we're not going to remove the operand we find. 22757 SDValue Input = Op.getOperand(0); 22758 while (Input.getOpcode() == ISD::BITCAST) 22759 Input = Input.getOperand(0); 22760 22761 MVT VT = Input.getSimpleValueType(); 22762 MVT RootVT = Root.getSimpleValueType(); 22763 SDLoc DL(Root); 22764 22765 if (Mask.size() == 1) { 22766 int Index = Mask[0]; 22767 assert((Index >= 0 || Index == SM_SentinelUndef || 22768 Index == SM_SentinelZero) && 22769 "Invalid shuffle index found!"); 22770 22771 // We may end up with an accumulated mask of size 1 as a result of 22772 // widening of shuffle operands (see function canWidenShuffleElements). 22773 // If the only shuffle index is equal to SM_SentinelZero then propagate 22774 // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle 22775 // mask, and therefore the entire chain of shuffles can be folded away. 22776 if (Index == SM_SentinelZero) 22777 DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL)); 22778 else 22779 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), 22780 /*AddTo*/ true); 22781 return true; 22782 } 22783 22784 // Use the float domain if the operand type is a floating point type. 22785 bool FloatDomain = VT.isFloatingPoint(); 22786 22787 // For floating point shuffles, we don't have free copies in the shuffle 22788 // instructions or the ability to load as part of the instruction, so 22789 // canonicalize their shuffles to UNPCK or MOV variants. 22790 // 22791 // Note that even with AVX we prefer the PSHUFD form of shuffle for integer 22792 // vectors because it can have a load folded into it that UNPCK cannot. This 22793 // doesn't preclude something switching to the shorter encoding post-RA. 22794 // 22795 // FIXME: Should teach these routines about AVX vector widths. 22796 if (FloatDomain && VT.is128BitVector()) { 22797 if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { 22798 bool Lo = Mask.equals({0, 0}); 22799 unsigned Shuffle; 22800 MVT ShuffleVT; 22801 // Check if we have SSE3 which will let us use MOVDDUP. That instruction 22802 // is no slower than UNPCKLPD but has the option to fold the input operand 22803 // into even an unaligned memory load. 22804 if (Lo && Subtarget->hasSSE3()) { 22805 Shuffle = X86ISD::MOVDDUP; 22806 ShuffleVT = MVT::v2f64; 22807 } else { 22808 // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller 22809 // than the UNPCK variants. 22810 Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; 22811 ShuffleVT = MVT::v4f32; 22812 } 22813 if (Depth == 1 && Root->getOpcode() == Shuffle) 22814 return false; // Nothing to do! 22815 Op = DAG.getBitcast(ShuffleVT, Input); 22816 DCI.AddToWorklist(Op.getNode()); 22817 if (Shuffle == X86ISD::MOVDDUP) 22818 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); 22819 else 22820 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 22821 DCI.AddToWorklist(Op.getNode()); 22822 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), 22823 /*AddTo*/ true); 22824 return true; 22825 } 22826 if (Subtarget->hasSSE3() && 22827 (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) { 22828 bool Lo = Mask.equals({0, 0, 2, 2}); 22829 unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; 22830 MVT ShuffleVT = MVT::v4f32; 22831 if (Depth == 1 && Root->getOpcode() == Shuffle) 22832 return false; // Nothing to do! 22833 Op = DAG.getBitcast(ShuffleVT, Input); 22834 DCI.AddToWorklist(Op.getNode()); 22835 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); 22836 DCI.AddToWorklist(Op.getNode()); 22837 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), 22838 /*AddTo*/ true); 22839 return true; 22840 } 22841 if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) { 22842 bool Lo = Mask.equals({0, 0, 1, 1}); 22843 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; 22844 MVT ShuffleVT = MVT::v4f32; 22845 if (Depth == 1 && Root->getOpcode() == Shuffle) 22846 return false; // Nothing to do! 22847 Op = DAG.getBitcast(ShuffleVT, Input); 22848 DCI.AddToWorklist(Op.getNode()); 22849 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 22850 DCI.AddToWorklist(Op.getNode()); 22851 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), 22852 /*AddTo*/ true); 22853 return true; 22854 } 22855 } 22856 22857 // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK 22858 // variants as none of these have single-instruction variants that are 22859 // superior to the UNPCK formulation. 22860 if (!FloatDomain && VT.is128BitVector() && 22861 (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || 22862 Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || 22863 Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || 22864 Mask.equals( 22865 {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) { 22866 bool Lo = Mask[0] == 0; 22867 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; 22868 if (Depth == 1 && Root->getOpcode() == Shuffle) 22869 return false; // Nothing to do! 22870 MVT ShuffleVT; 22871 switch (Mask.size()) { 22872 case 8: 22873 ShuffleVT = MVT::v8i16; 22874 break; 22875 case 16: 22876 ShuffleVT = MVT::v16i8; 22877 break; 22878 default: 22879 llvm_unreachable("Impossible mask size!"); 22880 }; 22881 Op = DAG.getBitcast(ShuffleVT, Input); 22882 DCI.AddToWorklist(Op.getNode()); 22883 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 22884 DCI.AddToWorklist(Op.getNode()); 22885 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), 22886 /*AddTo*/ true); 22887 return true; 22888 } 22889 22890 // Don't try to re-form single instruction chains under any circumstances now 22891 // that we've done encoding canonicalization for them. 22892 if (Depth < 2) 22893 return false; 22894 22895 // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we 22896 // can replace them with a single PSHUFB instruction profitably. Intel's 22897 // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but 22898 // in practice PSHUFB tends to be *very* fast so we're more aggressive. 22899 if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { 22900 SmallVector<SDValue, 16> PSHUFBMask; 22901 int NumBytes = VT.getSizeInBits() / 8; 22902 int Ratio = NumBytes / Mask.size(); 22903 for (int i = 0; i < NumBytes; ++i) { 22904 if (Mask[i / Ratio] == SM_SentinelUndef) { 22905 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); 22906 continue; 22907 } 22908 int M = Mask[i / Ratio] != SM_SentinelZero 22909 ? Ratio * Mask[i / Ratio] + i % Ratio 22910 : 255; 22911 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); 22912 } 22913 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); 22914 Op = DAG.getBitcast(ByteVT, Input); 22915 DCI.AddToWorklist(Op.getNode()); 22916 SDValue PSHUFBMaskOp = 22917 DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); 22918 DCI.AddToWorklist(PSHUFBMaskOp.getNode()); 22919 Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); 22920 DCI.AddToWorklist(Op.getNode()); 22921 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), 22922 /*AddTo*/ true); 22923 return true; 22924 } 22925 22926 // Failed to find any combines. 22927 return false; 22928 } 22929 22930 /// \brief Fully generic combining of x86 shuffle instructions. 22931 /// 22932 /// This should be the last combine run over the x86 shuffle instructions. Once 22933 /// they have been fully optimized, this will recursively consider all chains 22934 /// of single-use shuffle instructions, build a generic model of the cumulative 22935 /// shuffle operation, and check for simpler instructions which implement this 22936 /// operation. We use this primarily for two purposes: 22937 /// 22938 /// 1) Collapse generic shuffles to specialized single instructions when 22939 /// equivalent. In most cases, this is just an encoding size win, but 22940 /// sometimes we will collapse multiple generic shuffles into a single 22941 /// special-purpose shuffle. 22942 /// 2) Look for sequences of shuffle instructions with 3 or more total 22943 /// instructions, and replace them with the slightly more expensive SSSE3 22944 /// PSHUFB instruction if available. We do this as the last combining step 22945 /// to ensure we avoid using PSHUFB if we can implement the shuffle with 22946 /// a suitable short sequence of other instructions. The PHUFB will either 22947 /// use a register or have to read from memory and so is slightly (but only 22948 /// slightly) more expensive than the other shuffle instructions. 22949 /// 22950 /// Because this is inherently a quadratic operation (for each shuffle in 22951 /// a chain, we recurse up the chain), the depth is limited to 8 instructions. 22952 /// This should never be an issue in practice as the shuffle lowering doesn't 22953 /// produce sequences of more than 8 instructions. 22954 /// 22955 /// FIXME: We will currently miss some cases where the redundant shuffling 22956 /// would simplify under the threshold for PSHUFB formation because of 22957 /// combine-ordering. To fix this, we should do the redundant instruction 22958 /// combining in this recursive walk. 22959 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, 22960 ArrayRef<int> RootMask, 22961 int Depth, bool HasPSHUFB, 22962 SelectionDAG &DAG, 22963 TargetLowering::DAGCombinerInfo &DCI, 22964 const X86Subtarget *Subtarget) { 22965 // Bound the depth of our recursive combine because this is ultimately 22966 // quadratic in nature. 22967 if (Depth > 8) 22968 return false; 22969 22970 // Directly rip through bitcasts to find the underlying operand. 22971 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) 22972 Op = Op.getOperand(0); 22973 22974 MVT VT = Op.getSimpleValueType(); 22975 if (!VT.isVector()) 22976 return false; // Bail if we hit a non-vector. 22977 22978 assert(Root.getSimpleValueType().isVector() && 22979 "Shuffles operate on vector types!"); 22980 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && 22981 "Can only combine shuffles of the same vector register size."); 22982 22983 if (!isTargetShuffle(Op.getOpcode())) 22984 return false; 22985 SmallVector<int, 16> OpMask; 22986 bool IsUnary; 22987 bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); 22988 // We only can combine unary shuffles which we can decode the mask for. 22989 if (!HaveMask || !IsUnary) 22990 return false; 22991 22992 assert(VT.getVectorNumElements() == OpMask.size() && 22993 "Different mask size from vector size!"); 22994 assert(((RootMask.size() > OpMask.size() && 22995 RootMask.size() % OpMask.size() == 0) || 22996 (OpMask.size() > RootMask.size() && 22997 OpMask.size() % RootMask.size() == 0) || 22998 OpMask.size() == RootMask.size()) && 22999 "The smaller number of elements must divide the larger."); 23000 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); 23001 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); 23002 assert(((RootRatio == 1 && OpRatio == 1) || 23003 (RootRatio == 1) != (OpRatio == 1)) && 23004 "Must not have a ratio for both incoming and op masks!"); 23005 23006 SmallVector<int, 16> Mask; 23007 Mask.reserve(std::max(OpMask.size(), RootMask.size())); 23008 23009 // Merge this shuffle operation's mask into our accumulated mask. Note that 23010 // this shuffle's mask will be the first applied to the input, followed by the 23011 // root mask to get us all the way to the root value arrangement. The reason 23012 // for this order is that we are recursing up the operation chain. 23013 for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { 23014 int RootIdx = i / RootRatio; 23015 if (RootMask[RootIdx] < 0) { 23016 // This is a zero or undef lane, we're done. 23017 Mask.push_back(RootMask[RootIdx]); 23018 continue; 23019 } 23020 23021 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; 23022 int OpIdx = RootMaskedIdx / OpRatio; 23023 if (OpMask[OpIdx] < 0) { 23024 // The incoming lanes are zero or undef, it doesn't matter which ones we 23025 // are using. 23026 Mask.push_back(OpMask[OpIdx]); 23027 continue; 23028 } 23029 23030 // Ok, we have non-zero lanes, map them through. 23031 Mask.push_back(OpMask[OpIdx] * OpRatio + 23032 RootMaskedIdx % OpRatio); 23033 } 23034 23035 // See if we can recurse into the operand to combine more things. 23036 switch (Op.getOpcode()) { 23037 case X86ISD::PSHUFB: 23038 HasPSHUFB = true; 23039 case X86ISD::PSHUFD: 23040 case X86ISD::PSHUFHW: 23041 case X86ISD::PSHUFLW: 23042 if (Op.getOperand(0).hasOneUse() && 23043 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, 23044 HasPSHUFB, DAG, DCI, Subtarget)) 23045 return true; 23046 break; 23047 23048 case X86ISD::UNPCKL: 23049 case X86ISD::UNPCKH: 23050 assert(Op.getOperand(0) == Op.getOperand(1) && 23051 "We only combine unary shuffles!"); 23052 // We can't check for single use, we have to check that this shuffle is the 23053 // only user. 23054 if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && 23055 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, 23056 HasPSHUFB, DAG, DCI, Subtarget)) 23057 return true; 23058 break; 23059 } 23060 23061 // Minor canonicalization of the accumulated shuffle mask to make it easier 23062 // to match below. All this does is detect masks with squential pairs of 23063 // elements, and shrink them to the half-width mask. It does this in a loop 23064 // so it will reduce the size of the mask to the minimal width mask which 23065 // performs an equivalent shuffle. 23066 SmallVector<int, 16> WidenedMask; 23067 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { 23068 Mask = std::move(WidenedMask); 23069 WidenedMask.clear(); 23070 } 23071 23072 return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, 23073 Subtarget); 23074 } 23075 23076 /// \brief Get the PSHUF-style mask from PSHUF node. 23077 /// 23078 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 23079 /// PSHUF-style masks that can be reused with such instructions. 23080 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { 23081 MVT VT = N.getSimpleValueType(); 23082 SmallVector<int, 4> Mask; 23083 bool IsUnary; 23084 bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); 23085 (void)HaveMask; 23086 assert(HaveMask); 23087 23088 // If we have more than 128-bits, only the low 128-bits of shuffle mask 23089 // matter. Check that the upper masks are repeats and remove them. 23090 if (VT.getSizeInBits() > 128) { 23091 int LaneElts = 128 / VT.getScalarSizeInBits(); 23092 #ifndef NDEBUG 23093 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) 23094 for (int j = 0; j < LaneElts; ++j) 23095 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && 23096 "Mask doesn't repeat in high 128-bit lanes!"); 23097 #endif 23098 Mask.resize(LaneElts); 23099 } 23100 23101 switch (N.getOpcode()) { 23102 case X86ISD::PSHUFD: 23103 return Mask; 23104 case X86ISD::PSHUFLW: 23105 Mask.resize(4); 23106 return Mask; 23107 case X86ISD::PSHUFHW: 23108 Mask.erase(Mask.begin(), Mask.begin() + 4); 23109 for (int &M : Mask) 23110 M -= 4; 23111 return Mask; 23112 default: 23113 llvm_unreachable("No valid shuffle instruction found!"); 23114 } 23115 } 23116 23117 /// \brief Search for a combinable shuffle across a chain ending in pshufd. 23118 /// 23119 /// We walk up the chain and look for a combinable shuffle, skipping over 23120 /// shuffles that we could hoist this shuffle's transformation past without 23121 /// altering anything. 23122 static SDValue 23123 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, 23124 SelectionDAG &DAG, 23125 TargetLowering::DAGCombinerInfo &DCI) { 23126 assert(N.getOpcode() == X86ISD::PSHUFD && 23127 "Called with something other than an x86 128-bit half shuffle!"); 23128 SDLoc DL(N); 23129 23130 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack 23131 // of the shuffles in the chain so that we can form a fresh chain to replace 23132 // this one. 23133 SmallVector<SDValue, 8> Chain; 23134 SDValue V = N.getOperand(0); 23135 for (; V.hasOneUse(); V = V.getOperand(0)) { 23136 switch (V.getOpcode()) { 23137 default: 23138 return SDValue(); // Nothing combined! 23139 23140 case ISD::BITCAST: 23141 // Skip bitcasts as we always know the type for the target specific 23142 // instructions. 23143 continue; 23144 23145 case X86ISD::PSHUFD: 23146 // Found another dword shuffle. 23147 break; 23148 23149 case X86ISD::PSHUFLW: 23150 // Check that the low words (being shuffled) are the identity in the 23151 // dword shuffle, and the high words are self-contained. 23152 if (Mask[0] != 0 || Mask[1] != 1 || 23153 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) 23154 return SDValue(); 23155 23156 Chain.push_back(V); 23157 continue; 23158 23159 case X86ISD::PSHUFHW: 23160 // Check that the high words (being shuffled) are the identity in the 23161 // dword shuffle, and the low words are self-contained. 23162 if (Mask[2] != 2 || Mask[3] != 3 || 23163 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) 23164 return SDValue(); 23165 23166 Chain.push_back(V); 23167 continue; 23168 23169 case X86ISD::UNPCKL: 23170 case X86ISD::UNPCKH: 23171 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword 23172 // shuffle into a preceding word shuffle. 23173 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && 23174 V.getSimpleValueType().getVectorElementType() != MVT::i16) 23175 return SDValue(); 23176 23177 // Search for a half-shuffle which we can combine with. 23178 unsigned CombineOp = 23179 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; 23180 if (V.getOperand(0) != V.getOperand(1) || 23181 !V->isOnlyUserOf(V.getOperand(0).getNode())) 23182 return SDValue(); 23183 Chain.push_back(V); 23184 V = V.getOperand(0); 23185 do { 23186 switch (V.getOpcode()) { 23187 default: 23188 return SDValue(); // Nothing to combine. 23189 23190 case X86ISD::PSHUFLW: 23191 case X86ISD::PSHUFHW: 23192 if (V.getOpcode() == CombineOp) 23193 break; 23194 23195 Chain.push_back(V); 23196 23197 // Fallthrough! 23198 case ISD::BITCAST: 23199 V = V.getOperand(0); 23200 continue; 23201 } 23202 break; 23203 } while (V.hasOneUse()); 23204 break; 23205 } 23206 // Break out of the loop if we break out of the switch. 23207 break; 23208 } 23209 23210 if (!V.hasOneUse()) 23211 // We fell out of the loop without finding a viable combining instruction. 23212 return SDValue(); 23213 23214 // Merge this node's mask and our incoming mask. 23215 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 23216 for (int &M : Mask) 23217 M = VMask[M]; 23218 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), 23219 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 23220 23221 // Rebuild the chain around this new shuffle. 23222 while (!Chain.empty()) { 23223 SDValue W = Chain.pop_back_val(); 23224 23225 if (V.getValueType() != W.getOperand(0).getValueType()) 23226 V = DAG.getBitcast(W.getOperand(0).getValueType(), V); 23227 23228 switch (W.getOpcode()) { 23229 default: 23230 llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); 23231 23232 case X86ISD::UNPCKL: 23233 case X86ISD::UNPCKH: 23234 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); 23235 break; 23236 23237 case X86ISD::PSHUFD: 23238 case X86ISD::PSHUFLW: 23239 case X86ISD::PSHUFHW: 23240 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); 23241 break; 23242 } 23243 } 23244 if (V.getValueType() != N.getValueType()) 23245 V = DAG.getBitcast(N.getValueType(), V); 23246 23247 // Return the new chain to replace N. 23248 return V; 23249 } 23250 23251 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or 23252 /// pshufhw. 23253 /// 23254 /// We walk up the chain, skipping shuffles of the other half and looking 23255 /// through shuffles which switch halves trying to find a shuffle of the same 23256 /// pair of dwords. 23257 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, 23258 SelectionDAG &DAG, 23259 TargetLowering::DAGCombinerInfo &DCI) { 23260 assert( 23261 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && 23262 "Called with something other than an x86 128-bit half shuffle!"); 23263 SDLoc DL(N); 23264 unsigned CombineOpcode = N.getOpcode(); 23265 23266 // Walk up a single-use chain looking for a combinable shuffle. 23267 SDValue V = N.getOperand(0); 23268 for (; V.hasOneUse(); V = V.getOperand(0)) { 23269 switch (V.getOpcode()) { 23270 default: 23271 return false; // Nothing combined! 23272 23273 case ISD::BITCAST: 23274 // Skip bitcasts as we always know the type for the target specific 23275 // instructions. 23276 continue; 23277 23278 case X86ISD::PSHUFLW: 23279 case X86ISD::PSHUFHW: 23280 if (V.getOpcode() == CombineOpcode) 23281 break; 23282 23283 // Other-half shuffles are no-ops. 23284 continue; 23285 } 23286 // Break out of the loop if we break out of the switch. 23287 break; 23288 } 23289 23290 if (!V.hasOneUse()) 23291 // We fell out of the loop without finding a viable combining instruction. 23292 return false; 23293 23294 // Combine away the bottom node as its shuffle will be accumulated into 23295 // a preceding shuffle. 23296 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); 23297 23298 // Record the old value. 23299 SDValue Old = V; 23300 23301 // Merge this node's mask and our incoming mask (adjusted to account for all 23302 // the pshufd instructions encountered). 23303 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 23304 for (int &M : Mask) 23305 M = VMask[M]; 23306 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), 23307 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 23308 23309 // Check that the shuffles didn't cancel each other out. If not, we need to 23310 // combine to the new one. 23311 if (Old != V) 23312 // Replace the combinable shuffle with the combined one, updating all users 23313 // so that we re-evaluate the chain here. 23314 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); 23315 23316 return true; 23317 } 23318 23319 /// \brief Try to combine x86 target specific shuffles. 23320 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, 23321 TargetLowering::DAGCombinerInfo &DCI, 23322 const X86Subtarget *Subtarget) { 23323 SDLoc DL(N); 23324 MVT VT = N.getSimpleValueType(); 23325 SmallVector<int, 4> Mask; 23326 23327 switch (N.getOpcode()) { 23328 case X86ISD::PSHUFD: 23329 case X86ISD::PSHUFLW: 23330 case X86ISD::PSHUFHW: 23331 Mask = getPSHUFShuffleMask(N); 23332 assert(Mask.size() == 4); 23333 break; 23334 case X86ISD::UNPCKL: { 23335 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in 23336 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE 23337 // moves upper half elements into the lower half part. For example: 23338 // 23339 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, 23340 // undef:v16i8 23341 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 23342 // 23343 // will be combined to: 23344 // 23345 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 23346 23347 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not 23348 // happen due to advanced instructions. 23349 if (!VT.is128BitVector()) 23350 return SDValue(); 23351 23352 auto Op0 = N.getOperand(0); 23353 auto Op1 = N.getOperand(1); 23354 if (Op0.getOpcode() == ISD::UNDEF && 23355 Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { 23356 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); 23357 23358 unsigned NumElts = VT.getVectorNumElements(); 23359 SmallVector<int, 8> ExpectedMask(NumElts, -1); 23360 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, 23361 NumElts / 2); 23362 23363 auto ShufOp = Op1.getOperand(0); 23364 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) 23365 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); 23366 } 23367 return SDValue(); 23368 } 23369 default: 23370 return SDValue(); 23371 } 23372 23373 // Nuke no-op shuffles that show up after combining. 23374 if (isNoopShuffleMask(Mask)) 23375 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); 23376 23377 // Look for simplifications involving one or two shuffle instructions. 23378 SDValue V = N.getOperand(0); 23379 switch (N.getOpcode()) { 23380 default: 23381 break; 23382 case X86ISD::PSHUFLW: 23383 case X86ISD::PSHUFHW: 23384 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); 23385 23386 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) 23387 return SDValue(); // We combined away this shuffle, so we're done. 23388 23389 // See if this reduces to a PSHUFD which is no more expensive and can 23390 // combine with more operations. Note that it has to at least flip the 23391 // dwords as otherwise it would have been removed as a no-op. 23392 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { 23393 int DMask[] = {0, 1, 2, 3}; 23394 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; 23395 DMask[DOffset + 0] = DOffset + 1; 23396 DMask[DOffset + 1] = DOffset + 0; 23397 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); 23398 V = DAG.getBitcast(DVT, V); 23399 DCI.AddToWorklist(V.getNode()); 23400 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, 23401 getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); 23402 DCI.AddToWorklist(V.getNode()); 23403 return DAG.getBitcast(VT, V); 23404 } 23405 23406 // Look for shuffle patterns which can be implemented as a single unpack. 23407 // FIXME: This doesn't handle the location of the PSHUFD generically, and 23408 // only works when we have a PSHUFD followed by two half-shuffles. 23409 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && 23410 (V.getOpcode() == X86ISD::PSHUFLW || 23411 V.getOpcode() == X86ISD::PSHUFHW) && 23412 V.getOpcode() != N.getOpcode() && 23413 V.hasOneUse()) { 23414 SDValue D = V.getOperand(0); 23415 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) 23416 D = D.getOperand(0); 23417 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { 23418 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 23419 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); 23420 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; 23421 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; 23422 int WordMask[8]; 23423 for (int i = 0; i < 4; ++i) { 23424 WordMask[i + NOffset] = Mask[i] + NOffset; 23425 WordMask[i + VOffset] = VMask[i] + VOffset; 23426 } 23427 // Map the word mask through the DWord mask. 23428 int MappedMask[8]; 23429 for (int i = 0; i < 8; ++i) 23430 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; 23431 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || 23432 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { 23433 // We can replace all three shuffles with an unpack. 23434 V = DAG.getBitcast(VT, D.getOperand(0)); 23435 DCI.AddToWorklist(V.getNode()); 23436 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL 23437 : X86ISD::UNPCKH, 23438 DL, VT, V, V); 23439 } 23440 } 23441 } 23442 23443 break; 23444 23445 case X86ISD::PSHUFD: 23446 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) 23447 return NewN; 23448 23449 break; 23450 } 23451 23452 return SDValue(); 23453 } 23454 23455 /// \brief Try to combine a shuffle into a target-specific add-sub node. 23456 /// 23457 /// We combine this directly on the abstract vector shuffle nodes so it is 23458 /// easier to generically match. We also insert dummy vector shuffle nodes for 23459 /// the operands which explicitly discard the lanes which are unused by this 23460 /// operation to try to flow through the rest of the combiner the fact that 23461 /// they're unused. 23462 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { 23463 SDLoc DL(N); 23464 EVT VT = N->getValueType(0); 23465 23466 // We only handle target-independent shuffles. 23467 // FIXME: It would be easy and harmless to use the target shuffle mask 23468 // extraction tool to support more. 23469 if (N->getOpcode() != ISD::VECTOR_SHUFFLE) 23470 return SDValue(); 23471 23472 auto *SVN = cast<ShuffleVectorSDNode>(N); 23473 SmallVector<int, 8> Mask; 23474 for (int M : SVN->getMask()) 23475 Mask.push_back(M); 23476 23477 SDValue V1 = N->getOperand(0); 23478 SDValue V2 = N->getOperand(1); 23479 23480 // We require the first shuffle operand to be the FSUB node, and the second to 23481 // be the FADD node. 23482 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { 23483 ShuffleVectorSDNode::commuteMask(Mask); 23484 std::swap(V1, V2); 23485 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) 23486 return SDValue(); 23487 23488 // If there are other uses of these operations we can't fold them. 23489 if (!V1->hasOneUse() || !V2->hasOneUse()) 23490 return SDValue(); 23491 23492 // Ensure that both operations have the same operands. Note that we can 23493 // commute the FADD operands. 23494 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); 23495 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && 23496 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) 23497 return SDValue(); 23498 23499 // We're looking for blends between FADD and FSUB nodes. We insist on these 23500 // nodes being lined up in a specific expected pattern. 23501 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || 23502 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || 23503 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) 23504 return SDValue(); 23505 23506 // Only specific types are legal at this point, assert so we notice if and 23507 // when these change. 23508 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || 23509 VT == MVT::v4f64) && 23510 "Unknown vector type encountered!"); 23511 23512 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); 23513 } 23514 23515 /// PerformShuffleCombine - Performs several different shuffle combines. 23516 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 23517 TargetLowering::DAGCombinerInfo &DCI, 23518 const X86Subtarget *Subtarget) { 23519 SDLoc dl(N); 23520 SDValue N0 = N->getOperand(0); 23521 SDValue N1 = N->getOperand(1); 23522 EVT VT = N->getValueType(0); 23523 23524 // Don't create instructions with illegal types after legalize types has run. 23525 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 23526 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 23527 return SDValue(); 23528 23529 // If we have legalized the vector types, look for blends of FADD and FSUB 23530 // nodes that we can fuse into an ADDSUB node. 23531 if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) 23532 if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) 23533 return AddSub; 23534 23535 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 23536 if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && 23537 N->getOpcode() == ISD::VECTOR_SHUFFLE) 23538 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 23539 23540 // During Type Legalization, when promoting illegal vector types, 23541 // the backend might introduce new shuffle dag nodes and bitcasts. 23542 // 23543 // This code performs the following transformation: 23544 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> 23545 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) 23546 // 23547 // We do this only if both the bitcast and the BINOP dag nodes have 23548 // one use. Also, perform this transformation only if the new binary 23549 // operation is legal. This is to avoid introducing dag nodes that 23550 // potentially need to be further expanded (or custom lowered) into a 23551 // less optimal sequence of dag nodes. 23552 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && 23553 N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && 23554 N0.getOpcode() == ISD::BITCAST) { 23555 SDValue BC0 = N0.getOperand(0); 23556 EVT SVT = BC0.getValueType(); 23557 unsigned Opcode = BC0.getOpcode(); 23558 unsigned NumElts = VT.getVectorNumElements(); 23559 23560 if (BC0.hasOneUse() && SVT.isVector() && 23561 SVT.getVectorNumElements() * 2 == NumElts && 23562 TLI.isOperationLegal(Opcode, VT)) { 23563 bool CanFold = false; 23564 switch (Opcode) { 23565 default : break; 23566 case ISD::ADD : 23567 case ISD::FADD : 23568 case ISD::SUB : 23569 case ISD::FSUB : 23570 case ISD::MUL : 23571 case ISD::FMUL : 23572 CanFold = true; 23573 } 23574 23575 unsigned SVTNumElts = SVT.getVectorNumElements(); 23576 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 23577 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) 23578 CanFold = SVOp->getMaskElt(i) == (int)(i * 2); 23579 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) 23580 CanFold = SVOp->getMaskElt(i) < 0; 23581 23582 if (CanFold) { 23583 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); 23584 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); 23585 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); 23586 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); 23587 } 23588 } 23589 } 23590 23591 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 23592 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 23593 // consecutive, non-overlapping, and in the right order. 23594 SmallVector<SDValue, 16> Elts; 23595 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 23596 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 23597 23598 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) 23599 return LD; 23600 23601 if (isTargetShuffle(N->getOpcode())) { 23602 SDValue Shuffle = 23603 PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); 23604 if (Shuffle.getNode()) 23605 return Shuffle; 23606 23607 // Try recursively combining arbitrary sequences of x86 shuffle 23608 // instructions into higher-order shuffles. We do this after combining 23609 // specific PSHUF instruction sequences into their minimal form so that we 23610 // can evaluate how many specialized shuffle instructions are involved in 23611 // a particular chain. 23612 SmallVector<int, 1> NonceMask; // Just a placeholder. 23613 NonceMask.push_back(0); 23614 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, 23615 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, 23616 DCI, Subtarget)) 23617 return SDValue(); // This routine will use CombineTo to replace N. 23618 } 23619 23620 return SDValue(); 23621 } 23622 23623 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 23624 /// specific shuffle of a load can be folded into a single element load. 23625 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 23626 /// shuffles have been custom lowered so we need to handle those here. 23627 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 23628 TargetLowering::DAGCombinerInfo &DCI) { 23629 if (DCI.isBeforeLegalizeOps()) 23630 return SDValue(); 23631 23632 SDValue InVec = N->getOperand(0); 23633 SDValue EltNo = N->getOperand(1); 23634 23635 if (!isa<ConstantSDNode>(EltNo)) 23636 return SDValue(); 23637 23638 EVT OriginalVT = InVec.getValueType(); 23639 23640 if (InVec.getOpcode() == ISD::BITCAST) { 23641 // Don't duplicate a load with other uses. 23642 if (!InVec.hasOneUse()) 23643 return SDValue(); 23644 EVT BCVT = InVec.getOperand(0).getValueType(); 23645 if (!BCVT.isVector() || 23646 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) 23647 return SDValue(); 23648 InVec = InVec.getOperand(0); 23649 } 23650 23651 EVT CurrentVT = InVec.getValueType(); 23652 23653 if (!isTargetShuffle(InVec.getOpcode())) 23654 return SDValue(); 23655 23656 // Don't duplicate a load with other uses. 23657 if (!InVec.hasOneUse()) 23658 return SDValue(); 23659 23660 SmallVector<int, 16> ShuffleMask; 23661 bool UnaryShuffle; 23662 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), 23663 ShuffleMask, UnaryShuffle)) 23664 return SDValue(); 23665 23666 // Select the input vector, guarding against out of range extract vector. 23667 unsigned NumElems = CurrentVT.getVectorNumElements(); 23668 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 23669 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 23670 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 23671 : InVec.getOperand(1); 23672 23673 // If inputs to shuffle are the same for both ops, then allow 2 uses 23674 unsigned AllowedUses = InVec.getNumOperands() > 1 && 23675 InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 23676 23677 if (LdNode.getOpcode() == ISD::BITCAST) { 23678 // Don't duplicate a load with other uses. 23679 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 23680 return SDValue(); 23681 23682 AllowedUses = 1; // only allow 1 load use if we have a bitcast 23683 LdNode = LdNode.getOperand(0); 23684 } 23685 23686 if (!ISD::isNormalLoad(LdNode.getNode())) 23687 return SDValue(); 23688 23689 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 23690 23691 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 23692 return SDValue(); 23693 23694 EVT EltVT = N->getValueType(0); 23695 // If there's a bitcast before the shuffle, check if the load type and 23696 // alignment is valid. 23697 unsigned Align = LN0->getAlignment(); 23698 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 23699 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( 23700 EltVT.getTypeForEVT(*DAG.getContext())); 23701 23702 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) 23703 return SDValue(); 23704 23705 // All checks match so transform back to vector_shuffle so that DAG combiner 23706 // can finish the job 23707 SDLoc dl(N); 23708 23709 // Create shuffle node taking into account the case that its a unary shuffle 23710 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) 23711 : InVec.getOperand(1); 23712 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, 23713 InVec.getOperand(0), Shuffle, 23714 &ShuffleMask[0]); 23715 Shuffle = DAG.getBitcast(OriginalVT, Shuffle); 23716 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 23717 EltNo); 23718 } 23719 23720 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, 23721 const X86Subtarget *Subtarget) { 23722 SDValue N0 = N->getOperand(0); 23723 EVT VT = N->getValueType(0); 23724 23725 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are 23726 // special and don't usually play with other vector types, it's better to 23727 // handle them early to be sure we emit efficient code by avoiding 23728 // store-load conversions. 23729 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && 23730 N0.getValueType() == MVT::v2i32 && 23731 isNullConstant(N0.getOperand(1))) { 23732 SDValue N00 = N0->getOperand(0); 23733 if (N00.getValueType() == MVT::i32) 23734 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); 23735 } 23736 23737 // Convert a bitcasted integer logic operation that has one bitcasted 23738 // floating-point operand and one constant operand into a floating-point 23739 // logic operation. This may create a load of the constant, but that is 23740 // cheaper than materializing the constant in an integer register and 23741 // transferring it to an SSE register or transferring the SSE operand to 23742 // integer register and back. 23743 unsigned FPOpcode; 23744 switch (N0.getOpcode()) { 23745 case ISD::AND: FPOpcode = X86ISD::FAND; break; 23746 case ISD::OR: FPOpcode = X86ISD::FOR; break; 23747 case ISD::XOR: FPOpcode = X86ISD::FXOR; break; 23748 default: return SDValue(); 23749 } 23750 if (((Subtarget->hasSSE1() && VT == MVT::f32) || 23751 (Subtarget->hasSSE2() && VT == MVT::f64)) && 23752 isa<ConstantSDNode>(N0.getOperand(1)) && 23753 N0.getOperand(0).getOpcode() == ISD::BITCAST && 23754 N0.getOperand(0).getOperand(0).getValueType() == VT) { 23755 SDValue N000 = N0.getOperand(0).getOperand(0); 23756 SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); 23757 return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); 23758 } 23759 23760 return SDValue(); 23761 } 23762 23763 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 23764 /// generation and convert it from being a bunch of shuffles and extracts 23765 /// into a somewhat faster sequence. For i686, the best sequence is apparently 23766 /// storing the value and loading scalars back, while for x64 we should 23767 /// use 64-bit extracts and shifts. 23768 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 23769 TargetLowering::DAGCombinerInfo &DCI) { 23770 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) 23771 return NewOp; 23772 23773 SDValue InputVector = N->getOperand(0); 23774 SDLoc dl(InputVector); 23775 // Detect mmx to i32 conversion through a v2i32 elt extract. 23776 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && 23777 N->getValueType(0) == MVT::i32 && 23778 InputVector.getValueType() == MVT::v2i32) { 23779 23780 // The bitcast source is a direct mmx result. 23781 SDValue MMXSrc = InputVector.getNode()->getOperand(0); 23782 if (MMXSrc.getValueType() == MVT::x86mmx) 23783 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), 23784 N->getValueType(0), 23785 InputVector.getNode()->getOperand(0)); 23786 23787 // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). 23788 if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && 23789 MMXSrc.getValueType() == MVT::i64) { 23790 SDValue MMXSrcOp = MMXSrc.getOperand(0); 23791 if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST && 23792 MMXSrcOp.getValueType() == MVT::v1i64 && 23793 MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) 23794 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), 23795 N->getValueType(0), MMXSrcOp.getOperand(0)); 23796 } 23797 } 23798 23799 EVT VT = N->getValueType(0); 23800 23801 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && 23802 InputVector.getOpcode() == ISD::BITCAST && 23803 isa<ConstantSDNode>(InputVector.getOperand(0))) { 23804 uint64_t ExtractedElt = 23805 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 23806 uint64_t InputValue = 23807 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); 23808 uint64_t Res = (InputValue >> ExtractedElt) & 1; 23809 return DAG.getConstant(Res, dl, MVT::i1); 23810 } 23811 // Only operate on vectors of 4 elements, where the alternative shuffling 23812 // gets to be more expensive. 23813 if (InputVector.getValueType() != MVT::v4i32) 23814 return SDValue(); 23815 23816 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 23817 // single use which is a sign-extend or zero-extend, and all elements are 23818 // used. 23819 SmallVector<SDNode *, 4> Uses; 23820 unsigned ExtractedElements = 0; 23821 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 23822 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 23823 if (UI.getUse().getResNo() != InputVector.getResNo()) 23824 return SDValue(); 23825 23826 SDNode *Extract = *UI; 23827 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 23828 return SDValue(); 23829 23830 if (Extract->getValueType(0) != MVT::i32) 23831 return SDValue(); 23832 if (!Extract->hasOneUse()) 23833 return SDValue(); 23834 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 23835 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 23836 return SDValue(); 23837 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 23838 return SDValue(); 23839 23840 // Record which element was extracted. 23841 ExtractedElements |= 23842 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 23843 23844 Uses.push_back(Extract); 23845 } 23846 23847 // If not all the elements were used, this may not be worthwhile. 23848 if (ExtractedElements != 15) 23849 return SDValue(); 23850 23851 // Ok, we've now decided to do the transformation. 23852 // If 64-bit shifts are legal, use the extract-shift sequence, 23853 // otherwise bounce the vector off the cache. 23854 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 23855 SDValue Vals[4]; 23856 23857 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { 23858 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector); 23859 auto &DL = DAG.getDataLayout(); 23860 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL); 23861 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, 23862 DAG.getConstant(0, dl, VecIdxTy)); 23863 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, 23864 DAG.getConstant(1, dl, VecIdxTy)); 23865 23866 SDValue ShAmt = DAG.getConstant( 23867 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL)); 23868 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); 23869 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, 23870 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); 23871 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); 23872 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, 23873 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); 23874 } else { 23875 // Store the value to a temporary stack slot. 23876 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 23877 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 23878 MachinePointerInfo(), false, false, 0); 23879 23880 EVT ElementType = InputVector.getValueType().getVectorElementType(); 23881 unsigned EltSize = ElementType.getSizeInBits() / 8; 23882 23883 // Replace each use (extract) with a load of the appropriate element. 23884 for (unsigned i = 0; i < 4; ++i) { 23885 uint64_t Offset = EltSize * i; 23886 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); 23887 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT); 23888 23889 SDValue ScalarAddr = 23890 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); 23891 23892 // Load the scalar. 23893 Vals[i] = DAG.getLoad(ElementType, dl, Ch, 23894 ScalarAddr, MachinePointerInfo(), 23895 false, false, false, 0); 23896 23897 } 23898 } 23899 23900 // Replace the extracts 23901 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 23902 UE = Uses.end(); UI != UE; ++UI) { 23903 SDNode *Extract = *UI; 23904 23905 SDValue Idx = Extract->getOperand(1); 23906 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 23907 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); 23908 } 23909 23910 // The replacement was made in place; don't return anything. 23911 return SDValue(); 23912 } 23913 23914 static SDValue 23915 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, 23916 const X86Subtarget *Subtarget) { 23917 SDLoc dl(N); 23918 SDValue Cond = N->getOperand(0); 23919 SDValue LHS = N->getOperand(1); 23920 SDValue RHS = N->getOperand(2); 23921 23922 if (Cond.getOpcode() == ISD::SIGN_EXTEND) { 23923 SDValue CondSrc = Cond->getOperand(0); 23924 if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) 23925 Cond = CondSrc->getOperand(0); 23926 } 23927 23928 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) 23929 return SDValue(); 23930 23931 // A vselect where all conditions and data are constants can be optimized into 23932 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). 23933 if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && 23934 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) 23935 return SDValue(); 23936 23937 unsigned MaskValue = 0; 23938 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) 23939 return SDValue(); 23940 23941 MVT VT = N->getSimpleValueType(0); 23942 unsigned NumElems = VT.getVectorNumElements(); 23943 SmallVector<int, 8> ShuffleMask(NumElems, -1); 23944 for (unsigned i = 0; i < NumElems; ++i) { 23945 // Be sure we emit undef where we can. 23946 if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) 23947 ShuffleMask[i] = -1; 23948 else 23949 ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); 23950 } 23951 23952 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 23953 if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) 23954 return SDValue(); 23955 return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); 23956 } 23957 23958 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 23959 /// nodes. 23960 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 23961 TargetLowering::DAGCombinerInfo &DCI, 23962 const X86Subtarget *Subtarget) { 23963 SDLoc DL(N); 23964 SDValue Cond = N->getOperand(0); 23965 // Get the LHS/RHS of the select. 23966 SDValue LHS = N->getOperand(1); 23967 SDValue RHS = N->getOperand(2); 23968 EVT VT = LHS.getValueType(); 23969 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 23970 23971 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 23972 // instructions match the semantics of the common C idiom x<y?x:y but not 23973 // x<=y?x:y, because of how they handle negative zero (which can be 23974 // ignored in unsafe-math mode). 23975 // We also try to create v2f32 min/max nodes, which we later widen to v4f32. 23976 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 23977 VT != MVT::f80 && VT != MVT::f128 && 23978 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && 23979 (Subtarget->hasSSE2() || 23980 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 23981 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 23982 23983 unsigned Opcode = 0; 23984 // Check for x CC y ? x : y. 23985 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 23986 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 23987 switch (CC) { 23988 default: break; 23989 case ISD::SETULT: 23990 // Converting this to a min would handle NaNs incorrectly, and swapping 23991 // the operands would cause it to handle comparisons between positive 23992 // and negative zero incorrectly. 23993 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 23994 if (!DAG.getTarget().Options.UnsafeFPMath && 23995 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 23996 break; 23997 std::swap(LHS, RHS); 23998 } 23999 Opcode = X86ISD::FMIN; 24000 break; 24001 case ISD::SETOLE: 24002 // Converting this to a min would handle comparisons between positive 24003 // and negative zero incorrectly. 24004 if (!DAG.getTarget().Options.UnsafeFPMath && 24005 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 24006 break; 24007 Opcode = X86ISD::FMIN; 24008 break; 24009 case ISD::SETULE: 24010 // Converting this to a min would handle both negative zeros and NaNs 24011 // incorrectly, but we can swap the operands to fix both. 24012 std::swap(LHS, RHS); 24013 case ISD::SETOLT: 24014 case ISD::SETLT: 24015 case ISD::SETLE: 24016 Opcode = X86ISD::FMIN; 24017 break; 24018 24019 case ISD::SETOGE: 24020 // Converting this to a max would handle comparisons between positive 24021 // and negative zero incorrectly. 24022 if (!DAG.getTarget().Options.UnsafeFPMath && 24023 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 24024 break; 24025 Opcode = X86ISD::FMAX; 24026 break; 24027 case ISD::SETUGT: 24028 // Converting this to a max would handle NaNs incorrectly, and swapping 24029 // the operands would cause it to handle comparisons between positive 24030 // and negative zero incorrectly. 24031 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 24032 if (!DAG.getTarget().Options.UnsafeFPMath && 24033 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 24034 break; 24035 std::swap(LHS, RHS); 24036 } 24037 Opcode = X86ISD::FMAX; 24038 break; 24039 case ISD::SETUGE: 24040 // Converting this to a max would handle both negative zeros and NaNs 24041 // incorrectly, but we can swap the operands to fix both. 24042 std::swap(LHS, RHS); 24043 case ISD::SETOGT: 24044 case ISD::SETGT: 24045 case ISD::SETGE: 24046 Opcode = X86ISD::FMAX; 24047 break; 24048 } 24049 // Check for x CC y ? y : x -- a min/max with reversed arms. 24050 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 24051 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 24052 switch (CC) { 24053 default: break; 24054 case ISD::SETOGE: 24055 // Converting this to a min would handle comparisons between positive 24056 // and negative zero incorrectly, and swapping the operands would 24057 // cause it to handle NaNs incorrectly. 24058 if (!DAG.getTarget().Options.UnsafeFPMath && 24059 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 24060 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 24061 break; 24062 std::swap(LHS, RHS); 24063 } 24064 Opcode = X86ISD::FMIN; 24065 break; 24066 case ISD::SETUGT: 24067 // Converting this to a min would handle NaNs incorrectly. 24068 if (!DAG.getTarget().Options.UnsafeFPMath && 24069 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 24070 break; 24071 Opcode = X86ISD::FMIN; 24072 break; 24073 case ISD::SETUGE: 24074 // Converting this to a min would handle both negative zeros and NaNs 24075 // incorrectly, but we can swap the operands to fix both. 24076 std::swap(LHS, RHS); 24077 case ISD::SETOGT: 24078 case ISD::SETGT: 24079 case ISD::SETGE: 24080 Opcode = X86ISD::FMIN; 24081 break; 24082 24083 case ISD::SETULT: 24084 // Converting this to a max would handle NaNs incorrectly. 24085 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 24086 break; 24087 Opcode = X86ISD::FMAX; 24088 break; 24089 case ISD::SETOLE: 24090 // Converting this to a max would handle comparisons between positive 24091 // and negative zero incorrectly, and swapping the operands would 24092 // cause it to handle NaNs incorrectly. 24093 if (!DAG.getTarget().Options.UnsafeFPMath && 24094 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 24095 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 24096 break; 24097 std::swap(LHS, RHS); 24098 } 24099 Opcode = X86ISD::FMAX; 24100 break; 24101 case ISD::SETULE: 24102 // Converting this to a max would handle both negative zeros and NaNs 24103 // incorrectly, but we can swap the operands to fix both. 24104 std::swap(LHS, RHS); 24105 case ISD::SETOLT: 24106 case ISD::SETLT: 24107 case ISD::SETLE: 24108 Opcode = X86ISD::FMAX; 24109 break; 24110 } 24111 } 24112 24113 if (Opcode) 24114 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 24115 } 24116 24117 EVT CondVT = Cond.getValueType(); 24118 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && 24119 CondVT.getVectorElementType() == MVT::i1) { 24120 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper 24121 // lowering on KNL. In this case we convert it to 24122 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. 24123 // The same situation for all 128 and 256-bit vectors of i8 and i16. 24124 // Since SKX these selects have a proper lowering. 24125 EVT OpVT = LHS.getValueType(); 24126 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && 24127 (OpVT.getVectorElementType() == MVT::i8 || 24128 OpVT.getVectorElementType() == MVT::i16) && 24129 !(Subtarget->hasBWI() && Subtarget->hasVLX())) { 24130 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); 24131 DCI.AddToWorklist(Cond.getNode()); 24132 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); 24133 } 24134 } 24135 // If this is a select between two integer constants, try to do some 24136 // optimizations. 24137 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 24138 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 24139 // Don't do this for crazy integer types. 24140 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 24141 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 24142 // so that TrueC (the true value) is larger than FalseC. 24143 bool NeedsCondInvert = false; 24144 24145 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 24146 // Efficiently invertible. 24147 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 24148 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 24149 isa<ConstantSDNode>(Cond.getOperand(1))))) { 24150 NeedsCondInvert = true; 24151 std::swap(TrueC, FalseC); 24152 } 24153 24154 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 24155 if (FalseC->getAPIntValue() == 0 && 24156 TrueC->getAPIntValue().isPowerOf2()) { 24157 if (NeedsCondInvert) // Invert the condition if needed. 24158 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 24159 DAG.getConstant(1, DL, Cond.getValueType())); 24160 24161 // Zero extend the condition if needed. 24162 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 24163 24164 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 24165 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 24166 DAG.getConstant(ShAmt, DL, MVT::i8)); 24167 } 24168 24169 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 24170 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 24171 if (NeedsCondInvert) // Invert the condition if needed. 24172 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 24173 DAG.getConstant(1, DL, Cond.getValueType())); 24174 24175 // Zero extend the condition if needed. 24176 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 24177 FalseC->getValueType(0), Cond); 24178 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 24179 SDValue(FalseC, 0)); 24180 } 24181 24182 // Optimize cases that will turn into an LEA instruction. This requires 24183 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 24184 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 24185 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 24186 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 24187 24188 bool isFastMultiplier = false; 24189 if (Diff < 10) { 24190 switch ((unsigned char)Diff) { 24191 default: break; 24192 case 1: // result = add base, cond 24193 case 2: // result = lea base( , cond*2) 24194 case 3: // result = lea base(cond, cond*2) 24195 case 4: // result = lea base( , cond*4) 24196 case 5: // result = lea base(cond, cond*4) 24197 case 8: // result = lea base( , cond*8) 24198 case 9: // result = lea base(cond, cond*8) 24199 isFastMultiplier = true; 24200 break; 24201 } 24202 } 24203 24204 if (isFastMultiplier) { 24205 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 24206 if (NeedsCondInvert) // Invert the condition if needed. 24207 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 24208 DAG.getConstant(1, DL, Cond.getValueType())); 24209 24210 // Zero extend the condition if needed. 24211 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 24212 Cond); 24213 // Scale the condition by the difference. 24214 if (Diff != 1) 24215 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 24216 DAG.getConstant(Diff, DL, 24217 Cond.getValueType())); 24218 24219 // Add the base if non-zero. 24220 if (FalseC->getAPIntValue() != 0) 24221 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 24222 SDValue(FalseC, 0)); 24223 return Cond; 24224 } 24225 } 24226 } 24227 } 24228 24229 // Canonicalize max and min: 24230 // (x > y) ? x : y -> (x >= y) ? x : y 24231 // (x < y) ? x : y -> (x <= y) ? x : y 24232 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 24233 // the need for an extra compare 24234 // against zero. e.g. 24235 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 24236 // subl %esi, %edi 24237 // testl %edi, %edi 24238 // movl $0, %eax 24239 // cmovgl %edi, %eax 24240 // => 24241 // xorl %eax, %eax 24242 // subl %esi, $edi 24243 // cmovsl %eax, %edi 24244 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 24245 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 24246 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 24247 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 24248 switch (CC) { 24249 default: break; 24250 case ISD::SETLT: 24251 case ISD::SETGT: { 24252 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 24253 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), 24254 Cond.getOperand(0), Cond.getOperand(1), NewCC); 24255 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 24256 } 24257 } 24258 } 24259 24260 // Early exit check 24261 if (!TLI.isTypeLegal(VT)) 24262 return SDValue(); 24263 24264 // Match VSELECTs into subs with unsigned saturation. 24265 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 24266 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. 24267 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || 24268 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { 24269 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 24270 24271 // Check if one of the arms of the VSELECT is a zero vector. If it's on the 24272 // left side invert the predicate to simplify logic below. 24273 SDValue Other; 24274 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 24275 Other = RHS; 24276 CC = ISD::getSetCCInverse(CC, true); 24277 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { 24278 Other = LHS; 24279 } 24280 24281 if (Other.getNode() && Other->getNumOperands() == 2 && 24282 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { 24283 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); 24284 SDValue CondRHS = Cond->getOperand(1); 24285 24286 // Look for a general sub with unsigned saturation first. 24287 // x >= y ? x-y : 0 --> subus x, y 24288 // x > y ? x-y : 0 --> subus x, y 24289 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && 24290 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) 24291 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 24292 24293 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) 24294 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { 24295 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS)) 24296 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) 24297 // If the RHS is a constant we have to reverse the const 24298 // canonicalization. 24299 // x > C-1 ? x+-C : 0 --> subus x, C 24300 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && 24301 CondRHSConst->getAPIntValue() == 24302 (-OpRHSConst->getAPIntValue() - 1)) 24303 return DAG.getNode( 24304 X86ISD::SUBUS, DL, VT, OpLHS, 24305 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT)); 24306 24307 // Another special case: If C was a sign bit, the sub has been 24308 // canonicalized into a xor. 24309 // FIXME: Would it be better to use computeKnownBits to determine 24310 // whether it's safe to decanonicalize the xor? 24311 // x s< 0 ? x^C : 0 --> subus x, C 24312 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && 24313 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 24314 OpRHSConst->getAPIntValue().isSignBit()) 24315 // Note that we have to rebuild the RHS constant here to ensure we 24316 // don't rely on particular values of undef lanes. 24317 return DAG.getNode( 24318 X86ISD::SUBUS, DL, VT, OpLHS, 24319 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT)); 24320 } 24321 } 24322 } 24323 24324 // Simplify vector selection if condition value type matches vselect 24325 // operand type 24326 if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { 24327 assert(Cond.getValueType().isVector() && 24328 "vector select expects a vector selector!"); 24329 24330 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); 24331 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 24332 24333 // Try invert the condition if true value is not all 1s and false value 24334 // is not all 0s. 24335 if (!TValIsAllOnes && !FValIsAllZeros && 24336 // Check if the selector will be produced by CMPP*/PCMP* 24337 Cond.getOpcode() == ISD::SETCC && 24338 // Check if SETCC has already been promoted 24339 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == 24340 CondVT) { 24341 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); 24342 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); 24343 24344 if (TValIsAllZeros || FValIsAllOnes) { 24345 SDValue CC = Cond.getOperand(2); 24346 ISD::CondCode NewCC = 24347 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 24348 Cond.getOperand(0).getValueType().isInteger()); 24349 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); 24350 std::swap(LHS, RHS); 24351 TValIsAllOnes = FValIsAllOnes; 24352 FValIsAllZeros = TValIsAllZeros; 24353 } 24354 } 24355 24356 if (TValIsAllOnes || FValIsAllZeros) { 24357 SDValue Ret; 24358 24359 if (TValIsAllOnes && FValIsAllZeros) 24360 Ret = Cond; 24361 else if (TValIsAllOnes) 24362 Ret = 24363 DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS)); 24364 else if (FValIsAllZeros) 24365 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, 24366 DAG.getBitcast(CondVT, LHS)); 24367 24368 return DAG.getBitcast(VT, Ret); 24369 } 24370 } 24371 24372 // We should generate an X86ISD::BLENDI from a vselect if its argument 24373 // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of 24374 // constants. This specific pattern gets generated when we split a 24375 // selector for a 512 bit vector in a machine without AVX512 (but with 24376 // 256-bit vectors), during legalization: 24377 // 24378 // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) 24379 // 24380 // Iff we find this pattern and the build_vectors are built from 24381 // constants, we translate the vselect into a shuffle_vector that we 24382 // know will be matched by LowerVECTOR_SHUFFLEtoBlend. 24383 if ((N->getOpcode() == ISD::VSELECT || 24384 N->getOpcode() == X86ISD::SHRUNKBLEND) && 24385 !DCI.isBeforeLegalize() && !VT.is512BitVector()) { 24386 SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); 24387 if (Shuffle.getNode()) 24388 return Shuffle; 24389 } 24390 24391 // If this is a *dynamic* select (non-constant condition) and we can match 24392 // this node with one of the variable blend instructions, restructure the 24393 // condition so that the blends can use the high bit of each element and use 24394 // SimplifyDemandedBits to simplify the condition operand. 24395 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 24396 !DCI.isBeforeLegalize() && 24397 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { 24398 unsigned BitWidth = Cond.getValueType().getScalarSizeInBits(); 24399 24400 // Don't optimize vector selects that map to mask-registers. 24401 if (BitWidth == 1) 24402 return SDValue(); 24403 24404 // We can only handle the cases where VSELECT is directly legal on the 24405 // subtarget. We custom lower VSELECT nodes with constant conditions and 24406 // this makes it hard to see whether a dynamic VSELECT will correctly 24407 // lower, so we both check the operation's status and explicitly handle the 24408 // cases where a *dynamic* blend will fail even though a constant-condition 24409 // blend could be custom lowered. 24410 // FIXME: We should find a better way to handle this class of problems. 24411 // Potentially, we should combine constant-condition vselect nodes 24412 // pre-legalization into shuffles and not mark as many types as custom 24413 // lowered. 24414 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) 24415 return SDValue(); 24416 // FIXME: We don't support i16-element blends currently. We could and 24417 // should support them by making *all* the bits in the condition be set 24418 // rather than just the high bit and using an i8-element blend. 24419 if (VT.getVectorElementType() == MVT::i16) 24420 return SDValue(); 24421 // Dynamic blending was only available from SSE4.1 onward. 24422 if (VT.is128BitVector() && !Subtarget->hasSSE41()) 24423 return SDValue(); 24424 // Byte blends are only available in AVX2 24425 if (VT == MVT::v32i8 && !Subtarget->hasAVX2()) 24426 return SDValue(); 24427 24428 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 24429 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 24430 24431 APInt KnownZero, KnownOne; 24432 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 24433 DCI.isBeforeLegalizeOps()); 24434 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 24435 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, 24436 TLO)) { 24437 // If we changed the computation somewhere in the DAG, this change 24438 // will affect all users of Cond. 24439 // Make sure it is fine and update all the nodes so that we do not 24440 // use the generic VSELECT anymore. Otherwise, we may perform 24441 // wrong optimizations as we messed up with the actual expectation 24442 // for the vector boolean values. 24443 if (Cond != TLO.Old) { 24444 // Check all uses of that condition operand to check whether it will be 24445 // consumed by non-BLEND instructions, which may depend on all bits are 24446 // set properly. 24447 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); 24448 I != E; ++I) 24449 if (I->getOpcode() != ISD::VSELECT) 24450 // TODO: Add other opcodes eventually lowered into BLEND. 24451 return SDValue(); 24452 24453 // Update all the users of the condition, before committing the change, 24454 // so that the VSELECT optimizations that expect the correct vector 24455 // boolean value will not be triggered. 24456 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); 24457 I != E; ++I) 24458 DAG.ReplaceAllUsesOfValueWith( 24459 SDValue(*I, 0), 24460 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), 24461 Cond, I->getOperand(1), I->getOperand(2))); 24462 DCI.CommitTargetLoweringOpt(TLO); 24463 return SDValue(); 24464 } 24465 // At this point, only Cond is changed. Change the condition 24466 // just for N to keep the opportunity to optimize all other 24467 // users their own way. 24468 DAG.ReplaceAllUsesOfValueWith( 24469 SDValue(N, 0), 24470 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), 24471 TLO.New, N->getOperand(1), N->getOperand(2))); 24472 return SDValue(); 24473 } 24474 } 24475 24476 return SDValue(); 24477 } 24478 24479 // Check whether a boolean test is testing a boolean value generated by 24480 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 24481 // code. 24482 // 24483 // Simplify the following patterns: 24484 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 24485 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 24486 // to (Op EFLAGS Cond) 24487 // 24488 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 24489 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 24490 // to (Op EFLAGS !Cond) 24491 // 24492 // where Op could be BRCOND or CMOV. 24493 // 24494 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 24495 // Quit if not CMP and SUB with its value result used. 24496 if (Cmp.getOpcode() != X86ISD::CMP && 24497 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 24498 return SDValue(); 24499 24500 // Quit if not used as a boolean value. 24501 if (CC != X86::COND_E && CC != X86::COND_NE) 24502 return SDValue(); 24503 24504 // Check CMP operands. One of them should be 0 or 1 and the other should be 24505 // an SetCC or extended from it. 24506 SDValue Op1 = Cmp.getOperand(0); 24507 SDValue Op2 = Cmp.getOperand(1); 24508 24509 SDValue SetCC; 24510 const ConstantSDNode* C = nullptr; 24511 bool needOppositeCond = (CC == X86::COND_E); 24512 bool checkAgainstTrue = false; // Is it a comparison against 1? 24513 24514 if ((C = dyn_cast<ConstantSDNode>(Op1))) 24515 SetCC = Op2; 24516 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 24517 SetCC = Op1; 24518 else // Quit if all operands are not constants. 24519 return SDValue(); 24520 24521 if (C->getZExtValue() == 1) { 24522 needOppositeCond = !needOppositeCond; 24523 checkAgainstTrue = true; 24524 } else if (C->getZExtValue() != 0) 24525 // Quit if the constant is neither 0 or 1. 24526 return SDValue(); 24527 24528 bool truncatedToBoolWithAnd = false; 24529 // Skip (zext $x), (trunc $x), or (and $x, 1) node. 24530 while (SetCC.getOpcode() == ISD::ZERO_EXTEND || 24531 SetCC.getOpcode() == ISD::TRUNCATE || 24532 SetCC.getOpcode() == ISD::AND) { 24533 if (SetCC.getOpcode() == ISD::AND) { 24534 int OpIdx = -1; 24535 if (isOneConstant(SetCC.getOperand(0))) 24536 OpIdx = 1; 24537 if (isOneConstant(SetCC.getOperand(1))) 24538 OpIdx = 0; 24539 if (OpIdx == -1) 24540 break; 24541 SetCC = SetCC.getOperand(OpIdx); 24542 truncatedToBoolWithAnd = true; 24543 } else 24544 SetCC = SetCC.getOperand(0); 24545 } 24546 24547 switch (SetCC.getOpcode()) { 24548 case X86ISD::SETCC_CARRY: 24549 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to 24550 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, 24551 // i.e. it's a comparison against true but the result of SETCC_CARRY is not 24552 // truncated to i1 using 'and'. 24553 if (checkAgainstTrue && !truncatedToBoolWithAnd) 24554 break; 24555 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && 24556 "Invalid use of SETCC_CARRY!"); 24557 // FALL THROUGH 24558 case X86ISD::SETCC: 24559 // Set the condition code or opposite one if necessary. 24560 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 24561 if (needOppositeCond) 24562 CC = X86::GetOppositeBranchCondition(CC); 24563 return SetCC.getOperand(1); 24564 case X86ISD::CMOV: { 24565 // Check whether false/true value has canonical one, i.e. 0 or 1. 24566 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 24567 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 24568 // Quit if true value is not a constant. 24569 if (!TVal) 24570 return SDValue(); 24571 // Quit if false value is not a constant. 24572 if (!FVal) { 24573 SDValue Op = SetCC.getOperand(0); 24574 // Skip 'zext' or 'trunc' node. 24575 if (Op.getOpcode() == ISD::ZERO_EXTEND || 24576 Op.getOpcode() == ISD::TRUNCATE) 24577 Op = Op.getOperand(0); 24578 // A special case for rdrand/rdseed, where 0 is set if false cond is 24579 // found. 24580 if ((Op.getOpcode() != X86ISD::RDRAND && 24581 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) 24582 return SDValue(); 24583 } 24584 // Quit if false value is not the constant 0 or 1. 24585 bool FValIsFalse = true; 24586 if (FVal && FVal->getZExtValue() != 0) { 24587 if (FVal->getZExtValue() != 1) 24588 return SDValue(); 24589 // If FVal is 1, opposite cond is needed. 24590 needOppositeCond = !needOppositeCond; 24591 FValIsFalse = false; 24592 } 24593 // Quit if TVal is not the constant opposite of FVal. 24594 if (FValIsFalse && TVal->getZExtValue() != 1) 24595 return SDValue(); 24596 if (!FValIsFalse && TVal->getZExtValue() != 0) 24597 return SDValue(); 24598 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 24599 if (needOppositeCond) 24600 CC = X86::GetOppositeBranchCondition(CC); 24601 return SetCC.getOperand(3); 24602 } 24603 } 24604 24605 return SDValue(); 24606 } 24607 24608 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. 24609 /// Match: 24610 /// (X86or (X86setcc) (X86setcc)) 24611 /// (X86cmp (and (X86setcc) (X86setcc)), 0) 24612 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, 24613 X86::CondCode &CC1, SDValue &Flags, 24614 bool &isAnd) { 24615 if (Cond->getOpcode() == X86ISD::CMP) { 24616 if (!isNullConstant(Cond->getOperand(1))) 24617 return false; 24618 24619 Cond = Cond->getOperand(0); 24620 } 24621 24622 isAnd = false; 24623 24624 SDValue SetCC0, SetCC1; 24625 switch (Cond->getOpcode()) { 24626 default: return false; 24627 case ISD::AND: 24628 case X86ISD::AND: 24629 isAnd = true; 24630 // fallthru 24631 case ISD::OR: 24632 case X86ISD::OR: 24633 SetCC0 = Cond->getOperand(0); 24634 SetCC1 = Cond->getOperand(1); 24635 break; 24636 }; 24637 24638 // Make sure we have SETCC nodes, using the same flags value. 24639 if (SetCC0.getOpcode() != X86ISD::SETCC || 24640 SetCC1.getOpcode() != X86ISD::SETCC || 24641 SetCC0->getOperand(1) != SetCC1->getOperand(1)) 24642 return false; 24643 24644 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); 24645 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); 24646 Flags = SetCC0->getOperand(1); 24647 return true; 24648 } 24649 24650 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 24651 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 24652 TargetLowering::DAGCombinerInfo &DCI, 24653 const X86Subtarget *Subtarget) { 24654 SDLoc DL(N); 24655 24656 // If the flag operand isn't dead, don't touch this CMOV. 24657 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 24658 return SDValue(); 24659 24660 SDValue FalseOp = N->getOperand(0); 24661 SDValue TrueOp = N->getOperand(1); 24662 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 24663 SDValue Cond = N->getOperand(3); 24664 24665 if (CC == X86::COND_E || CC == X86::COND_NE) { 24666 switch (Cond.getOpcode()) { 24667 default: break; 24668 case X86ISD::BSR: 24669 case X86ISD::BSF: 24670 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 24671 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 24672 return (CC == X86::COND_E) ? FalseOp : TrueOp; 24673 } 24674 } 24675 24676 SDValue Flags; 24677 24678 Flags = checkBoolTestSetCCCombine(Cond, CC); 24679 if (Flags.getNode() && 24680 // Extra check as FCMOV only supports a subset of X86 cond. 24681 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { 24682 SDValue Ops[] = { FalseOp, TrueOp, 24683 DAG.getConstant(CC, DL, MVT::i8), Flags }; 24684 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); 24685 } 24686 24687 // If this is a select between two integer constants, try to do some 24688 // optimizations. Note that the operands are ordered the opposite of SELECT 24689 // operands. 24690 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 24691 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 24692 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 24693 // larger than FalseC (the false value). 24694 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 24695 CC = X86::GetOppositeBranchCondition(CC); 24696 std::swap(TrueC, FalseC); 24697 std::swap(TrueOp, FalseOp); 24698 } 24699 24700 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 24701 // This is efficient for any integer data type (including i8/i16) and 24702 // shift amount. 24703 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 24704 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 24705 DAG.getConstant(CC, DL, MVT::i8), Cond); 24706 24707 // Zero extend the condition if needed. 24708 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 24709 24710 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 24711 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 24712 DAG.getConstant(ShAmt, DL, MVT::i8)); 24713 if (N->getNumValues() == 2) // Dead flag value? 24714 return DCI.CombineTo(N, Cond, SDValue()); 24715 return Cond; 24716 } 24717 24718 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 24719 // for any integer data type, including i8/i16. 24720 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 24721 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 24722 DAG.getConstant(CC, DL, MVT::i8), Cond); 24723 24724 // Zero extend the condition if needed. 24725 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 24726 FalseC->getValueType(0), Cond); 24727 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 24728 SDValue(FalseC, 0)); 24729 24730 if (N->getNumValues() == 2) // Dead flag value? 24731 return DCI.CombineTo(N, Cond, SDValue()); 24732 return Cond; 24733 } 24734 24735 // Optimize cases that will turn into an LEA instruction. This requires 24736 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 24737 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 24738 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 24739 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 24740 24741 bool isFastMultiplier = false; 24742 if (Diff < 10) { 24743 switch ((unsigned char)Diff) { 24744 default: break; 24745 case 1: // result = add base, cond 24746 case 2: // result = lea base( , cond*2) 24747 case 3: // result = lea base(cond, cond*2) 24748 case 4: // result = lea base( , cond*4) 24749 case 5: // result = lea base(cond, cond*4) 24750 case 8: // result = lea base( , cond*8) 24751 case 9: // result = lea base(cond, cond*8) 24752 isFastMultiplier = true; 24753 break; 24754 } 24755 } 24756 24757 if (isFastMultiplier) { 24758 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 24759 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 24760 DAG.getConstant(CC, DL, MVT::i8), Cond); 24761 // Zero extend the condition if needed. 24762 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 24763 Cond); 24764 // Scale the condition by the difference. 24765 if (Diff != 1) 24766 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 24767 DAG.getConstant(Diff, DL, Cond.getValueType())); 24768 24769 // Add the base if non-zero. 24770 if (FalseC->getAPIntValue() != 0) 24771 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 24772 SDValue(FalseC, 0)); 24773 if (N->getNumValues() == 2) // Dead flag value? 24774 return DCI.CombineTo(N, Cond, SDValue()); 24775 return Cond; 24776 } 24777 } 24778 } 24779 } 24780 24781 // Handle these cases: 24782 // (select (x != c), e, c) -> select (x != c), e, x), 24783 // (select (x == c), c, e) -> select (x == c), x, e) 24784 // where the c is an integer constant, and the "select" is the combination 24785 // of CMOV and CMP. 24786 // 24787 // The rationale for this change is that the conditional-move from a constant 24788 // needs two instructions, however, conditional-move from a register needs 24789 // only one instruction. 24790 // 24791 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 24792 // some instruction-combining opportunities. This opt needs to be 24793 // postponed as late as possible. 24794 // 24795 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 24796 // the DCI.xxxx conditions are provided to postpone the optimization as 24797 // late as possible. 24798 24799 ConstantSDNode *CmpAgainst = nullptr; 24800 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 24801 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 24802 !isa<ConstantSDNode>(Cond.getOperand(0))) { 24803 24804 if (CC == X86::COND_NE && 24805 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 24806 CC = X86::GetOppositeBranchCondition(CC); 24807 std::swap(TrueOp, FalseOp); 24808 } 24809 24810 if (CC == X86::COND_E && 24811 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 24812 SDValue Ops[] = { FalseOp, Cond.getOperand(0), 24813 DAG.getConstant(CC, DL, MVT::i8), Cond }; 24814 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); 24815 } 24816 } 24817 } 24818 24819 // Fold and/or of setcc's to double CMOV: 24820 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) 24821 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) 24822 // 24823 // This combine lets us generate: 24824 // cmovcc1 (jcc1 if we don't have CMOV) 24825 // cmovcc2 (same) 24826 // instead of: 24827 // setcc1 24828 // setcc2 24829 // and/or 24830 // cmovne (jne if we don't have CMOV) 24831 // When we can't use the CMOV instruction, it might increase branch 24832 // mispredicts. 24833 // When we can use CMOV, or when there is no mispredict, this improves 24834 // throughput and reduces register pressure. 24835 // 24836 if (CC == X86::COND_NE) { 24837 SDValue Flags; 24838 X86::CondCode CC0, CC1; 24839 bool isAndSetCC; 24840 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { 24841 if (isAndSetCC) { 24842 std::swap(FalseOp, TrueOp); 24843 CC0 = X86::GetOppositeBranchCondition(CC0); 24844 CC1 = X86::GetOppositeBranchCondition(CC1); 24845 } 24846 24847 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), 24848 Flags}; 24849 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); 24850 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; 24851 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); 24852 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); 24853 return CMOV; 24854 } 24855 } 24856 24857 return SDValue(); 24858 } 24859 24860 /// PerformMulCombine - Optimize a single multiply with constant into two 24861 /// in order to implement it with two cheaper instructions, e.g. 24862 /// LEA + SHL, LEA + LEA. 24863 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 24864 TargetLowering::DAGCombinerInfo &DCI) { 24865 // An imul is usually smaller than the alternative sequence. 24866 if (DAG.getMachineFunction().getFunction()->optForMinSize()) 24867 return SDValue(); 24868 24869 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 24870 return SDValue(); 24871 24872 EVT VT = N->getValueType(0); 24873 if (VT != MVT::i64 && VT != MVT::i32) 24874 return SDValue(); 24875 24876 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 24877 if (!C) 24878 return SDValue(); 24879 uint64_t MulAmt = C->getZExtValue(); 24880 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 24881 return SDValue(); 24882 24883 uint64_t MulAmt1 = 0; 24884 uint64_t MulAmt2 = 0; 24885 if ((MulAmt % 9) == 0) { 24886 MulAmt1 = 9; 24887 MulAmt2 = MulAmt / 9; 24888 } else if ((MulAmt % 5) == 0) { 24889 MulAmt1 = 5; 24890 MulAmt2 = MulAmt / 5; 24891 } else if ((MulAmt % 3) == 0) { 24892 MulAmt1 = 3; 24893 MulAmt2 = MulAmt / 3; 24894 } 24895 24896 SDLoc DL(N); 24897 SDValue NewMul; 24898 if (MulAmt2 && 24899 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 24900 24901 if (isPowerOf2_64(MulAmt2) && 24902 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 24903 // If second multiplifer is pow2, issue it first. We want the multiply by 24904 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 24905 // is an add. 24906 std::swap(MulAmt1, MulAmt2); 24907 24908 if (isPowerOf2_64(MulAmt1)) 24909 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 24910 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); 24911 else 24912 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 24913 DAG.getConstant(MulAmt1, DL, VT)); 24914 24915 if (isPowerOf2_64(MulAmt2)) 24916 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 24917 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); 24918 else 24919 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 24920 DAG.getConstant(MulAmt2, DL, VT)); 24921 } 24922 24923 if (!NewMul) { 24924 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) 24925 && "Both cases that could cause potential overflows should have " 24926 "already been handled."); 24927 if (isPowerOf2_64(MulAmt - 1)) 24928 // (mul x, 2^N + 1) => (add (shl x, N), x) 24929 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), 24930 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 24931 DAG.getConstant(Log2_64(MulAmt - 1), DL, 24932 MVT::i8))); 24933 24934 else if (isPowerOf2_64(MulAmt + 1)) 24935 // (mul x, 2^N - 1) => (sub (shl x, N), x) 24936 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, 24937 N->getOperand(0), 24938 DAG.getConstant(Log2_64(MulAmt + 1), 24939 DL, MVT::i8)), N->getOperand(0)); 24940 } 24941 24942 if (NewMul) 24943 // Do not add new nodes to DAG combiner worklist. 24944 DCI.CombineTo(N, NewMul, false); 24945 24946 return SDValue(); 24947 } 24948 24949 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 24950 SDValue N0 = N->getOperand(0); 24951 SDValue N1 = N->getOperand(1); 24952 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 24953 EVT VT = N0.getValueType(); 24954 24955 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 24956 // since the result of setcc_c is all zero's or all ones. 24957 if (VT.isInteger() && !VT.isVector() && 24958 N1C && N0.getOpcode() == ISD::AND && 24959 N0.getOperand(1).getOpcode() == ISD::Constant) { 24960 SDValue N00 = N0.getOperand(0); 24961 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 24962 APInt ShAmt = N1C->getAPIntValue(); 24963 Mask = Mask.shl(ShAmt); 24964 bool MaskOK = false; 24965 // We can handle cases concerning bit-widening nodes containing setcc_c if 24966 // we carefully interrogate the mask to make sure we are semantics 24967 // preserving. 24968 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth 24969 // of the underlying setcc_c operation if the setcc_c was zero extended. 24970 // Consider the following example: 24971 // zext(setcc_c) -> i32 0x0000FFFF 24972 // c1 -> i32 0x0000FFFF 24973 // c2 -> i32 0x00000001 24974 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE 24975 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE 24976 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 24977 MaskOK = true; 24978 } else if (N00.getOpcode() == ISD::SIGN_EXTEND && 24979 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 24980 MaskOK = true; 24981 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || 24982 N00.getOpcode() == ISD::ANY_EXTEND) && 24983 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 24984 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); 24985 } 24986 if (MaskOK && Mask != 0) { 24987 SDLoc DL(N); 24988 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); 24989 } 24990 } 24991 24992 // Hardware support for vector shifts is sparse which makes us scalarize the 24993 // vector operations in many cases. Also, on sandybridge ADD is faster than 24994 // shl. 24995 // (shl V, 1) -> add V,V 24996 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1)) 24997 if (auto *N1SplatC = N1BV->getConstantSplatNode()) { 24998 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 24999 // We shift all of the values by one. In many cases we do not have 25000 // hardware support for this operation. This is better expressed as an ADD 25001 // of two values. 25002 if (N1SplatC->getAPIntValue() == 1) 25003 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); 25004 } 25005 25006 return SDValue(); 25007 } 25008 25009 static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { 25010 SDValue N0 = N->getOperand(0); 25011 SDValue N1 = N->getOperand(1); 25012 EVT VT = N0.getValueType(); 25013 unsigned Size = VT.getSizeInBits(); 25014 25015 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) 25016 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or 25017 // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) 25018 // depending on sign of (SarConst - [56,48,32,24,16]) 25019 25020 // sexts in X86 are MOVs. The MOVs have the same code size 25021 // as above SHIFTs (only SHIFT on 1 has lower code size). 25022 // However the MOVs have 2 advantages to a SHIFT: 25023 // 1. MOVs can write to a register that differs from source 25024 // 2. MOVs accept memory operands 25025 25026 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || 25027 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || 25028 N0.getOperand(1).getOpcode() != ISD::Constant) 25029 return SDValue(); 25030 25031 SDValue N00 = N0.getOperand(0); 25032 SDValue N01 = N0.getOperand(1); 25033 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue(); 25034 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue(); 25035 EVT CVT = N1.getValueType(); 25036 25037 if (SarConst.isNegative()) 25038 return SDValue(); 25039 25040 for (MVT SVT : MVT::integer_valuetypes()) { 25041 unsigned ShiftSize = SVT.getSizeInBits(); 25042 // skipping types without corresponding sext/zext and 25043 // ShlConst that is not one of [56,48,32,24,16] 25044 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) 25045 continue; 25046 SDLoc DL(N); 25047 SDValue NN = 25048 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); 25049 SarConst = SarConst - (Size - ShiftSize); 25050 if (SarConst == 0) 25051 return NN; 25052 else if (SarConst.isNegative()) 25053 return DAG.getNode(ISD::SHL, DL, VT, NN, 25054 DAG.getConstant(-SarConst, DL, CVT)); 25055 else 25056 return DAG.getNode(ISD::SRA, DL, VT, NN, 25057 DAG.getConstant(SarConst, DL, CVT)); 25058 } 25059 return SDValue(); 25060 } 25061 25062 /// \brief Returns a vector of 0s if the node in input is a vector logical 25063 /// shift by a constant amount which is known to be bigger than or equal 25064 /// to the vector element size in bits. 25065 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, 25066 const X86Subtarget *Subtarget) { 25067 EVT VT = N->getValueType(0); 25068 25069 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 25070 (!Subtarget->hasInt256() || 25071 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 25072 return SDValue(); 25073 25074 SDValue Amt = N->getOperand(1); 25075 SDLoc DL(N); 25076 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) 25077 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { 25078 APInt ShiftAmt = AmtSplat->getAPIntValue(); 25079 unsigned MaxAmount = 25080 VT.getSimpleVT().getVectorElementType().getSizeInBits(); 25081 25082 // SSE2/AVX2 logical shifts always return a vector of 0s 25083 // if the shift amount is bigger than or equal to 25084 // the element size. The constant shift amount will be 25085 // encoded as a 8-bit immediate. 25086 if (ShiftAmt.trunc(8).uge(MaxAmount)) 25087 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); 25088 } 25089 25090 return SDValue(); 25091 } 25092 25093 /// PerformShiftCombine - Combine shifts. 25094 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 25095 TargetLowering::DAGCombinerInfo &DCI, 25096 const X86Subtarget *Subtarget) { 25097 if (N->getOpcode() == ISD::SHL) 25098 if (SDValue V = PerformSHLCombine(N, DAG)) 25099 return V; 25100 25101 if (N->getOpcode() == ISD::SRA) 25102 if (SDValue V = PerformSRACombine(N, DAG)) 25103 return V; 25104 25105 // Try to fold this logical shift into a zero vector. 25106 if (N->getOpcode() != ISD::SRA) 25107 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) 25108 return V; 25109 25110 return SDValue(); 25111 } 25112 25113 // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 25114 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS 25115 // and friends. Likewise for OR -> CMPNEQSS. 25116 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 25117 TargetLowering::DAGCombinerInfo &DCI, 25118 const X86Subtarget *Subtarget) { 25119 unsigned opcode; 25120 25121 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 25122 // we're requiring SSE2 for both. 25123 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 25124 SDValue N0 = N->getOperand(0); 25125 SDValue N1 = N->getOperand(1); 25126 SDValue CMP0 = N0->getOperand(1); 25127 SDValue CMP1 = N1->getOperand(1); 25128 SDLoc DL(N); 25129 25130 // The SETCCs should both refer to the same CMP. 25131 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 25132 return SDValue(); 25133 25134 SDValue CMP00 = CMP0->getOperand(0); 25135 SDValue CMP01 = CMP0->getOperand(1); 25136 EVT VT = CMP00.getValueType(); 25137 25138 if (VT == MVT::f32 || VT == MVT::f64) { 25139 bool ExpectingFlags = false; 25140 // Check for any users that want flags: 25141 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 25142 !ExpectingFlags && UI != UE; ++UI) 25143 switch (UI->getOpcode()) { 25144 default: 25145 case ISD::BR_CC: 25146 case ISD::BRCOND: 25147 case ISD::SELECT: 25148 ExpectingFlags = true; 25149 break; 25150 case ISD::CopyToReg: 25151 case ISD::SIGN_EXTEND: 25152 case ISD::ZERO_EXTEND: 25153 case ISD::ANY_EXTEND: 25154 break; 25155 } 25156 25157 if (!ExpectingFlags) { 25158 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 25159 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 25160 25161 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 25162 X86::CondCode tmp = cc0; 25163 cc0 = cc1; 25164 cc1 = tmp; 25165 } 25166 25167 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 25168 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 25169 // FIXME: need symbolic constants for these magic numbers. 25170 // See X86ATTInstPrinter.cpp:printSSECC(). 25171 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 25172 if (Subtarget->hasAVX512()) { 25173 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, 25174 CMP01, 25175 DAG.getConstant(x86cc, DL, MVT::i8)); 25176 if (N->getValueType(0) != MVT::i1) 25177 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), 25178 FSetCC); 25179 return FSetCC; 25180 } 25181 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, 25182 CMP00.getValueType(), CMP00, CMP01, 25183 DAG.getConstant(x86cc, DL, 25184 MVT::i8)); 25185 25186 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 25187 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; 25188 25189 if (is64BitFP && !Subtarget->is64Bit()) { 25190 // On a 32-bit target, we cannot bitcast the 64-bit float to a 25191 // 64-bit integer, since that's not a legal type. Since 25192 // OnesOrZeroesF is all ones of all zeroes, we don't need all the 25193 // bits, but can do this little dance to extract the lowest 32 bits 25194 // and work with those going forward. 25195 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, 25196 OnesOrZeroesF); 25197 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); 25198 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, 25199 Vector32, DAG.getIntPtrConstant(0, DL)); 25200 IntVT = MVT::i32; 25201 } 25202 25203 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); 25204 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, 25205 DAG.getConstant(1, DL, IntVT)); 25206 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 25207 ANDed); 25208 return OneBitOfTruth; 25209 } 25210 } 25211 } 25212 } 25213 return SDValue(); 25214 } 25215 25216 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 25217 /// so it can be folded inside ANDNP. 25218 static bool CanFoldXORWithAllOnes(const SDNode *N) { 25219 EVT VT = N->getValueType(0); 25220 25221 // Match direct AllOnes for 128 and 256-bit vectors 25222 if (ISD::isBuildVectorAllOnes(N)) 25223 return true; 25224 25225 // Look through a bit convert. 25226 if (N->getOpcode() == ISD::BITCAST) 25227 N = N->getOperand(0).getNode(); 25228 25229 // Sometimes the operand may come from a insert_subvector building a 256-bit 25230 // allones vector 25231 if (VT.is256BitVector() && 25232 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 25233 SDValue V1 = N->getOperand(0); 25234 SDValue V2 = N->getOperand(1); 25235 25236 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 25237 V1.getOperand(0).getOpcode() == ISD::UNDEF && 25238 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 25239 ISD::isBuildVectorAllOnes(V2.getNode())) 25240 return true; 25241 } 25242 25243 return false; 25244 } 25245 25246 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized 25247 // register. In most cases we actually compare or select YMM-sized registers 25248 // and mixing the two types creates horrible code. This method optimizes 25249 // some of the transition sequences. 25250 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, 25251 TargetLowering::DAGCombinerInfo &DCI, 25252 const X86Subtarget *Subtarget) { 25253 EVT VT = N->getValueType(0); 25254 if (!VT.is256BitVector()) 25255 return SDValue(); 25256 25257 assert((N->getOpcode() == ISD::ANY_EXTEND || 25258 N->getOpcode() == ISD::ZERO_EXTEND || 25259 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); 25260 25261 SDValue Narrow = N->getOperand(0); 25262 EVT NarrowVT = Narrow->getValueType(0); 25263 if (!NarrowVT.is128BitVector()) 25264 return SDValue(); 25265 25266 if (Narrow->getOpcode() != ISD::XOR && 25267 Narrow->getOpcode() != ISD::AND && 25268 Narrow->getOpcode() != ISD::OR) 25269 return SDValue(); 25270 25271 SDValue N0 = Narrow->getOperand(0); 25272 SDValue N1 = Narrow->getOperand(1); 25273 SDLoc DL(Narrow); 25274 25275 // The Left side has to be a trunc. 25276 if (N0.getOpcode() != ISD::TRUNCATE) 25277 return SDValue(); 25278 25279 // The type of the truncated inputs. 25280 EVT WideVT = N0->getOperand(0)->getValueType(0); 25281 if (WideVT != VT) 25282 return SDValue(); 25283 25284 // The right side has to be a 'trunc' or a constant vector. 25285 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; 25286 ConstantSDNode *RHSConstSplat = nullptr; 25287 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1)) 25288 RHSConstSplat = RHSBV->getConstantSplatNode(); 25289 if (!RHSTrunc && !RHSConstSplat) 25290 return SDValue(); 25291 25292 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 25293 25294 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) 25295 return SDValue(); 25296 25297 // Set N0 and N1 to hold the inputs to the new wide operation. 25298 N0 = N0->getOperand(0); 25299 if (RHSConstSplat) { 25300 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), 25301 SDValue(RHSConstSplat, 0)); 25302 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); 25303 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); 25304 } else if (RHSTrunc) { 25305 N1 = N1->getOperand(0); 25306 } 25307 25308 // Generate the wide operation. 25309 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); 25310 unsigned Opcode = N->getOpcode(); 25311 switch (Opcode) { 25312 case ISD::ANY_EXTEND: 25313 return Op; 25314 case ISD::ZERO_EXTEND: { 25315 unsigned InBits = NarrowVT.getScalarSizeInBits(); 25316 APInt Mask = APInt::getAllOnesValue(InBits); 25317 Mask = Mask.zext(VT.getScalarSizeInBits()); 25318 return DAG.getNode(ISD::AND, DL, VT, 25319 Op, DAG.getConstant(Mask, DL, VT)); 25320 } 25321 case ISD::SIGN_EXTEND: 25322 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, 25323 Op, DAG.getValueType(NarrowVT)); 25324 default: 25325 llvm_unreachable("Unexpected opcode"); 25326 } 25327 } 25328 25329 static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, 25330 TargetLowering::DAGCombinerInfo &DCI, 25331 const X86Subtarget *Subtarget) { 25332 SDValue N0 = N->getOperand(0); 25333 SDValue N1 = N->getOperand(1); 25334 SDLoc DL(N); 25335 25336 // A vector zext_in_reg may be represented as a shuffle, 25337 // feeding into a bitcast (this represents anyext) feeding into 25338 // an and with a mask. 25339 // We'd like to try to combine that into a shuffle with zero 25340 // plus a bitcast, removing the and. 25341 if (N0.getOpcode() != ISD::BITCAST || 25342 N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) 25343 return SDValue(); 25344 25345 // The other side of the AND should be a splat of 2^C, where C 25346 // is the number of bits in the source type. 25347 if (N1.getOpcode() == ISD::BITCAST) 25348 N1 = N1.getOperand(0); 25349 if (N1.getOpcode() != ISD::BUILD_VECTOR) 25350 return SDValue(); 25351 BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1); 25352 25353 ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0)); 25354 EVT SrcType = Shuffle->getValueType(0); 25355 25356 // We expect a single-source shuffle 25357 if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF) 25358 return SDValue(); 25359 25360 unsigned SrcSize = SrcType.getScalarSizeInBits(); 25361 25362 APInt SplatValue, SplatUndef; 25363 unsigned SplatBitSize; 25364 bool HasAnyUndefs; 25365 if (!Vector->isConstantSplat(SplatValue, SplatUndef, 25366 SplatBitSize, HasAnyUndefs)) 25367 return SDValue(); 25368 25369 unsigned ResSize = N1.getValueType().getScalarSizeInBits(); 25370 // Make sure the splat matches the mask we expect 25371 if (SplatBitSize > ResSize || 25372 (SplatValue + 1).exactLogBase2() != (int)SrcSize) 25373 return SDValue(); 25374 25375 // Make sure the input and output size make sense 25376 if (SrcSize >= ResSize || ResSize % SrcSize) 25377 return SDValue(); 25378 25379 // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> 25380 // The number of u's between each two values depends on the ratio between 25381 // the source and dest type. 25382 unsigned ZextRatio = ResSize / SrcSize; 25383 bool IsZext = true; 25384 for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) { 25385 if (i % ZextRatio) { 25386 if (Shuffle->getMaskElt(i) > 0) { 25387 // Expected undef 25388 IsZext = false; 25389 break; 25390 } 25391 } else { 25392 if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) { 25393 // Expected element number 25394 IsZext = false; 25395 break; 25396 } 25397 } 25398 } 25399 25400 if (!IsZext) 25401 return SDValue(); 25402 25403 // Ok, perform the transformation - replace the shuffle with 25404 // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero 25405 // (instead of undef) where the k elements come from the zero vector. 25406 SmallVector<int, 8> Mask; 25407 unsigned NumElems = SrcType.getVectorNumElements(); 25408 for (unsigned i = 0; i < NumElems; ++i) 25409 if (i % ZextRatio) 25410 Mask.push_back(NumElems); 25411 else 25412 Mask.push_back(i / ZextRatio); 25413 25414 SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, 25415 Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask); 25416 return DAG.getBitcast(N0.getValueType(), NewShuffle); 25417 } 25418 25419 /// If both input operands of a logic op are being cast from floating point 25420 /// types, try to convert this into a floating point logic node to avoid 25421 /// unnecessary moves from SSE to integer registers. 25422 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, 25423 const X86Subtarget *Subtarget) { 25424 unsigned FPOpcode = ISD::DELETED_NODE; 25425 if (N->getOpcode() == ISD::AND) 25426 FPOpcode = X86ISD::FAND; 25427 else if (N->getOpcode() == ISD::OR) 25428 FPOpcode = X86ISD::FOR; 25429 else if (N->getOpcode() == ISD::XOR) 25430 FPOpcode = X86ISD::FXOR; 25431 25432 assert(FPOpcode != ISD::DELETED_NODE && 25433 "Unexpected input node for FP logic conversion"); 25434 25435 EVT VT = N->getValueType(0); 25436 SDValue N0 = N->getOperand(0); 25437 SDValue N1 = N->getOperand(1); 25438 SDLoc DL(N); 25439 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && 25440 ((Subtarget->hasSSE1() && VT == MVT::i32) || 25441 (Subtarget->hasSSE2() && VT == MVT::i64))) { 25442 SDValue N00 = N0.getOperand(0); 25443 SDValue N10 = N1.getOperand(0); 25444 EVT N00Type = N00.getValueType(); 25445 EVT N10Type = N10.getValueType(); 25446 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { 25447 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); 25448 return DAG.getBitcast(VT, FPLogic); 25449 } 25450 } 25451 return SDValue(); 25452 } 25453 25454 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 25455 TargetLowering::DAGCombinerInfo &DCI, 25456 const X86Subtarget *Subtarget) { 25457 if (DCI.isBeforeLegalizeOps()) 25458 return SDValue(); 25459 25460 if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget)) 25461 return Zext; 25462 25463 if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) 25464 return R; 25465 25466 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) 25467 return FPLogic; 25468 25469 EVT VT = N->getValueType(0); 25470 SDValue N0 = N->getOperand(0); 25471 SDValue N1 = N->getOperand(1); 25472 SDLoc DL(N); 25473 25474 // Create BEXTR instructions 25475 // BEXTR is ((X >> imm) & (2**size-1)) 25476 if (VT == MVT::i32 || VT == MVT::i64) { 25477 // Check for BEXTR. 25478 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && 25479 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { 25480 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); 25481 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 25482 if (MaskNode && ShiftNode) { 25483 uint64_t Mask = MaskNode->getZExtValue(); 25484 uint64_t Shift = ShiftNode->getZExtValue(); 25485 if (isMask_64(Mask)) { 25486 uint64_t MaskSize = countPopulation(Mask); 25487 if (Shift + MaskSize <= VT.getSizeInBits()) 25488 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), 25489 DAG.getConstant(Shift | (MaskSize << 8), DL, 25490 VT)); 25491 } 25492 } 25493 } // BEXTR 25494 25495 return SDValue(); 25496 } 25497 25498 // Want to form ANDNP nodes: 25499 // 1) In the hopes of then easily combining them with OR and AND nodes 25500 // to form PBLEND/PSIGN. 25501 // 2) To match ANDN packed intrinsics 25502 if (VT != MVT::v2i64 && VT != MVT::v4i64) 25503 return SDValue(); 25504 25505 // Check LHS for vnot 25506 if (N0.getOpcode() == ISD::XOR && 25507 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 25508 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 25509 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 25510 25511 // Check RHS for vnot 25512 if (N1.getOpcode() == ISD::XOR && 25513 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 25514 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 25515 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 25516 25517 return SDValue(); 25518 } 25519 25520 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 25521 TargetLowering::DAGCombinerInfo &DCI, 25522 const X86Subtarget *Subtarget) { 25523 if (DCI.isBeforeLegalizeOps()) 25524 return SDValue(); 25525 25526 if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) 25527 return R; 25528 25529 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) 25530 return FPLogic; 25531 25532 SDValue N0 = N->getOperand(0); 25533 SDValue N1 = N->getOperand(1); 25534 EVT VT = N->getValueType(0); 25535 25536 // look for psign/blend 25537 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 25538 if (!Subtarget->hasSSSE3() || 25539 (VT == MVT::v4i64 && !Subtarget->hasInt256())) 25540 return SDValue(); 25541 25542 // Canonicalize pandn to RHS 25543 if (N0.getOpcode() == X86ISD::ANDNP) 25544 std::swap(N0, N1); 25545 // or (and (m, y), (pandn m, x)) 25546 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 25547 SDValue Mask = N1.getOperand(0); 25548 SDValue X = N1.getOperand(1); 25549 SDValue Y; 25550 if (N0.getOperand(0) == Mask) 25551 Y = N0.getOperand(1); 25552 if (N0.getOperand(1) == Mask) 25553 Y = N0.getOperand(0); 25554 25555 // Check to see if the mask appeared in both the AND and ANDNP and 25556 if (!Y.getNode()) 25557 return SDValue(); 25558 25559 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 25560 // Look through mask bitcast. 25561 if (Mask.getOpcode() == ISD::BITCAST) 25562 Mask = Mask.getOperand(0); 25563 if (X.getOpcode() == ISD::BITCAST) 25564 X = X.getOperand(0); 25565 if (Y.getOpcode() == ISD::BITCAST) 25566 Y = Y.getOperand(0); 25567 25568 EVT MaskVT = Mask.getValueType(); 25569 25570 // Validate that the Mask operand is a vector sra node. 25571 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 25572 // there is no psrai.b 25573 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 25574 unsigned SraAmt = ~0; 25575 if (Mask.getOpcode() == ISD::SRA) { 25576 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) 25577 if (auto *AmtConst = AmtBV->getConstantSplatNode()) 25578 SraAmt = AmtConst->getZExtValue(); 25579 } else if (Mask.getOpcode() == X86ISD::VSRAI) { 25580 SDValue SraC = Mask.getOperand(1); 25581 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 25582 } 25583 if ((SraAmt + 1) != EltBits) 25584 return SDValue(); 25585 25586 SDLoc DL(N); 25587 25588 // Now we know we at least have a plendvb with the mask val. See if 25589 // we can form a psignb/w/d. 25590 // psign = x.type == y.type == mask.type && y = sub(0, x); 25591 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 25592 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 25593 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 25594 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 25595 "Unsupported VT for PSIGN"); 25596 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 25597 return DAG.getBitcast(VT, Mask); 25598 } 25599 // PBLENDVB only available on SSE 4.1 25600 if (!Subtarget->hasSSE41()) 25601 return SDValue(); 25602 25603 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 25604 25605 X = DAG.getBitcast(BlendVT, X); 25606 Y = DAG.getBitcast(BlendVT, Y); 25607 Mask = DAG.getBitcast(BlendVT, Mask); 25608 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 25609 return DAG.getBitcast(VT, Mask); 25610 } 25611 } 25612 25613 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 25614 return SDValue(); 25615 25616 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 25617 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); 25618 25619 // SHLD/SHRD instructions have lower register pressure, but on some 25620 // platforms they have higher latency than the equivalent 25621 // series of shifts/or that would otherwise be generated. 25622 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions 25623 // have higher latencies and we are not optimizing for size. 25624 if (!OptForSize && Subtarget->isSHLDSlow()) 25625 return SDValue(); 25626 25627 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 25628 std::swap(N0, N1); 25629 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 25630 return SDValue(); 25631 if (!N0.hasOneUse() || !N1.hasOneUse()) 25632 return SDValue(); 25633 25634 SDValue ShAmt0 = N0.getOperand(1); 25635 if (ShAmt0.getValueType() != MVT::i8) 25636 return SDValue(); 25637 SDValue ShAmt1 = N1.getOperand(1); 25638 if (ShAmt1.getValueType() != MVT::i8) 25639 return SDValue(); 25640 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 25641 ShAmt0 = ShAmt0.getOperand(0); 25642 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 25643 ShAmt1 = ShAmt1.getOperand(0); 25644 25645 SDLoc DL(N); 25646 unsigned Opc = X86ISD::SHLD; 25647 SDValue Op0 = N0.getOperand(0); 25648 SDValue Op1 = N1.getOperand(0); 25649 if (ShAmt0.getOpcode() == ISD::SUB) { 25650 Opc = X86ISD::SHRD; 25651 std::swap(Op0, Op1); 25652 std::swap(ShAmt0, ShAmt1); 25653 } 25654 25655 unsigned Bits = VT.getSizeInBits(); 25656 if (ShAmt1.getOpcode() == ISD::SUB) { 25657 SDValue Sum = ShAmt1.getOperand(0); 25658 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 25659 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 25660 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 25661 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 25662 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 25663 return DAG.getNode(Opc, DL, VT, 25664 Op0, Op1, 25665 DAG.getNode(ISD::TRUNCATE, DL, 25666 MVT::i8, ShAmt0)); 25667 } 25668 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 25669 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 25670 if (ShAmt0C && 25671 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 25672 return DAG.getNode(Opc, DL, VT, 25673 N0.getOperand(0), N1.getOperand(0), 25674 DAG.getNode(ISD::TRUNCATE, DL, 25675 MVT::i8, ShAmt0)); 25676 } 25677 25678 return SDValue(); 25679 } 25680 25681 // Generate NEG and CMOV for integer abs. 25682 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 25683 EVT VT = N->getValueType(0); 25684 25685 // Since X86 does not have CMOV for 8-bit integer, we don't convert 25686 // 8-bit integer abs to NEG and CMOV. 25687 if (VT.isInteger() && VT.getSizeInBits() == 8) 25688 return SDValue(); 25689 25690 SDValue N0 = N->getOperand(0); 25691 SDValue N1 = N->getOperand(1); 25692 SDLoc DL(N); 25693 25694 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 25695 // and change it to SUB and CMOV. 25696 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 25697 N0.getOpcode() == ISD::ADD && 25698 N0.getOperand(1) == N1 && 25699 N1.getOpcode() == ISD::SRA && 25700 N1.getOperand(0) == N0.getOperand(0)) 25701 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 25702 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 25703 // Generate SUB & CMOV. 25704 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 25705 DAG.getConstant(0, DL, VT), N0.getOperand(0)); 25706 25707 SDValue Ops[] = { N0.getOperand(0), Neg, 25708 DAG.getConstant(X86::COND_GE, DL, MVT::i8), 25709 SDValue(Neg.getNode(), 1) }; 25710 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); 25711 } 25712 return SDValue(); 25713 } 25714 25715 // Try to turn tests against the signbit in the form of: 25716 // XOR(TRUNCATE(SRL(X, size(X)-1)), 1) 25717 // into: 25718 // SETGT(X, -1) 25719 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { 25720 // This is only worth doing if the output type is i8. 25721 if (N->getValueType(0) != MVT::i8) 25722 return SDValue(); 25723 25724 SDValue N0 = N->getOperand(0); 25725 SDValue N1 = N->getOperand(1); 25726 25727 // We should be performing an xor against a truncated shift. 25728 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) 25729 return SDValue(); 25730 25731 // Make sure we are performing an xor against one. 25732 if (!isOneConstant(N1)) 25733 return SDValue(); 25734 25735 // SetCC on x86 zero extends so only act on this if it's a logical shift. 25736 SDValue Shift = N0.getOperand(0); 25737 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) 25738 return SDValue(); 25739 25740 // Make sure we are truncating from one of i16, i32 or i64. 25741 EVT ShiftTy = Shift.getValueType(); 25742 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) 25743 return SDValue(); 25744 25745 // Make sure the shift amount extracts the sign bit. 25746 if (!isa<ConstantSDNode>(Shift.getOperand(1)) || 25747 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) 25748 return SDValue(); 25749 25750 // Create a greater-than comparison against -1. 25751 // N.B. Using SETGE against 0 works but we want a canonical looking 25752 // comparison, using SETGT matches up with what TranslateX86CC. 25753 SDLoc DL(N); 25754 SDValue ShiftOp = Shift.getOperand(0); 25755 EVT ShiftOpTy = ShiftOp.getValueType(); 25756 SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp, 25757 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); 25758 return Cond; 25759 } 25760 25761 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 25762 TargetLowering::DAGCombinerInfo &DCI, 25763 const X86Subtarget *Subtarget) { 25764 if (DCI.isBeforeLegalizeOps()) 25765 return SDValue(); 25766 25767 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) 25768 return RV; 25769 25770 if (Subtarget->hasCMov()) 25771 if (SDValue RV = performIntegerAbsCombine(N, DAG)) 25772 return RV; 25773 25774 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) 25775 return FPLogic; 25776 25777 return SDValue(); 25778 } 25779 25780 /// This function detects the AVG pattern between vectors of unsigned i8/i16, 25781 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient 25782 /// X86ISD::AVG instruction. 25783 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, 25784 const X86Subtarget *Subtarget, SDLoc DL) { 25785 if (!VT.isVector() || !VT.isSimple()) 25786 return SDValue(); 25787 EVT InVT = In.getValueType(); 25788 unsigned NumElems = VT.getVectorNumElements(); 25789 25790 EVT ScalarVT = VT.getVectorElementType(); 25791 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && 25792 isPowerOf2_32(NumElems))) 25793 return SDValue(); 25794 25795 // InScalarVT is the intermediate type in AVG pattern and it should be greater 25796 // than the original input type (i8/i16). 25797 EVT InScalarVT = InVT.getVectorElementType(); 25798 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) 25799 return SDValue(); 25800 25801 if (Subtarget->hasAVX512()) { 25802 if (VT.getSizeInBits() > 512) 25803 return SDValue(); 25804 } else if (Subtarget->hasAVX2()) { 25805 if (VT.getSizeInBits() > 256) 25806 return SDValue(); 25807 } else { 25808 if (VT.getSizeInBits() > 128) 25809 return SDValue(); 25810 } 25811 25812 // Detect the following pattern: 25813 // 25814 // %1 = zext <N x i8> %a to <N x i32> 25815 // %2 = zext <N x i8> %b to <N x i32> 25816 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> 25817 // %4 = add nuw nsw <N x i32> %3, %2 25818 // %5 = lshr <N x i32> %N, <i32 1 x N> 25819 // %6 = trunc <N x i32> %5 to <N x i8> 25820 // 25821 // In AVX512, the last instruction can also be a trunc store. 25822 25823 if (In.getOpcode() != ISD::SRL) 25824 return SDValue(); 25825 25826 // A lambda checking the given SDValue is a constant vector and each element 25827 // is in the range [Min, Max]. 25828 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { 25829 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); 25830 if (!BV || !BV->isConstant()) 25831 return false; 25832 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { 25833 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); 25834 if (!C) 25835 return false; 25836 uint64_t Val = C->getZExtValue(); 25837 if (Val < Min || Val > Max) 25838 return false; 25839 } 25840 return true; 25841 }; 25842 25843 // Check if each element of the vector is left-shifted by one. 25844 auto LHS = In.getOperand(0); 25845 auto RHS = In.getOperand(1); 25846 if (!IsConstVectorInRange(RHS, 1, 1)) 25847 return SDValue(); 25848 if (LHS.getOpcode() != ISD::ADD) 25849 return SDValue(); 25850 25851 // Detect a pattern of a + b + 1 where the order doesn't matter. 25852 SDValue Operands[3]; 25853 Operands[0] = LHS.getOperand(0); 25854 Operands[1] = LHS.getOperand(1); 25855 25856 // Take care of the case when one of the operands is a constant vector whose 25857 // element is in the range [1, 256]. 25858 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && 25859 Operands[0].getOpcode() == ISD::ZERO_EXTEND && 25860 Operands[0].getOperand(0).getValueType() == VT) { 25861 // The pattern is detected. Subtract one from the constant vector, then 25862 // demote it and emit X86ISD::AVG instruction. 25863 SDValue One = DAG.getConstant(1, DL, InScalarVT); 25864 SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, 25865 SmallVector<SDValue, 8>(NumElems, One)); 25866 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); 25867 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); 25868 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), 25869 Operands[1]); 25870 } 25871 25872 if (Operands[0].getOpcode() == ISD::ADD) 25873 std::swap(Operands[0], Operands[1]); 25874 else if (Operands[1].getOpcode() != ISD::ADD) 25875 return SDValue(); 25876 Operands[2] = Operands[1].getOperand(0); 25877 Operands[1] = Operands[1].getOperand(1); 25878 25879 // Now we have three operands of two additions. Check that one of them is a 25880 // constant vector with ones, and the other two are promoted from i8/i16. 25881 for (int i = 0; i < 3; ++i) { 25882 if (!IsConstVectorInRange(Operands[i], 1, 1)) 25883 continue; 25884 std::swap(Operands[i], Operands[2]); 25885 25886 // Check if Operands[0] and Operands[1] are results of type promotion. 25887 for (int j = 0; j < 2; ++j) 25888 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || 25889 Operands[j].getOperand(0).getValueType() != VT) 25890 return SDValue(); 25891 25892 // The pattern is detected, emit X86ISD::AVG instruction. 25893 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), 25894 Operands[1].getOperand(0)); 25895 } 25896 25897 return SDValue(); 25898 } 25899 25900 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 25901 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 25902 TargetLowering::DAGCombinerInfo &DCI, 25903 const X86Subtarget *Subtarget) { 25904 LoadSDNode *Ld = cast<LoadSDNode>(N); 25905 EVT RegVT = Ld->getValueType(0); 25906 EVT MemVT = Ld->getMemoryVT(); 25907 SDLoc dl(Ld); 25908 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 25909 25910 // For chips with slow 32-byte unaligned loads, break the 32-byte operation 25911 // into two 16-byte operations. 25912 ISD::LoadExtType Ext = Ld->getExtensionType(); 25913 bool Fast; 25914 unsigned AddressSpace = Ld->getAddressSpace(); 25915 unsigned Alignment = Ld->getAlignment(); 25916 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && 25917 Ext == ISD::NON_EXTLOAD && 25918 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, 25919 AddressSpace, Alignment, &Fast) && !Fast) { 25920 unsigned NumElems = RegVT.getVectorNumElements(); 25921 if (NumElems < 2) 25922 return SDValue(); 25923 25924 SDValue Ptr = Ld->getBasePtr(); 25925 SDValue Increment = 25926 DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); 25927 25928 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 25929 NumElems/2); 25930 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 25931 Ld->getPointerInfo(), Ld->isVolatile(), 25932 Ld->isNonTemporal(), Ld->isInvariant(), 25933 Alignment); 25934 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 25935 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 25936 Ld->getPointerInfo(), Ld->isVolatile(), 25937 Ld->isNonTemporal(), Ld->isInvariant(), 25938 std::min(16U, Alignment)); 25939 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 25940 Load1.getValue(1), 25941 Load2.getValue(1)); 25942 25943 SDValue NewVec = DAG.getUNDEF(RegVT); 25944 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); 25945 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); 25946 return DCI.CombineTo(N, NewVec, TF, true); 25947 } 25948 25949 return SDValue(); 25950 } 25951 25952 /// PerformMLOADCombine - Resolve extending loads 25953 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, 25954 TargetLowering::DAGCombinerInfo &DCI, 25955 const X86Subtarget *Subtarget) { 25956 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); 25957 if (Mld->getExtensionType() != ISD::SEXTLOAD) 25958 return SDValue(); 25959 25960 EVT VT = Mld->getValueType(0); 25961 unsigned NumElems = VT.getVectorNumElements(); 25962 EVT LdVT = Mld->getMemoryVT(); 25963 SDLoc dl(Mld); 25964 25965 assert(LdVT != VT && "Cannot extend to the same type"); 25966 unsigned ToSz = VT.getVectorElementType().getSizeInBits(); 25967 unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); 25968 // From, To sizes and ElemCount must be pow of two 25969 assert (isPowerOf2_32(NumElems * FromSz * ToSz) && 25970 "Unexpected size for extending masked load"); 25971 25972 unsigned SizeRatio = ToSz / FromSz; 25973 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); 25974 25975 // Create a type on which we perform the shuffle 25976 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 25977 LdVT.getScalarType(), NumElems*SizeRatio); 25978 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 25979 25980 // Convert Src0 value 25981 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0()); 25982 if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { 25983 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 25984 for (unsigned i = 0; i != NumElems; ++i) 25985 ShuffleVec[i] = i * SizeRatio; 25986 25987 // Can't shuffle using an illegal type. 25988 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && 25989 "WideVecVT should be legal"); 25990 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, 25991 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); 25992 } 25993 // Prepare the new mask 25994 SDValue NewMask; 25995 SDValue Mask = Mld->getMask(); 25996 if (Mask.getValueType() == VT) { 25997 // Mask and original value have the same type 25998 NewMask = DAG.getBitcast(WideVecVT, Mask); 25999 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 26000 for (unsigned i = 0; i != NumElems; ++i) 26001 ShuffleVec[i] = i * SizeRatio; 26002 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) 26003 ShuffleVec[i] = NumElems * SizeRatio; 26004 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, 26005 DAG.getConstant(0, dl, WideVecVT), 26006 &ShuffleVec[0]); 26007 } 26008 else { 26009 assert(Mask.getValueType().getVectorElementType() == MVT::i1); 26010 unsigned WidenNumElts = NumElems*SizeRatio; 26011 unsigned MaskNumElts = VT.getVectorNumElements(); 26012 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 26013 WidenNumElts); 26014 26015 unsigned NumConcat = WidenNumElts / MaskNumElts; 26016 SmallVector<SDValue, 16> Ops(NumConcat); 26017 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); 26018 Ops[0] = Mask; 26019 for (unsigned i = 1; i != NumConcat; ++i) 26020 Ops[i] = ZeroVal; 26021 26022 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); 26023 } 26024 26025 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), 26026 Mld->getBasePtr(), NewMask, WideSrc0, 26027 Mld->getMemoryVT(), Mld->getMemOperand(), 26028 ISD::NON_EXTLOAD); 26029 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); 26030 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); 26031 } 26032 /// PerformMSTORECombine - Resolve truncating stores 26033 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, 26034 const X86Subtarget *Subtarget) { 26035 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); 26036 if (!Mst->isTruncatingStore()) 26037 return SDValue(); 26038 26039 EVT VT = Mst->getValue().getValueType(); 26040 unsigned NumElems = VT.getVectorNumElements(); 26041 EVT StVT = Mst->getMemoryVT(); 26042 SDLoc dl(Mst); 26043 26044 assert(StVT != VT && "Cannot truncate to the same type"); 26045 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 26046 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 26047 26048 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 26049 26050 // The truncating store is legal in some cases. For example 26051 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw 26052 // are designated for truncate store. 26053 // In this case we don't need any further transformations. 26054 if (TLI.isTruncStoreLegal(VT, StVT)) 26055 return SDValue(); 26056 26057 // From, To sizes and ElemCount must be pow of two 26058 assert (isPowerOf2_32(NumElems * FromSz * ToSz) && 26059 "Unexpected size for truncating masked store"); 26060 // We are going to use the original vector elt for storing. 26061 // Accumulated smaller vector elements must be a multiple of the store size. 26062 assert (((NumElems * FromSz) % ToSz) == 0 && 26063 "Unexpected ratio for truncating masked store"); 26064 26065 unsigned SizeRatio = FromSz / ToSz; 26066 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 26067 26068 // Create a type on which we perform the shuffle 26069 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 26070 StVT.getScalarType(), NumElems*SizeRatio); 26071 26072 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 26073 26074 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); 26075 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 26076 for (unsigned i = 0; i != NumElems; ++i) 26077 ShuffleVec[i] = i * SizeRatio; 26078 26079 // Can't shuffle using an illegal type. 26080 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && 26081 "WideVecVT should be legal"); 26082 26083 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 26084 DAG.getUNDEF(WideVecVT), 26085 &ShuffleVec[0]); 26086 26087 SDValue NewMask; 26088 SDValue Mask = Mst->getMask(); 26089 if (Mask.getValueType() == VT) { 26090 // Mask and original value have the same type 26091 NewMask = DAG.getBitcast(WideVecVT, Mask); 26092 for (unsigned i = 0; i != NumElems; ++i) 26093 ShuffleVec[i] = i * SizeRatio; 26094 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) 26095 ShuffleVec[i] = NumElems*SizeRatio; 26096 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, 26097 DAG.getConstant(0, dl, WideVecVT), 26098 &ShuffleVec[0]); 26099 } 26100 else { 26101 assert(Mask.getValueType().getVectorElementType() == MVT::i1); 26102 unsigned WidenNumElts = NumElems*SizeRatio; 26103 unsigned MaskNumElts = VT.getVectorNumElements(); 26104 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 26105 WidenNumElts); 26106 26107 unsigned NumConcat = WidenNumElts / MaskNumElts; 26108 SmallVector<SDValue, 16> Ops(NumConcat); 26109 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); 26110 Ops[0] = Mask; 26111 for (unsigned i = 1; i != NumConcat; ++i) 26112 Ops[i] = ZeroVal; 26113 26114 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); 26115 } 26116 26117 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, 26118 Mst->getBasePtr(), NewMask, StVT, 26119 Mst->getMemOperand(), false); 26120 } 26121 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 26122 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 26123 const X86Subtarget *Subtarget) { 26124 StoreSDNode *St = cast<StoreSDNode>(N); 26125 EVT VT = St->getValue().getValueType(); 26126 EVT StVT = St->getMemoryVT(); 26127 SDLoc dl(St); 26128 SDValue StoredVal = St->getOperand(1); 26129 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 26130 26131 // If we are saving a concatenation of two XMM registers and 32-byte stores 26132 // are slow, such as on Sandy Bridge, perform two 16-byte stores. 26133 bool Fast; 26134 unsigned AddressSpace = St->getAddressSpace(); 26135 unsigned Alignment = St->getAlignment(); 26136 if (VT.is256BitVector() && StVT == VT && 26137 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 26138 AddressSpace, Alignment, &Fast) && !Fast) { 26139 unsigned NumElems = VT.getVectorNumElements(); 26140 if (NumElems < 2) 26141 return SDValue(); 26142 26143 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); 26144 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); 26145 26146 SDValue Stride = 26147 DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); 26148 SDValue Ptr0 = St->getBasePtr(); 26149 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 26150 26151 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 26152 St->getPointerInfo(), St->isVolatile(), 26153 St->isNonTemporal(), Alignment); 26154 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 26155 St->getPointerInfo(), St->isVolatile(), 26156 St->isNonTemporal(), 26157 std::min(16U, Alignment)); 26158 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 26159 } 26160 26161 // Optimize trunc store (of multiple scalars) to shuffle and store. 26162 // First, pack all of the elements in one place. Next, store to memory 26163 // in fewer chunks. 26164 if (St->isTruncatingStore() && VT.isVector()) { 26165 // Check if we can detect an AVG pattern from the truncation. If yes, 26166 // replace the trunc store by a normal store with the result of X86ISD::AVG 26167 // instruction. 26168 SDValue Avg = 26169 detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); 26170 if (Avg.getNode()) 26171 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), 26172 St->getPointerInfo(), St->isVolatile(), 26173 St->isNonTemporal(), St->getAlignment()); 26174 26175 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 26176 unsigned NumElems = VT.getVectorNumElements(); 26177 assert(StVT != VT && "Cannot truncate to the same type"); 26178 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 26179 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 26180 26181 // The truncating store is legal in some cases. For example 26182 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw 26183 // are designated for truncate store. 26184 // In this case we don't need any further transformations. 26185 if (TLI.isTruncStoreLegal(VT, StVT)) 26186 return SDValue(); 26187 26188 // From, To sizes and ElemCount must be pow of two 26189 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 26190 // We are going to use the original vector elt for storing. 26191 // Accumulated smaller vector elements must be a multiple of the store size. 26192 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 26193 26194 unsigned SizeRatio = FromSz / ToSz; 26195 26196 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 26197 26198 // Create a type on which we perform the shuffle 26199 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 26200 StVT.getScalarType(), NumElems*SizeRatio); 26201 26202 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 26203 26204 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); 26205 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 26206 for (unsigned i = 0; i != NumElems; ++i) 26207 ShuffleVec[i] = i * SizeRatio; 26208 26209 // Can't shuffle using an illegal type. 26210 if (!TLI.isTypeLegal(WideVecVT)) 26211 return SDValue(); 26212 26213 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 26214 DAG.getUNDEF(WideVecVT), 26215 &ShuffleVec[0]); 26216 // At this point all of the data is stored at the bottom of the 26217 // register. We now need to save it to mem. 26218 26219 // Find the largest store unit 26220 MVT StoreType = MVT::i8; 26221 for (MVT Tp : MVT::integer_valuetypes()) { 26222 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 26223 StoreType = Tp; 26224 } 26225 26226 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 26227 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 26228 (64 <= NumElems * ToSz)) 26229 StoreType = MVT::f64; 26230 26231 // Bitcast the original vector into a vector of store-size units 26232 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 26233 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 26234 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 26235 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); 26236 SmallVector<SDValue, 8> Chains; 26237 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl, 26238 TLI.getPointerTy(DAG.getDataLayout())); 26239 SDValue Ptr = St->getBasePtr(); 26240 26241 // Perform one or more big stores into memory. 26242 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 26243 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 26244 StoreType, ShuffWide, 26245 DAG.getIntPtrConstant(i, dl)); 26246 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 26247 St->getPointerInfo(), St->isVolatile(), 26248 St->isNonTemporal(), St->getAlignment()); 26249 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 26250 Chains.push_back(Ch); 26251 } 26252 26253 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 26254 } 26255 26256 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 26257 // the FP state in cases where an emms may be missing. 26258 // A preferable solution to the general problem is to figure out the right 26259 // places to insert EMMS. This qualifies as a quick hack. 26260 26261 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 26262 if (VT.getSizeInBits() != 64) 26263 return SDValue(); 26264 26265 const Function *F = DAG.getMachineFunction().getFunction(); 26266 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); 26267 bool F64IsLegal = 26268 !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2(); 26269 if ((VT.isVector() || 26270 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 26271 isa<LoadSDNode>(St->getValue()) && 26272 !cast<LoadSDNode>(St->getValue())->isVolatile() && 26273 St->getChain().hasOneUse() && !St->isVolatile()) { 26274 SDNode* LdVal = St->getValue().getNode(); 26275 LoadSDNode *Ld = nullptr; 26276 int TokenFactorIndex = -1; 26277 SmallVector<SDValue, 8> Ops; 26278 SDNode* ChainVal = St->getChain().getNode(); 26279 // Must be a store of a load. We currently handle two cases: the load 26280 // is a direct child, and it's under an intervening TokenFactor. It is 26281 // possible to dig deeper under nested TokenFactors. 26282 if (ChainVal == LdVal) 26283 Ld = cast<LoadSDNode>(St->getChain()); 26284 else if (St->getValue().hasOneUse() && 26285 ChainVal->getOpcode() == ISD::TokenFactor) { 26286 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 26287 if (ChainVal->getOperand(i).getNode() == LdVal) { 26288 TokenFactorIndex = i; 26289 Ld = cast<LoadSDNode>(St->getValue()); 26290 } else 26291 Ops.push_back(ChainVal->getOperand(i)); 26292 } 26293 } 26294 26295 if (!Ld || !ISD::isNormalLoad(Ld)) 26296 return SDValue(); 26297 26298 // If this is not the MMX case, i.e. we are just turning i64 load/store 26299 // into f64 load/store, avoid the transformation if there are multiple 26300 // uses of the loaded value. 26301 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 26302 return SDValue(); 26303 26304 SDLoc LdDL(Ld); 26305 SDLoc StDL(N); 26306 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 26307 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 26308 // pair instead. 26309 if (Subtarget->is64Bit() || F64IsLegal) { 26310 MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 26311 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 26312 Ld->getPointerInfo(), Ld->isVolatile(), 26313 Ld->isNonTemporal(), Ld->isInvariant(), 26314 Ld->getAlignment()); 26315 SDValue NewChain = NewLd.getValue(1); 26316 if (TokenFactorIndex != -1) { 26317 Ops.push_back(NewChain); 26318 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); 26319 } 26320 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 26321 St->getPointerInfo(), 26322 St->isVolatile(), St->isNonTemporal(), 26323 St->getAlignment()); 26324 } 26325 26326 // Otherwise, lower to two pairs of 32-bit loads / stores. 26327 SDValue LoAddr = Ld->getBasePtr(); 26328 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 26329 DAG.getConstant(4, LdDL, MVT::i32)); 26330 26331 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 26332 Ld->getPointerInfo(), 26333 Ld->isVolatile(), Ld->isNonTemporal(), 26334 Ld->isInvariant(), Ld->getAlignment()); 26335 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 26336 Ld->getPointerInfo().getWithOffset(4), 26337 Ld->isVolatile(), Ld->isNonTemporal(), 26338 Ld->isInvariant(), 26339 MinAlign(Ld->getAlignment(), 4)); 26340 26341 SDValue NewChain = LoLd.getValue(1); 26342 if (TokenFactorIndex != -1) { 26343 Ops.push_back(LoLd); 26344 Ops.push_back(HiLd); 26345 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); 26346 } 26347 26348 LoAddr = St->getBasePtr(); 26349 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 26350 DAG.getConstant(4, StDL, MVT::i32)); 26351 26352 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 26353 St->getPointerInfo(), 26354 St->isVolatile(), St->isNonTemporal(), 26355 St->getAlignment()); 26356 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 26357 St->getPointerInfo().getWithOffset(4), 26358 St->isVolatile(), 26359 St->isNonTemporal(), 26360 MinAlign(St->getAlignment(), 4)); 26361 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 26362 } 26363 26364 // This is similar to the above case, but here we handle a scalar 64-bit 26365 // integer store that is extracted from a vector on a 32-bit target. 26366 // If we have SSE2, then we can treat it like a floating-point double 26367 // to get past legalization. The execution dependencies fixup pass will 26368 // choose the optimal machine instruction for the store if this really is 26369 // an integer or v2f32 rather than an f64. 26370 if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() && 26371 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 26372 SDValue OldExtract = St->getOperand(1); 26373 SDValue ExtOp0 = OldExtract.getOperand(0); 26374 unsigned VecSize = ExtOp0.getValueSizeInBits(); 26375 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); 26376 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); 26377 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 26378 BitCast, OldExtract.getOperand(1)); 26379 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), 26380 St->getPointerInfo(), St->isVolatile(), 26381 St->isNonTemporal(), St->getAlignment()); 26382 } 26383 26384 return SDValue(); 26385 } 26386 26387 /// Return 'true' if this vector operation is "horizontal" 26388 /// and return the operands for the horizontal operation in LHS and RHS. A 26389 /// horizontal operation performs the binary operation on successive elements 26390 /// of its first operand, then on successive elements of its second operand, 26391 /// returning the resulting values in a vector. For example, if 26392 /// A = < float a0, float a1, float a2, float a3 > 26393 /// and 26394 /// B = < float b0, float b1, float b2, float b3 > 26395 /// then the result of doing a horizontal operation on A and B is 26396 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 26397 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 26398 /// A horizontal-op B, for some already available A and B, and if so then LHS is 26399 /// set to A, RHS to B, and the routine returns 'true'. 26400 /// Note that the binary operation should have the property that if one of the 26401 /// operands is UNDEF then the result is UNDEF. 26402 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 26403 // Look for the following pattern: if 26404 // A = < float a0, float a1, float a2, float a3 > 26405 // B = < float b0, float b1, float b2, float b3 > 26406 // and 26407 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 26408 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 26409 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 26410 // which is A horizontal-op B. 26411 26412 // At least one of the operands should be a vector shuffle. 26413 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 26414 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 26415 return false; 26416 26417 MVT VT = LHS.getSimpleValueType(); 26418 26419 assert((VT.is128BitVector() || VT.is256BitVector()) && 26420 "Unsupported vector type for horizontal add/sub"); 26421 26422 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 26423 // operate independently on 128-bit lanes. 26424 unsigned NumElts = VT.getVectorNumElements(); 26425 unsigned NumLanes = VT.getSizeInBits()/128; 26426 unsigned NumLaneElts = NumElts / NumLanes; 26427 assert((NumLaneElts % 2 == 0) && 26428 "Vector type should have an even number of elements in each lane"); 26429 unsigned HalfLaneElts = NumLaneElts/2; 26430 26431 // View LHS in the form 26432 // LHS = VECTOR_SHUFFLE A, B, LMask 26433 // If LHS is not a shuffle then pretend it is the shuffle 26434 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 26435 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 26436 // type VT. 26437 SDValue A, B; 26438 SmallVector<int, 16> LMask(NumElts); 26439 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 26440 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 26441 A = LHS.getOperand(0); 26442 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 26443 B = LHS.getOperand(1); 26444 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 26445 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 26446 } else { 26447 if (LHS.getOpcode() != ISD::UNDEF) 26448 A = LHS; 26449 for (unsigned i = 0; i != NumElts; ++i) 26450 LMask[i] = i; 26451 } 26452 26453 // Likewise, view RHS in the form 26454 // RHS = VECTOR_SHUFFLE C, D, RMask 26455 SDValue C, D; 26456 SmallVector<int, 16> RMask(NumElts); 26457 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 26458 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 26459 C = RHS.getOperand(0); 26460 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 26461 D = RHS.getOperand(1); 26462 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 26463 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 26464 } else { 26465 if (RHS.getOpcode() != ISD::UNDEF) 26466 C = RHS; 26467 for (unsigned i = 0; i != NumElts; ++i) 26468 RMask[i] = i; 26469 } 26470 26471 // Check that the shuffles are both shuffling the same vectors. 26472 if (!(A == C && B == D) && !(A == D && B == C)) 26473 return false; 26474 26475 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 26476 if (!A.getNode() && !B.getNode()) 26477 return false; 26478 26479 // If A and B occur in reverse order in RHS, then "swap" them (which means 26480 // rewriting the mask). 26481 if (A != C) 26482 ShuffleVectorSDNode::commuteMask(RMask); 26483 26484 // At this point LHS and RHS are equivalent to 26485 // LHS = VECTOR_SHUFFLE A, B, LMask 26486 // RHS = VECTOR_SHUFFLE A, B, RMask 26487 // Check that the masks correspond to performing a horizontal operation. 26488 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 26489 for (unsigned i = 0; i != NumLaneElts; ++i) { 26490 int LIdx = LMask[i+l], RIdx = RMask[i+l]; 26491 26492 // Ignore any UNDEF components. 26493 if (LIdx < 0 || RIdx < 0 || 26494 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 26495 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 26496 continue; 26497 26498 // Check that successive elements are being operated on. If not, this is 26499 // not a horizontal operation. 26500 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs 26501 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; 26502 if (!(LIdx == Index && RIdx == Index + 1) && 26503 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 26504 return false; 26505 } 26506 } 26507 26508 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 26509 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 26510 return true; 26511 } 26512 26513 /// Do target-specific dag combines on floating point adds. 26514 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 26515 const X86Subtarget *Subtarget) { 26516 EVT VT = N->getValueType(0); 26517 SDValue LHS = N->getOperand(0); 26518 SDValue RHS = N->getOperand(1); 26519 26520 // Try to synthesize horizontal adds from adds of shuffles. 26521 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 26522 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 26523 isHorizontalBinOp(LHS, RHS, true)) 26524 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); 26525 return SDValue(); 26526 } 26527 26528 /// Do target-specific dag combines on floating point subs. 26529 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 26530 const X86Subtarget *Subtarget) { 26531 EVT VT = N->getValueType(0); 26532 SDValue LHS = N->getOperand(0); 26533 SDValue RHS = N->getOperand(1); 26534 26535 // Try to synthesize horizontal subs from subs of shuffles. 26536 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 26537 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 26538 isHorizontalBinOp(LHS, RHS, false)) 26539 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); 26540 return SDValue(); 26541 } 26542 26543 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. 26544 static SDValue 26545 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, 26546 SmallVector<SDValue, 8> &Regs) { 26547 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || 26548 Regs[0].getValueType() == MVT::v2i64)); 26549 EVT OutVT = N->getValueType(0); 26550 EVT OutSVT = OutVT.getVectorElementType(); 26551 EVT InVT = Regs[0].getValueType(); 26552 EVT InSVT = InVT.getVectorElementType(); 26553 SDLoc DL(N); 26554 26555 // First, use mask to unset all bits that won't appear in the result. 26556 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && 26557 "OutSVT can only be either i8 or i16."); 26558 SDValue MaskVal = 26559 DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); 26560 SDValue MaskVec = DAG.getNode( 26561 ISD::BUILD_VECTOR, DL, InVT, 26562 SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal)); 26563 for (auto &Reg : Regs) 26564 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); 26565 26566 MVT UnpackedVT, PackedVT; 26567 if (OutSVT == MVT::i8) { 26568 UnpackedVT = MVT::v8i16; 26569 PackedVT = MVT::v16i8; 26570 } else { 26571 UnpackedVT = MVT::v4i32; 26572 PackedVT = MVT::v8i16; 26573 } 26574 26575 // In each iteration, truncate the type by a half size. 26576 auto RegNum = Regs.size(); 26577 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); 26578 j < e; j *= 2, RegNum /= 2) { 26579 for (unsigned i = 0; i < RegNum; i++) 26580 Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); 26581 for (unsigned i = 0; i < RegNum / 2; i++) 26582 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], 26583 Regs[i * 2 + 1]); 26584 } 26585 26586 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and 26587 // then extract a subvector as the result since v8i8 is not a legal type. 26588 if (OutVT == MVT::v8i8) { 26589 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); 26590 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], 26591 DAG.getIntPtrConstant(0, DL)); 26592 return Regs[0]; 26593 } else if (RegNum > 1) { 26594 Regs.resize(RegNum); 26595 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); 26596 } else 26597 return Regs[0]; 26598 } 26599 26600 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. 26601 static SDValue 26602 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, 26603 SmallVector<SDValue, 8> &Regs) { 26604 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); 26605 EVT OutVT = N->getValueType(0); 26606 SDLoc DL(N); 26607 26608 // Shift left by 16 bits, then arithmetic-shift right by 16 bits. 26609 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); 26610 for (auto &Reg : Regs) { 26611 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); 26612 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); 26613 } 26614 26615 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) 26616 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], 26617 Regs[i * 2 + 1]); 26618 26619 if (Regs.size() > 2) { 26620 Regs.resize(Regs.size() / 2); 26621 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); 26622 } else 26623 return Regs[0]; 26624 } 26625 26626 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into 26627 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type 26628 /// legalization the truncation will be translated into a BUILD_VECTOR with each 26629 /// element that is extracted from a vector and then truncated, and it is 26630 /// diffcult to do this optimization based on them. 26631 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, 26632 const X86Subtarget *Subtarget) { 26633 EVT OutVT = N->getValueType(0); 26634 if (!OutVT.isVector()) 26635 return SDValue(); 26636 26637 SDValue In = N->getOperand(0); 26638 if (!In.getValueType().isSimple()) 26639 return SDValue(); 26640 26641 EVT InVT = In.getValueType(); 26642 unsigned NumElems = OutVT.getVectorNumElements(); 26643 26644 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on 26645 // SSE2, and we need to take care of it specially. 26646 // AVX512 provides vpmovdb. 26647 if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) 26648 return SDValue(); 26649 26650 EVT OutSVT = OutVT.getVectorElementType(); 26651 EVT InSVT = InVT.getVectorElementType(); 26652 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && 26653 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && 26654 NumElems >= 8)) 26655 return SDValue(); 26656 26657 // SSSE3's pshufb results in less instructions in the cases below. 26658 if (Subtarget->hasSSSE3() && NumElems == 8 && 26659 ((OutSVT == MVT::i8 && InSVT != MVT::i64) || 26660 (InSVT == MVT::i32 && OutSVT == MVT::i16))) 26661 return SDValue(); 26662 26663 SDLoc DL(N); 26664 26665 // Split a long vector into vectors of legal type. 26666 unsigned RegNum = InVT.getSizeInBits() / 128; 26667 SmallVector<SDValue, 8> SubVec(RegNum); 26668 if (InSVT == MVT::i32) { 26669 for (unsigned i = 0; i < RegNum; i++) 26670 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 26671 DAG.getIntPtrConstant(i * 4, DL)); 26672 } else { 26673 for (unsigned i = 0; i < RegNum; i++) 26674 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 26675 DAG.getIntPtrConstant(i * 2, DL)); 26676 } 26677 26678 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS 26679 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to 26680 // truncate 2 x v4i32 to v8i16. 26681 if (Subtarget->hasSSE41() || OutSVT == MVT::i8) 26682 return combineVectorTruncationWithPACKUS(N, DAG, SubVec); 26683 else if (InSVT == MVT::i32) 26684 return combineVectorTruncationWithPACKSS(N, DAG, SubVec); 26685 else 26686 return SDValue(); 26687 } 26688 26689 static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, 26690 const X86Subtarget *Subtarget) { 26691 // Try to detect AVG pattern first. 26692 SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, 26693 Subtarget, SDLoc(N)); 26694 if (Avg.getNode()) 26695 return Avg; 26696 26697 return combineVectorTruncation(N, DAG, Subtarget); 26698 } 26699 26700 /// Do target-specific dag combines on floating point negations. 26701 static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, 26702 const X86Subtarget *Subtarget) { 26703 EVT VT = N->getValueType(0); 26704 EVT SVT = VT.getScalarType(); 26705 SDValue Arg = N->getOperand(0); 26706 SDLoc DL(N); 26707 26708 // Let legalize expand this if it isn't a legal type yet. 26709 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 26710 return SDValue(); 26711 26712 // If we're negating a FMUL node on a target with FMA, then we can avoid the 26713 // use of a constant by performing (-0 - A*B) instead. 26714 // FIXME: Check rounding control flags as well once it becomes available. 26715 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && 26716 Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { 26717 SDValue Zero = DAG.getConstantFP(0.0, DL, VT); 26718 return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), 26719 Arg.getOperand(1), Zero); 26720 } 26721 26722 // If we're negating a FMA node, then we can adjust the 26723 // instruction to include the extra negation. 26724 if (Arg.hasOneUse()) { 26725 switch (Arg.getOpcode()) { 26726 case X86ISD::FMADD: 26727 return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), 26728 Arg.getOperand(1), Arg.getOperand(2)); 26729 case X86ISD::FMSUB: 26730 return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), 26731 Arg.getOperand(1), Arg.getOperand(2)); 26732 case X86ISD::FNMADD: 26733 return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), 26734 Arg.getOperand(1), Arg.getOperand(2)); 26735 case X86ISD::FNMSUB: 26736 return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), 26737 Arg.getOperand(1), Arg.getOperand(2)); 26738 } 26739 } 26740 return SDValue(); 26741 } 26742 26743 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, 26744 const X86Subtarget *Subtarget) { 26745 EVT VT = N->getValueType(0); 26746 if (VT.is512BitVector() && !Subtarget->hasDQI()) { 26747 // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. 26748 // These logic operations may be executed in the integer domain. 26749 SDLoc dl(N); 26750 MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); 26751 MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); 26752 26753 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); 26754 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); 26755 unsigned IntOpcode = 0; 26756 switch (N->getOpcode()) { 26757 default: llvm_unreachable("Unexpected FP logic op"); 26758 case X86ISD::FOR: IntOpcode = ISD::OR; break; 26759 case X86ISD::FXOR: IntOpcode = ISD::XOR; break; 26760 case X86ISD::FAND: IntOpcode = ISD::AND; break; 26761 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; 26762 } 26763 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); 26764 return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); 26765 } 26766 return SDValue(); 26767 } 26768 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. 26769 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, 26770 const X86Subtarget *Subtarget) { 26771 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 26772 26773 // F[X]OR(0.0, x) -> x 26774 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 26775 if (C->getValueAPF().isPosZero()) 26776 return N->getOperand(1); 26777 26778 // F[X]OR(x, 0.0) -> x 26779 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 26780 if (C->getValueAPF().isPosZero()) 26781 return N->getOperand(0); 26782 26783 return lowerX86FPLogicOp(N, DAG, Subtarget); 26784 } 26785 26786 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. 26787 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 26788 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 26789 26790 // Only perform optimizations if UnsafeMath is used. 26791 if (!DAG.getTarget().Options.UnsafeFPMath) 26792 return SDValue(); 26793 26794 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 26795 // into FMINC and FMAXC, which are Commutative operations. 26796 unsigned NewOp = 0; 26797 switch (N->getOpcode()) { 26798 default: llvm_unreachable("unknown opcode"); 26799 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 26800 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 26801 } 26802 26803 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), 26804 N->getOperand(0), N->getOperand(1)); 26805 } 26806 26807 static SDValue performFMaxNumCombine(SDNode *N, SelectionDAG &DAG, 26808 const X86Subtarget *Subtarget) { 26809 // This takes at least 3 instructions, so favor a library call when 26810 // minimizing code size. 26811 if (DAG.getMachineFunction().getFunction()->optForMinSize()) 26812 return SDValue(); 26813 26814 EVT VT = N->getValueType(0); 26815 26816 // TODO: Check for global or instruction-level "nnan". In that case, we 26817 // should be able to lower to FMAX/FMIN alone. 26818 // TODO: If an operand is already known to be a NaN or not a NaN, this 26819 // should be an optional swap and FMAX/FMIN. 26820 // TODO: Allow f64, vectors, and fminnum. 26821 26822 if (VT != MVT::f32 || !Subtarget->hasSSE1() || Subtarget->useSoftFloat()) 26823 return SDValue(); 26824 26825 SDValue Op0 = N->getOperand(0); 26826 SDValue Op1 = N->getOperand(1); 26827 SDLoc DL(N); 26828 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( 26829 DAG.getDataLayout(), *DAG.getContext(), VT); 26830 26831 // There are 4 possibilities involving NaN inputs, and these are the required 26832 // outputs: 26833 // Op1 26834 // Num NaN 26835 // ---------------- 26836 // Num | Max | Op0 | 26837 // Op0 ---------------- 26838 // NaN | Op1 | NaN | 26839 // ---------------- 26840 // 26841 // The SSE FP max/min instructions were not designed for this case, but rather 26842 // to implement: 26843 // Max = Op1 > Op0 ? Op1 : Op0 26844 // 26845 // So they always return Op0 if either input is a NaN. However, we can still 26846 // use those instructions for fmaxnum by selecting away a NaN input. 26847 26848 // If either operand is NaN, the 2nd source operand (Op0) is passed through. 26849 SDValue Max = DAG.getNode(X86ISD::FMAX, DL, VT, Op1, Op0); 26850 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); 26851 26852 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands 26853 // are NaN, the NaN value of Op1 is the result. 26854 return DAG.getNode(ISD::SELECT, DL, VT, IsOp0Nan, Op1, Max); 26855 } 26856 26857 /// Do target-specific dag combines on X86ISD::FAND nodes. 26858 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, 26859 const X86Subtarget *Subtarget) { 26860 // FAND(0.0, x) -> 0.0 26861 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 26862 if (C->getValueAPF().isPosZero()) 26863 return N->getOperand(0); 26864 26865 // FAND(x, 0.0) -> 0.0 26866 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 26867 if (C->getValueAPF().isPosZero()) 26868 return N->getOperand(1); 26869 26870 return lowerX86FPLogicOp(N, DAG, Subtarget); 26871 } 26872 26873 /// Do target-specific dag combines on X86ISD::FANDN nodes 26874 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, 26875 const X86Subtarget *Subtarget) { 26876 // FANDN(0.0, x) -> x 26877 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 26878 if (C->getValueAPF().isPosZero()) 26879 return N->getOperand(1); 26880 26881 // FANDN(x, 0.0) -> 0.0 26882 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 26883 if (C->getValueAPF().isPosZero()) 26884 return N->getOperand(1); 26885 26886 return lowerX86FPLogicOp(N, DAG, Subtarget); 26887 } 26888 26889 static SDValue PerformBTCombine(SDNode *N, 26890 SelectionDAG &DAG, 26891 TargetLowering::DAGCombinerInfo &DCI) { 26892 // BT ignores high bits in the bit index operand. 26893 SDValue Op1 = N->getOperand(1); 26894 if (Op1.hasOneUse()) { 26895 unsigned BitWidth = Op1.getValueSizeInBits(); 26896 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 26897 APInt KnownZero, KnownOne; 26898 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 26899 !DCI.isBeforeLegalizeOps()); 26900 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 26901 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 26902 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 26903 DCI.CommitTargetLoweringOpt(TLO); 26904 } 26905 return SDValue(); 26906 } 26907 26908 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 26909 SDValue Op = N->getOperand(0); 26910 if (Op.getOpcode() == ISD::BITCAST) 26911 Op = Op.getOperand(0); 26912 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 26913 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 26914 VT.getVectorElementType().getSizeInBits() == 26915 OpVT.getVectorElementType().getSizeInBits()) { 26916 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 26917 } 26918 return SDValue(); 26919 } 26920 26921 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 26922 const X86Subtarget *Subtarget) { 26923 EVT VT = N->getValueType(0); 26924 if (!VT.isVector()) 26925 return SDValue(); 26926 26927 SDValue N0 = N->getOperand(0); 26928 SDValue N1 = N->getOperand(1); 26929 EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); 26930 SDLoc dl(N); 26931 26932 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the 26933 // both SSE and AVX2 since there is no sign-extended shift right 26934 // operation on a vector with 64-bit elements. 26935 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> 26936 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) 26937 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || 26938 N0.getOpcode() == ISD::SIGN_EXTEND)) { 26939 SDValue N00 = N0.getOperand(0); 26940 26941 // EXTLOAD has a better solution on AVX2, 26942 // it may be replaced with X86ISD::VSEXT node. 26943 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) 26944 if (!ISD::isNormalLoad(N00.getNode())) 26945 return SDValue(); 26946 26947 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { 26948 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 26949 N00, N1); 26950 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); 26951 } 26952 } 26953 return SDValue(); 26954 } 26955 26956 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext) 26957 /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities 26958 /// to combine math ops, use an LEA, or use a complex addressing mode. This can 26959 /// eliminate extend, add, and shift instructions. 26960 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, 26961 const X86Subtarget *Subtarget) { 26962 // TODO: This should be valid for other integer types. 26963 EVT VT = Sext->getValueType(0); 26964 if (VT != MVT::i64) 26965 return SDValue(); 26966 26967 // We need an 'add nsw' feeding into the 'sext'. 26968 SDValue Add = Sext->getOperand(0); 26969 if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) 26970 return SDValue(); 26971 26972 // Having a constant operand to the 'add' ensures that we are not increasing 26973 // the instruction count because the constant is extended for free below. 26974 // A constant operand can also become the displacement field of an LEA. 26975 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); 26976 if (!AddOp1) 26977 return SDValue(); 26978 26979 // Don't make the 'add' bigger if there's no hope of combining it with some 26980 // other 'add' or 'shl' instruction. 26981 // TODO: It may be profitable to generate simpler LEA instructions in place 26982 // of single 'add' instructions, but the cost model for selecting an LEA 26983 // currently has a high threshold. 26984 bool HasLEAPotential = false; 26985 for (auto *User : Sext->uses()) { 26986 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { 26987 HasLEAPotential = true; 26988 break; 26989 } 26990 } 26991 if (!HasLEAPotential) 26992 return SDValue(); 26993 26994 // Everything looks good, so pull the 'sext' ahead of the 'add'. 26995 int64_t AddConstant = AddOp1->getSExtValue(); 26996 SDValue AddOp0 = Add.getOperand(0); 26997 SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); 26998 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); 26999 27000 // The wider add is guaranteed to not wrap because both operands are 27001 // sign-extended. 27002 SDNodeFlags Flags; 27003 Flags.setNoSignedWrap(true); 27004 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); 27005 } 27006 27007 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 27008 TargetLowering::DAGCombinerInfo &DCI, 27009 const X86Subtarget *Subtarget) { 27010 SDValue N0 = N->getOperand(0); 27011 EVT VT = N->getValueType(0); 27012 EVT SVT = VT.getScalarType(); 27013 EVT InVT = N0.getValueType(); 27014 EVT InSVT = InVT.getScalarType(); 27015 SDLoc DL(N); 27016 27017 // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> 27018 // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) 27019 // This exposes the sext to the sdivrem lowering, so that it directly extends 27020 // from AH (which we otherwise need to do contortions to access). 27021 if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && 27022 InVT == MVT::i8 && VT == MVT::i32) { 27023 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); 27024 SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, 27025 N0.getOperand(0), N0.getOperand(1)); 27026 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); 27027 return R.getValue(1); 27028 } 27029 27030 if (!DCI.isBeforeLegalizeOps()) { 27031 if (InVT == MVT::i1) { 27032 SDValue Zero = DAG.getConstant(0, DL, VT); 27033 SDValue AllOnes = 27034 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); 27035 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); 27036 } 27037 return SDValue(); 27038 } 27039 27040 if (VT.isVector() && Subtarget->hasSSE2()) { 27041 auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) { 27042 EVT InVT = N.getValueType(); 27043 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), 27044 Size / InVT.getScalarSizeInBits()); 27045 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(), 27046 DAG.getUNDEF(InVT)); 27047 Opnds[0] = N; 27048 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); 27049 }; 27050 27051 // If target-size is less than 128-bits, extend to a type that would extend 27052 // to 128 bits, extend that and extract the original target vector. 27053 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) && 27054 (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && 27055 (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { 27056 unsigned Scale = 128 / VT.getSizeInBits(); 27057 EVT ExVT = 27058 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); 27059 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); 27060 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex); 27061 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, 27062 DAG.getIntPtrConstant(0, DL)); 27063 } 27064 27065 // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG 27066 // which ensures lowering to X86ISD::VSEXT (pmovsx*). 27067 if (VT.getSizeInBits() == 128 && 27068 (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && 27069 (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { 27070 SDValue ExOp = ExtendVecSize(DL, N0, 128); 27071 return DAG.getSignExtendVectorInReg(ExOp, DL, VT); 27072 } 27073 27074 // On pre-AVX2 targets, split into 128-bit nodes of 27075 // ISD::SIGN_EXTEND_VECTOR_INREG. 27076 if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) && 27077 (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && 27078 (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { 27079 unsigned NumVecs = VT.getSizeInBits() / 128; 27080 unsigned NumSubElts = 128 / SVT.getSizeInBits(); 27081 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); 27082 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); 27083 27084 SmallVector<SDValue, 8> Opnds; 27085 for (unsigned i = 0, Offset = 0; i != NumVecs; 27086 ++i, Offset += NumSubElts) { 27087 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, 27088 DAG.getIntPtrConstant(Offset, DL)); 27089 SrcVec = ExtendVecSize(DL, SrcVec, 128); 27090 SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); 27091 Opnds.push_back(SrcVec); 27092 } 27093 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); 27094 } 27095 } 27096 27097 if (Subtarget->hasAVX() && VT.is256BitVector()) 27098 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) 27099 return R; 27100 27101 if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) 27102 return NewAdd; 27103 27104 return SDValue(); 27105 } 27106 27107 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 27108 const X86Subtarget* Subtarget) { 27109 SDLoc dl(N); 27110 EVT VT = N->getValueType(0); 27111 27112 // Let legalize expand this if it isn't a legal type yet. 27113 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 27114 return SDValue(); 27115 27116 EVT ScalarVT = VT.getScalarType(); 27117 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) 27118 return SDValue(); 27119 27120 SDValue A = N->getOperand(0); 27121 SDValue B = N->getOperand(1); 27122 SDValue C = N->getOperand(2); 27123 27124 bool NegA = (A.getOpcode() == ISD::FNEG); 27125 bool NegB = (B.getOpcode() == ISD::FNEG); 27126 bool NegC = (C.getOpcode() == ISD::FNEG); 27127 27128 // Negative multiplication when NegA xor NegB 27129 bool NegMul = (NegA != NegB); 27130 if (NegA) 27131 A = A.getOperand(0); 27132 if (NegB) 27133 B = B.getOperand(0); 27134 if (NegC) 27135 C = C.getOperand(0); 27136 27137 unsigned Opcode; 27138 if (!NegMul) 27139 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 27140 else 27141 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 27142 27143 return DAG.getNode(Opcode, dl, VT, A, B, C); 27144 } 27145 27146 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 27147 TargetLowering::DAGCombinerInfo &DCI, 27148 const X86Subtarget *Subtarget) { 27149 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 27150 // (and (i32 x86isd::setcc_carry), 1) 27151 // This eliminates the zext. This transformation is necessary because 27152 // ISD::SETCC is always legalized to i8. 27153 SDLoc dl(N); 27154 SDValue N0 = N->getOperand(0); 27155 EVT VT = N->getValueType(0); 27156 27157 if (N0.getOpcode() == ISD::AND && 27158 N0.hasOneUse() && 27159 N0.getOperand(0).hasOneUse()) { 27160 SDValue N00 = N0.getOperand(0); 27161 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 27162 if (!isOneConstant(N0.getOperand(1))) 27163 return SDValue(); 27164 return DAG.getNode(ISD::AND, dl, VT, 27165 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 27166 N00.getOperand(0), N00.getOperand(1)), 27167 DAG.getConstant(1, dl, VT)); 27168 } 27169 } 27170 27171 if (N0.getOpcode() == ISD::TRUNCATE && 27172 N0.hasOneUse() && 27173 N0.getOperand(0).hasOneUse()) { 27174 SDValue N00 = N0.getOperand(0); 27175 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 27176 return DAG.getNode(ISD::AND, dl, VT, 27177 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 27178 N00.getOperand(0), N00.getOperand(1)), 27179 DAG.getConstant(1, dl, VT)); 27180 } 27181 } 27182 27183 if (VT.is256BitVector()) 27184 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) 27185 return R; 27186 27187 // (i8,i32 zext (udivrem (i8 x, i8 y)) -> 27188 // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) 27189 // This exposes the zext to the udivrem lowering, so that it directly extends 27190 // from AH (which we otherwise need to do contortions to access). 27191 if (N0.getOpcode() == ISD::UDIVREM && 27192 N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && 27193 (VT == MVT::i32 || VT == MVT::i64)) { 27194 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); 27195 SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, 27196 N0.getOperand(0), N0.getOperand(1)); 27197 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); 27198 return R.getValue(1); 27199 } 27200 27201 return SDValue(); 27202 } 27203 27204 // Optimize x == -y --> x+y == 0 27205 // x != -y --> x+y != 0 27206 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, 27207 const X86Subtarget* Subtarget) { 27208 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 27209 SDValue LHS = N->getOperand(0); 27210 SDValue RHS = N->getOperand(1); 27211 EVT VT = N->getValueType(0); 27212 SDLoc DL(N); 27213 27214 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 27215 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { 27216 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, 27217 LHS.getOperand(1)); 27218 return DAG.getSetCC(DL, N->getValueType(0), addV, 27219 DAG.getConstant(0, DL, addV.getValueType()), CC); 27220 } 27221 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 27222 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { 27223 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, 27224 RHS.getOperand(1)); 27225 return DAG.getSetCC(DL, N->getValueType(0), addV, 27226 DAG.getConstant(0, DL, addV.getValueType()), CC); 27227 } 27228 27229 if (VT.getScalarType() == MVT::i1 && 27230 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { 27231 bool IsSEXT0 = 27232 (LHS.getOpcode() == ISD::SIGN_EXTEND) && 27233 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); 27234 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); 27235 27236 if (!IsSEXT0 || !IsVZero1) { 27237 // Swap the operands and update the condition code. 27238 std::swap(LHS, RHS); 27239 CC = ISD::getSetCCSwappedOperands(CC); 27240 27241 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && 27242 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); 27243 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); 27244 } 27245 27246 if (IsSEXT0 && IsVZero1) { 27247 assert(VT == LHS.getOperand(0).getValueType() && 27248 "Uexpected operand type"); 27249 if (CC == ISD::SETGT) 27250 return DAG.getConstant(0, DL, VT); 27251 if (CC == ISD::SETLE) 27252 return DAG.getConstant(1, DL, VT); 27253 if (CC == ISD::SETEQ || CC == ISD::SETGE) 27254 return DAG.getNOT(DL, LHS.getOperand(0), VT); 27255 27256 assert((CC == ISD::SETNE || CC == ISD::SETLT) && 27257 "Unexpected condition code!"); 27258 return LHS.getOperand(0); 27259 } 27260 } 27261 27262 return SDValue(); 27263 } 27264 27265 static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { 27266 SDValue V0 = N->getOperand(0); 27267 SDValue V1 = N->getOperand(1); 27268 SDLoc DL(N); 27269 EVT VT = N->getValueType(0); 27270 27271 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector 27272 // operands and changing the mask to 1. This saves us a bunch of 27273 // pattern-matching possibilities related to scalar math ops in SSE/AVX. 27274 // x86InstrInfo knows how to commute this back after instruction selection 27275 // if it would help register allocation. 27276 27277 // TODO: If optimizing for size or a processor that doesn't suffer from 27278 // partial register update stalls, this should be transformed into a MOVSD 27279 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. 27280 27281 if (VT == MVT::v2f64) 27282 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) 27283 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { 27284 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); 27285 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); 27286 } 27287 27288 return SDValue(); 27289 } 27290 27291 static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { 27292 SDLoc DL(N); 27293 // Gather and Scatter instructions use k-registers for masks. The type of 27294 // the masks is v*i1. So the mask will be truncated anyway. 27295 // The SIGN_EXTEND_INREG my be dropped. 27296 SDValue Mask = N->getOperand(2); 27297 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { 27298 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); 27299 NewOps[2] = Mask.getOperand(0); 27300 DAG.UpdateNodeOperands(N, NewOps); 27301 } 27302 return SDValue(); 27303 } 27304 27305 // Helper function of PerformSETCCCombine. It is to materialize "setb reg" 27306 // as "sbb reg,reg", since it can be extended without zext and produces 27307 // an all-ones bit which is more useful than 0/1 in some cases. 27308 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, 27309 MVT VT) { 27310 if (VT == MVT::i8) 27311 return DAG.getNode(ISD::AND, DL, VT, 27312 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 27313 DAG.getConstant(X86::COND_B, DL, MVT::i8), 27314 EFLAGS), 27315 DAG.getConstant(1, DL, VT)); 27316 assert (VT == MVT::i1 && "Unexpected type for SECCC node"); 27317 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, 27318 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 27319 DAG.getConstant(X86::COND_B, DL, MVT::i8), 27320 EFLAGS)); 27321 } 27322 27323 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 27324 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 27325 TargetLowering::DAGCombinerInfo &DCI, 27326 const X86Subtarget *Subtarget) { 27327 SDLoc DL(N); 27328 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 27329 SDValue EFLAGS = N->getOperand(1); 27330 27331 if (CC == X86::COND_A) { 27332 // Try to convert COND_A into COND_B in an attempt to facilitate 27333 // materializing "setb reg". 27334 // 27335 // Do not flip "e > c", where "c" is a constant, because Cmp instruction 27336 // cannot take an immediate as its first operand. 27337 // 27338 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 27339 EFLAGS.getValueType().isInteger() && 27340 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { 27341 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), 27342 EFLAGS.getNode()->getVTList(), 27343 EFLAGS.getOperand(1), EFLAGS.getOperand(0)); 27344 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); 27345 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); 27346 } 27347 } 27348 27349 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 27350 // a zext and produces an all-ones bit which is more useful than 0/1 in some 27351 // cases. 27352 if (CC == X86::COND_B) 27353 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); 27354 27355 if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { 27356 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); 27357 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 27358 } 27359 27360 return SDValue(); 27361 } 27362 27363 // Optimize branch condition evaluation. 27364 // 27365 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 27366 TargetLowering::DAGCombinerInfo &DCI, 27367 const X86Subtarget *Subtarget) { 27368 SDLoc DL(N); 27369 SDValue Chain = N->getOperand(0); 27370 SDValue Dest = N->getOperand(1); 27371 SDValue EFLAGS = N->getOperand(3); 27372 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 27373 27374 if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { 27375 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); 27376 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 27377 Flags); 27378 } 27379 27380 return SDValue(); 27381 } 27382 27383 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 27384 SelectionDAG &DAG) { 27385 // Take advantage of vector comparisons producing 0 or -1 in each lane to 27386 // optimize away operation when it's from a constant. 27387 // 27388 // The general transformation is: 27389 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 27390 // AND(VECTOR_CMP(x,y), constant2) 27391 // constant2 = UNARYOP(constant) 27392 27393 // Early exit if this isn't a vector operation, the operand of the 27394 // unary operation isn't a bitwise AND, or if the sizes of the operations 27395 // aren't the same. 27396 EVT VT = N->getValueType(0); 27397 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 27398 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 27399 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 27400 return SDValue(); 27401 27402 // Now check that the other operand of the AND is a constant. We could 27403 // make the transformation for non-constant splats as well, but it's unclear 27404 // that would be a benefit as it would not eliminate any operations, just 27405 // perform one more step in scalar code before moving to the vector unit. 27406 if (BuildVectorSDNode *BV = 27407 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 27408 // Bail out if the vector isn't a constant. 27409 if (!BV->isConstant()) 27410 return SDValue(); 27411 27412 // Everything checks out. Build up the new and improved node. 27413 SDLoc DL(N); 27414 EVT IntVT = BV->getValueType(0); 27415 // Create a new constant of the appropriate type for the transformed 27416 // DAG. 27417 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 27418 // The AND node needs bitcasts to/from an integer vector type around it. 27419 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); 27420 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 27421 N->getOperand(0)->getOperand(0), MaskConst); 27422 SDValue Res = DAG.getBitcast(VT, NewAnd); 27423 return Res; 27424 } 27425 27426 return SDValue(); 27427 } 27428 27429 static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 27430 const X86Subtarget *Subtarget) { 27431 SDValue Op0 = N->getOperand(0); 27432 EVT VT = N->getValueType(0); 27433 EVT InVT = Op0.getValueType(); 27434 EVT InSVT = InVT.getScalarType(); 27435 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 27436 27437 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) 27438 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) 27439 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { 27440 SDLoc dl(N); 27441 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 27442 InVT.getVectorNumElements()); 27443 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); 27444 27445 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT)) 27446 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); 27447 27448 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); 27449 } 27450 27451 return SDValue(); 27452 } 27453 27454 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 27455 const X86Subtarget *Subtarget) { 27456 // First try to optimize away the conversion entirely when it's 27457 // conditionally from a constant. Vectors only. 27458 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) 27459 return Res; 27460 27461 // Now move on to more general possibilities. 27462 SDValue Op0 = N->getOperand(0); 27463 EVT VT = N->getValueType(0); 27464 EVT InVT = Op0.getValueType(); 27465 EVT InSVT = InVT.getScalarType(); 27466 27467 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) 27468 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) 27469 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { 27470 SDLoc dl(N); 27471 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 27472 InVT.getVectorNumElements()); 27473 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 27474 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); 27475 } 27476 27477 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 27478 // a 32-bit target where SSE doesn't support i64->FP operations. 27479 if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { 27480 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 27481 EVT LdVT = Ld->getValueType(0); 27482 27483 // This transformation is not supported if the result type is f16 27484 if (VT == MVT::f16) 27485 return SDValue(); 27486 27487 if (!Ld->isVolatile() && !VT.isVector() && 27488 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 27489 !Subtarget->is64Bit() && LdVT == MVT::i64) { 27490 SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( 27491 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); 27492 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 27493 return FILDChain; 27494 } 27495 } 27496 return SDValue(); 27497 } 27498 27499 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 27500 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 27501 X86TargetLowering::DAGCombinerInfo &DCI) { 27502 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 27503 // the result is either zero or one (depending on the input carry bit). 27504 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 27505 if (X86::isZeroNode(N->getOperand(0)) && 27506 X86::isZeroNode(N->getOperand(1)) && 27507 // We don't have a good way to replace an EFLAGS use, so only do this when 27508 // dead right now. 27509 SDValue(N, 1).use_empty()) { 27510 SDLoc DL(N); 27511 EVT VT = N->getValueType(0); 27512 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); 27513 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 27514 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 27515 DAG.getConstant(X86::COND_B, DL, 27516 MVT::i8), 27517 N->getOperand(2)), 27518 DAG.getConstant(1, DL, VT)); 27519 return DCI.CombineTo(N, Res1, CarryOut); 27520 } 27521 27522 return SDValue(); 27523 } 27524 27525 // fold (add Y, (sete X, 0)) -> adc 0, Y 27526 // (add Y, (setne X, 0)) -> sbb -1, Y 27527 // (sub (sete X, 0), Y) -> sbb 0, Y 27528 // (sub (setne X, 0), Y) -> adc -1, Y 27529 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 27530 SDLoc DL(N); 27531 27532 // Look through ZExts. 27533 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 27534 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 27535 return SDValue(); 27536 27537 SDValue SetCC = Ext.getOperand(0); 27538 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 27539 return SDValue(); 27540 27541 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 27542 if (CC != X86::COND_E && CC != X86::COND_NE) 27543 return SDValue(); 27544 27545 SDValue Cmp = SetCC.getOperand(1); 27546 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 27547 !X86::isZeroNode(Cmp.getOperand(1)) || 27548 !Cmp.getOperand(0).getValueType().isInteger()) 27549 return SDValue(); 27550 27551 SDValue CmpOp0 = Cmp.getOperand(0); 27552 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 27553 DAG.getConstant(1, DL, CmpOp0.getValueType())); 27554 27555 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 27556 if (CC == X86::COND_NE) 27557 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 27558 DL, OtherVal.getValueType(), OtherVal, 27559 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()), 27560 NewCmp); 27561 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 27562 DL, OtherVal.getValueType(), OtherVal, 27563 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); 27564 } 27565 27566 /// PerformADDCombine - Do target-specific dag combines on integer adds. 27567 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 27568 const X86Subtarget *Subtarget) { 27569 EVT VT = N->getValueType(0); 27570 SDValue Op0 = N->getOperand(0); 27571 SDValue Op1 = N->getOperand(1); 27572 27573 // Try to synthesize horizontal adds from adds of shuffles. 27574 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 27575 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 27576 isHorizontalBinOp(Op0, Op1, true)) 27577 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); 27578 27579 return OptimizeConditionalInDecrement(N, DAG); 27580 } 27581 27582 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 27583 const X86Subtarget *Subtarget) { 27584 SDValue Op0 = N->getOperand(0); 27585 SDValue Op1 = N->getOperand(1); 27586 27587 // X86 can't encode an immediate LHS of a sub. See if we can push the 27588 // negation into a preceding instruction. 27589 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 27590 // If the RHS of the sub is a XOR with one use and a constant, invert the 27591 // immediate. Then add one to the LHS of the sub so we can turn 27592 // X-Y -> X+~Y+1, saving one register. 27593 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 27594 isa<ConstantSDNode>(Op1.getOperand(1))) { 27595 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 27596 EVT VT = Op0.getValueType(); 27597 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, 27598 Op1.getOperand(0), 27599 DAG.getConstant(~XorC, SDLoc(Op1), VT)); 27600 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, 27601 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT)); 27602 } 27603 } 27604 27605 // Try to synthesize horizontal adds from adds of shuffles. 27606 EVT VT = N->getValueType(0); 27607 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 27608 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 27609 isHorizontalBinOp(Op0, Op1, true)) 27610 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); 27611 27612 return OptimizeConditionalInDecrement(N, DAG); 27613 } 27614 27615 /// performVZEXTCombine - Performs build vector combines 27616 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, 27617 TargetLowering::DAGCombinerInfo &DCI, 27618 const X86Subtarget *Subtarget) { 27619 SDLoc DL(N); 27620 MVT VT = N->getSimpleValueType(0); 27621 SDValue Op = N->getOperand(0); 27622 MVT OpVT = Op.getSimpleValueType(); 27623 MVT OpEltVT = OpVT.getVectorElementType(); 27624 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); 27625 27626 // (vzext (bitcast (vzext (x)) -> (vzext x) 27627 SDValue V = Op; 27628 while (V.getOpcode() == ISD::BITCAST) 27629 V = V.getOperand(0); 27630 27631 if (V != Op && V.getOpcode() == X86ISD::VZEXT) { 27632 MVT InnerVT = V.getSimpleValueType(); 27633 MVT InnerEltVT = InnerVT.getVectorElementType(); 27634 27635 // If the element sizes match exactly, we can just do one larger vzext. This 27636 // is always an exact type match as vzext operates on integer types. 27637 if (OpEltVT == InnerEltVT) { 27638 assert(OpVT == InnerVT && "Types must match for vzext!"); 27639 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); 27640 } 27641 27642 // The only other way we can combine them is if only a single element of the 27643 // inner vzext is used in the input to the outer vzext. 27644 if (InnerEltVT.getSizeInBits() < InputBits) 27645 return SDValue(); 27646 27647 // In this case, the inner vzext is completely dead because we're going to 27648 // only look at bits inside of the low element. Just do the outer vzext on 27649 // a bitcast of the input to the inner. 27650 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V)); 27651 } 27652 27653 // Check if we can bypass extracting and re-inserting an element of an input 27654 // vector. Essentially: 27655 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 27656 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && 27657 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 27658 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { 27659 SDValue ExtractedV = V.getOperand(0); 27660 SDValue OrigV = ExtractedV.getOperand(0); 27661 if (isNullConstant(ExtractedV.getOperand(1))) { 27662 MVT OrigVT = OrigV.getSimpleValueType(); 27663 // Extract a subvector if necessary... 27664 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { 27665 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); 27666 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), 27667 OrigVT.getVectorNumElements() / Ratio); 27668 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, 27669 DAG.getIntPtrConstant(0, DL)); 27670 } 27671 Op = DAG.getBitcast(OpVT, OrigV); 27672 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); 27673 } 27674 } 27675 27676 return SDValue(); 27677 } 27678 27679 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 27680 DAGCombinerInfo &DCI) const { 27681 SelectionDAG &DAG = DCI.DAG; 27682 switch (N->getOpcode()) { 27683 default: break; 27684 case ISD::EXTRACT_VECTOR_ELT: 27685 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 27686 case ISD::VSELECT: 27687 case ISD::SELECT: 27688 case X86ISD::SHRUNKBLEND: 27689 return PerformSELECTCombine(N, DAG, DCI, Subtarget); 27690 case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget); 27691 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 27692 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 27693 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 27694 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 27695 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 27696 case ISD::SHL: 27697 case ISD::SRA: 27698 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 27699 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 27700 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 27701 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 27702 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 27703 case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); 27704 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 27705 case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); 27706 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); 27707 case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); 27708 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 27709 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 27710 case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget); 27711 case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); 27712 case X86ISD::FXOR: 27713 case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); 27714 case X86ISD::FMIN: 27715 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 27716 case ISD::FMAXNUM: return performFMaxNumCombine(N, DAG, Subtarget); 27717 case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); 27718 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); 27719 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 27720 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 27721 case ISD::ANY_EXTEND: 27722 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 27723 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 27724 case ISD::SIGN_EXTEND_INREG: 27725 return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); 27726 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); 27727 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 27728 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 27729 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); 27730 case X86ISD::SHUFP: // Handle all target specific shuffles 27731 case X86ISD::PALIGNR: 27732 case X86ISD::UNPCKH: 27733 case X86ISD::UNPCKL: 27734 case X86ISD::MOVHLPS: 27735 case X86ISD::MOVLHPS: 27736 case X86ISD::PSHUFB: 27737 case X86ISD::PSHUFD: 27738 case X86ISD::PSHUFHW: 27739 case X86ISD::PSHUFLW: 27740 case X86ISD::MOVSS: 27741 case X86ISD::MOVSD: 27742 case X86ISD::VPERMILPI: 27743 case X86ISD::VPERM2X128: 27744 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 27745 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 27746 case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); 27747 case ISD::MGATHER: 27748 case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); 27749 } 27750 27751 return SDValue(); 27752 } 27753 27754 /// isTypeDesirableForOp - Return true if the target has native support for 27755 /// the specified value type and it is 'desirable' to use the type for the 27756 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 27757 /// instruction encodings are longer and some i16 instructions are slow. 27758 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 27759 if (!isTypeLegal(VT)) 27760 return false; 27761 if (VT != MVT::i16) 27762 return true; 27763 27764 switch (Opc) { 27765 default: 27766 return true; 27767 case ISD::LOAD: 27768 case ISD::SIGN_EXTEND: 27769 case ISD::ZERO_EXTEND: 27770 case ISD::ANY_EXTEND: 27771 case ISD::SHL: 27772 case ISD::SRL: 27773 case ISD::SUB: 27774 case ISD::ADD: 27775 case ISD::MUL: 27776 case ISD::AND: 27777 case ISD::OR: 27778 case ISD::XOR: 27779 return false; 27780 } 27781 } 27782 27783 /// IsDesirableToPromoteOp - This method query the target whether it is 27784 /// beneficial for dag combiner to promote the specified node. If true, it 27785 /// should return the desired promotion type by reference. 27786 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 27787 EVT VT = Op.getValueType(); 27788 if (VT != MVT::i16) 27789 return false; 27790 27791 bool Promote = false; 27792 bool Commute = false; 27793 switch (Op.getOpcode()) { 27794 default: break; 27795 case ISD::LOAD: { 27796 LoadSDNode *LD = cast<LoadSDNode>(Op); 27797 // If the non-extending load has a single use and it's not live out, then it 27798 // might be folded. 27799 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 27800 Op.hasOneUse()*/) { 27801 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 27802 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 27803 // The only case where we'd want to promote LOAD (rather then it being 27804 // promoted as an operand is when it's only use is liveout. 27805 if (UI->getOpcode() != ISD::CopyToReg) 27806 return false; 27807 } 27808 } 27809 Promote = true; 27810 break; 27811 } 27812 case ISD::SIGN_EXTEND: 27813 case ISD::ZERO_EXTEND: 27814 case ISD::ANY_EXTEND: 27815 Promote = true; 27816 break; 27817 case ISD::SHL: 27818 case ISD::SRL: { 27819 SDValue N0 = Op.getOperand(0); 27820 // Look out for (store (shl (load), x)). 27821 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 27822 return false; 27823 Promote = true; 27824 break; 27825 } 27826 case ISD::ADD: 27827 case ISD::MUL: 27828 case ISD::AND: 27829 case ISD::OR: 27830 case ISD::XOR: 27831 Commute = true; 27832 // fallthrough 27833 case ISD::SUB: { 27834 SDValue N0 = Op.getOperand(0); 27835 SDValue N1 = Op.getOperand(1); 27836 if (!Commute && MayFoldLoad(N1)) 27837 return false; 27838 // Avoid disabling potential load folding opportunities. 27839 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 27840 return false; 27841 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 27842 return false; 27843 Promote = true; 27844 } 27845 } 27846 27847 PVT = MVT::i32; 27848 return Promote; 27849 } 27850 27851 //===----------------------------------------------------------------------===// 27852 // X86 Inline Assembly Support 27853 //===----------------------------------------------------------------------===// 27854 27855 // Helper to match a string separated by whitespace. 27856 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { 27857 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. 27858 27859 for (StringRef Piece : Pieces) { 27860 if (!S.startswith(Piece)) // Check if the piece matches. 27861 return false; 27862 27863 S = S.substr(Piece.size()); 27864 StringRef::size_type Pos = S.find_first_not_of(" \t"); 27865 if (Pos == 0) // We matched a prefix. 27866 return false; 27867 27868 S = S.substr(Pos); 27869 } 27870 27871 return S.empty(); 27872 } 27873 27874 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { 27875 27876 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { 27877 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && 27878 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && 27879 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { 27880 27881 if (AsmPieces.size() == 3) 27882 return true; 27883 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) 27884 return true; 27885 } 27886 } 27887 return false; 27888 } 27889 27890 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 27891 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 27892 27893 std::string AsmStr = IA->getAsmString(); 27894 27895 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 27896 if (!Ty || Ty->getBitWidth() % 16 != 0) 27897 return false; 27898 27899 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 27900 SmallVector<StringRef, 4> AsmPieces; 27901 SplitString(AsmStr, AsmPieces, ";\n"); 27902 27903 switch (AsmPieces.size()) { 27904 default: return false; 27905 case 1: 27906 // FIXME: this should verify that we are targeting a 486 or better. If not, 27907 // we will turn this bswap into something that will be lowered to logical 27908 // ops instead of emitting the bswap asm. For now, we don't support 486 or 27909 // lower so don't worry about this. 27910 // bswap $0 27911 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || 27912 matchAsm(AsmPieces[0], {"bswapl", "$0"}) || 27913 matchAsm(AsmPieces[0], {"bswapq", "$0"}) || 27914 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || 27915 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || 27916 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { 27917 // No need to check constraints, nothing other than the equivalent of 27918 // "=r,0" would be valid here. 27919 return IntrinsicLowering::LowerToByteSwap(CI); 27920 } 27921 27922 // rorw $$8, ${0:w} --> llvm.bswap.i16 27923 if (CI->getType()->isIntegerTy(16) && 27924 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 27925 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || 27926 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { 27927 AsmPieces.clear(); 27928 StringRef ConstraintsStr = IA->getConstraintString(); 27929 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 27930 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 27931 if (clobbersFlagRegisters(AsmPieces)) 27932 return IntrinsicLowering::LowerToByteSwap(CI); 27933 } 27934 break; 27935 case 3: 27936 if (CI->getType()->isIntegerTy(32) && 27937 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 27938 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && 27939 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && 27940 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { 27941 AsmPieces.clear(); 27942 StringRef ConstraintsStr = IA->getConstraintString(); 27943 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 27944 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 27945 if (clobbersFlagRegisters(AsmPieces)) 27946 return IntrinsicLowering::LowerToByteSwap(CI); 27947 } 27948 27949 if (CI->getType()->isIntegerTy(64)) { 27950 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 27951 if (Constraints.size() >= 2 && 27952 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 27953 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 27954 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 27955 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && 27956 matchAsm(AsmPieces[1], {"bswap", "%edx"}) && 27957 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) 27958 return IntrinsicLowering::LowerToByteSwap(CI); 27959 } 27960 } 27961 break; 27962 } 27963 return false; 27964 } 27965 27966 /// getConstraintType - Given a constraint letter, return the type of 27967 /// constraint it is for this target. 27968 X86TargetLowering::ConstraintType 27969 X86TargetLowering::getConstraintType(StringRef Constraint) const { 27970 if (Constraint.size() == 1) { 27971 switch (Constraint[0]) { 27972 case 'R': 27973 case 'q': 27974 case 'Q': 27975 case 'f': 27976 case 't': 27977 case 'u': 27978 case 'y': 27979 case 'x': 27980 case 'Y': 27981 case 'l': 27982 return C_RegisterClass; 27983 case 'a': 27984 case 'b': 27985 case 'c': 27986 case 'd': 27987 case 'S': 27988 case 'D': 27989 case 'A': 27990 return C_Register; 27991 case 'I': 27992 case 'J': 27993 case 'K': 27994 case 'L': 27995 case 'M': 27996 case 'N': 27997 case 'G': 27998 case 'C': 27999 case 'e': 28000 case 'Z': 28001 return C_Other; 28002 default: 28003 break; 28004 } 28005 } 28006 return TargetLowering::getConstraintType(Constraint); 28007 } 28008 28009 /// Examine constraint type and operand type and determine a weight value. 28010 /// This object must already have been set up with the operand type 28011 /// and the current alternative constraint selected. 28012 TargetLowering::ConstraintWeight 28013 X86TargetLowering::getSingleConstraintMatchWeight( 28014 AsmOperandInfo &info, const char *constraint) const { 28015 ConstraintWeight weight = CW_Invalid; 28016 Value *CallOperandVal = info.CallOperandVal; 28017 // If we don't have a value, we can't do a match, 28018 // but allow it at the lowest weight. 28019 if (!CallOperandVal) 28020 return CW_Default; 28021 Type *type = CallOperandVal->getType(); 28022 // Look at the constraint type. 28023 switch (*constraint) { 28024 default: 28025 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 28026 case 'R': 28027 case 'q': 28028 case 'Q': 28029 case 'a': 28030 case 'b': 28031 case 'c': 28032 case 'd': 28033 case 'S': 28034 case 'D': 28035 case 'A': 28036 if (CallOperandVal->getType()->isIntegerTy()) 28037 weight = CW_SpecificReg; 28038 break; 28039 case 'f': 28040 case 't': 28041 case 'u': 28042 if (type->isFloatingPointTy()) 28043 weight = CW_SpecificReg; 28044 break; 28045 case 'y': 28046 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 28047 weight = CW_SpecificReg; 28048 break; 28049 case 'x': 28050 case 'Y': 28051 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 28052 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) 28053 weight = CW_Register; 28054 break; 28055 case 'I': 28056 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 28057 if (C->getZExtValue() <= 31) 28058 weight = CW_Constant; 28059 } 28060 break; 28061 case 'J': 28062 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 28063 if (C->getZExtValue() <= 63) 28064 weight = CW_Constant; 28065 } 28066 break; 28067 case 'K': 28068 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 28069 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 28070 weight = CW_Constant; 28071 } 28072 break; 28073 case 'L': 28074 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 28075 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 28076 weight = CW_Constant; 28077 } 28078 break; 28079 case 'M': 28080 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 28081 if (C->getZExtValue() <= 3) 28082 weight = CW_Constant; 28083 } 28084 break; 28085 case 'N': 28086 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 28087 if (C->getZExtValue() <= 0xff) 28088 weight = CW_Constant; 28089 } 28090 break; 28091 case 'G': 28092 case 'C': 28093 if (isa<ConstantFP>(CallOperandVal)) { 28094 weight = CW_Constant; 28095 } 28096 break; 28097 case 'e': 28098 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 28099 if ((C->getSExtValue() >= -0x80000000LL) && 28100 (C->getSExtValue() <= 0x7fffffffLL)) 28101 weight = CW_Constant; 28102 } 28103 break; 28104 case 'Z': 28105 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 28106 if (C->getZExtValue() <= 0xffffffff) 28107 weight = CW_Constant; 28108 } 28109 break; 28110 } 28111 return weight; 28112 } 28113 28114 /// LowerXConstraint - try to replace an X constraint, which matches anything, 28115 /// with another that has more specific requirements based on the type of the 28116 /// corresponding operand. 28117 const char *X86TargetLowering:: 28118 LowerXConstraint(EVT ConstraintVT) const { 28119 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 28120 // 'f' like normal targets. 28121 if (ConstraintVT.isFloatingPoint()) { 28122 if (Subtarget->hasSSE2()) 28123 return "Y"; 28124 if (Subtarget->hasSSE1()) 28125 return "x"; 28126 } 28127 28128 return TargetLowering::LowerXConstraint(ConstraintVT); 28129 } 28130 28131 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 28132 /// vector. If it is invalid, don't add anything to Ops. 28133 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 28134 std::string &Constraint, 28135 std::vector<SDValue>&Ops, 28136 SelectionDAG &DAG) const { 28137 SDValue Result; 28138 28139 // Only support length 1 constraints for now. 28140 if (Constraint.length() > 1) return; 28141 28142 char ConstraintLetter = Constraint[0]; 28143 switch (ConstraintLetter) { 28144 default: break; 28145 case 'I': 28146 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28147 if (C->getZExtValue() <= 31) { 28148 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 28149 Op.getValueType()); 28150 break; 28151 } 28152 } 28153 return; 28154 case 'J': 28155 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28156 if (C->getZExtValue() <= 63) { 28157 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 28158 Op.getValueType()); 28159 break; 28160 } 28161 } 28162 return; 28163 case 'K': 28164 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28165 if (isInt<8>(C->getSExtValue())) { 28166 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 28167 Op.getValueType()); 28168 break; 28169 } 28170 } 28171 return; 28172 case 'L': 28173 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28174 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || 28175 (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { 28176 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), 28177 Op.getValueType()); 28178 break; 28179 } 28180 } 28181 return; 28182 case 'M': 28183 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28184 if (C->getZExtValue() <= 3) { 28185 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 28186 Op.getValueType()); 28187 break; 28188 } 28189 } 28190 return; 28191 case 'N': 28192 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28193 if (C->getZExtValue() <= 255) { 28194 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 28195 Op.getValueType()); 28196 break; 28197 } 28198 } 28199 return; 28200 case 'O': 28201 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28202 if (C->getZExtValue() <= 127) { 28203 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 28204 Op.getValueType()); 28205 break; 28206 } 28207 } 28208 return; 28209 case 'e': { 28210 // 32-bit signed value 28211 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28212 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 28213 C->getSExtValue())) { 28214 // Widen to 64 bits here to get it sign extended. 28215 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); 28216 break; 28217 } 28218 // FIXME gcc accepts some relocatable values here too, but only in certain 28219 // memory models; it's complicated. 28220 } 28221 return; 28222 } 28223 case 'Z': { 28224 // 32-bit unsigned value 28225 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 28226 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 28227 C->getZExtValue())) { 28228 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 28229 Op.getValueType()); 28230 break; 28231 } 28232 } 28233 // FIXME gcc accepts some relocatable values here too, but only in certain 28234 // memory models; it's complicated. 28235 return; 28236 } 28237 case 'i': { 28238 // Literal immediates are always ok. 28239 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 28240 // Widen to 64 bits here to get it sign extended. 28241 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); 28242 break; 28243 } 28244 28245 // In any sort of PIC mode addresses need to be computed at runtime by 28246 // adding in a register or some sort of table lookup. These can't 28247 // be used as immediates. 28248 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 28249 return; 28250 28251 // If we are in non-pic codegen mode, we allow the address of a global (with 28252 // an optional displacement) to be used with 'i'. 28253 GlobalAddressSDNode *GA = nullptr; 28254 int64_t Offset = 0; 28255 28256 // Match either (GA), (GA+C), (GA+C1+C2), etc. 28257 while (1) { 28258 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 28259 Offset += GA->getOffset(); 28260 break; 28261 } else if (Op.getOpcode() == ISD::ADD) { 28262 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 28263 Offset += C->getZExtValue(); 28264 Op = Op.getOperand(0); 28265 continue; 28266 } 28267 } else if (Op.getOpcode() == ISD::SUB) { 28268 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 28269 Offset += -C->getZExtValue(); 28270 Op = Op.getOperand(0); 28271 continue; 28272 } 28273 } 28274 28275 // Otherwise, this isn't something we can handle, reject it. 28276 return; 28277 } 28278 28279 const GlobalValue *GV = GA->getGlobal(); 28280 // If we require an extra load to get this address, as in PIC mode, we 28281 // can't accept it. 28282 if (isGlobalStubReference( 28283 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) 28284 return; 28285 28286 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), 28287 GA->getValueType(0), Offset); 28288 break; 28289 } 28290 } 28291 28292 if (Result.getNode()) { 28293 Ops.push_back(Result); 28294 return; 28295 } 28296 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 28297 } 28298 28299 std::pair<unsigned, const TargetRegisterClass *> 28300 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 28301 StringRef Constraint, 28302 MVT VT) const { 28303 // First, see if this is a constraint that directly corresponds to an LLVM 28304 // register class. 28305 if (Constraint.size() == 1) { 28306 // GCC Constraint Letters 28307 switch (Constraint[0]) { 28308 default: break; 28309 // TODO: Slight differences here in allocation order and leaving 28310 // RIP in the class. Do they matter any more here than they do 28311 // in the normal allocation? 28312 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 28313 if (Subtarget->is64Bit()) { 28314 if (VT == MVT::i32 || VT == MVT::f32) 28315 return std::make_pair(0U, &X86::GR32RegClass); 28316 if (VT == MVT::i16) 28317 return std::make_pair(0U, &X86::GR16RegClass); 28318 if (VT == MVT::i8 || VT == MVT::i1) 28319 return std::make_pair(0U, &X86::GR8RegClass); 28320 if (VT == MVT::i64 || VT == MVT::f64) 28321 return std::make_pair(0U, &X86::GR64RegClass); 28322 break; 28323 } 28324 // 32-bit fallthrough 28325 case 'Q': // Q_REGS 28326 if (VT == MVT::i32 || VT == MVT::f32) 28327 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 28328 if (VT == MVT::i16) 28329 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 28330 if (VT == MVT::i8 || VT == MVT::i1) 28331 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 28332 if (VT == MVT::i64) 28333 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 28334 break; 28335 case 'r': // GENERAL_REGS 28336 case 'l': // INDEX_REGS 28337 if (VT == MVT::i8 || VT == MVT::i1) 28338 return std::make_pair(0U, &X86::GR8RegClass); 28339 if (VT == MVT::i16) 28340 return std::make_pair(0U, &X86::GR16RegClass); 28341 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 28342 return std::make_pair(0U, &X86::GR32RegClass); 28343 return std::make_pair(0U, &X86::GR64RegClass); 28344 case 'R': // LEGACY_REGS 28345 if (VT == MVT::i8 || VT == MVT::i1) 28346 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 28347 if (VT == MVT::i16) 28348 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 28349 if (VT == MVT::i32 || !Subtarget->is64Bit()) 28350 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 28351 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 28352 case 'f': // FP Stack registers. 28353 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 28354 // value to the correct fpstack register class. 28355 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 28356 return std::make_pair(0U, &X86::RFP32RegClass); 28357 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 28358 return std::make_pair(0U, &X86::RFP64RegClass); 28359 return std::make_pair(0U, &X86::RFP80RegClass); 28360 case 'y': // MMX_REGS if MMX allowed. 28361 if (!Subtarget->hasMMX()) break; 28362 return std::make_pair(0U, &X86::VR64RegClass); 28363 case 'Y': // SSE_REGS if SSE2 allowed 28364 if (!Subtarget->hasSSE2()) break; 28365 // FALL THROUGH. 28366 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 28367 if (!Subtarget->hasSSE1()) break; 28368 28369 switch (VT.SimpleTy) { 28370 default: break; 28371 // Scalar SSE types. 28372 case MVT::f32: 28373 case MVT::i32: 28374 return std::make_pair(0U, &X86::FR32RegClass); 28375 case MVT::f64: 28376 case MVT::i64: 28377 return std::make_pair(0U, &X86::FR64RegClass); 28378 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. 28379 // Vector types. 28380 case MVT::v16i8: 28381 case MVT::v8i16: 28382 case MVT::v4i32: 28383 case MVT::v2i64: 28384 case MVT::v4f32: 28385 case MVT::v2f64: 28386 return std::make_pair(0U, &X86::VR128RegClass); 28387 // AVX types. 28388 case MVT::v32i8: 28389 case MVT::v16i16: 28390 case MVT::v8i32: 28391 case MVT::v4i64: 28392 case MVT::v8f32: 28393 case MVT::v4f64: 28394 return std::make_pair(0U, &X86::VR256RegClass); 28395 case MVT::v8f64: 28396 case MVT::v16f32: 28397 case MVT::v16i32: 28398 case MVT::v8i64: 28399 return std::make_pair(0U, &X86::VR512RegClass); 28400 } 28401 break; 28402 } 28403 } 28404 28405 // Use the default implementation in TargetLowering to convert the register 28406 // constraint into a member of a register class. 28407 std::pair<unsigned, const TargetRegisterClass*> Res; 28408 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 28409 28410 // Not found as a standard register? 28411 if (!Res.second) { 28412 // Map st(0) -> st(7) -> ST0 28413 if (Constraint.size() == 7 && Constraint[0] == '{' && 28414 tolower(Constraint[1]) == 's' && 28415 tolower(Constraint[2]) == 't' && 28416 Constraint[3] == '(' && 28417 (Constraint[4] >= '0' && Constraint[4] <= '7') && 28418 Constraint[5] == ')' && 28419 Constraint[6] == '}') { 28420 28421 Res.first = X86::FP0+Constraint[4]-'0'; 28422 Res.second = &X86::RFP80RegClass; 28423 return Res; 28424 } 28425 28426 // GCC allows "st(0)" to be called just plain "st". 28427 if (StringRef("{st}").equals_lower(Constraint)) { 28428 Res.first = X86::FP0; 28429 Res.second = &X86::RFP80RegClass; 28430 return Res; 28431 } 28432 28433 // flags -> EFLAGS 28434 if (StringRef("{flags}").equals_lower(Constraint)) { 28435 Res.first = X86::EFLAGS; 28436 Res.second = &X86::CCRRegClass; 28437 return Res; 28438 } 28439 28440 // 'A' means EAX + EDX. 28441 if (Constraint == "A") { 28442 Res.first = X86::EAX; 28443 Res.second = &X86::GR32_ADRegClass; 28444 return Res; 28445 } 28446 return Res; 28447 } 28448 28449 // Otherwise, check to see if this is a register class of the wrong value 28450 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 28451 // turn into {ax},{dx}. 28452 // MVT::Other is used to specify clobber names. 28453 if (Res.second->hasType(VT) || VT == MVT::Other) 28454 return Res; // Correct type already, nothing to do. 28455 28456 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should 28457 // return "eax". This should even work for things like getting 64bit integer 28458 // registers when given an f64 type. 28459 const TargetRegisterClass *Class = Res.second; 28460 if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || 28461 Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { 28462 unsigned Size = VT.getSizeInBits(); 28463 MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8 28464 : Size == 16 ? MVT::i16 28465 : Size == 32 ? MVT::i32 28466 : Size == 64 ? MVT::i64 28467 : MVT::Other; 28468 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy); 28469 if (DestReg > 0) { 28470 Res.first = DestReg; 28471 Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass 28472 : SimpleTy == MVT::i16 ? &X86::GR16RegClass 28473 : SimpleTy == MVT::i32 ? &X86::GR32RegClass 28474 : &X86::GR64RegClass; 28475 assert(Res.second->contains(Res.first) && "Register in register class"); 28476 } else { 28477 // No register found/type mismatch. 28478 Res.first = 0; 28479 Res.second = nullptr; 28480 } 28481 } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass || 28482 Class == &X86::VR128RegClass || Class == &X86::VR256RegClass || 28483 Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass || 28484 Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass || 28485 Class == &X86::VR512RegClass) { 28486 // Handle references to XMM physical registers that got mapped into the 28487 // wrong class. This can happen with constraints like {xmm0} where the 28488 // target independent register mapper will just pick the first match it can 28489 // find, ignoring the required type. 28490 28491 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. 28492 if (VT == MVT::f32 || VT == MVT::i32) 28493 Res.second = &X86::FR32RegClass; 28494 else if (VT == MVT::f64 || VT == MVT::i64) 28495 Res.second = &X86::FR64RegClass; 28496 else if (X86::VR128RegClass.hasType(VT)) 28497 Res.second = &X86::VR128RegClass; 28498 else if (X86::VR256RegClass.hasType(VT)) 28499 Res.second = &X86::VR256RegClass; 28500 else if (X86::VR512RegClass.hasType(VT)) 28501 Res.second = &X86::VR512RegClass; 28502 else { 28503 // Type mismatch and not a clobber: Return an error; 28504 Res.first = 0; 28505 Res.second = nullptr; 28506 } 28507 } 28508 28509 return Res; 28510 } 28511 28512 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, 28513 const AddrMode &AM, Type *Ty, 28514 unsigned AS) const { 28515 // Scaling factors are not free at all. 28516 // An indexed folded instruction, i.e., inst (reg1, reg2, scale), 28517 // will take 2 allocations in the out of order engine instead of 1 28518 // for plain addressing mode, i.e. inst (reg1). 28519 // E.g., 28520 // vaddps (%rsi,%drx), %ymm0, %ymm1 28521 // Requires two allocations (one for the load, one for the computation) 28522 // whereas: 28523 // vaddps (%rsi), %ymm0, %ymm1 28524 // Requires just 1 allocation, i.e., freeing allocations for other operations 28525 // and having less micro operations to execute. 28526 // 28527 // For some X86 architectures, this is even worse because for instance for 28528 // stores, the complex addressing mode forces the instruction to use the 28529 // "load" ports instead of the dedicated "store" port. 28530 // E.g., on Haswell: 28531 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. 28532 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. 28533 if (isLegalAddressingMode(DL, AM, Ty, AS)) 28534 // Scale represents reg2 * scale, thus account for 1 28535 // as soon as we use a second register. 28536 return AM.Scale != 0; 28537 return -1; 28538 } 28539 28540 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { 28541 // Integer division on x86 is expensive. However, when aggressively optimizing 28542 // for code size, we prefer to use a div instruction, as it is usually smaller 28543 // than the alternative sequence. 28544 // The exception to this is vector division. Since x86 doesn't have vector 28545 // integer division, leaving the division as-is is a loss even in terms of 28546 // size, because it will have to be scalarized, while the alternative code 28547 // sequence can be performed in vector form. 28548 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, 28549 Attribute::MinSize); 28550 return OptSize && !VT.isVector(); 28551 } 28552 28553 void X86TargetLowering::markInRegArguments(SelectionDAG &DAG, 28554 TargetLowering::ArgListTy& Args) const { 28555 // The MCU psABI requires some arguments to be passed in-register. 28556 // For regular calls, the inreg arguments are marked by the front-end. 28557 // However, for compiler generated library calls, we have to patch this 28558 // up here. 28559 if (!Subtarget->isTargetMCU() || !Args.size()) 28560 return; 28561 28562 unsigned FreeRegs = 3; 28563 for (auto &Arg : Args) { 28564 // For library functions, we do not expect any fancy types. 28565 unsigned Size = DAG.getDataLayout().getTypeSizeInBits(Arg.Ty); 28566 unsigned SizeInRegs = (Size + 31) / 32; 28567 if (SizeInRegs > 2 || SizeInRegs > FreeRegs) 28568 continue; 28569 28570 Arg.isInReg = true; 28571 FreeRegs -= SizeInRegs; 28572 if (!FreeRegs) 28573 break; 28574 } 28575 } 28576