1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that X86 uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "X86ISelLowering.h" 16 #include "Utils/X86ShuffleDecode.h" 17 #include "X86CallingConv.h" 18 #include "X86FrameLowering.h" 19 #include "X86InstrBuilder.h" 20 #include "X86MachineFunctionInfo.h" 21 #include "X86ShuffleDecodeConstantPool.h" 22 #include "X86TargetMachine.h" 23 #include "X86TargetObjectFile.h" 24 #include "llvm/ADT/SmallBitVector.h" 25 #include "llvm/ADT/SmallSet.h" 26 #include "llvm/ADT/Statistic.h" 27 #include "llvm/ADT/StringExtras.h" 28 #include "llvm/ADT/StringSwitch.h" 29 #include "llvm/Analysis/EHPersonalities.h" 30 #include "llvm/CodeGen/IntrinsicLowering.h" 31 #include "llvm/CodeGen/MachineFrameInfo.h" 32 #include "llvm/CodeGen/MachineFunction.h" 33 #include "llvm/CodeGen/MachineInstrBuilder.h" 34 #include "llvm/CodeGen/MachineJumpTableInfo.h" 35 #include "llvm/CodeGen/MachineModuleInfo.h" 36 #include "llvm/CodeGen/MachineRegisterInfo.h" 37 #include "llvm/CodeGen/WinEHFuncInfo.h" 38 #include "llvm/IR/CallSite.h" 39 #include "llvm/IR/CallingConv.h" 40 #include "llvm/IR/Constants.h" 41 #include "llvm/IR/DerivedTypes.h" 42 #include "llvm/IR/Function.h" 43 #include "llvm/IR/GlobalAlias.h" 44 #include "llvm/IR/GlobalVariable.h" 45 #include "llvm/IR/Instructions.h" 46 #include "llvm/IR/Intrinsics.h" 47 #include "llvm/MC/MCAsmInfo.h" 48 #include "llvm/MC/MCContext.h" 49 #include "llvm/MC/MCExpr.h" 50 #include "llvm/MC/MCSymbol.h" 51 #include "llvm/Support/CommandLine.h" 52 #include "llvm/Support/Debug.h" 53 #include "llvm/Support/ErrorHandling.h" 54 #include "llvm/Support/MathExtras.h" 55 #include "llvm/Target/TargetOptions.h" 56 #include "X86IntrinsicsInfo.h" 57 #include <bitset> 58 #include <numeric> 59 #include <cctype> 60 using namespace llvm; 61 62 #define DEBUG_TYPE "x86-isel" 63 64 STATISTIC(NumTailCalls, "Number of tail calls"); 65 66 static cl::opt<bool> ExperimentalVectorWideningLegalization( 67 "x86-experimental-vector-widening-legalization", cl::init(false), 68 cl::desc("Enable an experimental vector type legalization through widening " 69 "rather than promotion."), 70 cl::Hidden); 71 72 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, 73 const X86Subtarget &STI) 74 : TargetLowering(TM), Subtarget(STI) { 75 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); 76 X86ScalarSSEf64 = Subtarget.hasSSE2(); 77 X86ScalarSSEf32 = Subtarget.hasSSE1(); 78 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); 79 80 // Set up the TargetLowering object. 81 82 // X86 is weird. It always uses i8 for shift amounts and setcc results. 83 setBooleanContents(ZeroOrOneBooleanContent); 84 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 85 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 86 87 // For 64-bit, since we have so many registers, use the ILP scheduler. 88 // For 32-bit, use the register pressure specific scheduling. 89 // For Atom, always use ILP scheduling. 90 if (Subtarget.isAtom()) 91 setSchedulingPreference(Sched::ILP); 92 else if (Subtarget.is64Bit()) 93 setSchedulingPreference(Sched::ILP); 94 else 95 setSchedulingPreference(Sched::RegPressure); 96 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 97 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 98 99 // Bypass expensive divides on Atom when compiling with O2. 100 if (TM.getOptLevel() >= CodeGenOpt::Default) { 101 if (Subtarget.hasSlowDivide32()) 102 addBypassSlowDiv(32, 8); 103 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) 104 addBypassSlowDiv(64, 16); 105 } 106 107 if (Subtarget.isTargetKnownWindowsMSVC()) { 108 // Setup Windows compiler runtime calls. 109 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 110 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 111 setLibcallName(RTLIB::SREM_I64, "_allrem"); 112 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 113 setLibcallName(RTLIB::MUL_I64, "_allmul"); 114 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 115 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 116 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 117 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 118 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 119 } 120 121 if (Subtarget.isTargetDarwin()) { 122 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 123 setUseUnderscoreSetJmp(false); 124 setUseUnderscoreLongJmp(false); 125 } else if (Subtarget.isTargetWindowsGNU()) { 126 // MS runtime is weird: it exports _setjmp, but longjmp! 127 setUseUnderscoreSetJmp(true); 128 setUseUnderscoreLongJmp(false); 129 } else { 130 setUseUnderscoreSetJmp(true); 131 setUseUnderscoreLongJmp(true); 132 } 133 134 // Set up the register classes. 135 addRegisterClass(MVT::i8, &X86::GR8RegClass); 136 addRegisterClass(MVT::i16, &X86::GR16RegClass); 137 addRegisterClass(MVT::i32, &X86::GR32RegClass); 138 if (Subtarget.is64Bit()) 139 addRegisterClass(MVT::i64, &X86::GR64RegClass); 140 141 for (MVT VT : MVT::integer_valuetypes()) 142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 143 144 // We don't accept any truncstore of integer registers. 145 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 146 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 147 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 148 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 149 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 150 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 151 152 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 153 154 // SETOEQ and SETUNE require checking two conditions. 155 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 156 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 157 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 158 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 159 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 160 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 161 162 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 163 // operation. 164 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 165 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 166 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 167 168 if (Subtarget.is64Bit()) { 169 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) 170 // f32/f64 are legal, f80 is custom. 171 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 172 else 173 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 174 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 175 } else if (!Subtarget.useSoftFloat()) { 176 // We have an algorithm for SSE2->double, and we turn this into a 177 // 64-bit FILD followed by conditional FADD for other targets. 178 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 179 // We have an algorithm for SSE2, and we turn this into a 64-bit 180 // FILD or VCVTUSI2SS/SD for other targets. 181 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 182 } 183 184 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 185 // this operation. 186 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 187 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 188 189 if (!Subtarget.useSoftFloat()) { 190 // SSE has no i16 to fp conversion, only i32 191 if (X86ScalarSSEf32) { 192 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 193 // f32 and f64 cases are Legal, f80 case is not 194 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 195 } else { 196 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 197 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 198 } 199 } else { 200 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 201 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 202 } 203 204 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 205 // this operation. 206 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 207 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 208 209 if (!Subtarget.useSoftFloat()) { 210 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 211 // are Legal, f80 is custom lowered. 212 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 213 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 214 215 if (X86ScalarSSEf32) { 216 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 217 // f32 and f64 cases are Legal, f80 case is not 218 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 219 } else { 220 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 221 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 222 } 223 } else { 224 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 225 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); 226 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); 227 } 228 229 // Handle FP_TO_UINT by promoting the destination to a larger signed 230 // conversion. 231 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 232 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 233 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 234 235 if (Subtarget.is64Bit()) { 236 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { 237 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. 238 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 239 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 240 } else { 241 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 242 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 243 } 244 } else if (!Subtarget.useSoftFloat()) { 245 // Since AVX is a superset of SSE3, only check for SSE here. 246 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3()) 247 // Expand FP_TO_UINT into a select. 248 // FIXME: We would like to use a Custom expander here eventually to do 249 // the optimal thing for SSE vs. the default expansion in the legalizer. 250 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 251 else 252 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. 253 // With SSE3 we can use fisttpll to convert to a signed i64; without 254 // SSE, we're stuck with a fistpll. 255 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 256 257 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 258 } 259 260 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 261 if (!X86ScalarSSEf64) { 262 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 263 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 264 if (Subtarget.is64Bit()) { 265 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 266 // Without SSE, i64->f64 goes through memory. 267 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 268 } 269 } else if (!Subtarget.is64Bit()) 270 setOperationAction(ISD::BITCAST , MVT::i64 , Custom); 271 272 // Scalar integer divide and remainder are lowered to use operations that 273 // produce two results, to match the available instructions. This exposes 274 // the two-result form to trivial CSE, which is able to combine x/y and x%y 275 // into a single instruction. 276 // 277 // Scalar integer multiply-high is also lowered to use two-result 278 // operations, to match the available instructions. However, plain multiply 279 // (low) operations are left as Legal, as there are single-result 280 // instructions for this in x86. Using the two-result multiply instructions 281 // when both high and low results are needed must be arranged by dagcombine. 282 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { 283 setOperationAction(ISD::MULHS, VT, Expand); 284 setOperationAction(ISD::MULHU, VT, Expand); 285 setOperationAction(ISD::SDIV, VT, Expand); 286 setOperationAction(ISD::UDIV, VT, Expand); 287 setOperationAction(ISD::SREM, VT, Expand); 288 setOperationAction(ISD::UREM, VT, Expand); 289 290 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 291 setOperationAction(ISD::ADDC, VT, Custom); 292 setOperationAction(ISD::ADDE, VT, Custom); 293 setOperationAction(ISD::SUBC, VT, Custom); 294 setOperationAction(ISD::SUBE, VT, Custom); 295 } 296 297 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 298 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 299 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, 300 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { 301 setOperationAction(ISD::BR_CC, VT, Expand); 302 setOperationAction(ISD::SELECT_CC, VT, Expand); 303 } 304 if (Subtarget.is64Bit()) 305 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 306 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 307 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 308 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 309 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 310 311 setOperationAction(ISD::FREM , MVT::f32 , Expand); 312 setOperationAction(ISD::FREM , MVT::f64 , Expand); 313 setOperationAction(ISD::FREM , MVT::f80 , Expand); 314 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 315 316 // Promote the i8 variants and force them on up to i32 which has a shorter 317 // encoding. 318 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); 319 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 320 if (!Subtarget.hasBMI()) { 321 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 322 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 323 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal); 324 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); 325 if (Subtarget.is64Bit()) { 326 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 327 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); 328 } 329 } 330 331 if (Subtarget.hasLZCNT()) { 332 // When promoting the i8 variants, force them to i32 for a shorter 333 // encoding. 334 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); 335 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 336 } else { 337 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 338 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 339 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 340 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 341 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 342 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 343 if (Subtarget.is64Bit()) { 344 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 345 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 346 } 347 } 348 349 // Special handling for half-precision floating point conversions. 350 // If we don't have F16C support, then lower half float conversions 351 // into library calls. 352 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { 353 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 354 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 355 } 356 357 // There's never any support for operations beyond MVT::f32. 358 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 359 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); 360 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 361 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); 362 363 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 364 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 365 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); 366 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 367 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 368 setTruncStoreAction(MVT::f80, MVT::f16, Expand); 369 370 if (Subtarget.hasPOPCNT()) { 371 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 372 } else { 373 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 374 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 375 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 376 if (Subtarget.is64Bit()) 377 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 378 } 379 380 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 381 382 if (!Subtarget.hasMOVBE()) 383 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 384 385 // These should be promoted to a larger select which is supported. 386 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 387 // X86 wants to expand cmov itself. 388 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { 389 setOperationAction(ISD::SELECT, VT, Custom); 390 setOperationAction(ISD::SETCC, VT, Custom); 391 } 392 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { 393 if (VT == MVT::i64 && !Subtarget.is64Bit()) 394 continue; 395 setOperationAction(ISD::SELECT, VT, Custom); 396 setOperationAction(ISD::SETCC, VT, Custom); 397 setOperationAction(ISD::SETCCE, VT, Custom); 398 } 399 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 400 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 401 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 402 // support continuation, user-level threading, and etc.. As a result, no 403 // other SjLj exception interfaces are implemented and please don't build 404 // your own exception handling based on them. 405 // LLVM/Clang supports zero-cost DWARF exception handling. 406 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 407 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 408 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 409 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) 410 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 411 412 // Darwin ABI issue. 413 for (auto VT : { MVT::i32, MVT::i64 }) { 414 if (VT == MVT::i64 && !Subtarget.is64Bit()) 415 continue; 416 setOperationAction(ISD::ConstantPool , VT, Custom); 417 setOperationAction(ISD::JumpTable , VT, Custom); 418 setOperationAction(ISD::GlobalAddress , VT, Custom); 419 setOperationAction(ISD::GlobalTLSAddress, VT, Custom); 420 setOperationAction(ISD::ExternalSymbol , VT, Custom); 421 setOperationAction(ISD::BlockAddress , VT, Custom); 422 } 423 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 424 for (auto VT : { MVT::i32, MVT::i64 }) { 425 if (VT == MVT::i64 && !Subtarget.is64Bit()) 426 continue; 427 setOperationAction(ISD::SHL_PARTS, VT, Custom); 428 setOperationAction(ISD::SRA_PARTS, VT, Custom); 429 setOperationAction(ISD::SRL_PARTS, VT, Custom); 430 } 431 432 if (Subtarget.hasSSE1()) 433 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 434 435 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 436 437 // Expand certain atomics 438 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { 439 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); 440 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 441 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); 442 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); 443 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); 444 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); 445 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 446 } 447 448 if (Subtarget.hasCmpxchg16b()) { 449 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); 450 } 451 452 // FIXME - use subtarget debug flags 453 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && 454 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && 455 TM.Options.ExceptionModel != ExceptionHandling::SjLj) { 456 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 457 } 458 459 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 460 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 461 462 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 463 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 464 465 setOperationAction(ISD::TRAP, MVT::Other, Legal); 466 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 467 468 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 469 setOperationAction(ISD::VASTART , MVT::Other, Custom); 470 setOperationAction(ISD::VAEND , MVT::Other, Expand); 471 bool Is64Bit = Subtarget.is64Bit(); 472 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); 473 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); 474 475 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 476 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 477 478 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); 479 480 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. 481 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); 482 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); 483 484 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { 485 // f32 and f64 use SSE. 486 // Set up the FP register classes. 487 addRegisterClass(MVT::f32, &X86::FR32RegClass); 488 addRegisterClass(MVT::f64, &X86::FR64RegClass); 489 490 for (auto VT : { MVT::f32, MVT::f64 }) { 491 // Use ANDPD to simulate FABS. 492 setOperationAction(ISD::FABS, VT, Custom); 493 494 // Use XORP to simulate FNEG. 495 setOperationAction(ISD::FNEG, VT, Custom); 496 497 // Use ANDPD and ORPD to simulate FCOPYSIGN. 498 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 499 500 // We don't support sin/cos/fmod 501 setOperationAction(ISD::FSIN , VT, Expand); 502 setOperationAction(ISD::FCOS , VT, Expand); 503 setOperationAction(ISD::FSINCOS, VT, Expand); 504 } 505 506 // Lower this to MOVMSK plus an AND. 507 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 508 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 509 510 // Expand FP immediates into loads from the stack, except for the special 511 // cases we handle. 512 addLegalFPImmediate(APFloat(+0.0)); // xorpd 513 addLegalFPImmediate(APFloat(+0.0f)); // xorps 514 } else if (UseX87 && X86ScalarSSEf32) { 515 // Use SSE for f32, x87 for f64. 516 // Set up the FP register classes. 517 addRegisterClass(MVT::f32, &X86::FR32RegClass); 518 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 519 520 // Use ANDPS to simulate FABS. 521 setOperationAction(ISD::FABS , MVT::f32, Custom); 522 523 // Use XORP to simulate FNEG. 524 setOperationAction(ISD::FNEG , MVT::f32, Custom); 525 526 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 527 528 // Use ANDPS and ORPS to simulate FCOPYSIGN. 529 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 530 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 531 532 // We don't support sin/cos/fmod 533 setOperationAction(ISD::FSIN , MVT::f32, Expand); 534 setOperationAction(ISD::FCOS , MVT::f32, Expand); 535 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 536 537 // Special cases we handle for FP constants. 538 addLegalFPImmediate(APFloat(+0.0f)); // xorps 539 addLegalFPImmediate(APFloat(+0.0)); // FLD0 540 addLegalFPImmediate(APFloat(+1.0)); // FLD1 541 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 542 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 543 544 if (!TM.Options.UnsafeFPMath) { 545 setOperationAction(ISD::FSIN , MVT::f64, Expand); 546 setOperationAction(ISD::FCOS , MVT::f64, Expand); 547 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 548 } 549 } else if (UseX87) { 550 // f32 and f64 in x87. 551 // Set up the FP register classes. 552 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 553 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 554 555 for (auto VT : { MVT::f32, MVT::f64 }) { 556 setOperationAction(ISD::UNDEF, VT, Expand); 557 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 558 559 if (!TM.Options.UnsafeFPMath) { 560 setOperationAction(ISD::FSIN , VT, Expand); 561 setOperationAction(ISD::FCOS , VT, Expand); 562 setOperationAction(ISD::FSINCOS, VT, Expand); 563 } 564 } 565 addLegalFPImmediate(APFloat(+0.0)); // FLD0 566 addLegalFPImmediate(APFloat(+1.0)); // FLD1 567 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 568 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 569 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 570 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 571 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 572 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 573 } 574 575 // We don't support FMA. 576 setOperationAction(ISD::FMA, MVT::f64, Expand); 577 setOperationAction(ISD::FMA, MVT::f32, Expand); 578 579 // Long double always uses X87, except f128 in MMX. 580 if (UseX87) { 581 if (Subtarget.is64Bit() && Subtarget.hasMMX()) { 582 addRegisterClass(MVT::f128, &X86::FR128RegClass); 583 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); 584 setOperationAction(ISD::FABS , MVT::f128, Custom); 585 setOperationAction(ISD::FNEG , MVT::f128, Custom); 586 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); 587 } 588 589 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 590 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 591 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 592 { 593 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 594 addLegalFPImmediate(TmpFlt); // FLD0 595 TmpFlt.changeSign(); 596 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 597 598 bool ignored; 599 APFloat TmpFlt2(+1.0); 600 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 601 &ignored); 602 addLegalFPImmediate(TmpFlt2); // FLD1 603 TmpFlt2.changeSign(); 604 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 605 } 606 607 if (!TM.Options.UnsafeFPMath) { 608 setOperationAction(ISD::FSIN , MVT::f80, Expand); 609 setOperationAction(ISD::FCOS , MVT::f80, Expand); 610 setOperationAction(ISD::FSINCOS, MVT::f80, Expand); 611 } 612 613 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 614 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 615 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 616 setOperationAction(ISD::FRINT, MVT::f80, Expand); 617 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 618 setOperationAction(ISD::FMA, MVT::f80, Expand); 619 } 620 621 // Always use a library call for pow. 622 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 623 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 624 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 625 626 setOperationAction(ISD::FLOG, MVT::f80, Expand); 627 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 628 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 629 setOperationAction(ISD::FEXP, MVT::f80, Expand); 630 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 631 setOperationAction(ISD::FMINNUM, MVT::f80, Expand); 632 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); 633 634 // Some FP actions are always expanded for vector types. 635 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, 636 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { 637 setOperationAction(ISD::FSIN, VT, Expand); 638 setOperationAction(ISD::FSINCOS, VT, Expand); 639 setOperationAction(ISD::FCOS, VT, Expand); 640 setOperationAction(ISD::FREM, VT, Expand); 641 setOperationAction(ISD::FPOWI, VT, Expand); 642 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 643 setOperationAction(ISD::FPOW, VT, Expand); 644 setOperationAction(ISD::FLOG, VT, Expand); 645 setOperationAction(ISD::FLOG2, VT, Expand); 646 setOperationAction(ISD::FLOG10, VT, Expand); 647 setOperationAction(ISD::FEXP, VT, Expand); 648 setOperationAction(ISD::FEXP2, VT, Expand); 649 } 650 651 // First set operation action for all vector types to either promote 652 // (for widening) or expand (for scalarization). Then we will selectively 653 // turn on ones that can be effectively codegen'd. 654 for (MVT VT : MVT::vector_valuetypes()) { 655 setOperationAction(ISD::SDIV, VT, Expand); 656 setOperationAction(ISD::UDIV, VT, Expand); 657 setOperationAction(ISD::SREM, VT, Expand); 658 setOperationAction(ISD::UREM, VT, Expand); 659 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 660 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 661 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 662 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 663 setOperationAction(ISD::FMA, VT, Expand); 664 setOperationAction(ISD::FFLOOR, VT, Expand); 665 setOperationAction(ISD::FCEIL, VT, Expand); 666 setOperationAction(ISD::FTRUNC, VT, Expand); 667 setOperationAction(ISD::FRINT, VT, Expand); 668 setOperationAction(ISD::FNEARBYINT, VT, Expand); 669 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 670 setOperationAction(ISD::MULHS, VT, Expand); 671 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 672 setOperationAction(ISD::MULHU, VT, Expand); 673 setOperationAction(ISD::SDIVREM, VT, Expand); 674 setOperationAction(ISD::UDIVREM, VT, Expand); 675 setOperationAction(ISD::CTPOP, VT, Expand); 676 setOperationAction(ISD::CTTZ, VT, Expand); 677 setOperationAction(ISD::CTLZ, VT, Expand); 678 setOperationAction(ISD::ROTL, VT, Expand); 679 setOperationAction(ISD::ROTR, VT, Expand); 680 setOperationAction(ISD::BSWAP, VT, Expand); 681 setOperationAction(ISD::SETCC, VT, Expand); 682 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 683 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 684 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 685 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 686 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 687 setOperationAction(ISD::TRUNCATE, VT, Expand); 688 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 689 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 690 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 691 setOperationAction(ISD::SELECT_CC, VT, Expand); 692 for (MVT InnerVT : MVT::vector_valuetypes()) { 693 setTruncStoreAction(InnerVT, VT, Expand); 694 695 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); 696 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); 697 698 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like 699 // types, we have to deal with them whether we ask for Expansion or not. 700 // Setting Expand causes its own optimisation problems though, so leave 701 // them legal. 702 if (VT.getVectorElementType() == MVT::i1) 703 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 704 705 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are 706 // split/scalarized right now. 707 if (VT.getVectorElementType() == MVT::f16) 708 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 709 } 710 } 711 712 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 713 // with -msoft-float, disable use of MMX as well. 714 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { 715 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 716 // No operations on x86mmx supported, everything uses intrinsics. 717 } 718 719 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { 720 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 721 722 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 723 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 724 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 725 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 726 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); 727 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 728 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 729 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 730 } 731 732 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { 733 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 734 735 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM 736 // registers cannot be used even for integer operations. 737 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 738 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 739 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 740 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 741 742 setOperationAction(ISD::MUL, MVT::v16i8, Custom); 743 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 744 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 745 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); 746 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); 747 setOperationAction(ISD::MULHU, MVT::v16i8, Custom); 748 setOperationAction(ISD::MULHS, MVT::v16i8, Custom); 749 setOperationAction(ISD::MULHU, MVT::v8i16, Legal); 750 setOperationAction(ISD::MULHS, MVT::v8i16, Legal); 751 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 752 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 753 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 754 755 setOperationAction(ISD::SMAX, MVT::v8i16, Legal); 756 setOperationAction(ISD::UMAX, MVT::v16i8, Legal); 757 setOperationAction(ISD::SMIN, MVT::v8i16, Legal); 758 setOperationAction(ISD::UMIN, MVT::v16i8, Legal); 759 760 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 761 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 762 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 763 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 764 765 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 766 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 767 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 768 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 769 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 770 771 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); 772 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 773 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 774 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 775 776 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 777 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 778 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 779 // ISD::CTTZ v2i64 - scalarization is faster. 780 781 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 782 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { 783 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 784 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 785 setOperationAction(ISD::VSELECT, VT, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 787 } 788 789 // We support custom legalizing of sext and anyext loads for specific 790 // memory vector types which we can load as a scalar (or sequence of 791 // scalars) and extend in-register to a legal 128-bit vector type. For sext 792 // loads these must work with a single scalar load. 793 for (MVT VT : MVT::integer_vector_valuetypes()) { 794 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); 795 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); 796 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); 797 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); 798 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); 799 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); 800 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); 801 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); 802 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); 803 } 804 805 for (auto VT : { MVT::v2f64, MVT::v2i64 }) { 806 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 807 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 808 setOperationAction(ISD::VSELECT, VT, Custom); 809 810 if (VT == MVT::v2i64 && !Subtarget.is64Bit()) 811 continue; 812 813 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 815 } 816 817 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 818 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { 819 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64); 820 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64); 821 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64); 822 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64); 823 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64); 824 } 825 826 // Custom lower v2i64 and v2f64 selects. 827 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 828 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 829 830 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 831 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 832 833 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 834 835 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 836 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 837 // As there is no 64-bit GPR available, we need build a special custom 838 // sequence to convert from v2i32 to v2f32. 839 if (!Subtarget.is64Bit()) 840 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 841 842 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 843 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 844 845 for (MVT VT : MVT::fp_vector_valuetypes()) 846 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); 847 848 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); 849 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); 850 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); 851 852 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); 853 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); 854 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); 855 856 for (auto VT : { MVT::v8i16, MVT::v16i8 }) { 857 setOperationAction(ISD::SRL, VT, Custom); 858 setOperationAction(ISD::SHL, VT, Custom); 859 setOperationAction(ISD::SRA, VT, Custom); 860 } 861 862 // In the customized shift lowering, the legal cases in AVX2 will be 863 // recognized. 864 for (auto VT : { MVT::v4i32, MVT::v2i64 }) { 865 setOperationAction(ISD::SRL, VT, Custom); 866 setOperationAction(ISD::SHL, VT, Custom); 867 setOperationAction(ISD::SRA, VT, Custom); 868 } 869 } 870 871 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { 872 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); 873 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); 874 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); 875 // ISD::CTLZ v4i32 - scalarization is faster. 876 // ISD::CTLZ v2i64 - scalarization is faster. 877 } 878 879 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { 880 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { 881 setOperationAction(ISD::FFLOOR, RoundedTy, Legal); 882 setOperationAction(ISD::FCEIL, RoundedTy, Legal); 883 setOperationAction(ISD::FTRUNC, RoundedTy, Legal); 884 setOperationAction(ISD::FRINT, RoundedTy, Legal); 885 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); 886 } 887 888 setOperationAction(ISD::SMAX, MVT::v16i8, Legal); 889 setOperationAction(ISD::SMAX, MVT::v4i32, Legal); 890 setOperationAction(ISD::UMAX, MVT::v8i16, Legal); 891 setOperationAction(ISD::UMAX, MVT::v4i32, Legal); 892 setOperationAction(ISD::SMIN, MVT::v16i8, Legal); 893 setOperationAction(ISD::SMIN, MVT::v4i32, Legal); 894 setOperationAction(ISD::UMIN, MVT::v8i16, Legal); 895 setOperationAction(ISD::UMIN, MVT::v4i32, Legal); 896 897 // FIXME: Do we need to handle scalar-to-vector here? 898 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 899 900 // We directly match byte blends in the backend as they match the VSELECT 901 // condition form. 902 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 903 904 // SSE41 brings specific instructions for doing vector sign extend even in 905 // cases where we don't have SRA. 906 for (MVT VT : MVT::integer_vector_valuetypes()) { 907 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); 908 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); 909 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); 910 } 911 912 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X 913 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); 914 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); 915 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); 916 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); 917 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); 918 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); 919 920 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); 921 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); 922 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); 923 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); 924 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); 925 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); 926 927 // i8 vectors are custom because the source register and source 928 // source memory operand types are not the same width. 929 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 930 } 931 932 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { 933 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, 934 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) 935 setOperationAction(ISD::ROTL, VT, Custom); 936 937 // XOP can efficiently perform BITREVERSE with VPPERM. 938 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) 939 setOperationAction(ISD::BITREVERSE, VT, Custom); 940 941 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, 942 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) 943 setOperationAction(ISD::BITREVERSE, VT, Custom); 944 } 945 946 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) { 947 bool HasInt256 = Subtarget.hasInt256(); 948 949 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 950 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 951 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 952 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 953 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 954 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 955 956 for (auto VT : { MVT::v8f32, MVT::v4f64 }) { 957 setOperationAction(ISD::FFLOOR, VT, Legal); 958 setOperationAction(ISD::FCEIL, VT, Legal); 959 setOperationAction(ISD::FTRUNC, VT, Legal); 960 setOperationAction(ISD::FRINT, VT, Legal); 961 setOperationAction(ISD::FNEARBYINT, VT, Legal); 962 setOperationAction(ISD::FNEG, VT, Custom); 963 setOperationAction(ISD::FABS, VT, Custom); 964 } 965 966 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted 967 // even though v8i16 is a legal type. 968 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); 969 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); 970 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 971 972 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 973 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 974 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 975 976 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 977 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 978 979 for (MVT VT : MVT::fp_vector_valuetypes()) 980 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); 981 982 for (auto VT : { MVT::v32i8, MVT::v16i16 }) { 983 setOperationAction(ISD::SRL, VT, Custom); 984 setOperationAction(ISD::SHL, VT, Custom); 985 setOperationAction(ISD::SRA, VT, Custom); 986 } 987 988 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 989 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 990 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 991 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 992 993 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 994 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 995 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 996 997 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); 998 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 999 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1000 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); 1001 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1002 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 1003 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); 1004 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); 1005 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); 1006 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1007 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 1008 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 1009 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); 1010 1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { 1012 setOperationAction(ISD::CTPOP, VT, Custom); 1013 setOperationAction(ISD::CTTZ, VT, Custom); 1014 } 1015 1016 // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2 1017 // as we end up splitting the 256-bit vectors. 1018 for (auto VT : { MVT::v32i8, MVT::v16i16 }) 1019 setOperationAction(ISD::CTLZ, VT, Custom); 1020 1021 if (HasInt256) 1022 for (auto VT : { MVT::v8i32, MVT::v4i64 }) 1023 setOperationAction(ISD::CTLZ, VT, Custom); 1024 1025 if (Subtarget.hasAnyFMA()) { 1026 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, 1027 MVT::v2f64, MVT::v4f64 }) 1028 setOperationAction(ISD::FMA, VT, Legal); 1029 } 1030 1031 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { 1032 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); 1033 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); 1034 } 1035 1036 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1037 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); 1038 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); 1039 setOperationAction(ISD::MUL, MVT::v32i8, Custom); 1040 1041 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); 1042 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); 1043 1044 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); 1045 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); 1046 setOperationAction(ISD::MULHU, MVT::v32i8, Custom); 1047 setOperationAction(ISD::MULHS, MVT::v32i8, Custom); 1048 1049 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { 1050 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); 1051 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); 1052 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); 1053 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); 1054 } 1055 1056 if (HasInt256) { 1057 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom); 1058 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom); 1059 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom); 1060 1061 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting 1062 // when we have a 256bit-wide blend with immediate. 1063 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); 1064 1065 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X 1066 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); 1067 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); 1068 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); 1069 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); 1070 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); 1071 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); 1072 1073 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); 1074 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); 1075 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); 1076 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); 1077 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); 1078 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); 1079 } 1080 1081 // In the customized shift lowering, the legal cases in AVX2 will be 1082 // recognized. 1083 for (auto VT : { MVT::v8i32, MVT::v4i64 }) { 1084 setOperationAction(ISD::SRL, VT, Custom); 1085 setOperationAction(ISD::SHL, VT, Custom); 1086 setOperationAction(ISD::SRA, VT, Custom); 1087 } 1088 1089 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, 1090 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { 1091 setOperationAction(ISD::MLOAD, VT, Legal); 1092 setOperationAction(ISD::MSTORE, VT, Legal); 1093 } 1094 1095 // Extract subvector is special because the value type 1096 // (result) is 128-bit but the source is 256-bit wide. 1097 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, 1098 MVT::v4f32, MVT::v2f64 }) { 1099 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1100 } 1101 1102 // Custom lower several nodes for 256-bit types. 1103 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, 1104 MVT::v8f32, MVT::v4f64 }) { 1105 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1106 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1107 setOperationAction(ISD::VSELECT, VT, Custom); 1108 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1109 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1110 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1111 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1112 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1113 } 1114 1115 if (HasInt256) 1116 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1117 1118 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1119 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { 1120 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64); 1121 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64); 1122 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64); 1123 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64); 1124 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64); 1125 } 1126 } 1127 1128 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { 1129 addRegisterClass(MVT::v16i32, &X86::VR512RegClass); 1130 addRegisterClass(MVT::v16f32, &X86::VR512RegClass); 1131 addRegisterClass(MVT::v8i64, &X86::VR512RegClass); 1132 addRegisterClass(MVT::v8f64, &X86::VR512RegClass); 1133 1134 addRegisterClass(MVT::i1, &X86::VK1RegClass); 1135 addRegisterClass(MVT::v8i1, &X86::VK8RegClass); 1136 addRegisterClass(MVT::v16i1, &X86::VK16RegClass); 1137 1138 for (MVT VT : MVT::fp_vector_valuetypes()) 1139 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); 1140 1141 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { 1142 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); 1143 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); 1144 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); 1145 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); 1146 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); 1147 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); 1148 } 1149 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 1150 setOperationAction(ISD::SETCC, MVT::i1, Custom); 1151 setOperationAction(ISD::SETCCE, MVT::i1, Custom); 1152 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 1153 setOperationAction(ISD::XOR, MVT::i1, Legal); 1154 setOperationAction(ISD::OR, MVT::i1, Legal); 1155 setOperationAction(ISD::AND, MVT::i1, Legal); 1156 setOperationAction(ISD::SUB, MVT::i1, Custom); 1157 setOperationAction(ISD::ADD, MVT::i1, Custom); 1158 setOperationAction(ISD::MUL, MVT::i1, Custom); 1159 1160 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16, 1161 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32, 1162 MVT::v8i64, MVT::v32i16, MVT::v64i8}) { 1163 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 1164 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom); 1165 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom); 1166 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom); 1167 setTruncStoreAction(VT, MaskVT, Custom); 1168 } 1169 1170 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { 1171 setOperationAction(ISD::FNEG, VT, Custom); 1172 setOperationAction(ISD::FABS, VT, Custom); 1173 setOperationAction(ISD::FMA, VT, Legal); 1174 } 1175 1176 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); 1177 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); 1178 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 1179 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 1180 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); 1181 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); 1182 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); 1183 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); 1184 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); 1185 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); 1186 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 1187 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 1188 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); 1189 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); 1190 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); 1191 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); 1192 1193 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); 1194 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); 1195 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); 1196 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); 1197 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); 1198 if (Subtarget.hasVLX()){ 1199 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); 1200 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); 1201 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); 1202 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); 1203 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); 1204 1205 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); 1206 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); 1207 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); 1208 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 1209 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 1210 } else { 1211 setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); 1212 setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); 1213 setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); 1214 setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); 1215 } 1216 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); 1217 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1218 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 1219 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); 1220 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); 1221 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand); 1222 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand); 1223 if (Subtarget.hasDQI()) { 1224 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); 1225 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); 1226 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); 1227 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); 1228 if (Subtarget.hasVLX()) { 1229 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); 1230 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); 1231 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); 1232 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); 1233 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); 1234 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); 1235 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); 1236 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); 1237 } 1238 } 1239 if (Subtarget.hasVLX()) { 1240 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1241 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 1242 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1243 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 1244 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 1245 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 1246 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 1247 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 1248 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); 1249 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); 1250 1251 // FIXME. This commands are available on SSE/AVX2, add relevant patterns. 1252 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal); 1253 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal); 1254 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal); 1255 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal); 1256 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal); 1257 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal); 1258 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal); 1259 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal); 1260 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal); 1261 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal); 1262 } 1263 1264 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); 1265 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); 1266 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); 1267 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 1268 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); 1269 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); 1270 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); 1271 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 1272 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); 1273 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); 1274 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); 1275 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1276 if (Subtarget.hasDQI()) { 1277 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); 1278 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); 1279 } 1280 for (auto VT : { MVT::v16f32, MVT::v8f64 }) { 1281 setOperationAction(ISD::FFLOOR, VT, Legal); 1282 setOperationAction(ISD::FCEIL, VT, Legal); 1283 setOperationAction(ISD::FTRUNC, VT, Legal); 1284 setOperationAction(ISD::FRINT, VT, Legal); 1285 setOperationAction(ISD::FNEARBYINT, VT, Legal); 1286 } 1287 1288 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); 1289 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); 1290 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); 1291 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); 1292 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); 1293 1294 setOperationAction(ISD::SETCC, MVT::v16i1, Custom); 1295 setOperationAction(ISD::SETCC, MVT::v8i1, Custom); 1296 1297 setOperationAction(ISD::MUL, MVT::v8i64, Custom); 1298 1299 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); 1300 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); 1301 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); 1302 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); 1303 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); 1304 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); 1305 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); 1306 setOperationAction(ISD::SELECT, MVT::v8f64, Custom); 1307 setOperationAction(ISD::SELECT, MVT::v8i64, Custom); 1308 setOperationAction(ISD::SELECT, MVT::v16f32, Custom); 1309 setOperationAction(ISD::SELECT, MVT::v16i1, Custom); 1310 setOperationAction(ISD::SELECT, MVT::v8i1, Custom); 1311 1312 setOperationAction(ISD::SMAX, MVT::v16i32, Legal); 1313 setOperationAction(ISD::SMAX, MVT::v8i64, Legal); 1314 setOperationAction(ISD::UMAX, MVT::v16i32, Legal); 1315 setOperationAction(ISD::UMAX, MVT::v8i64, Legal); 1316 setOperationAction(ISD::SMIN, MVT::v16i32, Legal); 1317 setOperationAction(ISD::SMIN, MVT::v8i64, Legal); 1318 setOperationAction(ISD::UMIN, MVT::v16i32, Legal); 1319 setOperationAction(ISD::UMIN, MVT::v8i64, Legal); 1320 1321 setOperationAction(ISD::ADD, MVT::v8i1, Expand); 1322 setOperationAction(ISD::ADD, MVT::v16i1, Expand); 1323 setOperationAction(ISD::SUB, MVT::v8i1, Expand); 1324 setOperationAction(ISD::SUB, MVT::v16i1, Expand); 1325 setOperationAction(ISD::MUL, MVT::v8i1, Expand); 1326 setOperationAction(ISD::MUL, MVT::v16i1, Expand); 1327 1328 setOperationAction(ISD::MUL, MVT::v16i32, Legal); 1329 1330 for (auto VT : { MVT::v16i32, MVT::v8i64 }) { 1331 setOperationAction(ISD::SRL, VT, Custom); 1332 setOperationAction(ISD::SHL, VT, Custom); 1333 setOperationAction(ISD::SRA, VT, Custom); 1334 setOperationAction(ISD::AND, VT, Legal); 1335 setOperationAction(ISD::OR, VT, Legal); 1336 setOperationAction(ISD::XOR, VT, Legal); 1337 setOperationAction(ISD::CTPOP, VT, Custom); 1338 setOperationAction(ISD::CTTZ, VT, Custom); 1339 } 1340 1341 if (Subtarget.hasCDI()) { 1342 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); 1343 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); 1344 1345 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); 1346 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); 1347 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); 1348 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); 1349 1350 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); 1351 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); 1352 1353 if (Subtarget.hasVLX()) { 1354 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); 1355 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); 1356 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); 1357 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); 1358 } else { 1359 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); 1360 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); 1361 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); 1362 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); 1363 } 1364 1365 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); 1366 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); 1367 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 1368 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 1369 } // Subtarget.hasCDI() 1370 1371 if (Subtarget.hasDQI()) { 1372 if (Subtarget.hasVLX()) { 1373 setOperationAction(ISD::MUL, MVT::v2i64, Legal); 1374 setOperationAction(ISD::MUL, MVT::v4i64, Legal); 1375 } 1376 setOperationAction(ISD::MUL, MVT::v8i64, Legal); 1377 } 1378 // Custom lower several nodes. 1379 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, 1380 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { 1381 setOperationAction(ISD::MGATHER, VT, Custom); 1382 setOperationAction(ISD::MSCATTER, VT, Custom); 1383 } 1384 // Extract subvector is special because the value type 1385 // (result) is 256-bit but the source is 512-bit wide. 1386 // 128-bit was made Custom under AVX1. 1387 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, 1388 MVT::v8f32, MVT::v4f64 }) 1389 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1390 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, 1391 MVT::v16i1, MVT::v32i1, MVT::v64i1 }) 1392 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 1393 1394 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { 1395 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1396 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1397 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1398 setOperationAction(ISD::VSELECT, VT, Legal); 1399 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1400 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1401 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1402 setOperationAction(ISD::MLOAD, VT, Legal); 1403 setOperationAction(ISD::MSTORE, VT, Legal); 1404 setOperationAction(ISD::MGATHER, VT, Legal); 1405 setOperationAction(ISD::MSCATTER, VT, Custom); 1406 } 1407 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { 1408 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64); 1409 } 1410 }// has AVX-512 1411 1412 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { 1413 addRegisterClass(MVT::v32i16, &X86::VR512RegClass); 1414 addRegisterClass(MVT::v64i8, &X86::VR512RegClass); 1415 1416 addRegisterClass(MVT::v32i1, &X86::VK32RegClass); 1417 addRegisterClass(MVT::v64i1, &X86::VK64RegClass); 1418 1419 setOperationAction(ISD::ADD, MVT::v32i1, Expand); 1420 setOperationAction(ISD::ADD, MVT::v64i1, Expand); 1421 setOperationAction(ISD::SUB, MVT::v32i1, Expand); 1422 setOperationAction(ISD::SUB, MVT::v64i1, Expand); 1423 setOperationAction(ISD::MUL, MVT::v32i1, Expand); 1424 setOperationAction(ISD::MUL, MVT::v64i1, Expand); 1425 1426 setOperationAction(ISD::SETCC, MVT::v32i1, Custom); 1427 setOperationAction(ISD::SETCC, MVT::v64i1, Custom); 1428 setOperationAction(ISD::MUL, MVT::v32i16, Legal); 1429 setOperationAction(ISD::MUL, MVT::v64i8, Custom); 1430 setOperationAction(ISD::MULHS, MVT::v32i16, Legal); 1431 setOperationAction(ISD::MULHU, MVT::v32i16, Legal); 1432 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); 1433 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); 1434 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); 1435 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); 1436 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); 1437 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); 1438 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); 1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); 1440 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); 1441 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); 1442 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); 1443 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); 1444 setOperationAction(ISD::SELECT, MVT::v32i1, Custom); 1445 setOperationAction(ISD::SELECT, MVT::v64i1, Custom); 1446 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); 1447 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); 1448 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); 1449 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); 1450 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); 1451 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); 1452 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); 1453 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); 1454 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); 1455 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); 1456 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); 1457 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); 1458 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); 1459 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); 1460 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); 1461 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); 1462 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); 1463 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); 1464 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); 1465 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); 1466 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom); 1467 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom); 1468 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand); 1469 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand); 1470 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); 1471 1472 setOperationAction(ISD::SMAX, MVT::v64i8, Legal); 1473 setOperationAction(ISD::SMAX, MVT::v32i16, Legal); 1474 setOperationAction(ISD::UMAX, MVT::v64i8, Legal); 1475 setOperationAction(ISD::UMAX, MVT::v32i16, Legal); 1476 setOperationAction(ISD::SMIN, MVT::v64i8, Legal); 1477 setOperationAction(ISD::SMIN, MVT::v32i16, Legal); 1478 setOperationAction(ISD::UMIN, MVT::v64i8, Legal); 1479 setOperationAction(ISD::UMIN, MVT::v32i16, Legal); 1480 1481 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); 1482 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); 1483 if (Subtarget.hasVLX()) 1484 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 1485 1486 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom; 1487 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { 1488 setOperationAction(ISD::MLOAD, VT, Action); 1489 setOperationAction(ISD::MSTORE, VT, Action); 1490 } 1491 1492 if (Subtarget.hasCDI()) { 1493 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); 1494 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); 1495 } 1496 1497 for (auto VT : { MVT::v64i8, MVT::v32i16 }) { 1498 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1499 setOperationAction(ISD::VSELECT, VT, Legal); 1500 setOperationAction(ISD::SRL, VT, Custom); 1501 setOperationAction(ISD::SHL, VT, Custom); 1502 setOperationAction(ISD::SRA, VT, Custom); 1503 setOperationAction(ISD::MLOAD, VT, Legal); 1504 setOperationAction(ISD::MSTORE, VT, Legal); 1505 setOperationAction(ISD::CTPOP, VT, Custom); 1506 setOperationAction(ISD::CTTZ, VT, Custom); 1507 1508 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64); 1509 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64); 1510 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64); 1511 } 1512 1513 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { 1514 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); 1515 if (Subtarget.hasVLX()) { 1516 // FIXME. This commands are available on SSE/AVX2, add relevant patterns. 1517 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal); 1518 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal); 1519 } 1520 } 1521 } 1522 1523 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { 1524 addRegisterClass(MVT::v4i1, &X86::VK4RegClass); 1525 addRegisterClass(MVT::v2i1, &X86::VK2RegClass); 1526 1527 setOperationAction(ISD::ADD, MVT::v2i1, Expand); 1528 setOperationAction(ISD::ADD, MVT::v4i1, Expand); 1529 setOperationAction(ISD::SUB, MVT::v2i1, Expand); 1530 setOperationAction(ISD::SUB, MVT::v4i1, Expand); 1531 setOperationAction(ISD::MUL, MVT::v2i1, Expand); 1532 setOperationAction(ISD::MUL, MVT::v4i1, Expand); 1533 1534 setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); 1535 setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); 1536 setOperationAction(ISD::SETCC, MVT::v4i1, Custom); 1537 setOperationAction(ISD::SETCC, MVT::v2i1, Custom); 1538 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); 1539 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); 1540 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); 1541 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); 1542 setOperationAction(ISD::SELECT, MVT::v4i1, Custom); 1543 setOperationAction(ISD::SELECT, MVT::v2i1, Custom); 1544 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); 1545 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); 1546 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); 1547 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); 1548 setOperationAction(ISD::VSELECT, MVT::v2i1, Expand); 1549 setOperationAction(ISD::VSELECT, MVT::v4i1, Expand); 1550 1551 for (auto VT : { MVT::v4i32, MVT::v8i32 }) { 1552 setOperationAction(ISD::AND, VT, Legal); 1553 setOperationAction(ISD::OR, VT, Legal); 1554 setOperationAction(ISD::XOR, VT, Legal); 1555 } 1556 1557 for (auto VT : { MVT::v2i64, MVT::v4i64 }) { 1558 setOperationAction(ISD::SMAX, VT, Legal); 1559 setOperationAction(ISD::UMAX, VT, Legal); 1560 setOperationAction(ISD::SMIN, VT, Legal); 1561 setOperationAction(ISD::UMIN, VT, Legal); 1562 } 1563 } 1564 1565 // We want to custom lower some of our intrinsics. 1566 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1567 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1568 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1569 if (!Subtarget.is64Bit()) { 1570 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); 1571 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1572 } 1573 1574 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1575 // handle type legalization for these operations here. 1576 // 1577 // FIXME: We really should do custom legalization for addition and 1578 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1579 // than generic legalization for 64-bit multiplication-with-overflow, though. 1580 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { 1581 if (VT == MVT::i64 && !Subtarget.is64Bit()) 1582 continue; 1583 // Add/Sub/Mul with overflow operations are custom lowered. 1584 setOperationAction(ISD::SADDO, VT, Custom); 1585 setOperationAction(ISD::UADDO, VT, Custom); 1586 setOperationAction(ISD::SSUBO, VT, Custom); 1587 setOperationAction(ISD::USUBO, VT, Custom); 1588 setOperationAction(ISD::SMULO, VT, Custom); 1589 setOperationAction(ISD::UMULO, VT, Custom); 1590 } 1591 1592 if (!Subtarget.is64Bit()) { 1593 // These libcalls are not available in 32-bit. 1594 setLibcallName(RTLIB::SHL_I128, nullptr); 1595 setLibcallName(RTLIB::SRL_I128, nullptr); 1596 setLibcallName(RTLIB::SRA_I128, nullptr); 1597 } 1598 1599 // Combine sin / cos into one node or libcall if possible. 1600 if (Subtarget.hasSinCos()) { 1601 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 1602 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 1603 if (Subtarget.isTargetDarwin()) { 1604 // For MacOSX, we don't want the normal expansion of a libcall to sincos. 1605 // We want to issue a libcall to __sincos_stret to avoid memory traffic. 1606 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1607 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1608 } 1609 } 1610 1611 if (Subtarget.isTargetWin64()) { 1612 setOperationAction(ISD::SDIV, MVT::i128, Custom); 1613 setOperationAction(ISD::UDIV, MVT::i128, Custom); 1614 setOperationAction(ISD::SREM, MVT::i128, Custom); 1615 setOperationAction(ISD::UREM, MVT::i128, Custom); 1616 setOperationAction(ISD::SDIVREM, MVT::i128, Custom); 1617 setOperationAction(ISD::UDIVREM, MVT::i128, Custom); 1618 } 1619 1620 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` 1621 // is. We should promote the value to 64-bits to solve this. 1622 // This is what the CRT headers do - `fmodf` is an inline header 1623 // function casting to f64 and calling `fmod`. 1624 if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC()) 1625 for (ISD::NodeType Op : 1626 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, 1627 ISD::FLOG10, ISD::FPOW, ISD::FSIN}) 1628 if (isOperationExpand(Op, MVT::f32)) 1629 setOperationAction(Op, MVT::f32, Promote); 1630 1631 // We have target-specific dag combine patterns for the following nodes: 1632 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1633 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1634 setTargetDAGCombine(ISD::BITCAST); 1635 setTargetDAGCombine(ISD::VSELECT); 1636 setTargetDAGCombine(ISD::SELECT); 1637 setTargetDAGCombine(ISD::SHL); 1638 setTargetDAGCombine(ISD::SRA); 1639 setTargetDAGCombine(ISD::SRL); 1640 setTargetDAGCombine(ISD::OR); 1641 setTargetDAGCombine(ISD::AND); 1642 setTargetDAGCombine(ISD::ADD); 1643 setTargetDAGCombine(ISD::FADD); 1644 setTargetDAGCombine(ISD::FSUB); 1645 setTargetDAGCombine(ISD::FNEG); 1646 setTargetDAGCombine(ISD::FMA); 1647 setTargetDAGCombine(ISD::FMINNUM); 1648 setTargetDAGCombine(ISD::FMAXNUM); 1649 setTargetDAGCombine(ISD::SUB); 1650 setTargetDAGCombine(ISD::LOAD); 1651 setTargetDAGCombine(ISD::MLOAD); 1652 setTargetDAGCombine(ISD::STORE); 1653 setTargetDAGCombine(ISD::MSTORE); 1654 setTargetDAGCombine(ISD::TRUNCATE); 1655 setTargetDAGCombine(ISD::ZERO_EXTEND); 1656 setTargetDAGCombine(ISD::ANY_EXTEND); 1657 setTargetDAGCombine(ISD::SIGN_EXTEND); 1658 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 1659 setTargetDAGCombine(ISD::SINT_TO_FP); 1660 setTargetDAGCombine(ISD::UINT_TO_FP); 1661 setTargetDAGCombine(ISD::SETCC); 1662 setTargetDAGCombine(ISD::MUL); 1663 setTargetDAGCombine(ISD::XOR); 1664 setTargetDAGCombine(ISD::MSCATTER); 1665 setTargetDAGCombine(ISD::MGATHER); 1666 1667 computeRegisterProperties(Subtarget.getRegisterInfo()); 1668 1669 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1670 MaxStoresPerMemsetOptSize = 8; 1671 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1672 MaxStoresPerMemcpyOptSize = 4; 1673 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1674 MaxStoresPerMemmoveOptSize = 4; 1675 setPrefLoopAlignment(4); // 2^4 bytes. 1676 1677 // An out-of-order CPU can speculatively execute past a predictable branch, 1678 // but a conditional move could be stalled by an expensive earlier operation. 1679 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); 1680 EnableExtLdPromotion = true; 1681 setPrefFunctionAlignment(4); // 2^4 bytes. 1682 1683 verifyIntrinsicTables(); 1684 } 1685 1686 // This has so far only been implemented for 64-bit MachO. 1687 bool X86TargetLowering::useLoadStackGuardNode() const { 1688 return Subtarget.isTargetMachO() && Subtarget.is64Bit(); 1689 } 1690 1691 TargetLoweringBase::LegalizeTypeAction 1692 X86TargetLowering::getPreferredVectorAction(EVT VT) const { 1693 if (ExperimentalVectorWideningLegalization && 1694 VT.getVectorNumElements() != 1 && 1695 VT.getVectorElementType().getSimpleVT() != MVT::i1) 1696 return TypeWidenVector; 1697 1698 return TargetLoweringBase::getPreferredVectorAction(VT); 1699 } 1700 1701 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, 1702 LLVMContext& Context, 1703 EVT VT) const { 1704 if (!VT.isVector()) 1705 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8; 1706 1707 if (VT.isSimple()) { 1708 MVT VVT = VT.getSimpleVT(); 1709 const unsigned NumElts = VVT.getVectorNumElements(); 1710 MVT EltVT = VVT.getVectorElementType(); 1711 if (VVT.is512BitVector()) { 1712 if (Subtarget.hasAVX512()) 1713 if (EltVT == MVT::i32 || EltVT == MVT::i64 || 1714 EltVT == MVT::f32 || EltVT == MVT::f64) 1715 switch(NumElts) { 1716 case 8: return MVT::v8i1; 1717 case 16: return MVT::v16i1; 1718 } 1719 if (Subtarget.hasBWI()) 1720 if (EltVT == MVT::i8 || EltVT == MVT::i16) 1721 switch(NumElts) { 1722 case 32: return MVT::v32i1; 1723 case 64: return MVT::v64i1; 1724 } 1725 } 1726 1727 if (Subtarget.hasBWI() && Subtarget.hasVLX()) 1728 return MVT::getVectorVT(MVT::i1, NumElts); 1729 1730 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) { 1731 EVT LegalVT = getTypeToTransformTo(Context, VT); 1732 EltVT = LegalVT.getVectorElementType().getSimpleVT(); 1733 } 1734 1735 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32) 1736 switch(NumElts) { 1737 case 2: return MVT::v2i1; 1738 case 4: return MVT::v4i1; 1739 case 8: return MVT::v8i1; 1740 } 1741 } 1742 1743 return VT.changeVectorElementTypeToInteger(); 1744 } 1745 1746 /// Helper for getByValTypeAlignment to determine 1747 /// the desired ByVal argument alignment. 1748 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1749 if (MaxAlign == 16) 1750 return; 1751 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1752 if (VTy->getBitWidth() == 128) 1753 MaxAlign = 16; 1754 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1755 unsigned EltAlign = 0; 1756 getMaxByValAlign(ATy->getElementType(), EltAlign); 1757 if (EltAlign > MaxAlign) 1758 MaxAlign = EltAlign; 1759 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1760 for (auto *EltTy : STy->elements()) { 1761 unsigned EltAlign = 0; 1762 getMaxByValAlign(EltTy, EltAlign); 1763 if (EltAlign > MaxAlign) 1764 MaxAlign = EltAlign; 1765 if (MaxAlign == 16) 1766 break; 1767 } 1768 } 1769 } 1770 1771 /// Return the desired alignment for ByVal aggregate 1772 /// function arguments in the caller parameter area. For X86, aggregates 1773 /// that contain SSE vectors are placed at 16-byte boundaries while the rest 1774 /// are at 4-byte boundaries. 1775 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, 1776 const DataLayout &DL) const { 1777 if (Subtarget.is64Bit()) { 1778 // Max of 8 and alignment of type. 1779 unsigned TyAlign = DL.getABITypeAlignment(Ty); 1780 if (TyAlign > 8) 1781 return TyAlign; 1782 return 8; 1783 } 1784 1785 unsigned Align = 4; 1786 if (Subtarget.hasSSE1()) 1787 getMaxByValAlign(Ty, Align); 1788 return Align; 1789 } 1790 1791 /// Returns the target specific optimal type for load 1792 /// and store operations as a result of memset, memcpy, and memmove 1793 /// lowering. If DstAlign is zero that means it's safe to destination 1794 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1795 /// means there isn't a need to check it against alignment requirement, 1796 /// probably because the source does not need to be loaded. If 'IsMemset' is 1797 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 1798 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 1799 /// source is constant so it does not need to be loaded. 1800 /// It returns EVT::Other if the type should be determined using generic 1801 /// target-independent logic. 1802 EVT 1803 X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1804 unsigned DstAlign, unsigned SrcAlign, 1805 bool IsMemset, bool ZeroMemset, 1806 bool MemcpyStrSrc, 1807 MachineFunction &MF) const { 1808 const Function *F = MF.getFunction(); 1809 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) { 1810 if (Size >= 16 && 1811 (!Subtarget.isUnalignedMem16Slow() || 1812 ((DstAlign == 0 || DstAlign >= 16) && 1813 (SrcAlign == 0 || SrcAlign >= 16)))) { 1814 // FIXME: Check if unaligned 32-byte accesses are slow. 1815 if (Size >= 32 && Subtarget.hasAVX()) { 1816 // Although this isn't a well-supported type for AVX1, we'll let 1817 // legalization and shuffle lowering produce the optimal codegen. If we 1818 // choose an optimal type with a vector element larger than a byte, 1819 // getMemsetStores() may create an intermediate splat (using an integer 1820 // multiply) before we splat as a vector. 1821 return MVT::v32i8; 1822 } 1823 if (Subtarget.hasSSE2()) 1824 return MVT::v16i8; 1825 // TODO: Can SSE1 handle a byte vector? 1826 if (Subtarget.hasSSE1()) 1827 return MVT::v4f32; 1828 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && 1829 !Subtarget.is64Bit() && Subtarget.hasSSE2()) { 1830 // Do not use f64 to lower memcpy if source is string constant. It's 1831 // better to use i32 to avoid the loads. 1832 // Also, do not use f64 to lower memset unless this is a memset of zeros. 1833 // The gymnastics of splatting a byte value into an XMM register and then 1834 // only using 8-byte stores (because this is a CPU with slow unaligned 1835 // 16-byte accesses) makes that a loser. 1836 return MVT::f64; 1837 } 1838 } 1839 // This is a compromise. If we reach here, unaligned accesses may be slow on 1840 // this target. However, creating smaller, aligned accesses could be even 1841 // slower and would certainly be a lot more code. 1842 if (Subtarget.is64Bit() && Size >= 8) 1843 return MVT::i64; 1844 return MVT::i32; 1845 } 1846 1847 bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 1848 if (VT == MVT::f32) 1849 return X86ScalarSSEf32; 1850 else if (VT == MVT::f64) 1851 return X86ScalarSSEf64; 1852 return true; 1853 } 1854 1855 bool 1856 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 1857 unsigned, 1858 unsigned, 1859 bool *Fast) const { 1860 if (Fast) { 1861 switch (VT.getSizeInBits()) { 1862 default: 1863 // 8-byte and under are always assumed to be fast. 1864 *Fast = true; 1865 break; 1866 case 128: 1867 *Fast = !Subtarget.isUnalignedMem16Slow(); 1868 break; 1869 case 256: 1870 *Fast = !Subtarget.isUnalignedMem32Slow(); 1871 break; 1872 // TODO: What about AVX-512 (512-bit) accesses? 1873 } 1874 } 1875 // Misaligned accesses of any size are always allowed. 1876 return true; 1877 } 1878 1879 /// Return the entry encoding for a jump table in the 1880 /// current function. The returned value is a member of the 1881 /// MachineJumpTableInfo::JTEntryKind enum. 1882 unsigned X86TargetLowering::getJumpTableEncoding() const { 1883 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1884 // symbol. 1885 if (isPositionIndependent() && Subtarget.isPICStyleGOT()) 1886 return MachineJumpTableInfo::EK_Custom32; 1887 1888 // Otherwise, use the normal jump table encoding heuristics. 1889 return TargetLowering::getJumpTableEncoding(); 1890 } 1891 1892 bool X86TargetLowering::useSoftFloat() const { 1893 return Subtarget.useSoftFloat(); 1894 } 1895 1896 const MCExpr * 1897 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1898 const MachineBasicBlock *MBB, 1899 unsigned uid,MCContext &Ctx) const{ 1900 assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); 1901 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1902 // entries. 1903 return MCSymbolRefExpr::create(MBB->getSymbol(), 1904 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1905 } 1906 1907 /// Returns relocation base for the given PIC jumptable. 1908 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1909 SelectionDAG &DAG) const { 1910 if (!Subtarget.is64Bit()) 1911 // This doesn't have SDLoc associated with it, but is not really the 1912 // same as a Register. 1913 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 1914 getPointerTy(DAG.getDataLayout())); 1915 return Table; 1916 } 1917 1918 /// This returns the relocation base for the given PIC jumptable, 1919 /// the same as getPICJumpTableRelocBase, but as an MCExpr. 1920 const MCExpr *X86TargetLowering:: 1921 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1922 MCContext &Ctx) const { 1923 // X86-64 uses RIP relative addressing based on the jump table label. 1924 if (Subtarget.isPICStyleRIPRel()) 1925 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1926 1927 // Otherwise, the reference is relative to the PIC base. 1928 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 1929 } 1930 1931 std::pair<const TargetRegisterClass *, uint8_t> 1932 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1933 MVT VT) const { 1934 const TargetRegisterClass *RRC = nullptr; 1935 uint8_t Cost = 1; 1936 switch (VT.SimpleTy) { 1937 default: 1938 return TargetLowering::findRepresentativeClass(TRI, VT); 1939 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1940 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; 1941 break; 1942 case MVT::x86mmx: 1943 RRC = &X86::VR64RegClass; 1944 break; 1945 case MVT::f32: case MVT::f64: 1946 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1947 case MVT::v4f32: case MVT::v2f64: 1948 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1949 case MVT::v4f64: 1950 RRC = &X86::VR128RegClass; 1951 break; 1952 } 1953 return std::make_pair(RRC, Cost); 1954 } 1955 1956 unsigned X86TargetLowering::getAddressSpace() const { 1957 if (Subtarget.is64Bit()) 1958 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; 1959 return 256; 1960 } 1961 1962 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { 1963 // glibc has a special slot for the stack guard in tcbhead_t, use it instead 1964 // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h) 1965 if (!Subtarget.isTargetGlibc()) 1966 return TargetLowering::getIRStackGuard(IRB); 1967 1968 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1969 // %gs:0x14 on i386 1970 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; 1971 unsigned AddressSpace = getAddressSpace(); 1972 return ConstantExpr::getIntToPtr( 1973 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), 1974 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); 1975 } 1976 1977 void X86TargetLowering::insertSSPDeclarations(Module &M) const { 1978 // MSVC CRT provides functionalities for stack protection. 1979 if (Subtarget.getTargetTriple().isOSMSVCRT()) { 1980 // MSVC CRT has a global variable holding security cookie. 1981 M.getOrInsertGlobal("__security_cookie", 1982 Type::getInt8PtrTy(M.getContext())); 1983 1984 // MSVC CRT has a function to validate security cookie. 1985 auto *SecurityCheckCookie = cast<Function>( 1986 M.getOrInsertFunction("__security_check_cookie", 1987 Type::getVoidTy(M.getContext()), 1988 Type::getInt8PtrTy(M.getContext()), nullptr)); 1989 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); 1990 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); 1991 return; 1992 } 1993 // glibc has a special slot for the stack guard. 1994 if (Subtarget.isTargetGlibc()) 1995 return; 1996 TargetLowering::insertSSPDeclarations(M); 1997 } 1998 1999 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { 2000 // MSVC CRT has a global variable holding security cookie. 2001 if (Subtarget.getTargetTriple().isOSMSVCRT()) 2002 return M.getGlobalVariable("__security_cookie"); 2003 return TargetLowering::getSDagStackGuard(M); 2004 } 2005 2006 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { 2007 // MSVC CRT has a function to validate security cookie. 2008 if (Subtarget.getTargetTriple().isOSMSVCRT()) 2009 return M.getFunction("__security_check_cookie"); 2010 return TargetLowering::getSSPStackGuardCheck(M); 2011 } 2012 2013 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { 2014 if (!Subtarget.isTargetAndroid()) 2015 return TargetLowering::getSafeStackPointerLocation(IRB); 2016 2017 // Android provides a fixed TLS slot for the SafeStack pointer. See the 2018 // definition of TLS_SLOT_SAFESTACK in 2019 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 2020 unsigned AddressSpace, Offset; 2021 2022 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: 2023 // %gs:0x24 on i386 2024 Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; 2025 AddressSpace = getAddressSpace(); 2026 return ConstantExpr::getIntToPtr( 2027 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), 2028 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); 2029 } 2030 2031 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 2032 unsigned DestAS) const { 2033 assert(SrcAS != DestAS && "Expected different address spaces!"); 2034 2035 return SrcAS < 256 && DestAS < 256; 2036 } 2037 2038 //===----------------------------------------------------------------------===// 2039 // Return Value Calling Convention Implementation 2040 //===----------------------------------------------------------------------===// 2041 2042 #include "X86GenCallingConv.inc" 2043 2044 bool X86TargetLowering::CanLowerReturn( 2045 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 2046 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 2047 SmallVector<CCValAssign, 16> RVLocs; 2048 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2049 return CCInfo.CheckReturn(Outs, RetCC_X86); 2050 } 2051 2052 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 2053 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; 2054 return ScratchRegs; 2055 } 2056 2057 SDValue 2058 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2059 bool isVarArg, 2060 const SmallVectorImpl<ISD::OutputArg> &Outs, 2061 const SmallVectorImpl<SDValue> &OutVals, 2062 const SDLoc &dl, SelectionDAG &DAG) const { 2063 MachineFunction &MF = DAG.getMachineFunction(); 2064 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2065 2066 if (CallConv == CallingConv::X86_INTR && !Outs.empty()) 2067 report_fatal_error("X86 interrupts may not return any value"); 2068 2069 SmallVector<CCValAssign, 16> RVLocs; 2070 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 2071 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 2072 2073 SDValue Flag; 2074 SmallVector<SDValue, 6> RetOps; 2075 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2076 // Operand #1 = Bytes To Pop 2077 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, 2078 MVT::i32)); 2079 2080 // Copy the result values into the output registers. 2081 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2082 CCValAssign &VA = RVLocs[i]; 2083 assert(VA.isRegLoc() && "Can only return in registers!"); 2084 SDValue ValToCopy = OutVals[i]; 2085 EVT ValVT = ValToCopy.getValueType(); 2086 2087 // Promote values to the appropriate types. 2088 if (VA.getLocInfo() == CCValAssign::SExt) 2089 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 2090 else if (VA.getLocInfo() == CCValAssign::ZExt) 2091 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 2092 else if (VA.getLocInfo() == CCValAssign::AExt) { 2093 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) 2094 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 2095 else 2096 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 2097 } 2098 else if (VA.getLocInfo() == CCValAssign::BCvt) 2099 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); 2100 2101 assert(VA.getLocInfo() != CCValAssign::FPExt && 2102 "Unexpected FP-extend for return value."); 2103 2104 // If this is x86-64, and we disabled SSE, we can't return FP values, 2105 // or SSE or MMX vectors. 2106 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 2107 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 2108 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { 2109 report_fatal_error("SSE register return with SSE disabled"); 2110 } 2111 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 2112 // llvm-gcc has never done it right and no one has noticed, so this 2113 // should be OK for now. 2114 if (ValVT == MVT::f64 && 2115 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) 2116 report_fatal_error("SSE2 register return with SSE2 disabled"); 2117 2118 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 2119 // the RET instruction and handled by the FP Stackifier. 2120 if (VA.getLocReg() == X86::FP0 || 2121 VA.getLocReg() == X86::FP1) { 2122 // If this is a copy from an xmm register to ST(0), use an FPExtend to 2123 // change the value to the FP stack register class. 2124 if (isScalarFPTypeInSSEReg(VA.getValVT())) 2125 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 2126 RetOps.push_back(ValToCopy); 2127 // Don't emit a copytoreg. 2128 continue; 2129 } 2130 2131 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 2132 // which is returned in RAX / RDX. 2133 if (Subtarget.is64Bit()) { 2134 if (ValVT == MVT::x86mmx) { 2135 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 2136 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); 2137 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 2138 ValToCopy); 2139 // If we don't have SSE2 available, convert to v4f32 so the generated 2140 // register is legal. 2141 if (!Subtarget.hasSSE2()) 2142 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); 2143 } 2144 } 2145 } 2146 2147 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 2148 Flag = Chain.getValue(1); 2149 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2150 } 2151 2152 // Swift calling convention does not require we copy the sret argument 2153 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. 2154 2155 // All x86 ABIs require that for returning structs by value we copy 2156 // the sret argument into %rax/%eax (depending on ABI) for the return. 2157 // We saved the argument into a virtual register in the entry block, 2158 // so now we copy the value out and into %rax/%eax. 2159 // 2160 // Checking Function.hasStructRetAttr() here is insufficient because the IR 2161 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is 2162 // false, then an sret argument may be implicitly inserted in the SelDAG. In 2163 // either case FuncInfo->setSRetReturnReg() will have been called. 2164 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { 2165 // When we have both sret and another return value, we should use the 2166 // original Chain stored in RetOps[0], instead of the current Chain updated 2167 // in the above loop. If we only have sret, RetOps[0] equals to Chain. 2168 2169 // For the case of sret and another return value, we have 2170 // Chain_0 at the function entry 2171 // Chain_1 = getCopyToReg(Chain_0) in the above loop 2172 // If we use Chain_1 in getCopyFromReg, we will have 2173 // Val = getCopyFromReg(Chain_1) 2174 // Chain_2 = getCopyToReg(Chain_1, Val) from below 2175 2176 // getCopyToReg(Chain_0) will be glued together with 2177 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be 2178 // in Unit B, and we will have cyclic dependency between Unit A and Unit B: 2179 // Data dependency from Unit B to Unit A due to usage of Val in 2180 // getCopyToReg(Chain_1, Val) 2181 // Chain dependency from Unit A to Unit B 2182 2183 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. 2184 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, 2185 getPointerTy(MF.getDataLayout())); 2186 2187 unsigned RetValReg 2188 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? 2189 X86::RAX : X86::EAX; 2190 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); 2191 Flag = Chain.getValue(1); 2192 2193 // RAX/EAX now acts like a return value. 2194 RetOps.push_back( 2195 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 2196 } 2197 2198 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 2199 const MCPhysReg *I = 2200 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2201 if (I) { 2202 for (; *I; ++I) { 2203 if (X86::GR64RegClass.contains(*I)) 2204 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 2205 else 2206 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2207 } 2208 } 2209 2210 RetOps[0] = Chain; // Update chain. 2211 2212 // Add the flag if we have it. 2213 if (Flag.getNode()) 2214 RetOps.push_back(Flag); 2215 2216 X86ISD::NodeType opcode = X86ISD::RET_FLAG; 2217 if (CallConv == CallingConv::X86_INTR) 2218 opcode = X86ISD::IRET; 2219 return DAG.getNode(opcode, dl, MVT::Other, RetOps); 2220 } 2221 2222 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2223 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) 2224 return false; 2225 2226 SDValue TCChain = Chain; 2227 SDNode *Copy = *N->use_begin(); 2228 if (Copy->getOpcode() == ISD::CopyToReg) { 2229 // If the copy has a glue operand, we conservatively assume it isn't safe to 2230 // perform a tail call. 2231 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2232 return false; 2233 TCChain = Copy->getOperand(0); 2234 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 2235 return false; 2236 2237 bool HasRet = false; 2238 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2239 UI != UE; ++UI) { 2240 if (UI->getOpcode() != X86ISD::RET_FLAG) 2241 return false; 2242 // If we are returning more than one value, we can definitely 2243 // not make a tail call see PR19530 2244 if (UI->getNumOperands() > 4) 2245 return false; 2246 if (UI->getNumOperands() == 4 && 2247 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) 2248 return false; 2249 HasRet = true; 2250 } 2251 2252 if (!HasRet) 2253 return false; 2254 2255 Chain = TCChain; 2256 return true; 2257 } 2258 2259 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 2260 ISD::NodeType ExtendKind) const { 2261 MVT ReturnMVT = MVT::i32; 2262 2263 bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); 2264 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { 2265 // The ABI does not require i1, i8 or i16 to be extended. 2266 // 2267 // On Darwin, there is code in the wild relying on Clang's old behaviour of 2268 // always extending i8/i16 return values, so keep doing that for now. 2269 // (PR26665). 2270 ReturnMVT = MVT::i8; 2271 } 2272 2273 EVT MinVT = getRegisterType(Context, ReturnMVT); 2274 return VT.bitsLT(MinVT) ? MinVT : VT; 2275 } 2276 2277 /// Lower the result values of a call into the 2278 /// appropriate copies out of appropriate physical registers. 2279 /// 2280 SDValue X86TargetLowering::LowerCallResult( 2281 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2282 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2283 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2284 2285 // Assign locations to each value returned by this call. 2286 SmallVector<CCValAssign, 16> RVLocs; 2287 bool Is64Bit = Subtarget.is64Bit(); 2288 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2289 *DAG.getContext()); 2290 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2291 2292 // Copy all of the result registers out of their specified physreg. 2293 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2294 CCValAssign &VA = RVLocs[i]; 2295 EVT CopyVT = VA.getLocVT(); 2296 2297 // If this is x86-64, and we disabled SSE, we can't return FP values 2298 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && 2299 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) { 2300 report_fatal_error("SSE register return with SSE disabled"); 2301 } 2302 2303 // If we prefer to use the value in xmm registers, copy it out as f80 and 2304 // use a truncate to move it from fp stack reg to xmm reg. 2305 bool RoundAfterCopy = false; 2306 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && 2307 isScalarFPTypeInSSEReg(VA.getValVT())) { 2308 if (!Subtarget.hasX87()) 2309 report_fatal_error("X87 register return with X87 disabled"); 2310 CopyVT = MVT::f80; 2311 RoundAfterCopy = (CopyVT != VA.getLocVT()); 2312 } 2313 2314 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 2315 CopyVT, InFlag).getValue(1); 2316 SDValue Val = Chain.getValue(0); 2317 2318 if (RoundAfterCopy) 2319 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 2320 // This truncation won't change the value. 2321 DAG.getIntPtrConstant(1, dl)); 2322 2323 if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1) 2324 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 2325 2326 InFlag = Chain.getValue(2); 2327 InVals.push_back(Val); 2328 } 2329 2330 return Chain; 2331 } 2332 2333 //===----------------------------------------------------------------------===// 2334 // C & StdCall & Fast Calling Convention implementation 2335 //===----------------------------------------------------------------------===// 2336 // StdCall calling convention seems to be standard for many Windows' API 2337 // routines and around. It differs from C calling convention just a little: 2338 // callee should clean up the stack, not caller. Symbols should be also 2339 // decorated in some fancy way :) It doesn't support any vector arguments. 2340 // For info on fast calling convention see Fast Calling Convention (tail call) 2341 // implementation LowerX86_32FastCCCallTo. 2342 2343 /// CallIsStructReturn - Determines whether a call uses struct return 2344 /// semantics. 2345 enum StructReturnType { 2346 NotStructReturn, 2347 RegStructReturn, 2348 StackStructReturn 2349 }; 2350 static StructReturnType 2351 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) { 2352 if (Outs.empty()) 2353 return NotStructReturn; 2354 2355 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 2356 if (!Flags.isSRet()) 2357 return NotStructReturn; 2358 if (Flags.isInReg() || IsMCU) 2359 return RegStructReturn; 2360 return StackStructReturn; 2361 } 2362 2363 /// Determines whether a function uses struct return semantics. 2364 static StructReturnType 2365 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) { 2366 if (Ins.empty()) 2367 return NotStructReturn; 2368 2369 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 2370 if (!Flags.isSRet()) 2371 return NotStructReturn; 2372 if (Flags.isInReg() || IsMCU) 2373 return RegStructReturn; 2374 return StackStructReturn; 2375 } 2376 2377 /// Make a copy of an aggregate at address specified by "Src" to address 2378 /// "Dst" with size and alignment information specified by the specific 2379 /// parameter attribute. The copy will be passed as a byval function parameter. 2380 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, 2381 SDValue Chain, ISD::ArgFlagsTy Flags, 2382 SelectionDAG &DAG, const SDLoc &dl) { 2383 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); 2384 2385 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 2386 /*isVolatile*/false, /*AlwaysInline=*/true, 2387 /*isTailCall*/false, 2388 MachinePointerInfo(), MachinePointerInfo()); 2389 } 2390 2391 /// Return true if the calling convention is one that we can guarantee TCO for. 2392 static bool canGuaranteeTCO(CallingConv::ID CC) { 2393 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 2394 CC == CallingConv::HiPE || CC == CallingConv::HHVM); 2395 } 2396 2397 /// Return true if we might ever do TCO for calls with this calling convention. 2398 static bool mayTailCallThisCC(CallingConv::ID CC) { 2399 switch (CC) { 2400 // C calling conventions: 2401 case CallingConv::C: 2402 case CallingConv::X86_64_Win64: 2403 case CallingConv::X86_64_SysV: 2404 // Callee pop conventions: 2405 case CallingConv::X86_ThisCall: 2406 case CallingConv::X86_StdCall: 2407 case CallingConv::X86_VectorCall: 2408 case CallingConv::X86_FastCall: 2409 return true; 2410 default: 2411 return canGuaranteeTCO(CC); 2412 } 2413 } 2414 2415 /// Return true if the function is being made into a tailcall target by 2416 /// changing its ABI. 2417 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { 2418 return GuaranteedTailCallOpt && canGuaranteeTCO(CC); 2419 } 2420 2421 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2422 auto Attr = 2423 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2424 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2425 return false; 2426 2427 CallSite CS(CI); 2428 CallingConv::ID CalleeCC = CS.getCallingConv(); 2429 if (!mayTailCallThisCC(CalleeCC)) 2430 return false; 2431 2432 return true; 2433 } 2434 2435 SDValue 2436 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, 2437 const SmallVectorImpl<ISD::InputArg> &Ins, 2438 const SDLoc &dl, SelectionDAG &DAG, 2439 const CCValAssign &VA, 2440 MachineFrameInfo *MFI, unsigned i) const { 2441 // Create the nodes corresponding to a load from this parameter slot. 2442 ISD::ArgFlagsTy Flags = Ins[i].Flags; 2443 bool AlwaysUseMutable = shouldGuaranteeTCO( 2444 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); 2445 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 2446 EVT ValVT; 2447 2448 // If value is passed by pointer we have address passed instead of the value 2449 // itself. 2450 bool ExtendedInMem = VA.isExtInLoc() && 2451 VA.getValVT().getScalarType() == MVT::i1; 2452 2453 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) 2454 ValVT = VA.getLocVT(); 2455 else 2456 ValVT = VA.getValVT(); 2457 2458 // Calculate SP offset of interrupt parameter, re-arrange the slot normally 2459 // taken by a return address. 2460 int Offset = 0; 2461 if (CallConv == CallingConv::X86_INTR) { 2462 const X86Subtarget& Subtarget = 2463 static_cast<const X86Subtarget&>(DAG.getSubtarget()); 2464 // X86 interrupts may take one or two arguments. 2465 // On the stack there will be no return address as in regular call. 2466 // Offset of last argument need to be set to -4/-8 bytes. 2467 // Where offset of the first argument out of two, should be set to 0 bytes. 2468 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); 2469 } 2470 2471 // FIXME: For now, all byval parameter objects are marked mutable. This can be 2472 // changed with more analysis. 2473 // In case of tail call optimization mark all arguments mutable. Since they 2474 // could be overwritten by lowering of arguments in case of a tail call. 2475 if (Flags.isByVal()) { 2476 unsigned Bytes = Flags.getByValSize(); 2477 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 2478 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 2479 // Adjust SP offset of interrupt parameter. 2480 if (CallConv == CallingConv::X86_INTR) { 2481 MFI->setObjectOffset(FI, Offset); 2482 } 2483 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2484 } else { 2485 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 2486 VA.getLocMemOffset(), isImmutable); 2487 2488 // Set SExt or ZExt flag. 2489 if (VA.getLocInfo() == CCValAssign::ZExt) { 2490 MFI->setObjectZExt(FI, true); 2491 } else if (VA.getLocInfo() == CCValAssign::SExt) { 2492 MFI->setObjectSExt(FI, true); 2493 } 2494 2495 // Adjust SP offset of interrupt parameter. 2496 if (CallConv == CallingConv::X86_INTR) { 2497 MFI->setObjectOffset(FI, Offset); 2498 } 2499 2500 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2501 SDValue Val = DAG.getLoad( 2502 ValVT, dl, Chain, FIN, 2503 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, 2504 false, false, 0); 2505 return ExtendedInMem ? 2506 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; 2507 } 2508 } 2509 2510 // FIXME: Get this from tablegen. 2511 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, 2512 const X86Subtarget &Subtarget) { 2513 assert(Subtarget.is64Bit()); 2514 2515 if (Subtarget.isCallingConvWin64(CallConv)) { 2516 static const MCPhysReg GPR64ArgRegsWin64[] = { 2517 X86::RCX, X86::RDX, X86::R8, X86::R9 2518 }; 2519 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); 2520 } 2521 2522 static const MCPhysReg GPR64ArgRegs64Bit[] = { 2523 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 2524 }; 2525 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); 2526 } 2527 2528 // FIXME: Get this from tablegen. 2529 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, 2530 CallingConv::ID CallConv, 2531 const X86Subtarget &Subtarget) { 2532 assert(Subtarget.is64Bit()); 2533 if (Subtarget.isCallingConvWin64(CallConv)) { 2534 // The XMM registers which might contain var arg parameters are shadowed 2535 // in their paired GPR. So we only need to save the GPR to their home 2536 // slots. 2537 // TODO: __vectorcall will change this. 2538 return None; 2539 } 2540 2541 const Function *Fn = MF.getFunction(); 2542 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); 2543 bool isSoftFloat = Subtarget.useSoftFloat(); 2544 assert(!(isSoftFloat && NoImplicitFloatOps) && 2545 "SSE register cannot be used when SSE is disabled!"); 2546 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1()) 2547 // Kernel mode asks for SSE to be disabled, so there are no XMM argument 2548 // registers. 2549 return None; 2550 2551 static const MCPhysReg XMMArgRegs64Bit[] = { 2552 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2553 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2554 }; 2555 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); 2556 } 2557 2558 SDValue X86TargetLowering::LowerFormalArguments( 2559 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2560 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2561 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2562 MachineFunction &MF = DAG.getMachineFunction(); 2563 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2564 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); 2565 2566 const Function *Fn = MF.getFunction(); 2567 if (Fn->hasExternalLinkage() && 2568 Subtarget.isTargetCygMing() && 2569 Fn->getName() == "main") 2570 FuncInfo->setForceFramePointer(true); 2571 2572 MachineFrameInfo *MFI = MF.getFrameInfo(); 2573 bool Is64Bit = Subtarget.is64Bit(); 2574 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); 2575 2576 assert(!(isVarArg && canGuaranteeTCO(CallConv)) && 2577 "Var args not supported with calling convention fastcc, ghc or hipe"); 2578 2579 if (CallConv == CallingConv::X86_INTR) { 2580 bool isLegal = Ins.size() == 1 || 2581 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || 2582 (!Is64Bit && Ins[1].VT == MVT::i32))); 2583 if (!isLegal) 2584 report_fatal_error("X86 interrupts may take one or two arguments"); 2585 } 2586 2587 // Assign locations to all of the incoming arguments. 2588 SmallVector<CCValAssign, 16> ArgLocs; 2589 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 2590 2591 // Allocate shadow area for Win64 2592 if (IsWin64) 2593 CCInfo.AllocateStack(32, 8); 2594 2595 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 2596 2597 unsigned LastVal = ~0U; 2598 SDValue ArgValue; 2599 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2600 CCValAssign &VA = ArgLocs[i]; 2601 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 2602 // places. 2603 assert(VA.getValNo() != LastVal && 2604 "Don't support value assigned to multiple locs yet"); 2605 (void)LastVal; 2606 LastVal = VA.getValNo(); 2607 2608 if (VA.isRegLoc()) { 2609 EVT RegVT = VA.getLocVT(); 2610 const TargetRegisterClass *RC; 2611 if (RegVT == MVT::i32) 2612 RC = &X86::GR32RegClass; 2613 else if (Is64Bit && RegVT == MVT::i64) 2614 RC = &X86::GR64RegClass; 2615 else if (RegVT == MVT::f32) 2616 RC = &X86::FR32RegClass; 2617 else if (RegVT == MVT::f64) 2618 RC = &X86::FR64RegClass; 2619 else if (RegVT == MVT::f128) 2620 RC = &X86::FR128RegClass; 2621 else if (RegVT.is512BitVector()) 2622 RC = &X86::VR512RegClass; 2623 else if (RegVT.is256BitVector()) 2624 RC = &X86::VR256RegClass; 2625 else if (RegVT.is128BitVector()) 2626 RC = &X86::VR128RegClass; 2627 else if (RegVT == MVT::x86mmx) 2628 RC = &X86::VR64RegClass; 2629 else if (RegVT == MVT::i1) 2630 RC = &X86::VK1RegClass; 2631 else if (RegVT == MVT::v8i1) 2632 RC = &X86::VK8RegClass; 2633 else if (RegVT == MVT::v16i1) 2634 RC = &X86::VK16RegClass; 2635 else if (RegVT == MVT::v32i1) 2636 RC = &X86::VK32RegClass; 2637 else if (RegVT == MVT::v64i1) 2638 RC = &X86::VK64RegClass; 2639 else 2640 llvm_unreachable("Unknown argument type!"); 2641 2642 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2643 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 2644 2645 // If this is an 8 or 16-bit value, it is really passed promoted to 32 2646 // bits. Insert an assert[sz]ext to capture this, then truncate to the 2647 // right size. 2648 if (VA.getLocInfo() == CCValAssign::SExt) 2649 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 2650 DAG.getValueType(VA.getValVT())); 2651 else if (VA.getLocInfo() == CCValAssign::ZExt) 2652 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 2653 DAG.getValueType(VA.getValVT())); 2654 else if (VA.getLocInfo() == CCValAssign::BCvt) 2655 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); 2656 2657 if (VA.isExtInLoc()) { 2658 // Handle MMX values passed in XMM regs. 2659 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) 2660 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 2661 else 2662 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2663 } 2664 } else { 2665 assert(VA.isMemLoc()); 2666 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 2667 } 2668 2669 // If value is passed via pointer - do a load. 2670 if (VA.getLocInfo() == CCValAssign::Indirect) 2671 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 2672 MachinePointerInfo(), false, false, false, 0); 2673 2674 InVals.push_back(ArgValue); 2675 } 2676 2677 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2678 // Swift calling convention does not require we copy the sret argument 2679 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. 2680 if (CallConv == CallingConv::Swift) 2681 continue; 2682 2683 // All x86 ABIs require that for returning structs by value we copy the 2684 // sret argument into %rax/%eax (depending on ABI) for the return. Save 2685 // the argument into a virtual register so that we can access it from the 2686 // return points. 2687 if (Ins[i].Flags.isSRet()) { 2688 unsigned Reg = FuncInfo->getSRetReturnReg(); 2689 if (!Reg) { 2690 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 2691 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 2692 FuncInfo->setSRetReturnReg(Reg); 2693 } 2694 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); 2695 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 2696 break; 2697 } 2698 } 2699 2700 unsigned StackSize = CCInfo.getNextStackOffset(); 2701 // Align stack specially for tail calls. 2702 if (shouldGuaranteeTCO(CallConv, 2703 MF.getTarget().Options.GuaranteedTailCallOpt)) 2704 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 2705 2706 // If the function takes variable number of arguments, make a frame index for 2707 // the start of the first vararg value... for expansion of llvm.va_start. We 2708 // can skip this if there are no va_start calls. 2709 if (MFI->hasVAStart() && 2710 (Is64Bit || (CallConv != CallingConv::X86_FastCall && 2711 CallConv != CallingConv::X86_ThisCall))) { 2712 FuncInfo->setVarArgsFrameIndex( 2713 MFI->CreateFixedObject(1, StackSize, true)); 2714 } 2715 2716 // Figure out if XMM registers are in use. 2717 assert(!(Subtarget.useSoftFloat() && 2718 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && 2719 "SSE register cannot be used when SSE is disabled!"); 2720 2721 // 64-bit calling conventions support varargs and register parameters, so we 2722 // have to do extra work to spill them in the prologue. 2723 if (Is64Bit && isVarArg && MFI->hasVAStart()) { 2724 // Find the first unallocated argument registers. 2725 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); 2726 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); 2727 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); 2728 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); 2729 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && 2730 "SSE register cannot be used when SSE is disabled!"); 2731 2732 // Gather all the live in physical registers. 2733 SmallVector<SDValue, 6> LiveGPRs; 2734 SmallVector<SDValue, 8> LiveXMMRegs; 2735 SDValue ALVal; 2736 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { 2737 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); 2738 LiveGPRs.push_back( 2739 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); 2740 } 2741 if (!ArgXMMs.empty()) { 2742 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2743 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); 2744 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { 2745 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); 2746 LiveXMMRegs.push_back( 2747 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); 2748 } 2749 } 2750 2751 if (IsWin64) { 2752 // Get to the caller-allocated home save location. Add 8 to account 2753 // for the return address. 2754 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2755 FuncInfo->setRegSaveFrameIndex( 2756 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2757 // Fixup to set vararg frame on shadow area (4 x i64). 2758 if (NumIntRegs < 4) 2759 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2760 } else { 2761 // For X86-64, if there are vararg parameters that are passed via 2762 // registers, then we must store them to their spots on the stack so 2763 // they may be loaded by dereferencing the result of va_next. 2764 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2765 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); 2766 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( 2767 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); 2768 } 2769 2770 // Store the integer parameter registers. 2771 SmallVector<SDValue, 8> MemOps; 2772 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2773 getPointerTy(DAG.getDataLayout())); 2774 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2775 for (SDValue Val : LiveGPRs) { 2776 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2777 RSFIN, DAG.getIntPtrConstant(Offset, dl)); 2778 SDValue Store = 2779 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2780 MachinePointerInfo::getFixedStack( 2781 DAG.getMachineFunction(), 2782 FuncInfo->getRegSaveFrameIndex(), Offset), 2783 false, false, 0); 2784 MemOps.push_back(Store); 2785 Offset += 8; 2786 } 2787 2788 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { 2789 // Now store the XMM (fp + vector) parameter registers. 2790 SmallVector<SDValue, 12> SaveXMMOps; 2791 SaveXMMOps.push_back(Chain); 2792 SaveXMMOps.push_back(ALVal); 2793 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2794 FuncInfo->getRegSaveFrameIndex(), dl)); 2795 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2796 FuncInfo->getVarArgsFPOffset(), dl)); 2797 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), 2798 LiveXMMRegs.end()); 2799 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2800 MVT::Other, SaveXMMOps)); 2801 } 2802 2803 if (!MemOps.empty()) 2804 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 2805 } 2806 2807 if (isVarArg && MFI->hasMustTailInVarArgFunc()) { 2808 // Find the largest legal vector type. 2809 MVT VecVT = MVT::Other; 2810 // FIXME: Only some x86_32 calling conventions support AVX512. 2811 if (Subtarget.hasAVX512() && 2812 (Is64Bit || (CallConv == CallingConv::X86_VectorCall || 2813 CallConv == CallingConv::Intel_OCL_BI))) 2814 VecVT = MVT::v16f32; 2815 else if (Subtarget.hasAVX()) 2816 VecVT = MVT::v8f32; 2817 else if (Subtarget.hasSSE2()) 2818 VecVT = MVT::v4f32; 2819 2820 // We forward some GPRs and some vector types. 2821 SmallVector<MVT, 2> RegParmTypes; 2822 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; 2823 RegParmTypes.push_back(IntVT); 2824 if (VecVT != MVT::Other) 2825 RegParmTypes.push_back(VecVT); 2826 2827 // Compute the set of forwarded registers. The rest are scratch. 2828 SmallVectorImpl<ForwardedRegister> &Forwards = 2829 FuncInfo->getForwardedMustTailRegParms(); 2830 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); 2831 2832 // Conservatively forward AL on x86_64, since it might be used for varargs. 2833 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { 2834 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2835 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); 2836 } 2837 2838 // Copy all forwards from physical to virtual registers. 2839 for (ForwardedRegister &F : Forwards) { 2840 // FIXME: Can we use a less constrained schedule? 2841 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 2842 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); 2843 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); 2844 } 2845 } 2846 2847 // Some CCs need callee pop. 2848 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2849 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2850 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2851 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { 2852 // X86 interrupts must pop the error code if present 2853 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); 2854 } else { 2855 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2856 // If this is an sret function, the return should pop the hidden pointer. 2857 if (!Is64Bit && !canGuaranteeTCO(CallConv) && 2858 !Subtarget.getTargetTriple().isOSMSVCRT() && 2859 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn) 2860 FuncInfo->setBytesToPopOnReturn(4); 2861 } 2862 2863 if (!Is64Bit) { 2864 // RegSaveFrameIndex is X86-64 only. 2865 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2866 if (CallConv == CallingConv::X86_FastCall || 2867 CallConv == CallingConv::X86_ThisCall) 2868 // fastcc functions can't have varargs. 2869 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2870 } 2871 2872 FuncInfo->setArgumentStackSize(StackSize); 2873 2874 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { 2875 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); 2876 if (Personality == EHPersonality::CoreCLR) { 2877 assert(Is64Bit); 2878 // TODO: Add a mechanism to frame lowering that will allow us to indicate 2879 // that we'd prefer this slot be allocated towards the bottom of the frame 2880 // (i.e. near the stack pointer after allocating the frame). Every 2881 // funclet needs a copy of this slot in its (mostly empty) frame, and the 2882 // offset from the bottom of this and each funclet's frame must be the 2883 // same, so the size of funclets' (mostly empty) frames is dictated by 2884 // how far this slot is from the bottom (since they allocate just enough 2885 // space to accommodate holding this slot at the correct offset). 2886 int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); 2887 EHInfo->PSPSymFrameIdx = PSPSymFI; 2888 } 2889 } 2890 2891 return Chain; 2892 } 2893 2894 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 2895 SDValue Arg, const SDLoc &dl, 2896 SelectionDAG &DAG, 2897 const CCValAssign &VA, 2898 ISD::ArgFlagsTy Flags) const { 2899 unsigned LocMemOffset = VA.getLocMemOffset(); 2900 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2901 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2902 StackPtr, PtrOff); 2903 if (Flags.isByVal()) 2904 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2905 2906 return DAG.getStore( 2907 Chain, dl, Arg, PtrOff, 2908 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), 2909 false, false, 0); 2910 } 2911 2912 /// Emit a load of return address if tail call 2913 /// optimization is performed and it is required. 2914 SDValue X86TargetLowering::EmitTailCallLoadRetAddr( 2915 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, 2916 bool Is64Bit, int FPDiff, const SDLoc &dl) const { 2917 // Adjust the Return address stack slot. 2918 EVT VT = getPointerTy(DAG.getDataLayout()); 2919 OutRetAddr = getReturnAddressFrameIndex(DAG); 2920 2921 // Load the "old" Return address. 2922 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2923 false, false, false, 0); 2924 return SDValue(OutRetAddr.getNode(), 1); 2925 } 2926 2927 /// Emit a store of the return address if tail call 2928 /// optimization is performed and it is required (FPDiff!=0). 2929 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, 2930 SDValue Chain, SDValue RetAddrFrIdx, 2931 EVT PtrVT, unsigned SlotSize, 2932 int FPDiff, const SDLoc &dl) { 2933 // Store the return address to the appropriate stack slot. 2934 if (!FPDiff) return Chain; 2935 // Calculate the new stack slot for the return address. 2936 int NewReturnAddrFI = 2937 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 2938 false); 2939 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 2940 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2941 MachinePointerInfo::getFixedStack( 2942 DAG.getMachineFunction(), NewReturnAddrFI), 2943 false, false, 0); 2944 return Chain; 2945 } 2946 2947 /// Returns a vector_shuffle mask for an movs{s|d}, movd 2948 /// operation of specified width. 2949 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, 2950 SDValue V2) { 2951 unsigned NumElems = VT.getVectorNumElements(); 2952 SmallVector<int, 8> Mask; 2953 Mask.push_back(NumElems); 2954 for (unsigned i = 1; i != NumElems; ++i) 2955 Mask.push_back(i); 2956 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); 2957 } 2958 2959 SDValue 2960 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2961 SmallVectorImpl<SDValue> &InVals) const { 2962 SelectionDAG &DAG = CLI.DAG; 2963 SDLoc &dl = CLI.DL; 2964 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2965 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2966 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2967 SDValue Chain = CLI.Chain; 2968 SDValue Callee = CLI.Callee; 2969 CallingConv::ID CallConv = CLI.CallConv; 2970 bool &isTailCall = CLI.IsTailCall; 2971 bool isVarArg = CLI.IsVarArg; 2972 2973 MachineFunction &MF = DAG.getMachineFunction(); 2974 bool Is64Bit = Subtarget.is64Bit(); 2975 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); 2976 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); 2977 bool IsSibcall = false; 2978 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2979 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); 2980 2981 if (CallConv == CallingConv::X86_INTR) 2982 report_fatal_error("X86 interrupts may not be called directly"); 2983 2984 if (Attr.getValueAsString() == "true") 2985 isTailCall = false; 2986 2987 if (Subtarget.isPICStyleGOT() && 2988 !MF.getTarget().Options.GuaranteedTailCallOpt) { 2989 // If we are using a GOT, disable tail calls to external symbols with 2990 // default visibility. Tail calling such a symbol requires using a GOT 2991 // relocation, which forces early binding of the symbol. This breaks code 2992 // that require lazy function symbol resolution. Using musttail or 2993 // GuaranteedTailCallOpt will override this. 2994 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2995 if (!G || (!G->getGlobal()->hasLocalLinkage() && 2996 G->getGlobal()->hasDefaultVisibility())) 2997 isTailCall = false; 2998 } 2999 3000 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); 3001 if (IsMustTail) { 3002 // Force this to be a tail call. The verifier rules are enough to ensure 3003 // that we can lower this successfully without moving the return address 3004 // around. 3005 isTailCall = true; 3006 } else if (isTailCall) { 3007 // Check if it's really possible to do a tail call. 3008 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 3009 isVarArg, SR != NotStructReturn, 3010 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 3011 Outs, OutVals, Ins, DAG); 3012 3013 // Sibcalls are automatically detected tailcalls which do not require 3014 // ABI changes. 3015 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 3016 IsSibcall = true; 3017 3018 if (isTailCall) 3019 ++NumTailCalls; 3020 } 3021 3022 assert(!(isVarArg && canGuaranteeTCO(CallConv)) && 3023 "Var args not supported with calling convention fastcc, ghc or hipe"); 3024 3025 // Analyze operands of the call, assigning locations to each operand. 3026 SmallVector<CCValAssign, 16> ArgLocs; 3027 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 3028 3029 // Allocate shadow area for Win64 3030 if (IsWin64) 3031 CCInfo.AllocateStack(32, 8); 3032 3033 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3034 3035 // Get a count of how many bytes are to be pushed on the stack. 3036 unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); 3037 if (IsSibcall) 3038 // This is a sibcall. The memory operands are available in caller's 3039 // own caller's stack. 3040 NumBytes = 0; 3041 else if (MF.getTarget().Options.GuaranteedTailCallOpt && 3042 canGuaranteeTCO(CallConv)) 3043 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 3044 3045 int FPDiff = 0; 3046 if (isTailCall && !IsSibcall && !IsMustTail) { 3047 // Lower arguments at fp - stackoffset + fpdiff. 3048 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 3049 3050 FPDiff = NumBytesCallerPushed - NumBytes; 3051 3052 // Set the delta of movement of the returnaddr stackslot. 3053 // But only set if delta is greater than previous delta. 3054 if (FPDiff < X86Info->getTCReturnAddrDelta()) 3055 X86Info->setTCReturnAddrDelta(FPDiff); 3056 } 3057 3058 unsigned NumBytesToPush = NumBytes; 3059 unsigned NumBytesToPop = NumBytes; 3060 3061 // If we have an inalloca argument, all stack space has already been allocated 3062 // for us and be right at the top of the stack. We don't support multiple 3063 // arguments passed in memory when using inalloca. 3064 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { 3065 NumBytesToPush = 0; 3066 if (!ArgLocs.back().isMemLoc()) 3067 report_fatal_error("cannot use inalloca attribute on a register " 3068 "parameter"); 3069 if (ArgLocs.back().getLocMemOffset() != 0) 3070 report_fatal_error("any parameter with the inalloca attribute must be " 3071 "the only memory argument"); 3072 } 3073 3074 if (!IsSibcall) 3075 Chain = DAG.getCALLSEQ_START( 3076 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); 3077 3078 SDValue RetAddrFrIdx; 3079 // Load return address for tail calls. 3080 if (isTailCall && FPDiff) 3081 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 3082 Is64Bit, FPDiff, dl); 3083 3084 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 3085 SmallVector<SDValue, 8> MemOpChains; 3086 SDValue StackPtr; 3087 3088 // Walk the register/memloc assignments, inserting copies/loads. In the case 3089 // of tail call optimization arguments are handle later. 3090 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 3091 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3092 // Skip inalloca arguments, they have already been written. 3093 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3094 if (Flags.isInAlloca()) 3095 continue; 3096 3097 CCValAssign &VA = ArgLocs[i]; 3098 EVT RegVT = VA.getLocVT(); 3099 SDValue Arg = OutVals[i]; 3100 bool isByVal = Flags.isByVal(); 3101 3102 // Promote the value if needed. 3103 switch (VA.getLocInfo()) { 3104 default: llvm_unreachable("Unknown loc info!"); 3105 case CCValAssign::Full: break; 3106 case CCValAssign::SExt: 3107 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 3108 break; 3109 case CCValAssign::ZExt: 3110 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 3111 break; 3112 case CCValAssign::AExt: 3113 if (Arg.getValueType().isVector() && 3114 Arg.getValueType().getVectorElementType() == MVT::i1) 3115 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 3116 else if (RegVT.is128BitVector()) { 3117 // Special case: passing MMX values in XMM registers. 3118 Arg = DAG.getBitcast(MVT::i64, Arg); 3119 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 3120 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 3121 } else 3122 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 3123 break; 3124 case CCValAssign::BCvt: 3125 Arg = DAG.getBitcast(RegVT, Arg); 3126 break; 3127 case CCValAssign::Indirect: { 3128 // Store the argument. 3129 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 3130 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 3131 Chain = DAG.getStore( 3132 Chain, dl, Arg, SpillSlot, 3133 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3134 false, false, 0); 3135 Arg = SpillSlot; 3136 break; 3137 } 3138 } 3139 3140 if (VA.isRegLoc()) { 3141 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 3142 if (isVarArg && IsWin64) { 3143 // Win64 ABI requires argument XMM reg to be copied to the corresponding 3144 // shadow reg if callee is a varargs function. 3145 unsigned ShadowReg = 0; 3146 switch (VA.getLocReg()) { 3147 case X86::XMM0: ShadowReg = X86::RCX; break; 3148 case X86::XMM1: ShadowReg = X86::RDX; break; 3149 case X86::XMM2: ShadowReg = X86::R8; break; 3150 case X86::XMM3: ShadowReg = X86::R9; break; 3151 } 3152 if (ShadowReg) 3153 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 3154 } 3155 } else if (!IsSibcall && (!isTailCall || isByVal)) { 3156 assert(VA.isMemLoc()); 3157 if (!StackPtr.getNode()) 3158 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 3159 getPointerTy(DAG.getDataLayout())); 3160 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 3161 dl, DAG, VA, Flags)); 3162 } 3163 } 3164 3165 if (!MemOpChains.empty()) 3166 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 3167 3168 if (Subtarget.isPICStyleGOT()) { 3169 // ELF / PIC requires GOT in the EBX register before function calls via PLT 3170 // GOT pointer. 3171 if (!isTailCall) { 3172 RegsToPass.push_back(std::make_pair( 3173 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 3174 getPointerTy(DAG.getDataLayout())))); 3175 } else { 3176 // If we are tail calling and generating PIC/GOT style code load the 3177 // address of the callee into ECX. The value in ecx is used as target of 3178 // the tail jump. This is done to circumvent the ebx/callee-saved problem 3179 // for tail calls on PIC/GOT architectures. Normally we would just put the 3180 // address of GOT into ebx and then call target@PLT. But for tail calls 3181 // ebx would be restored (since ebx is callee saved) before jumping to the 3182 // target@PLT. 3183 3184 // Note: The actual moving to ECX is done further down. 3185 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 3186 if (G && !G->getGlobal()->hasLocalLinkage() && 3187 G->getGlobal()->hasDefaultVisibility()) 3188 Callee = LowerGlobalAddress(Callee, DAG); 3189 else if (isa<ExternalSymbolSDNode>(Callee)) 3190 Callee = LowerExternalSymbol(Callee, DAG); 3191 } 3192 } 3193 3194 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { 3195 // From AMD64 ABI document: 3196 // For calls that may call functions that use varargs or stdargs 3197 // (prototype-less calls or calls to functions containing ellipsis (...) in 3198 // the declaration) %al is used as hidden argument to specify the number 3199 // of SSE registers used. The contents of %al do not need to match exactly 3200 // the number of registers, but must be an ubound on the number of SSE 3201 // registers used and is in the range 0 - 8 inclusive. 3202 3203 // Count the number of XMM registers allocated. 3204 static const MCPhysReg XMMArgRegs[] = { 3205 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 3206 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 3207 }; 3208 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); 3209 assert((Subtarget.hasSSE1() || !NumXMMRegs) 3210 && "SSE registers cannot be used when SSE is disabled"); 3211 3212 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 3213 DAG.getConstant(NumXMMRegs, dl, 3214 MVT::i8))); 3215 } 3216 3217 if (isVarArg && IsMustTail) { 3218 const auto &Forwards = X86Info->getForwardedMustTailRegParms(); 3219 for (const auto &F : Forwards) { 3220 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 3221 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); 3222 } 3223 } 3224 3225 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls 3226 // don't need this because the eligibility check rejects calls that require 3227 // shuffling arguments passed in memory. 3228 if (!IsSibcall && isTailCall) { 3229 // Force all the incoming stack arguments to be loaded from the stack 3230 // before any new outgoing arguments are stored to the stack, because the 3231 // outgoing stack slots may alias the incoming argument stack slots, and 3232 // the alias isn't otherwise explicit. This is slightly more conservative 3233 // than necessary, because it means that each store effectively depends 3234 // on every argument instead of just those arguments it would clobber. 3235 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 3236 3237 SmallVector<SDValue, 8> MemOpChains2; 3238 SDValue FIN; 3239 int FI = 0; 3240 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3241 CCValAssign &VA = ArgLocs[i]; 3242 if (VA.isRegLoc()) 3243 continue; 3244 assert(VA.isMemLoc()); 3245 SDValue Arg = OutVals[i]; 3246 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3247 // Skip inalloca arguments. They don't require any work. 3248 if (Flags.isInAlloca()) 3249 continue; 3250 // Create frame index. 3251 int32_t Offset = VA.getLocMemOffset()+FPDiff; 3252 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 3253 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 3254 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3255 3256 if (Flags.isByVal()) { 3257 // Copy relative to framepointer. 3258 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); 3259 if (!StackPtr.getNode()) 3260 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 3261 getPointerTy(DAG.getDataLayout())); 3262 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 3263 StackPtr, Source); 3264 3265 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 3266 ArgChain, 3267 Flags, DAG, dl)); 3268 } else { 3269 // Store relative to framepointer. 3270 MemOpChains2.push_back(DAG.getStore( 3271 ArgChain, dl, Arg, FIN, 3272 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3273 false, false, 0)); 3274 } 3275 } 3276 3277 if (!MemOpChains2.empty()) 3278 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 3279 3280 // Store the return address to the appropriate stack slot. 3281 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 3282 getPointerTy(DAG.getDataLayout()), 3283 RegInfo->getSlotSize(), FPDiff, dl); 3284 } 3285 3286 // Build a sequence of copy-to-reg nodes chained together with token chain 3287 // and flag operands which copy the outgoing args into registers. 3288 SDValue InFlag; 3289 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 3290 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 3291 RegsToPass[i].second, InFlag); 3292 InFlag = Chain.getValue(1); 3293 } 3294 3295 if (DAG.getTarget().getCodeModel() == CodeModel::Large) { 3296 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 3297 // In the 64-bit large code model, we have to make all calls 3298 // through a register, since the call instruction's 32-bit 3299 // pc-relative offset may not be large enough to hold the whole 3300 // address. 3301 } else if (Callee->getOpcode() == ISD::GlobalAddress) { 3302 // If the callee is a GlobalAddress node (quite common, every direct call 3303 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 3304 // it. 3305 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); 3306 3307 // We should use extra load for direct calls to dllimported functions in 3308 // non-JIT mode. 3309 const GlobalValue *GV = G->getGlobal(); 3310 if (!GV->hasDLLImportStorageClass()) { 3311 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV); 3312 3313 Callee = DAG.getTargetGlobalAddress( 3314 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); 3315 3316 if (OpFlags == X86II::MO_GOTPCREL) { 3317 // Add a wrapper. 3318 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, 3319 getPointerTy(DAG.getDataLayout()), Callee); 3320 // Add extra indirection 3321 Callee = DAG.getLoad( 3322 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, 3323 MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, 3324 false, 0); 3325 } 3326 } 3327 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 3328 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); 3329 unsigned char OpFlags = 3330 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod); 3331 3332 Callee = DAG.getTargetExternalSymbol( 3333 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); 3334 } else if (Subtarget.isTarget64BitILP32() && 3335 Callee->getValueType(0) == MVT::i32) { 3336 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI 3337 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); 3338 } 3339 3340 // Returns a chain & a flag for retval copy to use. 3341 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3342 SmallVector<SDValue, 8> Ops; 3343 3344 if (!IsSibcall && isTailCall) { 3345 Chain = DAG.getCALLSEQ_END(Chain, 3346 DAG.getIntPtrConstant(NumBytesToPop, dl, true), 3347 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 3348 InFlag = Chain.getValue(1); 3349 } 3350 3351 Ops.push_back(Chain); 3352 Ops.push_back(Callee); 3353 3354 if (isTailCall) 3355 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); 3356 3357 // Add argument registers to the end of the list so that they are known live 3358 // into the call. 3359 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 3360 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 3361 RegsToPass[i].second.getValueType())); 3362 3363 // Add a register mask operand representing the call-preserved registers. 3364 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); 3365 assert(Mask && "Missing call preserved mask for calling convention"); 3366 3367 // If this is an invoke in a 32-bit function using a funclet-based 3368 // personality, assume the function clobbers all registers. If an exception 3369 // is thrown, the runtime will not restore CSRs. 3370 // FIXME: Model this more precisely so that we can register allocate across 3371 // the normal edge and spill and fill across the exceptional edge. 3372 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { 3373 const Function *CallerFn = MF.getFunction(); 3374 EHPersonality Pers = 3375 CallerFn->hasPersonalityFn() 3376 ? classifyEHPersonality(CallerFn->getPersonalityFn()) 3377 : EHPersonality::Unknown; 3378 if (isFuncletEHPersonality(Pers)) 3379 Mask = RegInfo->getNoPreservedMask(); 3380 } 3381 3382 Ops.push_back(DAG.getRegisterMask(Mask)); 3383 3384 if (InFlag.getNode()) 3385 Ops.push_back(InFlag); 3386 3387 if (isTailCall) { 3388 // We used to do: 3389 //// If this is the first return lowered for this function, add the regs 3390 //// to the liveout set for the function. 3391 // This isn't right, although it's probably harmless on x86; liveouts 3392 // should be computed from returns not tail calls. Consider a void 3393 // function making a tail call to a function returning int. 3394 MF.getFrameInfo()->setHasTailCall(); 3395 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); 3396 } 3397 3398 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); 3399 InFlag = Chain.getValue(1); 3400 3401 // Create the CALLSEQ_END node. 3402 unsigned NumBytesForCalleeToPop; 3403 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 3404 DAG.getTarget().Options.GuaranteedTailCallOpt)) 3405 NumBytesForCalleeToPop = NumBytes; // Callee pops everything 3406 else if (!Is64Bit && !canGuaranteeTCO(CallConv) && 3407 !Subtarget.getTargetTriple().isOSMSVCRT() && 3408 SR == StackStructReturn) 3409 // If this is a call to a struct-return function, the callee 3410 // pops the hidden struct pointer, so we have to push it back. 3411 // This is common for Darwin/X86, Linux & Mingw32 targets. 3412 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 3413 NumBytesForCalleeToPop = 4; 3414 else 3415 NumBytesForCalleeToPop = 0; // Callee pops nothing. 3416 3417 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) { 3418 // No need to reset the stack after the call if the call doesn't return. To 3419 // make the MI verify, we'll pretend the callee does it for us. 3420 NumBytesForCalleeToPop = NumBytes; 3421 } 3422 3423 // Returns a flag for retval copy to use. 3424 if (!IsSibcall) { 3425 Chain = DAG.getCALLSEQ_END(Chain, 3426 DAG.getIntPtrConstant(NumBytesToPop, dl, true), 3427 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, 3428 true), 3429 InFlag, dl); 3430 InFlag = Chain.getValue(1); 3431 } 3432 3433 // Handle result values, copying them out of physregs into vregs that we 3434 // return. 3435 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 3436 Ins, dl, DAG, InVals); 3437 } 3438 3439 //===----------------------------------------------------------------------===// 3440 // Fast Calling Convention (tail call) implementation 3441 //===----------------------------------------------------------------------===// 3442 3443 // Like std call, callee cleans arguments, convention except that ECX is 3444 // reserved for storing the tail called function address. Only 2 registers are 3445 // free for argument passing (inreg). Tail call optimization is performed 3446 // provided: 3447 // * tailcallopt is enabled 3448 // * caller/callee are fastcc 3449 // On X86_64 architecture with GOT-style position independent code only local 3450 // (within module) calls are supported at the moment. 3451 // To keep the stack aligned according to platform abi the function 3452 // GetAlignedArgumentStackSize ensures that argument delta is always multiples 3453 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 3454 // If a tail called function callee has more arguments than the caller the 3455 // caller needs to make sure that there is room to move the RETADDR to. This is 3456 // achieved by reserving an area the size of the argument delta right after the 3457 // original RETADDR, but before the saved framepointer or the spilled registers 3458 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 3459 // stack layout: 3460 // arg1 3461 // arg2 3462 // RETADDR 3463 // [ new RETADDR 3464 // move area ] 3465 // (possible EBP) 3466 // ESI 3467 // EDI 3468 // local1 .. 3469 3470 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align 3471 /// requirement. 3472 unsigned 3473 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 3474 SelectionDAG& DAG) const { 3475 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 3476 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); 3477 unsigned StackAlignment = TFI.getStackAlignment(); 3478 uint64_t AlignMask = StackAlignment - 1; 3479 int64_t Offset = StackSize; 3480 unsigned SlotSize = RegInfo->getSlotSize(); 3481 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 3482 // Number smaller than 12 so just add the difference. 3483 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 3484 } else { 3485 // Mask out lower bits, add stackalignment once plus the 12 bytes. 3486 Offset = ((~AlignMask) & Offset) + StackAlignment + 3487 (StackAlignment-SlotSize); 3488 } 3489 return Offset; 3490 } 3491 3492 /// Return true if the given stack call argument is already available in the 3493 /// same position (relatively) of the caller's incoming argument stack. 3494 static 3495 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 3496 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 3497 const X86InstrInfo *TII, const CCValAssign &VA) { 3498 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 3499 3500 for (;;) { 3501 // Look through nodes that don't alter the bits of the incoming value. 3502 unsigned Op = Arg.getOpcode(); 3503 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { 3504 Arg = Arg.getOperand(0); 3505 continue; 3506 } 3507 if (Op == ISD::TRUNCATE) { 3508 const SDValue &TruncInput = Arg.getOperand(0); 3509 if (TruncInput.getOpcode() == ISD::AssertZext && 3510 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() == 3511 Arg.getValueType()) { 3512 Arg = TruncInput.getOperand(0); 3513 continue; 3514 } 3515 } 3516 break; 3517 } 3518 3519 int FI = INT_MAX; 3520 if (Arg.getOpcode() == ISD::CopyFromReg) { 3521 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 3522 if (!TargetRegisterInfo::isVirtualRegister(VR)) 3523 return false; 3524 MachineInstr *Def = MRI->getVRegDef(VR); 3525 if (!Def) 3526 return false; 3527 if (!Flags.isByVal()) { 3528 if (!TII->isLoadFromStackSlot(*Def, FI)) 3529 return false; 3530 } else { 3531 unsigned Opcode = Def->getOpcode(); 3532 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || 3533 Opcode == X86::LEA64_32r) && 3534 Def->getOperand(1).isFI()) { 3535 FI = Def->getOperand(1).getIndex(); 3536 Bytes = Flags.getByValSize(); 3537 } else 3538 return false; 3539 } 3540 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 3541 if (Flags.isByVal()) 3542 // ByVal argument is passed in as a pointer but it's now being 3543 // dereferenced. e.g. 3544 // define @foo(%struct.X* %A) { 3545 // tail call @bar(%struct.X* byval %A) 3546 // } 3547 return false; 3548 SDValue Ptr = Ld->getBasePtr(); 3549 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 3550 if (!FINode) 3551 return false; 3552 FI = FINode->getIndex(); 3553 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 3554 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 3555 FI = FINode->getIndex(); 3556 Bytes = Flags.getByValSize(); 3557 } else 3558 return false; 3559 3560 assert(FI != INT_MAX); 3561 if (!MFI->isFixedObjectIndex(FI)) 3562 return false; 3563 3564 if (Offset != MFI->getObjectOffset(FI)) 3565 return false; 3566 3567 if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) { 3568 // If the argument location is wider than the argument type, check that any 3569 // extension flags match. 3570 if (Flags.isZExt() != MFI->isObjectZExt(FI) || 3571 Flags.isSExt() != MFI->isObjectSExt(FI)) { 3572 return false; 3573 } 3574 } 3575 3576 return Bytes == MFI->getObjectSize(FI); 3577 } 3578 3579 /// Check whether the call is eligible for tail call optimization. Targets 3580 /// that want to do tail call optimization should implement this function. 3581 bool X86TargetLowering::IsEligibleForTailCallOptimization( 3582 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 3583 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, 3584 const SmallVectorImpl<ISD::OutputArg> &Outs, 3585 const SmallVectorImpl<SDValue> &OutVals, 3586 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 3587 if (!mayTailCallThisCC(CalleeCC)) 3588 return false; 3589 3590 // If -tailcallopt is specified, make fastcc functions tail-callable. 3591 MachineFunction &MF = DAG.getMachineFunction(); 3592 const Function *CallerF = MF.getFunction(); 3593 3594 // If the function return type is x86_fp80 and the callee return type is not, 3595 // then the FP_EXTEND of the call result is not a nop. It's not safe to 3596 // perform a tailcall optimization here. 3597 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 3598 return false; 3599 3600 CallingConv::ID CallerCC = CallerF->getCallingConv(); 3601 bool CCMatch = CallerCC == CalleeCC; 3602 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); 3603 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); 3604 3605 // Win64 functions have extra shadow space for argument homing. Don't do the 3606 // sibcall if the caller and callee have mismatched expectations for this 3607 // space. 3608 if (IsCalleeWin64 != IsCallerWin64) 3609 return false; 3610 3611 if (DAG.getTarget().Options.GuaranteedTailCallOpt) { 3612 if (canGuaranteeTCO(CalleeCC) && CCMatch) 3613 return true; 3614 return false; 3615 } 3616 3617 // Look for obvious safe cases to perform tail call optimization that do not 3618 // require ABI changes. This is what gcc calls sibcall. 3619 3620 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 3621 // emit a special epilogue. 3622 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 3623 if (RegInfo->needsStackRealignment(MF)) 3624 return false; 3625 3626 // Also avoid sibcall optimization if either caller or callee uses struct 3627 // return semantics. 3628 if (isCalleeStructRet || isCallerStructRet) 3629 return false; 3630 3631 // Do not sibcall optimize vararg calls unless all arguments are passed via 3632 // registers. 3633 LLVMContext &C = *DAG.getContext(); 3634 if (isVarArg && !Outs.empty()) { 3635 // Optimizing for varargs on Win64 is unlikely to be safe without 3636 // additional testing. 3637 if (IsCalleeWin64 || IsCallerWin64) 3638 return false; 3639 3640 SmallVector<CCValAssign, 16> ArgLocs; 3641 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 3642 3643 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3644 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 3645 if (!ArgLocs[i].isRegLoc()) 3646 return false; 3647 } 3648 3649 // If the call result is in ST0 / ST1, it needs to be popped off the x87 3650 // stack. Therefore, if it's not used by the call it is not safe to optimize 3651 // this into a sibcall. 3652 bool Unused = false; 3653 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3654 if (!Ins[i].Used) { 3655 Unused = true; 3656 break; 3657 } 3658 } 3659 if (Unused) { 3660 SmallVector<CCValAssign, 16> RVLocs; 3661 CCState CCInfo(CalleeCC, false, MF, RVLocs, C); 3662 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 3663 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 3664 CCValAssign &VA = RVLocs[i]; 3665 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) 3666 return false; 3667 } 3668 } 3669 3670 // Check that the call results are passed in the same way. 3671 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 3672 RetCC_X86, RetCC_X86)) 3673 return false; 3674 // The callee has to preserve all registers the caller needs to preserve. 3675 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 3676 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 3677 if (!CCMatch) { 3678 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 3679 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 3680 return false; 3681 } 3682 3683 unsigned StackArgsSize = 0; 3684 3685 // If the callee takes no arguments then go on to check the results of the 3686 // call. 3687 if (!Outs.empty()) { 3688 // Check if stack adjustment is needed. For now, do not do this if any 3689 // argument is passed on the stack. 3690 SmallVector<CCValAssign, 16> ArgLocs; 3691 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 3692 3693 // Allocate shadow area for Win64 3694 if (IsCalleeWin64) 3695 CCInfo.AllocateStack(32, 8); 3696 3697 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3698 StackArgsSize = CCInfo.getNextStackOffset(); 3699 3700 if (CCInfo.getNextStackOffset()) { 3701 // Check if the arguments are already laid out in the right way as 3702 // the caller's fixed stack objects. 3703 MachineFrameInfo *MFI = MF.getFrameInfo(); 3704 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 3705 const X86InstrInfo *TII = Subtarget.getInstrInfo(); 3706 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3707 CCValAssign &VA = ArgLocs[i]; 3708 SDValue Arg = OutVals[i]; 3709 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3710 if (VA.getLocInfo() == CCValAssign::Indirect) 3711 return false; 3712 if (!VA.isRegLoc()) { 3713 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3714 MFI, MRI, TII, VA)) 3715 return false; 3716 } 3717 } 3718 } 3719 3720 bool PositionIndependent = isPositionIndependent(); 3721 // If the tailcall address may be in a register, then make sure it's 3722 // possible to register allocate for it. In 32-bit, the call address can 3723 // only target EAX, EDX, or ECX since the tail call must be scheduled after 3724 // callee-saved registers are restored. These happen to be the same 3725 // registers used to pass 'inreg' arguments so watch out for those. 3726 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && 3727 !isa<ExternalSymbolSDNode>(Callee)) || 3728 PositionIndependent)) { 3729 unsigned NumInRegs = 0; 3730 // In PIC we need an extra register to formulate the address computation 3731 // for the callee. 3732 unsigned MaxInRegs = PositionIndependent ? 2 : 3; 3733 3734 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3735 CCValAssign &VA = ArgLocs[i]; 3736 if (!VA.isRegLoc()) 3737 continue; 3738 unsigned Reg = VA.getLocReg(); 3739 switch (Reg) { 3740 default: break; 3741 case X86::EAX: case X86::EDX: case X86::ECX: 3742 if (++NumInRegs == MaxInRegs) 3743 return false; 3744 break; 3745 } 3746 } 3747 } 3748 3749 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3750 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 3751 return false; 3752 } 3753 3754 bool CalleeWillPop = 3755 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, 3756 MF.getTarget().Options.GuaranteedTailCallOpt); 3757 3758 if (unsigned BytesToPop = 3759 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { 3760 // If we have bytes to pop, the callee must pop them. 3761 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; 3762 if (!CalleePopMatches) 3763 return false; 3764 } else if (CalleeWillPop && StackArgsSize > 0) { 3765 // If we don't have bytes to pop, make sure the callee doesn't pop any. 3766 return false; 3767 } 3768 3769 return true; 3770 } 3771 3772 FastISel * 3773 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 3774 const TargetLibraryInfo *libInfo) const { 3775 return X86::createFastISel(funcInfo, libInfo); 3776 } 3777 3778 //===----------------------------------------------------------------------===// 3779 // Other Lowering Hooks 3780 //===----------------------------------------------------------------------===// 3781 3782 static bool MayFoldLoad(SDValue Op) { 3783 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 3784 } 3785 3786 static bool MayFoldIntoStore(SDValue Op) { 3787 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 3788 } 3789 3790 static bool isTargetShuffle(unsigned Opcode) { 3791 switch(Opcode) { 3792 default: return false; 3793 case X86ISD::BLENDI: 3794 case X86ISD::PSHUFB: 3795 case X86ISD::PSHUFD: 3796 case X86ISD::PSHUFHW: 3797 case X86ISD::PSHUFLW: 3798 case X86ISD::SHUFP: 3799 case X86ISD::INSERTPS: 3800 case X86ISD::PALIGNR: 3801 case X86ISD::VSHLDQ: 3802 case X86ISD::VSRLDQ: 3803 case X86ISD::MOVLHPS: 3804 case X86ISD::MOVLHPD: 3805 case X86ISD::MOVHLPS: 3806 case X86ISD::MOVLPS: 3807 case X86ISD::MOVLPD: 3808 case X86ISD::MOVSHDUP: 3809 case X86ISD::MOVSLDUP: 3810 case X86ISD::MOVDDUP: 3811 case X86ISD::MOVSS: 3812 case X86ISD::MOVSD: 3813 case X86ISD::UNPCKL: 3814 case X86ISD::UNPCKH: 3815 case X86ISD::VPERMILPI: 3816 case X86ISD::VPERMILPV: 3817 case X86ISD::VPERM2X128: 3818 case X86ISD::VPERMIL2: 3819 case X86ISD::VPERMI: 3820 case X86ISD::VPPERM: 3821 case X86ISD::VPERMV: 3822 case X86ISD::VPERMV3: 3823 case X86ISD::VZEXT_MOVL: 3824 return true; 3825 } 3826 } 3827 3828 static bool isTargetShuffleVariableMask(unsigned Opcode) { 3829 switch (Opcode) { 3830 default: return false; 3831 case X86ISD::PSHUFB: 3832 case X86ISD::VPERMILPV: 3833 return true; 3834 } 3835 } 3836 3837 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT, 3838 SDValue V1, unsigned TargetMask, 3839 SelectionDAG &DAG) { 3840 switch(Opc) { 3841 default: llvm_unreachable("Unknown x86 shuffle node"); 3842 case X86ISD::PSHUFD: 3843 case X86ISD::PSHUFHW: 3844 case X86ISD::PSHUFLW: 3845 case X86ISD::VPERMILPI: 3846 case X86ISD::VPERMI: 3847 return DAG.getNode(Opc, dl, VT, V1, 3848 DAG.getConstant(TargetMask, dl, MVT::i8)); 3849 } 3850 } 3851 3852 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT, 3853 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3854 switch(Opc) { 3855 default: llvm_unreachable("Unknown x86 shuffle node"); 3856 case X86ISD::MOVLHPS: 3857 case X86ISD::MOVLHPD: 3858 case X86ISD::MOVHLPS: 3859 case X86ISD::MOVLPS: 3860 case X86ISD::MOVLPD: 3861 case X86ISD::MOVSS: 3862 case X86ISD::MOVSD: 3863 case X86ISD::UNPCKL: 3864 case X86ISD::UNPCKH: 3865 return DAG.getNode(Opc, dl, VT, V1, V2); 3866 } 3867 } 3868 3869 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3870 MachineFunction &MF = DAG.getMachineFunction(); 3871 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 3872 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3873 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3874 3875 if (ReturnAddrIndex == 0) { 3876 // Set up a frame object for the return address. 3877 unsigned SlotSize = RegInfo->getSlotSize(); 3878 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, 3879 -(int64_t)SlotSize, 3880 false); 3881 FuncInfo->setRAIndex(ReturnAddrIndex); 3882 } 3883 3884 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); 3885 } 3886 3887 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3888 bool hasSymbolicDisplacement) { 3889 // Offset should fit into 32 bit immediate field. 3890 if (!isInt<32>(Offset)) 3891 return false; 3892 3893 // If we don't have a symbolic displacement - we don't have any extra 3894 // restrictions. 3895 if (!hasSymbolicDisplacement) 3896 return true; 3897 3898 // FIXME: Some tweaks might be needed for medium code model. 3899 if (M != CodeModel::Small && M != CodeModel::Kernel) 3900 return false; 3901 3902 // For small code model we assume that latest object is 16MB before end of 31 3903 // bits boundary. We may also accept pretty large negative constants knowing 3904 // that all objects are in the positive half of address space. 3905 if (M == CodeModel::Small && Offset < 16*1024*1024) 3906 return true; 3907 3908 // For kernel code model we know that all object resist in the negative half 3909 // of 32bits address space. We may not accept negative offsets, since they may 3910 // be just off and we may accept pretty large positive ones. 3911 if (M == CodeModel::Kernel && Offset >= 0) 3912 return true; 3913 3914 return false; 3915 } 3916 3917 /// Determines whether the callee is required to pop its own arguments. 3918 /// Callee pop is necessary to support tail calls. 3919 bool X86::isCalleePop(CallingConv::ID CallingConv, 3920 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { 3921 // If GuaranteeTCO is true, we force some calls to be callee pop so that we 3922 // can guarantee TCO. 3923 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) 3924 return true; 3925 3926 switch (CallingConv) { 3927 default: 3928 return false; 3929 case CallingConv::X86_StdCall: 3930 case CallingConv::X86_FastCall: 3931 case CallingConv::X86_ThisCall: 3932 case CallingConv::X86_VectorCall: 3933 return !is64Bit; 3934 } 3935 } 3936 3937 /// \brief Return true if the condition is an unsigned comparison operation. 3938 static bool isX86CCUnsigned(unsigned X86CC) { 3939 switch (X86CC) { 3940 default: 3941 llvm_unreachable("Invalid integer condition!"); 3942 case X86::COND_E: 3943 case X86::COND_NE: 3944 case X86::COND_B: 3945 case X86::COND_A: 3946 case X86::COND_BE: 3947 case X86::COND_AE: 3948 return true; 3949 case X86::COND_G: 3950 case X86::COND_GE: 3951 case X86::COND_L: 3952 case X86::COND_LE: 3953 return false; 3954 } 3955 } 3956 3957 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { 3958 switch (SetCCOpcode) { 3959 default: llvm_unreachable("Invalid integer condition!"); 3960 case ISD::SETEQ: return X86::COND_E; 3961 case ISD::SETGT: return X86::COND_G; 3962 case ISD::SETGE: return X86::COND_GE; 3963 case ISD::SETLT: return X86::COND_L; 3964 case ISD::SETLE: return X86::COND_LE; 3965 case ISD::SETNE: return X86::COND_NE; 3966 case ISD::SETULT: return X86::COND_B; 3967 case ISD::SETUGT: return X86::COND_A; 3968 case ISD::SETULE: return X86::COND_BE; 3969 case ISD::SETUGE: return X86::COND_AE; 3970 } 3971 } 3972 3973 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific 3974 /// condition code, returning the condition code and the LHS/RHS of the 3975 /// comparison to make. 3976 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, 3977 bool isFP, SDValue &LHS, SDValue &RHS, 3978 SelectionDAG &DAG) { 3979 if (!isFP) { 3980 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3981 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3982 // X > -1 -> X == 0, jump !sign. 3983 RHS = DAG.getConstant(0, DL, RHS.getValueType()); 3984 return X86::COND_NS; 3985 } 3986 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3987 // X < 0 -> X == 0, jump on sign. 3988 return X86::COND_S; 3989 } 3990 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3991 // X < 1 -> X <= 0 3992 RHS = DAG.getConstant(0, DL, RHS.getValueType()); 3993 return X86::COND_LE; 3994 } 3995 } 3996 3997 return TranslateIntegerX86CC(SetCCOpcode); 3998 } 3999 4000 // First determine if it is required or is profitable to flip the operands. 4001 4002 // If LHS is a foldable load, but RHS is not, flip the condition. 4003 if (ISD::isNON_EXTLoad(LHS.getNode()) && 4004 !ISD::isNON_EXTLoad(RHS.getNode())) { 4005 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 4006 std::swap(LHS, RHS); 4007 } 4008 4009 switch (SetCCOpcode) { 4010 default: break; 4011 case ISD::SETOLT: 4012 case ISD::SETOLE: 4013 case ISD::SETUGT: 4014 case ISD::SETUGE: 4015 std::swap(LHS, RHS); 4016 break; 4017 } 4018 4019 // On a floating point condition, the flags are set as follows: 4020 // ZF PF CF op 4021 // 0 | 0 | 0 | X > Y 4022 // 0 | 0 | 1 | X < Y 4023 // 1 | 0 | 0 | X == Y 4024 // 1 | 1 | 1 | unordered 4025 switch (SetCCOpcode) { 4026 default: llvm_unreachable("Condcode should be pre-legalized away"); 4027 case ISD::SETUEQ: 4028 case ISD::SETEQ: return X86::COND_E; 4029 case ISD::SETOLT: // flipped 4030 case ISD::SETOGT: 4031 case ISD::SETGT: return X86::COND_A; 4032 case ISD::SETOLE: // flipped 4033 case ISD::SETOGE: 4034 case ISD::SETGE: return X86::COND_AE; 4035 case ISD::SETUGT: // flipped 4036 case ISD::SETULT: 4037 case ISD::SETLT: return X86::COND_B; 4038 case ISD::SETUGE: // flipped 4039 case ISD::SETULE: 4040 case ISD::SETLE: return X86::COND_BE; 4041 case ISD::SETONE: 4042 case ISD::SETNE: return X86::COND_NE; 4043 case ISD::SETUO: return X86::COND_P; 4044 case ISD::SETO: return X86::COND_NP; 4045 case ISD::SETOEQ: 4046 case ISD::SETUNE: return X86::COND_INVALID; 4047 } 4048 } 4049 4050 /// Is there a floating point cmov for the specific X86 condition code? 4051 /// Current x86 isa includes the following FP cmov instructions: 4052 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 4053 static bool hasFPCMov(unsigned X86CC) { 4054 switch (X86CC) { 4055 default: 4056 return false; 4057 case X86::COND_B: 4058 case X86::COND_BE: 4059 case X86::COND_E: 4060 case X86::COND_P: 4061 case X86::COND_A: 4062 case X86::COND_AE: 4063 case X86::COND_NE: 4064 case X86::COND_NP: 4065 return true; 4066 } 4067 } 4068 4069 4070 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 4071 const CallInst &I, 4072 unsigned Intrinsic) const { 4073 4074 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); 4075 if (!IntrData) 4076 return false; 4077 4078 Info.opc = ISD::INTRINSIC_W_CHAIN; 4079 Info.readMem = false; 4080 Info.writeMem = false; 4081 Info.vol = false; 4082 Info.offset = 0; 4083 4084 switch (IntrData->Type) { 4085 case EXPAND_FROM_MEM: { 4086 Info.ptrVal = I.getArgOperand(0); 4087 Info.memVT = MVT::getVT(I.getType()); 4088 Info.align = 1; 4089 Info.readMem = true; 4090 break; 4091 } 4092 case COMPRESS_TO_MEM: { 4093 Info.ptrVal = I.getArgOperand(0); 4094 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType()); 4095 Info.align = 1; 4096 Info.writeMem = true; 4097 break; 4098 } 4099 case TRUNCATE_TO_MEM_VI8: 4100 case TRUNCATE_TO_MEM_VI16: 4101 case TRUNCATE_TO_MEM_VI32: { 4102 Info.ptrVal = I.getArgOperand(0); 4103 MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); 4104 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; 4105 if (IntrData->Type == TRUNCATE_TO_MEM_VI8) 4106 ScalarVT = MVT::i8; 4107 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) 4108 ScalarVT = MVT::i16; 4109 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) 4110 ScalarVT = MVT::i32; 4111 4112 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); 4113 Info.align = 1; 4114 Info.writeMem = true; 4115 break; 4116 } 4117 default: 4118 return false; 4119 } 4120 4121 return true; 4122 } 4123 4124 /// Returns true if the target can instruction select the 4125 /// specified FP immediate natively. If false, the legalizer will 4126 /// materialize the FP immediate as a load from a constant pool. 4127 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 4128 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 4129 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 4130 return true; 4131 } 4132 return false; 4133 } 4134 4135 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, 4136 ISD::LoadExtType ExtTy, 4137 EVT NewVT) const { 4138 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF 4139 // relocation target a movq or addq instruction: don't let the load shrink. 4140 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); 4141 if (BasePtr.getOpcode() == X86ISD::WrapperRIP) 4142 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) 4143 return GA->getTargetFlags() != X86II::MO_GOTTPOFF; 4144 return true; 4145 } 4146 4147 /// \brief Returns true if it is beneficial to convert a load of a constant 4148 /// to just the constant itself. 4149 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 4150 Type *Ty) const { 4151 assert(Ty->isIntegerTy()); 4152 4153 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 4154 if (BitSize == 0 || BitSize > 64) 4155 return false; 4156 return true; 4157 } 4158 4159 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, 4160 unsigned Index) const { 4161 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 4162 return false; 4163 4164 return (Index == 0 || Index == ResVT.getVectorNumElements()); 4165 } 4166 4167 bool X86TargetLowering::isCheapToSpeculateCttz() const { 4168 // Speculate cttz only if we can directly use TZCNT. 4169 return Subtarget.hasBMI(); 4170 } 4171 4172 bool X86TargetLowering::isCheapToSpeculateCtlz() const { 4173 // Speculate ctlz only if we can directly use LZCNT. 4174 return Subtarget.hasLZCNT(); 4175 } 4176 4177 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { 4178 if (!Subtarget.hasBMI()) 4179 return false; 4180 4181 // There are only 32-bit and 64-bit forms for 'andn'. 4182 EVT VT = Y.getValueType(); 4183 if (VT != MVT::i32 && VT != MVT::i64) 4184 return false; 4185 4186 return true; 4187 } 4188 4189 /// Return true if every element in Mask, beginning 4190 /// from position Pos and ending in Pos+Size is undef. 4191 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { 4192 for (unsigned i = Pos, e = Pos + Size; i != e; ++i) 4193 if (0 <= Mask[i]) 4194 return false; 4195 return true; 4196 } 4197 4198 /// Return true if Val is undef or if its value falls within the 4199 /// specified range (L, H]. 4200 static bool isUndefOrInRange(int Val, int Low, int Hi) { 4201 return (Val < 0) || (Val >= Low && Val < Hi); 4202 } 4203 4204 /// Return true if every element in Mask is undef or if its value 4205 /// falls within the specified range (L, H]. 4206 static bool isUndefOrInRange(ArrayRef<int> Mask, 4207 int Low, int Hi) { 4208 for (int M : Mask) 4209 if (!isUndefOrInRange(M, Low, Hi)) 4210 return false; 4211 return true; 4212 } 4213 4214 /// Val is either less than zero (undef) or equal to the specified value. 4215 static bool isUndefOrEqual(int Val, int CmpVal) { 4216 return (Val < 0 || Val == CmpVal); 4217 } 4218 4219 /// Val is either the undef or zero sentinel value. 4220 static bool isUndefOrZero(int Val) { 4221 return (Val == SM_SentinelUndef || Val == SM_SentinelZero); 4222 } 4223 4224 /// Return true if every element in Mask, beginning 4225 /// from position Pos and ending in Pos+Size, falls within the specified 4226 /// sequential range (Low, Low+Size]. or is undef. 4227 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 4228 unsigned Pos, unsigned Size, int Low) { 4229 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 4230 if (!isUndefOrEqual(Mask[i], Low)) 4231 return false; 4232 return true; 4233 } 4234 4235 /// Return true if every element in Mask, beginning 4236 /// from position Pos and ending in Pos+Size, falls within the specified 4237 /// sequential range (Low, Low+Size], or is undef or is zero. 4238 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, 4239 unsigned Size, int Low) { 4240 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low) 4241 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) 4242 return false; 4243 return true; 4244 } 4245 4246 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector 4247 /// extract that is suitable for instruction that extract 128 or 256 bit vectors 4248 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { 4249 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4250 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4251 return false; 4252 4253 // The index should be aligned on a vecWidth-bit boundary. 4254 uint64_t Index = 4255 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4256 4257 MVT VT = N->getSimpleValueType(0); 4258 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4259 bool Result = (Index * ElSize) % vecWidth == 0; 4260 4261 return Result; 4262 } 4263 4264 /// Return true if the specified INSERT_SUBVECTOR 4265 /// operand specifies a subvector insert that is suitable for input to 4266 /// insertion of 128 or 256-bit subvectors 4267 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { 4268 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4269 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4270 return false; 4271 // The index should be aligned on a vecWidth-bit boundary. 4272 uint64_t Index = 4273 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4274 4275 MVT VT = N->getSimpleValueType(0); 4276 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4277 bool Result = (Index * ElSize) % vecWidth == 0; 4278 4279 return Result; 4280 } 4281 4282 bool X86::isVINSERT128Index(SDNode *N) { 4283 return isVINSERTIndex(N, 128); 4284 } 4285 4286 bool X86::isVINSERT256Index(SDNode *N) { 4287 return isVINSERTIndex(N, 256); 4288 } 4289 4290 bool X86::isVEXTRACT128Index(SDNode *N) { 4291 return isVEXTRACTIndex(N, 128); 4292 } 4293 4294 bool X86::isVEXTRACT256Index(SDNode *N) { 4295 return isVEXTRACTIndex(N, 256); 4296 } 4297 4298 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { 4299 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4300 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) && 4301 "Illegal extract subvector for VEXTRACT"); 4302 4303 uint64_t Index = 4304 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4305 4306 MVT VecVT = N->getOperand(0).getSimpleValueType(); 4307 MVT ElVT = VecVT.getVectorElementType(); 4308 4309 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4310 return Index / NumElemsPerChunk; 4311 } 4312 4313 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { 4314 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4315 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) && 4316 "Illegal insert subvector for VINSERT"); 4317 4318 uint64_t Index = 4319 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4320 4321 MVT VecVT = N->getSimpleValueType(0); 4322 MVT ElVT = VecVT.getVectorElementType(); 4323 4324 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4325 return Index / NumElemsPerChunk; 4326 } 4327 4328 /// Return the appropriate immediate to extract the specified 4329 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. 4330 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { 4331 return getExtractVEXTRACTImmediate(N, 128); 4332 } 4333 4334 /// Return the appropriate immediate to extract the specified 4335 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. 4336 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { 4337 return getExtractVEXTRACTImmediate(N, 256); 4338 } 4339 4340 /// Return the appropriate immediate to insert at the specified 4341 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. 4342 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { 4343 return getInsertVINSERTImmediate(N, 128); 4344 } 4345 4346 /// Return the appropriate immediate to insert at the specified 4347 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. 4348 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { 4349 return getInsertVINSERTImmediate(N, 256); 4350 } 4351 4352 /// Returns true if Elt is a constant zero or a floating point constant +0.0. 4353 bool X86::isZeroNode(SDValue Elt) { 4354 return isNullConstant(Elt) || isNullFPConstant(Elt); 4355 } 4356 4357 // Build a vector of constants 4358 // Use an UNDEF node if MaskElt == -1. 4359 // Spilt 64-bit constants in the 32-bit mode. 4360 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, 4361 const SDLoc &dl, bool IsMask = false) { 4362 4363 SmallVector<SDValue, 32> Ops; 4364 bool Split = false; 4365 4366 MVT ConstVecVT = VT; 4367 unsigned NumElts = VT.getVectorNumElements(); 4368 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); 4369 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { 4370 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); 4371 Split = true; 4372 } 4373 4374 MVT EltVT = ConstVecVT.getVectorElementType(); 4375 for (unsigned i = 0; i < NumElts; ++i) { 4376 bool IsUndef = Values[i] < 0 && IsMask; 4377 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : 4378 DAG.getConstant(Values[i], dl, EltVT); 4379 Ops.push_back(OpNode); 4380 if (Split) 4381 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : 4382 DAG.getConstant(0, dl, EltVT)); 4383 } 4384 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); 4385 if (Split) 4386 ConstsNode = DAG.getBitcast(VT, ConstsNode); 4387 return ConstsNode; 4388 } 4389 4390 /// Returns a vector of specified type with all zero elements. 4391 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, 4392 SelectionDAG &DAG, const SDLoc &dl) { 4393 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || 4394 VT.getVectorElementType() == MVT::i1) && 4395 "Unexpected vector type"); 4396 4397 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest 4398 // type. This ensures they get CSE'd. But if the integer type is not 4399 // available, use a floating-point +0.0 instead. 4400 SDValue Vec; 4401 if (!Subtarget.hasSSE2() && VT.is128BitVector()) { 4402 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); 4403 } else if (VT.getVectorElementType() == MVT::i1) { 4404 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && 4405 "Unexpected vector type"); 4406 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) && 4407 "Unexpected vector type"); 4408 Vec = DAG.getConstant(0, dl, VT); 4409 } else { 4410 unsigned Num32BitElts = VT.getSizeInBits() / 32; 4411 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); 4412 } 4413 return DAG.getBitcast(VT, Vec); 4414 } 4415 4416 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, 4417 const SDLoc &dl, unsigned vectorWidth) { 4418 assert((vectorWidth == 128 || vectorWidth == 256) && 4419 "Unsupported vector width"); 4420 EVT VT = Vec.getValueType(); 4421 EVT ElVT = VT.getVectorElementType(); 4422 unsigned Factor = VT.getSizeInBits()/vectorWidth; 4423 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 4424 VT.getVectorNumElements()/Factor); 4425 4426 // Extract from UNDEF is UNDEF. 4427 if (Vec.isUndef()) 4428 return DAG.getUNDEF(ResultVT); 4429 4430 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR 4431 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); 4432 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); 4433 4434 // This is the index of the first element of the vectorWidth-bit chunk 4435 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. 4436 IdxVal &= ~(ElemsPerChunk - 1); 4437 4438 // If the input is a buildvector just emit a smaller one. 4439 if (Vec.getOpcode() == ISD::BUILD_VECTOR) 4440 return DAG.getNode(ISD::BUILD_VECTOR, 4441 dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); 4442 4443 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); 4444 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); 4445 } 4446 4447 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This 4448 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 4449 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 4450 /// instructions or a simple subregister reference. Idx is an index in the 4451 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes 4452 /// lowering EXTRACT_VECTOR_ELT operations easier. 4453 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, 4454 SelectionDAG &DAG, const SDLoc &dl) { 4455 assert((Vec.getValueType().is256BitVector() || 4456 Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); 4457 return extractSubVector(Vec, IdxVal, DAG, dl, 128); 4458 } 4459 4460 /// Generate a DAG to grab 256-bits from a 512-bit vector. 4461 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, 4462 SelectionDAG &DAG, const SDLoc &dl) { 4463 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); 4464 return extractSubVector(Vec, IdxVal, DAG, dl, 256); 4465 } 4466 4467 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, 4468 SelectionDAG &DAG, const SDLoc &dl, 4469 unsigned vectorWidth) { 4470 assert((vectorWidth == 128 || vectorWidth == 256) && 4471 "Unsupported vector width"); 4472 // Inserting UNDEF is Result 4473 if (Vec.isUndef()) 4474 return Result; 4475 EVT VT = Vec.getValueType(); 4476 EVT ElVT = VT.getVectorElementType(); 4477 EVT ResultVT = Result.getValueType(); 4478 4479 // Insert the relevant vectorWidth bits. 4480 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); 4481 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); 4482 4483 // This is the index of the first element of the vectorWidth-bit chunk 4484 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. 4485 IdxVal &= ~(ElemsPerChunk - 1); 4486 4487 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); 4488 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); 4489 } 4490 4491 /// Generate a DAG to put 128-bits into a vector > 128 bits. This 4492 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or 4493 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a 4494 /// simple superregister reference. Idx is an index in the 128 bits 4495 /// we want. It need not be aligned to a 128-bit boundary. That makes 4496 /// lowering INSERT_VECTOR_ELT operations easier. 4497 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, 4498 SelectionDAG &DAG, const SDLoc &dl) { 4499 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); 4500 4501 // For insertion into the zero index (low half) of a 256-bit vector, it is 4502 // more efficient to generate a blend with immediate instead of an insert*128. 4503 // We are still creating an INSERT_SUBVECTOR below with an undef node to 4504 // extend the subvector to the size of the result vector. Make sure that 4505 // we are not recursing on that node by checking for undef here. 4506 if (IdxVal == 0 && Result.getValueType().is256BitVector() && 4507 !Result.isUndef()) { 4508 EVT ResultVT = Result.getValueType(); 4509 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); 4510 SDValue Undef = DAG.getUNDEF(ResultVT); 4511 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, 4512 Vec, ZeroIndex); 4513 4514 // The blend instruction, and therefore its mask, depend on the data type. 4515 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); 4516 if (ScalarType.isFloatingPoint()) { 4517 // Choose either vblendps (float) or vblendpd (double). 4518 unsigned ScalarSize = ScalarType.getSizeInBits(); 4519 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); 4520 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; 4521 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); 4522 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); 4523 } 4524 4525 const X86Subtarget &Subtarget = 4526 static_cast<const X86Subtarget &>(DAG.getSubtarget()); 4527 4528 // AVX2 is needed for 256-bit integer blend support. 4529 // Integers must be cast to 32-bit because there is only vpblendd; 4530 // vpblendw can't be used for this because it has a handicapped mask. 4531 4532 // If we don't have AVX2, then cast to float. Using a wrong domain blend 4533 // is still more efficient than using the wrong domain vinsertf128 that 4534 // will be created by InsertSubVector(). 4535 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; 4536 4537 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); 4538 Result = DAG.getBitcast(CastVT, Result); 4539 Vec256 = DAG.getBitcast(CastVT, Vec256); 4540 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); 4541 return DAG.getBitcast(ResultVT, Vec256); 4542 } 4543 4544 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); 4545 } 4546 4547 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, 4548 SelectionDAG &DAG, const SDLoc &dl) { 4549 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); 4550 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256); 4551 } 4552 4553 /// Insert i1-subvector to i1-vector. 4554 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, 4555 const X86Subtarget &Subtarget) { 4556 4557 SDLoc dl(Op); 4558 SDValue Vec = Op.getOperand(0); 4559 SDValue SubVec = Op.getOperand(1); 4560 SDValue Idx = Op.getOperand(2); 4561 4562 if (!isa<ConstantSDNode>(Idx)) 4563 return SDValue(); 4564 4565 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 4566 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal 4567 return Op; 4568 4569 MVT OpVT = Op.getSimpleValueType(); 4570 MVT SubVecVT = SubVec.getSimpleValueType(); 4571 unsigned NumElems = OpVT.getVectorNumElements(); 4572 unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); 4573 4574 assert(IdxVal + SubVecNumElems <= NumElems && 4575 IdxVal % SubVecVT.getSizeInBits() == 0 && 4576 "Unexpected index value in INSERT_SUBVECTOR"); 4577 4578 // There are 3 possible cases: 4579 // 1. Subvector should be inserted in the lower part (IdxVal == 0) 4580 // 2. Subvector should be inserted in the upper part 4581 // (IdxVal + SubVecNumElems == NumElems) 4582 // 3. Subvector should be inserted in the middle (for example v2i1 4583 // to v16i1, index 2) 4584 4585 // extend to natively supported kshift 4586 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; 4587 MVT WideOpVT = OpVT; 4588 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits()) 4589 WideOpVT = MinVT; 4590 4591 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); 4592 SDValue Undef = DAG.getUNDEF(WideOpVT); 4593 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, 4594 Undef, SubVec, ZeroIdx); 4595 4596 // Extract sub-vector if require. 4597 auto ExtractSubVec = [&](SDValue V) { 4598 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, 4599 OpVT, V, ZeroIdx); 4600 }; 4601 4602 if (Vec.isUndef()) { 4603 if (IdxVal != 0) { 4604 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8); 4605 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits); 4606 } 4607 return ExtractSubVec(WideSubVec); 4608 } 4609 4610 if (ISD::isBuildVectorAllZeros(Vec.getNode())) { 4611 NumElems = WideOpVT.getVectorNumElements(); 4612 unsigned ShiftLeft = NumElems - SubVecNumElems; 4613 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; 4614 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, 4615 DAG.getConstant(ShiftLeft, dl, MVT::i8)); 4616 Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, 4617 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec; 4618 return ExtractSubVec(Vec); 4619 } 4620 4621 if (IdxVal == 0) { 4622 // Zero lower bits of the Vec 4623 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); 4624 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); 4625 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); 4626 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); 4627 // Merge them together, SubVec should be zero extended. 4628 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, 4629 getZeroVector(WideOpVT, Subtarget, DAG, dl), 4630 SubVec, ZeroIdx); 4631 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); 4632 return ExtractSubVec(Vec); 4633 } 4634 4635 // Simple case when we put subvector in the upper part 4636 if (IdxVal + SubVecNumElems == NumElems) { 4637 // Zero upper bits of the Vec 4638 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, 4639 DAG.getConstant(IdxVal, dl, MVT::i8)); 4640 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); 4641 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); 4642 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); 4643 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); 4644 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); 4645 return ExtractSubVec(Vec); 4646 } 4647 // Subvector should be inserted in the middle - use shuffle 4648 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, 4649 SubVec, ZeroIdx); 4650 SmallVector<int, 64> Mask; 4651 for (unsigned i = 0; i < NumElems; ++i) 4652 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? 4653 i : i + NumElems); 4654 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); 4655 } 4656 4657 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 4658 /// instructions. This is used because creating CONCAT_VECTOR nodes of 4659 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 4660 /// large BUILD_VECTORS. 4661 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 4662 unsigned NumElems, SelectionDAG &DAG, 4663 const SDLoc &dl) { 4664 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 4665 return insert128BitVector(V, V2, NumElems / 2, DAG, dl); 4666 } 4667 4668 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT, 4669 unsigned NumElems, SelectionDAG &DAG, 4670 const SDLoc &dl) { 4671 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 4672 return insert256BitVector(V, V2, NumElems / 2, DAG, dl); 4673 } 4674 4675 /// Returns a vector of specified type with all bits set. 4676 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4677 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately. 4678 /// Then bitcast to their original type, ensuring they get CSE'd. 4679 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget, 4680 SelectionDAG &DAG, const SDLoc &dl) { 4681 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && 4682 "Expected a 128/256/512-bit vector type"); 4683 4684 APInt Ones = APInt::getAllOnesValue(32); 4685 unsigned NumElts = VT.getSizeInBits() / 32; 4686 SDValue Vec; 4687 if (!Subtarget.hasInt256() && NumElts == 8) { 4688 Vec = DAG.getConstant(Ones, dl, MVT::v4i32); 4689 Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4690 } else { 4691 Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); 4692 } 4693 return DAG.getBitcast(VT, Vec); 4694 } 4695 4696 /// Returns a vector_shuffle node for an unpackl operation. 4697 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, 4698 SDValue V1, SDValue V2) { 4699 assert(VT.is128BitVector() && "Expected a 128-bit vector type"); 4700 unsigned NumElems = VT.getVectorNumElements(); 4701 SmallVector<int, 8> Mask(NumElems); 4702 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4703 Mask[i * 2] = i; 4704 Mask[i * 2 + 1] = i + NumElems; 4705 } 4706 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); 4707 } 4708 4709 /// Returns a vector_shuffle node for an unpackh operation. 4710 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT, 4711 SDValue V1, SDValue V2) { 4712 assert(VT.is128BitVector() && "Expected a 128-bit vector type"); 4713 unsigned NumElems = VT.getVectorNumElements(); 4714 SmallVector<int, 8> Mask(NumElems); 4715 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4716 Mask[i * 2] = i + Half; 4717 Mask[i * 2 + 1] = i + NumElems + Half; 4718 } 4719 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); 4720 } 4721 4722 /// Return a vector_shuffle of the specified vector of zero or undef vector. 4723 /// This produces a shuffle where the low element of V2 is swizzled into the 4724 /// zero/undef vector, landing at element Idx. 4725 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4726 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, 4727 bool IsZero, 4728 const X86Subtarget &Subtarget, 4729 SelectionDAG &DAG) { 4730 MVT VT = V2.getSimpleValueType(); 4731 SDValue V1 = IsZero 4732 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); 4733 int NumElems = VT.getVectorNumElements(); 4734 SmallVector<int, 16> MaskVec(NumElems); 4735 for (int i = 0; i != NumElems; ++i) 4736 // If this is the insertion idx, put the low elt of V2 here. 4737 MaskVec[i] = (i == Idx) ? NumElems : i; 4738 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); 4739 } 4740 4741 static SDValue peekThroughBitcasts(SDValue V) { 4742 while (V.getNode() && V.getOpcode() == ISD::BITCAST) 4743 V = V.getOperand(0); 4744 return V; 4745 } 4746 4747 static bool getTargetShuffleMaskIndices(SDValue MaskNode, 4748 unsigned MaskEltSizeInBits, 4749 SmallVectorImpl<uint64_t> &RawMask) { 4750 MaskNode = peekThroughBitcasts(MaskNode); 4751 4752 MVT VT = MaskNode.getSimpleValueType(); 4753 assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); 4754 4755 // Split an APInt element into MaskEltSizeInBits sized pieces and 4756 // insert into the shuffle mask. 4757 auto SplitElementToMask = [&](APInt Element) { 4758 // Note that this is x86 and so always little endian: the low byte is 4759 // the first byte of the mask. 4760 int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits; 4761 for (int i = 0; i < Split; ++i) { 4762 APInt RawElt = Element.getLoBits(MaskEltSizeInBits); 4763 Element = Element.lshr(MaskEltSizeInBits); 4764 RawMask.push_back(RawElt.getZExtValue()); 4765 } 4766 }; 4767 4768 if (MaskNode.getOpcode() == X86ISD::VBROADCAST) { 4769 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 4770 // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0 4771 if (VT.getScalarSizeInBits() != MaskEltSizeInBits) 4772 return false; 4773 if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) { 4774 const APInt &MaskElement = CN->getAPIntValue(); 4775 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { 4776 APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits); 4777 RawMask.push_back(RawElt.getZExtValue()); 4778 } 4779 } 4780 return false; 4781 } 4782 4783 if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL && 4784 MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) { 4785 4786 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 4787 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0) 4788 return false; 4789 unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits; 4790 4791 SDValue MaskOp = MaskNode.getOperand(0).getOperand(0); 4792 if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) { 4793 SplitElementToMask(CN->getAPIntValue()); 4794 RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0); 4795 return true; 4796 } 4797 return false; 4798 } 4799 4800 if (MaskNode.getOpcode() != ISD::BUILD_VECTOR) 4801 return false; 4802 4803 // We can always decode if the buildvector is all zero constants, 4804 // but can't use isBuildVectorAllZeros as it might contain UNDEFs. 4805 if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) { 4806 RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0); 4807 return true; 4808 } 4809 4810 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 4811 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0) 4812 return false; 4813 4814 for (SDValue Op : MaskNode->ops()) { 4815 if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode())) 4816 SplitElementToMask(CN->getAPIntValue()); 4817 else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode())) 4818 SplitElementToMask(CFN->getValueAPF().bitcastToAPInt()); 4819 else 4820 return false; 4821 } 4822 4823 return true; 4824 } 4825 4826 static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) { 4827 MaskNode = peekThroughBitcasts(MaskNode); 4828 4829 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); 4830 if (!MaskLoad) 4831 return nullptr; 4832 4833 SDValue Ptr = MaskLoad->getBasePtr(); 4834 if (Ptr->getOpcode() == X86ISD::Wrapper || 4835 Ptr->getOpcode() == X86ISD::WrapperRIP) 4836 Ptr = Ptr->getOperand(0); 4837 4838 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); 4839 if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) 4840 return nullptr; 4841 4842 return dyn_cast<Constant>(MaskCP->getConstVal()); 4843 } 4844 4845 /// Calculates the shuffle mask corresponding to the target-specific opcode. 4846 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle 4847 /// operands in \p Ops, and returns true. 4848 /// Sets \p IsUnary to true if only one source is used. Note that this will set 4849 /// IsUnary for shuffles which use a single input multiple times, and in those 4850 /// cases it will adjust the mask to only have indices within that single input. 4851 /// It is an error to call this with non-empty Mask/Ops vectors. 4852 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, 4853 SmallVectorImpl<SDValue> &Ops, 4854 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4855 unsigned NumElems = VT.getVectorNumElements(); 4856 SDValue ImmN; 4857 4858 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); 4859 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); 4860 4861 IsUnary = false; 4862 bool IsFakeUnary = false; 4863 switch(N->getOpcode()) { 4864 case X86ISD::BLENDI: 4865 ImmN = N->getOperand(N->getNumOperands()-1); 4866 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4867 break; 4868 case X86ISD::SHUFP: 4869 ImmN = N->getOperand(N->getNumOperands()-1); 4870 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4871 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4872 break; 4873 case X86ISD::INSERTPS: 4874 ImmN = N->getOperand(N->getNumOperands()-1); 4875 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4876 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4877 break; 4878 case X86ISD::UNPCKH: 4879 DecodeUNPCKHMask(VT, Mask); 4880 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4881 break; 4882 case X86ISD::UNPCKL: 4883 DecodeUNPCKLMask(VT, Mask); 4884 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4885 break; 4886 case X86ISD::MOVHLPS: 4887 DecodeMOVHLPSMask(NumElems, Mask); 4888 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4889 break; 4890 case X86ISD::MOVLHPS: 4891 DecodeMOVLHPSMask(NumElems, Mask); 4892 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4893 break; 4894 case X86ISD::PALIGNR: 4895 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); 4896 ImmN = N->getOperand(N->getNumOperands()-1); 4897 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4898 break; 4899 case X86ISD::VSHLDQ: 4900 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); 4901 ImmN = N->getOperand(N->getNumOperands() - 1); 4902 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4903 IsUnary = true; 4904 break; 4905 case X86ISD::VSRLDQ: 4906 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); 4907 ImmN = N->getOperand(N->getNumOperands() - 1); 4908 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4909 IsUnary = true; 4910 break; 4911 case X86ISD::PSHUFD: 4912 case X86ISD::VPERMILPI: 4913 ImmN = N->getOperand(N->getNumOperands()-1); 4914 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4915 IsUnary = true; 4916 break; 4917 case X86ISD::PSHUFHW: 4918 ImmN = N->getOperand(N->getNumOperands()-1); 4919 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4920 IsUnary = true; 4921 break; 4922 case X86ISD::PSHUFLW: 4923 ImmN = N->getOperand(N->getNumOperands()-1); 4924 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4925 IsUnary = true; 4926 break; 4927 case X86ISD::VZEXT_MOVL: 4928 DecodeZeroMoveLowMask(VT, Mask); 4929 IsUnary = true; 4930 break; 4931 case X86ISD::VPERMILPV: { 4932 IsUnary = true; 4933 SDValue MaskNode = N->getOperand(1); 4934 unsigned MaskEltSize = VT.getScalarSizeInBits(); 4935 SmallVector<uint64_t, 32> RawMask; 4936 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { 4937 DecodeVPERMILPMask(VT, RawMask, Mask); 4938 break; 4939 } 4940 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { 4941 DecodeVPERMILPMask(C, MaskEltSize, Mask); 4942 break; 4943 } 4944 return false; 4945 } 4946 case X86ISD::PSHUFB: { 4947 IsUnary = true; 4948 SDValue MaskNode = N->getOperand(1); 4949 SmallVector<uint64_t, 32> RawMask; 4950 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) { 4951 DecodePSHUFBMask(RawMask, Mask); 4952 break; 4953 } 4954 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { 4955 DecodePSHUFBMask(C, Mask); 4956 break; 4957 } 4958 return false; 4959 } 4960 case X86ISD::VPERMI: 4961 ImmN = N->getOperand(N->getNumOperands()-1); 4962 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4963 IsUnary = true; 4964 break; 4965 case X86ISD::MOVSS: 4966 case X86ISD::MOVSD: 4967 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); 4968 break; 4969 case X86ISD::VPERM2X128: 4970 ImmN = N->getOperand(N->getNumOperands()-1); 4971 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4972 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4973 break; 4974 case X86ISD::MOVSLDUP: 4975 DecodeMOVSLDUPMask(VT, Mask); 4976 IsUnary = true; 4977 break; 4978 case X86ISD::MOVSHDUP: 4979 DecodeMOVSHDUPMask(VT, Mask); 4980 IsUnary = true; 4981 break; 4982 case X86ISD::MOVDDUP: 4983 DecodeMOVDDUPMask(VT, Mask); 4984 IsUnary = true; 4985 break; 4986 case X86ISD::MOVLHPD: 4987 case X86ISD::MOVLPD: 4988 case X86ISD::MOVLPS: 4989 // Not yet implemented 4990 return false; 4991 case X86ISD::VPERMIL2: { 4992 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 4993 unsigned MaskEltSize = VT.getScalarSizeInBits(); 4994 SDValue MaskNode = N->getOperand(2); 4995 SDValue CtrlNode = N->getOperand(3); 4996 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) { 4997 unsigned CtrlImm = CtrlOp->getZExtValue(); 4998 SmallVector<uint64_t, 32> RawMask; 4999 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { 5000 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask); 5001 break; 5002 } 5003 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { 5004 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask); 5005 break; 5006 } 5007 } 5008 return false; 5009 } 5010 case X86ISD::VPPERM: { 5011 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 5012 SDValue MaskNode = N->getOperand(2); 5013 SmallVector<uint64_t, 32> RawMask; 5014 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) { 5015 DecodeVPPERMMask(RawMask, Mask); 5016 break; 5017 } 5018 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { 5019 DecodeVPPERMMask(C, Mask); 5020 break; 5021 } 5022 return false; 5023 } 5024 case X86ISD::VPERMV: { 5025 IsUnary = true; 5026 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. 5027 Ops.push_back(N->getOperand(1)); 5028 SDValue MaskNode = N->getOperand(0); 5029 SmallVector<uint64_t, 32> RawMask; 5030 unsigned MaskEltSize = VT.getScalarSizeInBits(); 5031 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { 5032 DecodeVPERMVMask(RawMask, Mask); 5033 break; 5034 } 5035 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { 5036 DecodeVPERMVMask(C, VT, Mask); 5037 break; 5038 } 5039 return false; 5040 } 5041 case X86ISD::VPERMV3: { 5042 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); 5043 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. 5044 Ops.push_back(N->getOperand(0)); 5045 Ops.push_back(N->getOperand(2)); 5046 SDValue MaskNode = N->getOperand(1); 5047 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { 5048 DecodeVPERMV3Mask(C, VT, Mask); 5049 break; 5050 } 5051 return false; 5052 } 5053 default: llvm_unreachable("unknown target shuffle node"); 5054 } 5055 5056 // Empty mask indicates the decode failed. 5057 if (Mask.empty()) 5058 return false; 5059 5060 // Check if we're getting a shuffle mask with zero'd elements. 5061 if (!AllowSentinelZero) 5062 if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; })) 5063 return false; 5064 5065 // If we have a fake unary shuffle, the shuffle mask is spread across two 5066 // inputs that are actually the same node. Re-map the mask to always point 5067 // into the first input. 5068 if (IsFakeUnary) 5069 for (int &M : Mask) 5070 if (M >= (int)Mask.size()) 5071 M -= Mask.size(); 5072 5073 // If we didn't already add operands in the opcode-specific code, default to 5074 // adding 1 or 2 operands starting at 0. 5075 if (Ops.empty()) { 5076 Ops.push_back(N->getOperand(0)); 5077 if (!IsUnary || IsFakeUnary) 5078 Ops.push_back(N->getOperand(1)); 5079 } 5080 5081 return true; 5082 } 5083 5084 /// Check a target shuffle mask's inputs to see if we can set any values to 5085 /// SM_SentinelZero - this is for elements that are known to be zero 5086 /// (not just zeroable) from their inputs. 5087 /// Returns true if the target shuffle mask was decoded. 5088 static bool setTargetShuffleZeroElements(SDValue N, 5089 SmallVectorImpl<int> &Mask, 5090 SmallVectorImpl<SDValue> &Ops) { 5091 bool IsUnary; 5092 if (!isTargetShuffle(N.getOpcode())) 5093 return false; 5094 if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops, 5095 Mask, IsUnary)) 5096 return false; 5097 5098 SDValue V1 = Ops[0]; 5099 SDValue V2 = IsUnary ? V1 : Ops[1]; 5100 5101 V1 = peekThroughBitcasts(V1); 5102 V2 = peekThroughBitcasts(V2); 5103 5104 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 5105 int M = Mask[i]; 5106 5107 // Already decoded as SM_SentinelZero / SM_SentinelUndef. 5108 if (M < 0) 5109 continue; 5110 5111 // Determine shuffle input and normalize the mask. 5112 SDValue V = M < Size ? V1 : V2; 5113 M %= Size; 5114 5115 // We are referencing an UNDEF input. 5116 if (V.isUndef()) { 5117 Mask[i] = SM_SentinelUndef; 5118 continue; 5119 } 5120 5121 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. 5122 if (V.getOpcode() != ISD::BUILD_VECTOR) 5123 continue; 5124 5125 // If the BUILD_VECTOR has fewer elements then the (larger) source 5126 // element must be UNDEF/ZERO. 5127 // TODO: Is it worth testing the individual bits of a constant? 5128 if ((Size % V.getNumOperands()) == 0) { 5129 int Scale = Size / V->getNumOperands(); 5130 SDValue Op = V.getOperand(M / Scale); 5131 if (Op.isUndef()) 5132 Mask[i] = SM_SentinelUndef; 5133 else if (X86::isZeroNode(Op)) 5134 Mask[i] = SM_SentinelZero; 5135 continue; 5136 } 5137 5138 // If the BUILD_VECTOR has more elements then all the (smaller) source 5139 // elements must be all UNDEF or all ZERO. 5140 if ((V.getNumOperands() % Size) == 0) { 5141 int Scale = V->getNumOperands() / Size; 5142 bool AllUndef = true; 5143 bool AllZero = true; 5144 for (int j = 0; j < Scale; ++j) { 5145 SDValue Op = V.getOperand((M * Scale) + j); 5146 AllUndef &= Op.isUndef(); 5147 AllZero &= X86::isZeroNode(Op); 5148 } 5149 if (AllUndef) 5150 Mask[i] = SM_SentinelUndef; 5151 else if (AllZero) 5152 Mask[i] = SM_SentinelZero; 5153 continue; 5154 } 5155 } 5156 5157 return true; 5158 } 5159 5160 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs 5161 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the 5162 /// remaining input indices in case we now have a unary shuffle and adjust the 5163 /// Op0/Op1 inputs accordingly. 5164 /// Returns true if the target shuffle mask was decoded. 5165 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1, 5166 SmallVectorImpl<int> &Mask) { 5167 SmallVector<SDValue, 2> Ops; 5168 if (!setTargetShuffleZeroElements(Op, Mask, Ops)) 5169 return false; 5170 5171 int NumElts = Mask.size(); 5172 bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) { 5173 return 0 <= Idx && Idx < NumElts; 5174 }); 5175 bool Op1InUse = std::any_of(Mask.begin(), Mask.end(), 5176 [NumElts](int Idx) { return NumElts <= Idx; }); 5177 5178 Op0 = Op0InUse ? Ops[0] : SDValue(); 5179 Op1 = Op1InUse ? Ops[1] : SDValue(); 5180 5181 // We're only using Op1 - commute the mask and inputs. 5182 if (!Op0InUse && Op1InUse) { 5183 for (int &M : Mask) 5184 if (NumElts <= M) 5185 M -= NumElts; 5186 Op0 = Op1; 5187 Op1 = SDValue(); 5188 } 5189 5190 return true; 5191 } 5192 5193 /// Returns the scalar element that will make up the ith 5194 /// element of the result of the vector shuffle. 5195 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 5196 unsigned Depth) { 5197 if (Depth == 6) 5198 return SDValue(); // Limit search depth. 5199 5200 SDValue V = SDValue(N, 0); 5201 EVT VT = V.getValueType(); 5202 unsigned Opcode = V.getOpcode(); 5203 5204 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 5205 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 5206 int Elt = SV->getMaskElt(Index); 5207 5208 if (Elt < 0) 5209 return DAG.getUNDEF(VT.getVectorElementType()); 5210 5211 unsigned NumElems = VT.getVectorNumElements(); 5212 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 5213 : SV->getOperand(1); 5214 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 5215 } 5216 5217 // Recurse into target specific vector shuffles to find scalars. 5218 if (isTargetShuffle(Opcode)) { 5219 MVT ShufVT = V.getSimpleValueType(); 5220 MVT ShufSVT = ShufVT.getVectorElementType(); 5221 int NumElems = (int)ShufVT.getVectorNumElements(); 5222 SmallVector<int, 16> ShuffleMask; 5223 SmallVector<SDValue, 16> ShuffleOps; 5224 bool IsUnary; 5225 5226 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) 5227 return SDValue(); 5228 5229 int Elt = ShuffleMask[Index]; 5230 if (Elt == SM_SentinelZero) 5231 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT) 5232 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT); 5233 if (Elt == SM_SentinelUndef) 5234 return DAG.getUNDEF(ShufSVT); 5235 5236 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); 5237 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; 5238 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 5239 Depth+1); 5240 } 5241 5242 // Actual nodes that may contain scalar elements 5243 if (Opcode == ISD::BITCAST) { 5244 V = V.getOperand(0); 5245 EVT SrcVT = V.getValueType(); 5246 unsigned NumElems = VT.getVectorNumElements(); 5247 5248 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 5249 return SDValue(); 5250 } 5251 5252 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5253 return (Index == 0) ? V.getOperand(0) 5254 : DAG.getUNDEF(VT.getVectorElementType()); 5255 5256 if (V.getOpcode() == ISD::BUILD_VECTOR) 5257 return V.getOperand(Index); 5258 5259 return SDValue(); 5260 } 5261 5262 /// Custom lower build_vector of v16i8. 5263 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 5264 unsigned NumNonZero, unsigned NumZero, 5265 SelectionDAG &DAG, 5266 const X86Subtarget &Subtarget, 5267 const TargetLowering &TLI) { 5268 if (NumNonZero > 8) 5269 return SDValue(); 5270 5271 SDLoc dl(Op); 5272 SDValue V; 5273 bool First = true; 5274 5275 // SSE4.1 - use PINSRB to insert each byte directly. 5276 if (Subtarget.hasSSE41()) { 5277 for (unsigned i = 0; i < 16; ++i) { 5278 bool isNonZero = (NonZeros & (1 << i)) != 0; 5279 if (isNonZero) { 5280 if (First) { 5281 if (NumZero) 5282 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); 5283 else 5284 V = DAG.getUNDEF(MVT::v16i8); 5285 First = false; 5286 } 5287 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 5288 MVT::v16i8, V, Op.getOperand(i), 5289 DAG.getIntPtrConstant(i, dl)); 5290 } 5291 } 5292 5293 return V; 5294 } 5295 5296 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. 5297 for (unsigned i = 0; i < 16; ++i) { 5298 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 5299 if (ThisIsNonZero && First) { 5300 if (NumZero) 5301 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5302 else 5303 V = DAG.getUNDEF(MVT::v8i16); 5304 First = false; 5305 } 5306 5307 if ((i & 1) != 0) { 5308 SDValue ThisElt, LastElt; 5309 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 5310 if (LastIsNonZero) { 5311 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 5312 MVT::i16, Op.getOperand(i-1)); 5313 } 5314 if (ThisIsNonZero) { 5315 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 5316 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 5317 ThisElt, DAG.getConstant(8, dl, MVT::i8)); 5318 if (LastIsNonZero) 5319 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 5320 } else 5321 ThisElt = LastElt; 5322 5323 if (ThisElt.getNode()) 5324 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 5325 DAG.getIntPtrConstant(i/2, dl)); 5326 } 5327 } 5328 5329 return DAG.getBitcast(MVT::v16i8, V); 5330 } 5331 5332 /// Custom lower build_vector of v8i16. 5333 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 5334 unsigned NumNonZero, unsigned NumZero, 5335 SelectionDAG &DAG, 5336 const X86Subtarget &Subtarget, 5337 const TargetLowering &TLI) { 5338 if (NumNonZero > 4) 5339 return SDValue(); 5340 5341 SDLoc dl(Op); 5342 SDValue V; 5343 bool First = true; 5344 for (unsigned i = 0; i < 8; ++i) { 5345 bool isNonZero = (NonZeros & (1 << i)) != 0; 5346 if (isNonZero) { 5347 if (First) { 5348 if (NumZero) 5349 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5350 else 5351 V = DAG.getUNDEF(MVT::v8i16); 5352 First = false; 5353 } 5354 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 5355 MVT::v8i16, V, Op.getOperand(i), 5356 DAG.getIntPtrConstant(i, dl)); 5357 } 5358 } 5359 5360 return V; 5361 } 5362 5363 /// Custom lower build_vector of v4i32 or v4f32. 5364 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, 5365 const X86Subtarget &Subtarget, 5366 const TargetLowering &TLI) { 5367 // Find all zeroable elements. 5368 std::bitset<4> Zeroable; 5369 for (int i=0; i < 4; ++i) { 5370 SDValue Elt = Op->getOperand(i); 5371 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); 5372 } 5373 assert(Zeroable.size() - Zeroable.count() > 1 && 5374 "We expect at least two non-zero elements!"); 5375 5376 // We only know how to deal with build_vector nodes where elements are either 5377 // zeroable or extract_vector_elt with constant index. 5378 SDValue FirstNonZero; 5379 unsigned FirstNonZeroIdx; 5380 for (unsigned i=0; i < 4; ++i) { 5381 if (Zeroable[i]) 5382 continue; 5383 SDValue Elt = Op->getOperand(i); 5384 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5385 !isa<ConstantSDNode>(Elt.getOperand(1))) 5386 return SDValue(); 5387 // Make sure that this node is extracting from a 128-bit vector. 5388 MVT VT = Elt.getOperand(0).getSimpleValueType(); 5389 if (!VT.is128BitVector()) 5390 return SDValue(); 5391 if (!FirstNonZero.getNode()) { 5392 FirstNonZero = Elt; 5393 FirstNonZeroIdx = i; 5394 } 5395 } 5396 5397 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); 5398 SDValue V1 = FirstNonZero.getOperand(0); 5399 MVT VT = V1.getSimpleValueType(); 5400 5401 // See if this build_vector can be lowered as a blend with zero. 5402 SDValue Elt; 5403 unsigned EltMaskIdx, EltIdx; 5404 int Mask[4]; 5405 for (EltIdx = 0; EltIdx < 4; ++EltIdx) { 5406 if (Zeroable[EltIdx]) { 5407 // The zero vector will be on the right hand side. 5408 Mask[EltIdx] = EltIdx+4; 5409 continue; 5410 } 5411 5412 Elt = Op->getOperand(EltIdx); 5413 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. 5414 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); 5415 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) 5416 break; 5417 Mask[EltIdx] = EltIdx; 5418 } 5419 5420 if (EltIdx == 4) { 5421 // Let the shuffle legalizer deal with blend operations. 5422 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); 5423 if (V1.getSimpleValueType() != VT) 5424 V1 = DAG.getBitcast(VT, V1); 5425 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask); 5426 } 5427 5428 // See if we can lower this build_vector to a INSERTPS. 5429 if (!Subtarget.hasSSE41()) 5430 return SDValue(); 5431 5432 SDValue V2 = Elt.getOperand(0); 5433 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) 5434 V1 = SDValue(); 5435 5436 bool CanFold = true; 5437 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { 5438 if (Zeroable[i]) 5439 continue; 5440 5441 SDValue Current = Op->getOperand(i); 5442 SDValue SrcVector = Current->getOperand(0); 5443 if (!V1.getNode()) 5444 V1 = SrcVector; 5445 CanFold = SrcVector == V1 && 5446 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; 5447 } 5448 5449 if (!CanFold) 5450 return SDValue(); 5451 5452 assert(V1.getNode() && "Expected at least two non-zero elements!"); 5453 if (V1.getSimpleValueType() != MVT::v4f32) 5454 V1 = DAG.getBitcast(MVT::v4f32, V1); 5455 if (V2.getSimpleValueType() != MVT::v4f32) 5456 V2 = DAG.getBitcast(MVT::v4f32, V2); 5457 5458 // Ok, we can emit an INSERTPS instruction. 5459 unsigned ZMask = Zeroable.to_ulong(); 5460 5461 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; 5462 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); 5463 SDLoc DL(Op); 5464 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, 5465 DAG.getIntPtrConstant(InsertPSMask, DL)); 5466 return DAG.getBitcast(VT, Result); 5467 } 5468 5469 /// Return a vector logical shift node. 5470 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, 5471 SelectionDAG &DAG, const TargetLowering &TLI, 5472 const SDLoc &dl) { 5473 assert(VT.is128BitVector() && "Unknown type for VShift"); 5474 MVT ShVT = MVT::v16i8; 5475 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 5476 SrcOp = DAG.getBitcast(ShVT, SrcOp); 5477 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); 5478 assert(NumBits % 8 == 0 && "Only support byte sized shifts"); 5479 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); 5480 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); 5481 } 5482 5483 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, 5484 SelectionDAG &DAG) { 5485 5486 // Check if the scalar load can be widened into a vector load. And if 5487 // the address is "base + cst" see if the cst can be "absorbed" into 5488 // the shuffle mask. 5489 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 5490 SDValue Ptr = LD->getBasePtr(); 5491 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 5492 return SDValue(); 5493 EVT PVT = LD->getValueType(0); 5494 if (PVT != MVT::i32 && PVT != MVT::f32) 5495 return SDValue(); 5496 5497 int FI = -1; 5498 int64_t Offset = 0; 5499 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 5500 FI = FINode->getIndex(); 5501 Offset = 0; 5502 } else if (DAG.isBaseWithConstantOffset(Ptr) && 5503 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 5504 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 5505 Offset = Ptr.getConstantOperandVal(1); 5506 Ptr = Ptr.getOperand(0); 5507 } else { 5508 return SDValue(); 5509 } 5510 5511 // FIXME: 256-bit vector instructions don't require a strict alignment, 5512 // improve this code to support it better. 5513 unsigned RequiredAlign = VT.getSizeInBits()/8; 5514 SDValue Chain = LD->getChain(); 5515 // Make sure the stack object alignment is at least 16 or 32. 5516 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5517 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 5518 if (MFI->isFixedObjectIndex(FI)) { 5519 // Can't change the alignment. FIXME: It's possible to compute 5520 // the exact stack offset and reference FI + adjust offset instead. 5521 // If someone *really* cares about this. That's the way to implement it. 5522 return SDValue(); 5523 } else { 5524 MFI->setObjectAlignment(FI, RequiredAlign); 5525 } 5526 } 5527 5528 // (Offset % 16 or 32) must be multiple of 4. Then address is then 5529 // Ptr + (Offset & ~15). 5530 if (Offset < 0) 5531 return SDValue(); 5532 if ((Offset % RequiredAlign) & 3) 5533 return SDValue(); 5534 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); 5535 if (StartOffset) { 5536 SDLoc DL(Ptr); 5537 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 5538 DAG.getConstant(StartOffset, DL, Ptr.getValueType())); 5539 } 5540 5541 int EltNo = (Offset - StartOffset) >> 2; 5542 unsigned NumElems = VT.getVectorNumElements(); 5543 5544 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 5545 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 5546 LD->getPointerInfo().getWithOffset(StartOffset), 5547 false, false, false, 0); 5548 5549 SmallVector<int, 8> Mask(NumElems, EltNo); 5550 5551 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); 5552 } 5553 5554 return SDValue(); 5555 } 5556 5557 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the 5558 /// elements can be replaced by a single large load which has the same value as 5559 /// a build_vector or insert_subvector whose loaded operands are 'Elts'. 5560 /// 5561 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a 5562 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, 5563 SDLoc &DL, SelectionDAG &DAG, 5564 bool isAfterLegalize) { 5565 unsigned NumElems = Elts.size(); 5566 5567 int LastLoadedElt = -1; 5568 SmallBitVector LoadMask(NumElems, false); 5569 SmallBitVector ZeroMask(NumElems, false); 5570 SmallBitVector UndefMask(NumElems, false); 5571 5572 // For each element in the initializer, see if we've found a load, zero or an 5573 // undef. 5574 for (unsigned i = 0; i < NumElems; ++i) { 5575 SDValue Elt = peekThroughBitcasts(Elts[i]); 5576 if (!Elt.getNode()) 5577 return SDValue(); 5578 5579 if (Elt.isUndef()) 5580 UndefMask[i] = true; 5581 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) 5582 ZeroMask[i] = true; 5583 else if (ISD::isNON_EXTLoad(Elt.getNode())) { 5584 LoadMask[i] = true; 5585 LastLoadedElt = i; 5586 // Each loaded element must be the correct fractional portion of the 5587 // requested vector load. 5588 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) 5589 return SDValue(); 5590 } else 5591 return SDValue(); 5592 } 5593 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems && 5594 "Incomplete element masks"); 5595 5596 // Handle Special Cases - all undef or undef/zero. 5597 if (UndefMask.count() == NumElems) 5598 return DAG.getUNDEF(VT); 5599 5600 // FIXME: Should we return this as a BUILD_VECTOR instead? 5601 if ((ZeroMask | UndefMask).count() == NumElems) 5602 return VT.isInteger() ? DAG.getConstant(0, DL, VT) 5603 : DAG.getConstantFP(0.0, DL, VT); 5604 5605 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5606 int FirstLoadedElt = LoadMask.find_first(); 5607 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); 5608 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase); 5609 EVT LDBaseVT = EltBase.getValueType(); 5610 5611 // Consecutive loads can contain UNDEFS but not ZERO elements. 5612 // Consecutive loads with UNDEFs and ZEROs elements require a 5613 // an additional shuffle stage to clear the ZERO elements. 5614 bool IsConsecutiveLoad = true; 5615 bool IsConsecutiveLoadWithZeros = true; 5616 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { 5617 if (LoadMask[i]) { 5618 SDValue Elt = peekThroughBitcasts(Elts[i]); 5619 LoadSDNode *LD = cast<LoadSDNode>(Elt); 5620 if (!DAG.areNonVolatileConsecutiveLoads( 5621 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8, 5622 i - FirstLoadedElt)) { 5623 IsConsecutiveLoad = false; 5624 IsConsecutiveLoadWithZeros = false; 5625 break; 5626 } 5627 } else if (ZeroMask[i]) { 5628 IsConsecutiveLoad = false; 5629 } 5630 } 5631 5632 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) { 5633 SDValue NewLd = DAG.getLoad( 5634 VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5635 LDBase->getPointerInfo(), false /*LDBase->isVolatile()*/, 5636 LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment()); 5637 5638 if (LDBase->hasAnyUseOfValue(1)) { 5639 SDValue NewChain = 5640 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), 5641 SDValue(NewLd.getNode(), 1)); 5642 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5643 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5644 SDValue(NewLd.getNode(), 1)); 5645 } 5646 5647 return NewLd; 5648 }; 5649 5650 // LOAD - all consecutive load/undefs (must start/end with a load). 5651 // If we have found an entire vector of loads and undefs, then return a large 5652 // load of the entire vector width starting at the base pointer. 5653 // If the vector contains zeros, then attempt to shuffle those elements. 5654 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) && 5655 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { 5656 assert(LDBase && "Did not find base load for merging consecutive loads"); 5657 EVT EltVT = LDBase->getValueType(0); 5658 // Ensure that the input vector size for the merged loads matches the 5659 // cumulative size of the input elements. 5660 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) 5661 return SDValue(); 5662 5663 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) 5664 return SDValue(); 5665 5666 if (IsConsecutiveLoad) 5667 return CreateLoad(VT, LDBase); 5668 5669 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded 5670 // vector and a zero vector to clear out the zero elements. 5671 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) { 5672 SmallVector<int, 4> ClearMask(NumElems, -1); 5673 for (unsigned i = 0; i < NumElems; ++i) { 5674 if (ZeroMask[i]) 5675 ClearMask[i] = i + NumElems; 5676 else if (LoadMask[i]) 5677 ClearMask[i] = i; 5678 } 5679 SDValue V = CreateLoad(VT, LDBase); 5680 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) 5681 : DAG.getConstantFP(0.0, DL, VT); 5682 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); 5683 } 5684 } 5685 5686 int LoadSize = 5687 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); 5688 5689 // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs. 5690 if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 && 5691 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { 5692 MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; 5693 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64); 5694 if (TLI.isTypeLegal(VecVT)) { 5695 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); 5696 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 5697 SDValue ResNode = 5698 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, 5699 LDBase->getPointerInfo(), 5700 LDBase->getAlignment(), 5701 false/*isVolatile*/, true/*ReadMem*/, 5702 false/*WriteMem*/); 5703 5704 // Make sure the newly-created LOAD is in the same position as LDBase in 5705 // terms of dependency. We create a TokenFactor for LDBase and ResNode, 5706 // and update uses of LDBase's output chain to use the TokenFactor. 5707 if (LDBase->hasAnyUseOfValue(1)) { 5708 SDValue NewChain = 5709 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), 5710 SDValue(ResNode.getNode(), 1)); 5711 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5712 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5713 SDValue(ResNode.getNode(), 1)); 5714 } 5715 5716 return DAG.getBitcast(VT, ResNode); 5717 } 5718 } 5719 5720 // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs. 5721 if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 && 5722 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { 5723 MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32; 5724 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32); 5725 if (TLI.isTypeLegal(VecVT)) { 5726 SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase) 5727 : DAG.getBitcast(VecSVT, EltBase); 5728 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V); 5729 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V); 5730 return DAG.getBitcast(VT, V); 5731 } 5732 } 5733 5734 return SDValue(); 5735 } 5736 5737 /// Attempt to use the vbroadcast instruction to generate a splat value for the 5738 /// following cases: 5739 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 5740 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from 5741 /// a scalar load, or a constant. 5742 /// The VBROADCAST node is returned when a pattern is found, 5743 /// or SDValue() otherwise. 5744 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget, 5745 SelectionDAG &DAG) { 5746 // VBROADCAST requires AVX. 5747 // TODO: Splats could be generated for non-AVX CPUs using SSE 5748 // instructions, but there's less potential gain for only 128-bit vectors. 5749 if (!Subtarget.hasAVX()) 5750 return SDValue(); 5751 5752 MVT VT = Op.getSimpleValueType(); 5753 SDLoc dl(Op); 5754 5755 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && 5756 "Unsupported vector type for broadcast."); 5757 5758 SDValue Ld; 5759 bool ConstSplatVal; 5760 5761 switch (Op.getOpcode()) { 5762 default: 5763 // Unknown pattern found. 5764 return SDValue(); 5765 5766 case ISD::BUILD_VECTOR: { 5767 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode()); 5768 BitVector UndefElements; 5769 SDValue Splat = BVOp->getSplatValue(&UndefElements); 5770 5771 // We need a splat of a single value to use broadcast, and it doesn't 5772 // make any sense if the value is only in one element of the vector. 5773 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) 5774 return SDValue(); 5775 5776 Ld = Splat; 5777 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5778 Ld.getOpcode() == ISD::ConstantFP); 5779 5780 // Make sure that all of the users of a non-constant load are from the 5781 // BUILD_VECTOR node. 5782 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) 5783 return SDValue(); 5784 break; 5785 } 5786 5787 case ISD::VECTOR_SHUFFLE: { 5788 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5789 5790 // Shuffles must have a splat mask where the first element is 5791 // broadcasted. 5792 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5793 return SDValue(); 5794 5795 SDValue Sc = Op.getOperand(0); 5796 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5797 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5798 5799 if (!Subtarget.hasInt256()) 5800 return SDValue(); 5801 5802 // Use the register form of the broadcast instruction available on AVX2. 5803 if (VT.getSizeInBits() >= 256) 5804 Sc = extract128BitVector(Sc, 0, DAG, dl); 5805 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5806 } 5807 5808 Ld = Sc.getOperand(0); 5809 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5810 Ld.getOpcode() == ISD::ConstantFP); 5811 5812 // The scalar_to_vector node and the suspected 5813 // load node must have exactly one user. 5814 // Constants may have multiple users. 5815 5816 // AVX-512 has register version of the broadcast 5817 bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() && 5818 Ld.getValueType().getSizeInBits() >= 32; 5819 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && 5820 !hasRegVer)) 5821 return SDValue(); 5822 break; 5823 } 5824 } 5825 5826 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5827 bool IsGE256 = (VT.getSizeInBits() >= 256); 5828 5829 // When optimizing for size, generate up to 5 extra bytes for a broadcast 5830 // instruction to save 8 or more bytes of constant pool data. 5831 // TODO: If multiple splats are generated to load the same constant, 5832 // it may be detrimental to overall size. There needs to be a way to detect 5833 // that condition to know if this is truly a size win. 5834 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); 5835 5836 // Handle broadcasting a single constant scalar from the constant pool 5837 // into a vector. 5838 // On Sandybridge (no AVX2), it is still better to load a constant vector 5839 // from the constant pool and not to broadcast it from a scalar. 5840 // But override that restriction when optimizing for size. 5841 // TODO: Check if splatting is recommended for other AVX-capable CPUs. 5842 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { 5843 EVT CVT = Ld.getValueType(); 5844 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5845 5846 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. 5847 // For size optimization, also splat v2f64 and v2i64, and for size opt 5848 // with AVX2, also splat i8 and i16. 5849 // With pattern matching, the VBROADCAST node may become a VMOVDDUP. 5850 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || 5851 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { 5852 const Constant *C = nullptr; 5853 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5854 C = CI->getConstantIntValue(); 5855 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5856 C = CF->getConstantFPValue(); 5857 5858 assert(C && "Invalid constant type"); 5859 5860 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5861 SDValue CP = 5862 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); 5863 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5864 Ld = DAG.getLoad( 5865 CVT, dl, DAG.getEntryNode(), CP, 5866 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 5867 false, false, Alignment); 5868 5869 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5870 } 5871 } 5872 5873 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5874 5875 // Handle AVX2 in-register broadcasts. 5876 if (!IsLoad && Subtarget.hasInt256() && 5877 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) 5878 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5879 5880 // The scalar source must be a normal load. 5881 if (!IsLoad) 5882 return SDValue(); 5883 5884 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || 5885 (Subtarget.hasVLX() && ScalarSize == 64)) 5886 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5887 5888 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5889 // double since there is no vbroadcastsd xmm 5890 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) { 5891 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5892 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5893 } 5894 5895 // Unsupported broadcast. 5896 return SDValue(); 5897 } 5898 5899 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real 5900 /// underlying vector and index. 5901 /// 5902 /// Modifies \p ExtractedFromVec to the real vector and returns the real 5903 /// index. 5904 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, 5905 SDValue ExtIdx) { 5906 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 5907 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) 5908 return Idx; 5909 5910 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already 5911 // lowered this: 5912 // (extract_vector_elt (v8f32 %vreg1), Constant<6>) 5913 // to: 5914 // (extract_vector_elt (vector_shuffle<2,u,u,u> 5915 // (extract_subvector (v8f32 %vreg0), Constant<4>), 5916 // undef) 5917 // Constant<0>) 5918 // In this case the vector is the extract_subvector expression and the index 5919 // is 2, as specified by the shuffle. 5920 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); 5921 SDValue ShuffleVec = SVOp->getOperand(0); 5922 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); 5923 assert(ShuffleVecVT.getVectorElementType() == 5924 ExtractedFromVec.getSimpleValueType().getVectorElementType()); 5925 5926 int ShuffleIdx = SVOp->getMaskElt(Idx); 5927 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { 5928 ExtractedFromVec = ShuffleVec; 5929 return ShuffleIdx; 5930 } 5931 return Idx; 5932 } 5933 5934 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { 5935 MVT VT = Op.getSimpleValueType(); 5936 5937 // Skip if insert_vec_elt is not supported. 5938 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5939 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 5940 return SDValue(); 5941 5942 SDLoc DL(Op); 5943 unsigned NumElems = Op.getNumOperands(); 5944 5945 SDValue VecIn1; 5946 SDValue VecIn2; 5947 SmallVector<unsigned, 4> InsertIndices; 5948 SmallVector<int, 8> Mask(NumElems, -1); 5949 5950 for (unsigned i = 0; i != NumElems; ++i) { 5951 unsigned Opc = Op.getOperand(i).getOpcode(); 5952 5953 if (Opc == ISD::UNDEF) 5954 continue; 5955 5956 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 5957 // Quit if more than 1 elements need inserting. 5958 if (InsertIndices.size() > 1) 5959 return SDValue(); 5960 5961 InsertIndices.push_back(i); 5962 continue; 5963 } 5964 5965 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 5966 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 5967 // Quit if non-constant index. 5968 if (!isa<ConstantSDNode>(ExtIdx)) 5969 return SDValue(); 5970 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); 5971 5972 // Quit if extracted from vector of different type. 5973 if (ExtractedFromVec.getValueType() != VT) 5974 return SDValue(); 5975 5976 if (!VecIn1.getNode()) 5977 VecIn1 = ExtractedFromVec; 5978 else if (VecIn1 != ExtractedFromVec) { 5979 if (!VecIn2.getNode()) 5980 VecIn2 = ExtractedFromVec; 5981 else if (VecIn2 != ExtractedFromVec) 5982 // Quit if more than 2 vectors to shuffle 5983 return SDValue(); 5984 } 5985 5986 if (ExtractedFromVec == VecIn1) 5987 Mask[i] = Idx; 5988 else if (ExtractedFromVec == VecIn2) 5989 Mask[i] = Idx + NumElems; 5990 } 5991 5992 if (!VecIn1.getNode()) 5993 return SDValue(); 5994 5995 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 5996 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); 5997 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 5998 unsigned Idx = InsertIndices[i]; 5999 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 6000 DAG.getIntPtrConstant(Idx, DL)); 6001 } 6002 6003 return NV; 6004 } 6005 6006 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { 6007 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && 6008 Op.getScalarValueSizeInBits() == 1 && 6009 "Can not convert non-constant vector"); 6010 uint64_t Immediate = 0; 6011 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 6012 SDValue In = Op.getOperand(idx); 6013 if (!In.isUndef()) 6014 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; 6015 } 6016 SDLoc dl(Op); 6017 MVT VT = 6018 MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8)); 6019 return DAG.getConstant(Immediate, dl, VT); 6020 } 6021 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. 6022 SDValue 6023 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { 6024 6025 MVT VT = Op.getSimpleValueType(); 6026 assert((VT.getVectorElementType() == MVT::i1) && 6027 "Unexpected type in LowerBUILD_VECTORvXi1!"); 6028 6029 SDLoc dl(Op); 6030 if (ISD::isBuildVectorAllZeros(Op.getNode())) 6031 return DAG.getTargetConstant(0, dl, VT); 6032 6033 if (ISD::isBuildVectorAllOnes(Op.getNode())) 6034 return DAG.getTargetConstant(1, dl, VT); 6035 6036 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { 6037 SDValue Imm = ConvertI1VectorToInteger(Op, DAG); 6038 if (Imm.getValueSizeInBits() == VT.getSizeInBits()) 6039 return DAG.getBitcast(VT, Imm); 6040 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); 6041 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, 6042 DAG.getIntPtrConstant(0, dl)); 6043 } 6044 6045 // Vector has one or more non-const elements 6046 uint64_t Immediate = 0; 6047 SmallVector<unsigned, 16> NonConstIdx; 6048 bool IsSplat = true; 6049 bool HasConstElts = false; 6050 int SplatIdx = -1; 6051 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 6052 SDValue In = Op.getOperand(idx); 6053 if (In.isUndef()) 6054 continue; 6055 if (!isa<ConstantSDNode>(In)) 6056 NonConstIdx.push_back(idx); 6057 else { 6058 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; 6059 HasConstElts = true; 6060 } 6061 if (SplatIdx < 0) 6062 SplatIdx = idx; 6063 else if (In != Op.getOperand(SplatIdx)) 6064 IsSplat = false; 6065 } 6066 6067 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" 6068 if (IsSplat) 6069 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), 6070 DAG.getConstant(1, dl, VT), 6071 DAG.getConstant(0, dl, VT)); 6072 6073 // insert elements one by one 6074 SDValue DstVec; 6075 SDValue Imm; 6076 if (Immediate) { 6077 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); 6078 Imm = DAG.getConstant(Immediate, dl, ImmVT); 6079 } 6080 else if (HasConstElts) 6081 Imm = DAG.getConstant(0, dl, VT); 6082 else 6083 Imm = DAG.getUNDEF(VT); 6084 if (Imm.getValueSizeInBits() == VT.getSizeInBits()) 6085 DstVec = DAG.getBitcast(VT, Imm); 6086 else { 6087 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); 6088 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, 6089 DAG.getIntPtrConstant(0, dl)); 6090 } 6091 6092 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { 6093 unsigned InsertIdx = NonConstIdx[i]; 6094 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 6095 Op.getOperand(InsertIdx), 6096 DAG.getIntPtrConstant(InsertIdx, dl)); 6097 } 6098 return DstVec; 6099 } 6100 6101 /// \brief Return true if \p N implements a horizontal binop and return the 6102 /// operands for the horizontal binop into V0 and V1. 6103 /// 6104 /// This is a helper function of LowerToHorizontalOp(). 6105 /// This function checks that the build_vector \p N in input implements a 6106 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal 6107 /// operation to match. 6108 /// For example, if \p Opcode is equal to ISD::ADD, then this function 6109 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode 6110 /// is equal to ISD::SUB, then this function checks if this is a horizontal 6111 /// arithmetic sub. 6112 /// 6113 /// This function only analyzes elements of \p N whose indices are 6114 /// in range [BaseIdx, LastIdx). 6115 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, 6116 SelectionDAG &DAG, 6117 unsigned BaseIdx, unsigned LastIdx, 6118 SDValue &V0, SDValue &V1) { 6119 EVT VT = N->getValueType(0); 6120 6121 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); 6122 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && 6123 "Invalid Vector in input!"); 6124 6125 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); 6126 bool CanFold = true; 6127 unsigned ExpectedVExtractIdx = BaseIdx; 6128 unsigned NumElts = LastIdx - BaseIdx; 6129 V0 = DAG.getUNDEF(VT); 6130 V1 = DAG.getUNDEF(VT); 6131 6132 // Check if N implements a horizontal binop. 6133 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { 6134 SDValue Op = N->getOperand(i + BaseIdx); 6135 6136 // Skip UNDEFs. 6137 if (Op->isUndef()) { 6138 // Update the expected vector extract index. 6139 if (i * 2 == NumElts) 6140 ExpectedVExtractIdx = BaseIdx; 6141 ExpectedVExtractIdx += 2; 6142 continue; 6143 } 6144 6145 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); 6146 6147 if (!CanFold) 6148 break; 6149 6150 SDValue Op0 = Op.getOperand(0); 6151 SDValue Op1 = Op.getOperand(1); 6152 6153 // Try to match the following pattern: 6154 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) 6155 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 6156 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 6157 Op0.getOperand(0) == Op1.getOperand(0) && 6158 isa<ConstantSDNode>(Op0.getOperand(1)) && 6159 isa<ConstantSDNode>(Op1.getOperand(1))); 6160 if (!CanFold) 6161 break; 6162 6163 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); 6164 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); 6165 6166 if (i * 2 < NumElts) { 6167 if (V0.isUndef()) { 6168 V0 = Op0.getOperand(0); 6169 if (V0.getValueType() != VT) 6170 return false; 6171 } 6172 } else { 6173 if (V1.isUndef()) { 6174 V1 = Op0.getOperand(0); 6175 if (V1.getValueType() != VT) 6176 return false; 6177 } 6178 if (i * 2 == NumElts) 6179 ExpectedVExtractIdx = BaseIdx; 6180 } 6181 6182 SDValue Expected = (i * 2 < NumElts) ? V0 : V1; 6183 if (I0 == ExpectedVExtractIdx) 6184 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; 6185 else if (IsCommutable && I1 == ExpectedVExtractIdx) { 6186 // Try to match the following dag sequence: 6187 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) 6188 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; 6189 } else 6190 CanFold = false; 6191 6192 ExpectedVExtractIdx += 2; 6193 } 6194 6195 return CanFold; 6196 } 6197 6198 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by 6199 /// a concat_vector. 6200 /// 6201 /// This is a helper function of LowerToHorizontalOp(). 6202 /// This function expects two 256-bit vectors called V0 and V1. 6203 /// At first, each vector is split into two separate 128-bit vectors. 6204 /// Then, the resulting 128-bit vectors are used to implement two 6205 /// horizontal binary operations. 6206 /// 6207 /// The kind of horizontal binary operation is defined by \p X86Opcode. 6208 /// 6209 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to 6210 /// the two new horizontal binop. 6211 /// When Mode is set, the first horizontal binop dag node would take as input 6212 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second 6213 /// horizontal binop dag node would take as input the lower 128-bit of V1 6214 /// and the upper 128-bit of V1. 6215 /// Example: 6216 /// HADD V0_LO, V0_HI 6217 /// HADD V1_LO, V1_HI 6218 /// 6219 /// Otherwise, the first horizontal binop dag node takes as input the lower 6220 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop 6221 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. 6222 /// Example: 6223 /// HADD V0_LO, V1_LO 6224 /// HADD V0_HI, V1_HI 6225 /// 6226 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower 6227 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to 6228 /// the upper 128-bits of the result. 6229 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, 6230 const SDLoc &DL, SelectionDAG &DAG, 6231 unsigned X86Opcode, bool Mode, 6232 bool isUndefLO, bool isUndefHI) { 6233 MVT VT = V0.getSimpleValueType(); 6234 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() && 6235 "Invalid nodes in input!"); 6236 6237 unsigned NumElts = VT.getVectorNumElements(); 6238 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); 6239 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); 6240 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); 6241 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); 6242 MVT NewVT = V0_LO.getSimpleValueType(); 6243 6244 SDValue LO = DAG.getUNDEF(NewVT); 6245 SDValue HI = DAG.getUNDEF(NewVT); 6246 6247 if (Mode) { 6248 // Don't emit a horizontal binop if the result is expected to be UNDEF. 6249 if (!isUndefLO && !V0->isUndef()) 6250 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); 6251 if (!isUndefHI && !V1->isUndef()) 6252 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); 6253 } else { 6254 // Don't emit a horizontal binop if the result is expected to be UNDEF. 6255 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) 6256 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); 6257 6258 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) 6259 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); 6260 } 6261 6262 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); 6263 } 6264 6265 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB 6266 /// node. 6267 static SDValue LowerToAddSub(const BuildVectorSDNode *BV, 6268 const X86Subtarget &Subtarget, SelectionDAG &DAG) { 6269 MVT VT = BV->getSimpleValueType(0); 6270 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && 6271 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) 6272 return SDValue(); 6273 6274 SDLoc DL(BV); 6275 unsigned NumElts = VT.getVectorNumElements(); 6276 SDValue InVec0 = DAG.getUNDEF(VT); 6277 SDValue InVec1 = DAG.getUNDEF(VT); 6278 6279 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || 6280 VT == MVT::v2f64) && "build_vector with an invalid type found!"); 6281 6282 // Odd-numbered elements in the input build vector are obtained from 6283 // adding two integer/float elements. 6284 // Even-numbered elements in the input build vector are obtained from 6285 // subtracting two integer/float elements. 6286 unsigned ExpectedOpcode = ISD::FSUB; 6287 unsigned NextExpectedOpcode = ISD::FADD; 6288 bool AddFound = false; 6289 bool SubFound = false; 6290 6291 for (unsigned i = 0, e = NumElts; i != e; ++i) { 6292 SDValue Op = BV->getOperand(i); 6293 6294 // Skip 'undef' values. 6295 unsigned Opcode = Op.getOpcode(); 6296 if (Opcode == ISD::UNDEF) { 6297 std::swap(ExpectedOpcode, NextExpectedOpcode); 6298 continue; 6299 } 6300 6301 // Early exit if we found an unexpected opcode. 6302 if (Opcode != ExpectedOpcode) 6303 return SDValue(); 6304 6305 SDValue Op0 = Op.getOperand(0); 6306 SDValue Op1 = Op.getOperand(1); 6307 6308 // Try to match the following pattern: 6309 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) 6310 // Early exit if we cannot match that sequence. 6311 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6312 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6313 !isa<ConstantSDNode>(Op0.getOperand(1)) || 6314 !isa<ConstantSDNode>(Op1.getOperand(1)) || 6315 Op0.getOperand(1) != Op1.getOperand(1)) 6316 return SDValue(); 6317 6318 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); 6319 if (I0 != i) 6320 return SDValue(); 6321 6322 // We found a valid add/sub node. Update the information accordingly. 6323 if (i & 1) 6324 AddFound = true; 6325 else 6326 SubFound = true; 6327 6328 // Update InVec0 and InVec1. 6329 if (InVec0.isUndef()) { 6330 InVec0 = Op0.getOperand(0); 6331 if (InVec0.getSimpleValueType() != VT) 6332 return SDValue(); 6333 } 6334 if (InVec1.isUndef()) { 6335 InVec1 = Op1.getOperand(0); 6336 if (InVec1.getSimpleValueType() != VT) 6337 return SDValue(); 6338 } 6339 6340 // Make sure that operands in input to each add/sub node always 6341 // come from a same pair of vectors. 6342 if (InVec0 != Op0.getOperand(0)) { 6343 if (ExpectedOpcode == ISD::FSUB) 6344 return SDValue(); 6345 6346 // FADD is commutable. Try to commute the operands 6347 // and then test again. 6348 std::swap(Op0, Op1); 6349 if (InVec0 != Op0.getOperand(0)) 6350 return SDValue(); 6351 } 6352 6353 if (InVec1 != Op1.getOperand(0)) 6354 return SDValue(); 6355 6356 // Update the pair of expected opcodes. 6357 std::swap(ExpectedOpcode, NextExpectedOpcode); 6358 } 6359 6360 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. 6361 if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef()) 6362 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); 6363 6364 return SDValue(); 6365 } 6366 6367 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. 6368 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, 6369 const X86Subtarget &Subtarget, 6370 SelectionDAG &DAG) { 6371 MVT VT = BV->getSimpleValueType(0); 6372 unsigned NumElts = VT.getVectorNumElements(); 6373 unsigned NumUndefsLO = 0; 6374 unsigned NumUndefsHI = 0; 6375 unsigned Half = NumElts/2; 6376 6377 // Count the number of UNDEF operands in the build_vector in input. 6378 for (unsigned i = 0, e = Half; i != e; ++i) 6379 if (BV->getOperand(i)->isUndef()) 6380 NumUndefsLO++; 6381 6382 for (unsigned i = Half, e = NumElts; i != e; ++i) 6383 if (BV->getOperand(i)->isUndef()) 6384 NumUndefsHI++; 6385 6386 // Early exit if this is either a build_vector of all UNDEFs or all the 6387 // operands but one are UNDEF. 6388 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) 6389 return SDValue(); 6390 6391 SDLoc DL(BV); 6392 SDValue InVec0, InVec1; 6393 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) { 6394 // Try to match an SSE3 float HADD/HSUB. 6395 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) 6396 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); 6397 6398 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) 6399 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); 6400 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) { 6401 // Try to match an SSSE3 integer HADD/HSUB. 6402 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) 6403 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); 6404 6405 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) 6406 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); 6407 } 6408 6409 if (!Subtarget.hasAVX()) 6410 return SDValue(); 6411 6412 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { 6413 // Try to match an AVX horizontal add/sub of packed single/double 6414 // precision floating point values from 256-bit vectors. 6415 SDValue InVec2, InVec3; 6416 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && 6417 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && 6418 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && 6419 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) 6420 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); 6421 6422 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && 6423 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && 6424 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && 6425 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) 6426 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); 6427 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { 6428 // Try to match an AVX2 horizontal add/sub of signed integers. 6429 SDValue InVec2, InVec3; 6430 unsigned X86Opcode; 6431 bool CanFold = true; 6432 6433 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && 6434 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && 6435 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && 6436 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) 6437 X86Opcode = X86ISD::HADD; 6438 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && 6439 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && 6440 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && 6441 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) 6442 X86Opcode = X86ISD::HSUB; 6443 else 6444 CanFold = false; 6445 6446 if (CanFold) { 6447 // Fold this build_vector into a single horizontal add/sub. 6448 // Do this only if the target has AVX2. 6449 if (Subtarget.hasAVX2()) 6450 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); 6451 6452 // Do not try to expand this build_vector into a pair of horizontal 6453 // add/sub if we can emit a pair of scalar add/sub. 6454 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) 6455 return SDValue(); 6456 6457 // Convert this build_vector into a pair of horizontal binop followed by 6458 // a concat vector. 6459 bool isUndefLO = NumUndefsLO == Half; 6460 bool isUndefHI = NumUndefsHI == Half; 6461 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, 6462 isUndefLO, isUndefHI); 6463 } 6464 } 6465 6466 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || 6467 VT == MVT::v16i16) && Subtarget.hasAVX()) { 6468 unsigned X86Opcode; 6469 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) 6470 X86Opcode = X86ISD::HADD; 6471 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) 6472 X86Opcode = X86ISD::HSUB; 6473 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) 6474 X86Opcode = X86ISD::FHADD; 6475 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) 6476 X86Opcode = X86ISD::FHSUB; 6477 else 6478 return SDValue(); 6479 6480 // Don't try to expand this build_vector into a pair of horizontal add/sub 6481 // if we can simply emit a pair of scalar add/sub. 6482 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) 6483 return SDValue(); 6484 6485 // Convert this build_vector into two horizontal add/sub followed by 6486 // a concat vector. 6487 bool isUndefLO = NumUndefsLO == Half; 6488 bool isUndefHI = NumUndefsHI == Half; 6489 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, 6490 isUndefLO, isUndefHI); 6491 } 6492 6493 return SDValue(); 6494 } 6495 6496 /// If a BUILD_VECTOR's source elements all apply the same bit operation and 6497 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and 6498 /// just apply the bit to the vectors. 6499 /// NOTE: Its not in our interest to start make a general purpose vectorizer 6500 /// from this, but enough scalar bit operations are created from the later 6501 /// legalization + scalarization stages to need basic support. 6502 static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) { 6503 SDLoc DL(Op); 6504 MVT VT = Op.getSimpleValueType(); 6505 unsigned NumElems = VT.getVectorNumElements(); 6506 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6507 6508 // Check that all elements have the same opcode. 6509 // TODO: Should we allow UNDEFS and if so how many? 6510 unsigned Opcode = Op.getOperand(0).getOpcode(); 6511 for (unsigned i = 1; i < NumElems; ++i) 6512 if (Opcode != Op.getOperand(i).getOpcode()) 6513 return SDValue(); 6514 6515 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). 6516 switch (Opcode) { 6517 default: 6518 return SDValue(); 6519 case ISD::AND: 6520 case ISD::XOR: 6521 case ISD::OR: 6522 if (!TLI.isOperationLegalOrPromote(Opcode, VT)) 6523 return SDValue(); 6524 break; 6525 } 6526 6527 SmallVector<SDValue, 4> LHSElts, RHSElts; 6528 for (SDValue Elt : Op->ops()) { 6529 SDValue LHS = Elt.getOperand(0); 6530 SDValue RHS = Elt.getOperand(1); 6531 6532 // We expect the canonicalized RHS operand to be the constant. 6533 if (!isa<ConstantSDNode>(RHS)) 6534 return SDValue(); 6535 LHSElts.push_back(LHS); 6536 RHSElts.push_back(RHS); 6537 } 6538 6539 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); 6540 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); 6541 return DAG.getNode(Opcode, DL, VT, LHS, RHS); 6542 } 6543 6544 /// Create a vector constant without a load. SSE/AVX provide the bare minimum 6545 /// functionality to do this, so it's all zeros, all ones, or some derivation 6546 /// that is cheap to calculate. 6547 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, 6548 const X86Subtarget &Subtarget) { 6549 SDLoc DL(Op); 6550 MVT VT = Op.getSimpleValueType(); 6551 6552 // Vectors containing all zeros can be matched by pxor and xorps. 6553 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 6554 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 6555 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 6556 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) 6557 return Op; 6558 6559 return getZeroVector(VT, Subtarget, DAG, DL); 6560 } 6561 6562 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 6563 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 6564 // vpcmpeqd on 256-bit vectors. 6565 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { 6566 if (VT == MVT::v4i32 || VT == MVT::v16i32 || 6567 (VT == MVT::v8i32 && Subtarget.hasInt256())) 6568 return Op; 6569 6570 return getOnesVector(VT, Subtarget, DAG, DL); 6571 } 6572 6573 return SDValue(); 6574 } 6575 6576 SDValue 6577 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6578 SDLoc dl(Op); 6579 6580 MVT VT = Op.getSimpleValueType(); 6581 MVT ExtVT = VT.getVectorElementType(); 6582 unsigned NumElems = Op.getNumOperands(); 6583 6584 // Generate vectors for predicate vectors. 6585 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) 6586 return LowerBUILD_VECTORvXi1(Op, DAG); 6587 6588 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) 6589 return VectorConstant; 6590 6591 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); 6592 if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) 6593 return AddSub; 6594 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) 6595 return HorizontalOp; 6596 if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) 6597 return Broadcast; 6598 if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG)) 6599 return BitOp; 6600 6601 unsigned EVTBits = ExtVT.getSizeInBits(); 6602 6603 unsigned NumZero = 0; 6604 unsigned NumNonZero = 0; 6605 uint64_t NonZeros = 0; 6606 bool IsAllConstants = true; 6607 SmallSet<SDValue, 8> Values; 6608 for (unsigned i = 0; i < NumElems; ++i) { 6609 SDValue Elt = Op.getOperand(i); 6610 if (Elt.isUndef()) 6611 continue; 6612 Values.insert(Elt); 6613 if (Elt.getOpcode() != ISD::Constant && 6614 Elt.getOpcode() != ISD::ConstantFP) 6615 IsAllConstants = false; 6616 if (X86::isZeroNode(Elt)) 6617 NumZero++; 6618 else { 6619 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. 6620 NonZeros |= ((uint64_t)1 << i); 6621 NumNonZero++; 6622 } 6623 } 6624 6625 // All undef vector. Return an UNDEF. All zero vectors were handled above. 6626 if (NumNonZero == 0) 6627 return DAG.getUNDEF(VT); 6628 6629 // Special case for single non-zero, non-undef, element. 6630 if (NumNonZero == 1) { 6631 unsigned Idx = countTrailingZeros(NonZeros); 6632 SDValue Item = Op.getOperand(Idx); 6633 6634 // If this is an insertion of an i64 value on x86-32, and if the top bits of 6635 // the value are obviously zero, truncate the value to i32 and do the 6636 // insertion that way. Only do this if the value is non-constant or if the 6637 // value is a constant being inserted into element 0. It is cheaper to do 6638 // a constant pool load than it is to do a movd + shuffle. 6639 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() && 6640 (!IsAllConstants || Idx == 0)) { 6641 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 6642 // Handle SSE only. 6643 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 6644 MVT VecVT = MVT::v4i32; 6645 6646 // Truncate the value (which may itself be a constant) to i32, and 6647 // convert it to a vector with movd (S2V+shuffle to zero extend). 6648 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 6649 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 6650 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef( 6651 Item, Idx * 2, true, Subtarget, DAG)); 6652 } 6653 } 6654 6655 // If we have a constant or non-constant insertion into the low element of 6656 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 6657 // the rest of the elements. This will be matched as movd/movq/movss/movsd 6658 // depending on what the source datatype is. 6659 if (Idx == 0) { 6660 if (NumZero == 0) 6661 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 6662 6663 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 6664 (ExtVT == MVT::i64 && Subtarget.is64Bit())) { 6665 if (VT.is512BitVector()) { 6666 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 6667 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 6668 Item, DAG.getIntPtrConstant(0, dl)); 6669 } 6670 assert((VT.is128BitVector() || VT.is256BitVector()) && 6671 "Expected an SSE value type!"); 6672 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 6673 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 6674 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 6675 } 6676 6677 // We can't directly insert an i8 or i16 into a vector, so zero extend 6678 // it to i32 first. 6679 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 6680 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 6681 if (VT.getSizeInBits() >= 256) { 6682 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); 6683 if (Subtarget.hasAVX()) { 6684 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); 6685 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 6686 } else { 6687 // Without AVX, we need to extend to a 128-bit vector and then 6688 // insert into the 256-bit vector. 6689 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 6690 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl); 6691 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl); 6692 } 6693 } else { 6694 assert(VT.is128BitVector() && "Expected an SSE value type!"); 6695 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 6696 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 6697 } 6698 return DAG.getBitcast(VT, Item); 6699 } 6700 } 6701 6702 // Is it a vector logical left shift? 6703 if (NumElems == 2 && Idx == 1 && 6704 X86::isZeroNode(Op.getOperand(0)) && 6705 !X86::isZeroNode(Op.getOperand(1))) { 6706 unsigned NumBits = VT.getSizeInBits(); 6707 return getVShift(true, VT, 6708 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6709 VT, Op.getOperand(1)), 6710 NumBits/2, DAG, *this, dl); 6711 } 6712 6713 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 6714 return SDValue(); 6715 6716 // Otherwise, if this is a vector with i32 or f32 elements, and the element 6717 // is a non-constant being inserted into an element other than the low one, 6718 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 6719 // movd/movss) to move this into the low element, then shuffle it into 6720 // place. 6721 if (EVTBits == 32) { 6722 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 6723 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); 6724 } 6725 } 6726 6727 // Splat is obviously ok. Let legalizer expand it to a shuffle. 6728 if (Values.size() == 1) { 6729 if (EVTBits == 32) { 6730 // Instead of a shuffle like this: 6731 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 6732 // Check if it's possible to issue this instead. 6733 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 6734 unsigned Idx = countTrailingZeros(NonZeros); 6735 SDValue Item = Op.getOperand(Idx); 6736 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 6737 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 6738 } 6739 return SDValue(); 6740 } 6741 6742 // A vector full of immediates; various special cases are already 6743 // handled, so this is best done with a single constant-pool load. 6744 if (IsAllConstants) 6745 return SDValue(); 6746 6747 // See if we can use a vector load to get all of the elements. 6748 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { 6749 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); 6750 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false)) 6751 return LD; 6752 } 6753 6754 // For AVX-length vectors, build the individual 128-bit pieces and use 6755 // shuffles to put them in place. 6756 if (VT.is256BitVector() || VT.is512BitVector()) { 6757 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); 6758 6759 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 6760 6761 // Build both the lower and upper subvector. 6762 SDValue Lower = 6763 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2)); 6764 SDValue Upper = DAG.getBuildVector( 6765 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2)); 6766 6767 // Recreate the wider vector with the lower and upper part. 6768 if (VT.is256BitVector()) 6769 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 6770 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 6771 } 6772 6773 // Let legalizer expand 2-wide build_vectors. 6774 if (EVTBits == 64) { 6775 if (NumNonZero == 1) { 6776 // One half is zero or undef. 6777 unsigned Idx = countTrailingZeros(NonZeros); 6778 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 6779 Op.getOperand(Idx)); 6780 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 6781 } 6782 return SDValue(); 6783 } 6784 6785 // If element VT is < 32 bits, convert it to inserts into a zero vector. 6786 if (EVTBits == 8 && NumElems == 16) 6787 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, 6788 DAG, Subtarget, *this)) 6789 return V; 6790 6791 if (EVTBits == 16 && NumElems == 8) 6792 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, 6793 DAG, Subtarget, *this)) 6794 return V; 6795 6796 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS 6797 if (EVTBits == 32 && NumElems == 4) 6798 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) 6799 return V; 6800 6801 // If element VT is == 32 bits, turn it into a number of shuffles. 6802 if (NumElems == 4 && NumZero > 0) { 6803 SmallVector<SDValue, 8> Ops(NumElems); 6804 for (unsigned i = 0; i < 4; ++i) { 6805 bool isZero = !(NonZeros & (1ULL << i)); 6806 if (isZero) 6807 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); 6808 else 6809 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6810 } 6811 6812 for (unsigned i = 0; i < 2; ++i) { 6813 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 6814 default: break; 6815 case 0: 6816 Ops[i] = Ops[i*2]; // Must be a zero vector. 6817 break; 6818 case 1: 6819 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); 6820 break; 6821 case 2: 6822 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); 6823 break; 6824 case 3: 6825 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); 6826 break; 6827 } 6828 } 6829 6830 bool Reverse1 = (NonZeros & 0x3) == 2; 6831 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 6832 int MaskVec[] = { 6833 Reverse1 ? 1 : 0, 6834 Reverse1 ? 0 : 1, 6835 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 6836 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 6837 }; 6838 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); 6839 } 6840 6841 if (Values.size() > 1 && VT.is128BitVector()) { 6842 // Check for a build vector from mostly shuffle plus few inserting. 6843 if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) 6844 return Sh; 6845 6846 // For SSE 4.1, use insertps to put the high elements into the low element. 6847 if (Subtarget.hasSSE41()) { 6848 SDValue Result; 6849 if (!Op.getOperand(0).isUndef()) 6850 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 6851 else 6852 Result = DAG.getUNDEF(VT); 6853 6854 for (unsigned i = 1; i < NumElems; ++i) { 6855 if (Op.getOperand(i).isUndef()) continue; 6856 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 6857 Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); 6858 } 6859 return Result; 6860 } 6861 6862 // Otherwise, expand into a number of unpckl*, start by extending each of 6863 // our (non-undef) elements to the full vector width with the element in the 6864 // bottom slot of the vector (which generates no code for SSE). 6865 SmallVector<SDValue, 8> Ops(NumElems); 6866 for (unsigned i = 0; i < NumElems; ++i) { 6867 if (!Op.getOperand(i).isUndef()) 6868 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6869 else 6870 Ops[i] = DAG.getUNDEF(VT); 6871 } 6872 6873 // Next, we iteratively mix elements, e.g. for v4f32: 6874 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 6875 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 6876 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 6877 unsigned EltStride = NumElems >> 1; 6878 while (EltStride != 0) { 6879 for (unsigned i = 0; i < EltStride; ++i) { 6880 // If Ops[i+EltStride] is undef and this is the first round of mixing, 6881 // then it is safe to just drop this shuffle: V[i] is already in the 6882 // right place, the one element (since it's the first round) being 6883 // inserted as undef can be dropped. This isn't safe for successive 6884 // rounds because they will permute elements within both vectors. 6885 if (Ops[i+EltStride].isUndef() && 6886 EltStride == NumElems/2) 6887 continue; 6888 6889 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]); 6890 } 6891 EltStride >>= 1; 6892 } 6893 return Ops[0]; 6894 } 6895 return SDValue(); 6896 } 6897 6898 // 256-bit AVX can use the vinsertf128 instruction 6899 // to create 256-bit vectors from two other 128-bit ones. 6900 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6901 SDLoc dl(Op); 6902 MVT ResVT = Op.getSimpleValueType(); 6903 6904 assert((ResVT.is256BitVector() || 6905 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); 6906 6907 SDValue V1 = Op.getOperand(0); 6908 SDValue V2 = Op.getOperand(1); 6909 unsigned NumElems = ResVT.getVectorNumElements(); 6910 if (ResVT.is256BitVector()) 6911 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6912 6913 if (Op.getNumOperands() == 4) { 6914 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), 6915 ResVT.getVectorNumElements()/2); 6916 SDValue V3 = Op.getOperand(2); 6917 SDValue V4 = Op.getOperand(3); 6918 return concat256BitVectors( 6919 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl), 6920 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT, 6921 NumElems, DAG, dl); 6922 } 6923 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6924 } 6925 6926 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, 6927 const X86Subtarget &Subtarget, 6928 SelectionDAG & DAG) { 6929 SDLoc dl(Op); 6930 MVT ResVT = Op.getSimpleValueType(); 6931 unsigned NumOfOperands = Op.getNumOperands(); 6932 6933 assert(isPowerOf2_32(NumOfOperands) && 6934 "Unexpected number of operands in CONCAT_VECTORS"); 6935 6936 SDValue Undef = DAG.getUNDEF(ResVT); 6937 if (NumOfOperands > 2) { 6938 // Specialize the cases when all, or all but one, of the operands are undef. 6939 unsigned NumOfDefinedOps = 0; 6940 unsigned OpIdx = 0; 6941 for (unsigned i = 0; i < NumOfOperands; i++) 6942 if (!Op.getOperand(i).isUndef()) { 6943 NumOfDefinedOps++; 6944 OpIdx = i; 6945 } 6946 if (NumOfDefinedOps == 0) 6947 return Undef; 6948 if (NumOfDefinedOps == 1) { 6949 unsigned SubVecNumElts = 6950 Op.getOperand(OpIdx).getValueType().getVectorNumElements(); 6951 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); 6952 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, 6953 Op.getOperand(OpIdx), IdxVal); 6954 } 6955 6956 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), 6957 ResVT.getVectorNumElements()/2); 6958 SmallVector<SDValue, 2> Ops; 6959 for (unsigned i = 0; i < NumOfOperands/2; i++) 6960 Ops.push_back(Op.getOperand(i)); 6961 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); 6962 Ops.clear(); 6963 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) 6964 Ops.push_back(Op.getOperand(i)); 6965 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); 6966 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); 6967 } 6968 6969 // 2 operands 6970 SDValue V1 = Op.getOperand(0); 6971 SDValue V2 = Op.getOperand(1); 6972 unsigned NumElems = ResVT.getVectorNumElements(); 6973 assert(V1.getValueType() == V2.getValueType() && 6974 V1.getValueType().getVectorNumElements() == NumElems/2 && 6975 "Unexpected operands in CONCAT_VECTORS"); 6976 6977 if (ResVT.getSizeInBits() >= 16) 6978 return Op; // The operation is legal with KUNPCK 6979 6980 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); 6981 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); 6982 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); 6983 if (IsZeroV1 && IsZeroV2) 6984 return ZeroVec; 6985 6986 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); 6987 if (V2.isUndef()) 6988 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); 6989 if (IsZeroV2) 6990 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); 6991 6992 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); 6993 if (V1.isUndef()) 6994 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); 6995 6996 if (IsZeroV1) 6997 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); 6998 6999 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); 7000 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); 7001 } 7002 7003 static SDValue LowerCONCAT_VECTORS(SDValue Op, 7004 const X86Subtarget &Subtarget, 7005 SelectionDAG &DAG) { 7006 MVT VT = Op.getSimpleValueType(); 7007 if (VT.getVectorElementType() == MVT::i1) 7008 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); 7009 7010 assert((VT.is256BitVector() && Op.getNumOperands() == 2) || 7011 (VT.is512BitVector() && (Op.getNumOperands() == 2 || 7012 Op.getNumOperands() == 4))); 7013 7014 // AVX can use the vinsertf128 instruction to create 256-bit vectors 7015 // from two other 128-bit ones. 7016 7017 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors 7018 return LowerAVXCONCAT_VECTORS(Op, DAG); 7019 } 7020 7021 //===----------------------------------------------------------------------===// 7022 // Vector shuffle lowering 7023 // 7024 // This is an experimental code path for lowering vector shuffles on x86. It is 7025 // designed to handle arbitrary vector shuffles and blends, gracefully 7026 // degrading performance as necessary. It works hard to recognize idiomatic 7027 // shuffles and lower them to optimal instruction patterns without leaving 7028 // a framework that allows reasonably efficient handling of all vector shuffle 7029 // patterns. 7030 //===----------------------------------------------------------------------===// 7031 7032 /// \brief Tiny helper function to identify a no-op mask. 7033 /// 7034 /// This is a somewhat boring predicate function. It checks whether the mask 7035 /// array input, which is assumed to be a single-input shuffle mask of the kind 7036 /// used by the X86 shuffle instructions (not a fully general 7037 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an 7038 /// in-place shuffle are 'no-op's. 7039 static bool isNoopShuffleMask(ArrayRef<int> Mask) { 7040 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7041 assert(Mask[i] >= -1 && "Out of bound mask element!"); 7042 if (Mask[i] >= 0 && Mask[i] != i) 7043 return false; 7044 } 7045 return true; 7046 } 7047 7048 /// \brief Test whether there are elements crossing 128-bit lanes in this 7049 /// shuffle mask. 7050 /// 7051 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations 7052 /// and we routinely test for these. 7053 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { 7054 int LaneSize = 128 / VT.getScalarSizeInBits(); 7055 int Size = Mask.size(); 7056 for (int i = 0; i < Size; ++i) 7057 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) 7058 return true; 7059 return false; 7060 } 7061 7062 /// \brief Test whether a shuffle mask is equivalent within each sub-lane. 7063 /// 7064 /// This checks a shuffle mask to see if it is performing the same 7065 /// lane-relative shuffle in each sub-lane. This trivially implies 7066 /// that it is also not lane-crossing. It may however involve a blend from the 7067 /// same lane of a second vector. 7068 /// 7069 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is 7070 /// non-trivial to compute in the face of undef lanes. The representation is 7071 /// suitable for use with existing 128-bit shuffles as entries from the second 7072 /// vector have been remapped to [LaneSize, 2*LaneSize). 7073 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, 7074 ArrayRef<int> Mask, 7075 SmallVectorImpl<int> &RepeatedMask) { 7076 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); 7077 RepeatedMask.assign(LaneSize, -1); 7078 int Size = Mask.size(); 7079 for (int i = 0; i < Size; ++i) { 7080 if (Mask[i] < 0) 7081 continue; 7082 if ((Mask[i] % Size) / LaneSize != i / LaneSize) 7083 // This entry crosses lanes, so there is no way to model this shuffle. 7084 return false; 7085 7086 // Ok, handle the in-lane shuffles by detecting if and when they repeat. 7087 // Adjust second vector indices to start at LaneSize instead of Size. 7088 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize 7089 : Mask[i] % LaneSize + LaneSize; 7090 if (RepeatedMask[i % LaneSize] < 0) 7091 // This is the first non-undef entry in this slot of a 128-bit lane. 7092 RepeatedMask[i % LaneSize] = LocalM; 7093 else if (RepeatedMask[i % LaneSize] != LocalM) 7094 // Found a mismatch with the repeated mask. 7095 return false; 7096 } 7097 return true; 7098 } 7099 7100 /// Test whether a shuffle mask is equivalent within each 128-bit lane. 7101 static bool 7102 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, 7103 SmallVectorImpl<int> &RepeatedMask) { 7104 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); 7105 } 7106 7107 /// Test whether a shuffle mask is equivalent within each 256-bit lane. 7108 static bool 7109 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, 7110 SmallVectorImpl<int> &RepeatedMask) { 7111 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); 7112 } 7113 7114 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask, 7115 SmallVectorImpl<int> &ScaledMask) { 7116 assert(0 < Scale && "Unexpected scaling factor"); 7117 int NumElts = Mask.size(); 7118 ScaledMask.assign(NumElts * Scale, -1); 7119 7120 for (int i = 0; i != NumElts; ++i) { 7121 int M = Mask[i]; 7122 7123 // Repeat sentinel values in every mask element. 7124 if (M < 0) { 7125 for (int s = 0; s != Scale; ++s) 7126 ScaledMask[(Scale * i) + s] = M; 7127 continue; 7128 } 7129 7130 // Scale mask element and increment across each mask element. 7131 for (int s = 0; s != Scale; ++s) 7132 ScaledMask[(Scale * i) + s] = (Scale * M) + s; 7133 } 7134 } 7135 7136 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of 7137 /// arguments. 7138 /// 7139 /// This is a fast way to test a shuffle mask against a fixed pattern: 7140 /// 7141 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } 7142 /// 7143 /// It returns true if the mask is exactly as wide as the argument list, and 7144 /// each element of the mask is either -1 (signifying undef) or the value given 7145 /// in the argument. 7146 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, 7147 ArrayRef<int> ExpectedMask) { 7148 if (Mask.size() != ExpectedMask.size()) 7149 return false; 7150 7151 int Size = Mask.size(); 7152 7153 // If the values are build vectors, we can look through them to find 7154 // equivalent inputs that make the shuffles equivalent. 7155 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1); 7156 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2); 7157 7158 for (int i = 0; i < Size; ++i) { 7159 assert(Mask[i] >= -1 && "Out of bound mask element!"); 7160 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) { 7161 auto *MaskBV = Mask[i] < Size ? BV1 : BV2; 7162 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; 7163 if (!MaskBV || !ExpectedBV || 7164 MaskBV->getOperand(Mask[i] % Size) != 7165 ExpectedBV->getOperand(ExpectedMask[i] % Size)) 7166 return false; 7167 } 7168 } 7169 7170 return true; 7171 } 7172 7173 /// Checks whether a target shuffle mask is equivalent to an explicit pattern. 7174 /// 7175 /// The masks must be exactly the same width. 7176 /// 7177 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding 7178 /// value in ExpectedMask is always accepted. Otherwise the indices must match. 7179 /// 7180 /// SM_SentinelZero is accepted as a valid negative index but must match in both. 7181 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, 7182 ArrayRef<int> ExpectedMask) { 7183 int Size = Mask.size(); 7184 if (Size != (int)ExpectedMask.size()) 7185 return false; 7186 7187 for (int i = 0; i < Size; ++i) 7188 if (Mask[i] == SM_SentinelUndef) 7189 continue; 7190 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero) 7191 return false; 7192 else if (Mask[i] != ExpectedMask[i]) 7193 return false; 7194 7195 return true; 7196 } 7197 7198 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. 7199 /// 7200 /// This helper function produces an 8-bit shuffle immediate corresponding to 7201 /// the ubiquitous shuffle encoding scheme used in x86 instructions for 7202 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for 7203 /// example. 7204 /// 7205 /// NB: We rely heavily on "undef" masks preserving the input lane. 7206 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { 7207 assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); 7208 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); 7209 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); 7210 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); 7211 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); 7212 7213 unsigned Imm = 0; 7214 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; 7215 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; 7216 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; 7217 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; 7218 return Imm; 7219 } 7220 7221 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, 7222 SelectionDAG &DAG) { 7223 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); 7224 } 7225 7226 /// \brief Compute whether each element of a shuffle is zeroable. 7227 /// 7228 /// A "zeroable" vector shuffle element is one which can be lowered to zero. 7229 /// Either it is an undef element in the shuffle mask, the element of the input 7230 /// referenced is undef, or the element of the input referenced is known to be 7231 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle 7232 /// as many lanes with this technique as possible to simplify the remaining 7233 /// shuffle. 7234 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, 7235 SDValue V1, SDValue V2) { 7236 SmallBitVector Zeroable(Mask.size(), false); 7237 V1 = peekThroughBitcasts(V1); 7238 V2 = peekThroughBitcasts(V2); 7239 7240 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); 7241 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); 7242 7243 int VectorSizeInBits = V1.getValueType().getSizeInBits(); 7244 int ScalarSizeInBits = VectorSizeInBits / Mask.size(); 7245 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); 7246 7247 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7248 int M = Mask[i]; 7249 // Handle the easy cases. 7250 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { 7251 Zeroable[i] = true; 7252 continue; 7253 } 7254 7255 // Determine shuffle input and normalize the mask. 7256 SDValue V = M < Size ? V1 : V2; 7257 M %= Size; 7258 7259 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. 7260 if (V.getOpcode() != ISD::BUILD_VECTOR) 7261 continue; 7262 7263 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of 7264 // the (larger) source element must be UNDEF/ZERO. 7265 if ((Size % V.getNumOperands()) == 0) { 7266 int Scale = Size / V->getNumOperands(); 7267 SDValue Op = V.getOperand(M / Scale); 7268 if (Op.isUndef() || X86::isZeroNode(Op)) 7269 Zeroable[i] = true; 7270 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 7271 APInt Val = Cst->getAPIntValue(); 7272 Val = Val.lshr((M % Scale) * ScalarSizeInBits); 7273 Val = Val.getLoBits(ScalarSizeInBits); 7274 Zeroable[i] = (Val == 0); 7275 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { 7276 APInt Val = Cst->getValueAPF().bitcastToAPInt(); 7277 Val = Val.lshr((M % Scale) * ScalarSizeInBits); 7278 Val = Val.getLoBits(ScalarSizeInBits); 7279 Zeroable[i] = (Val == 0); 7280 } 7281 continue; 7282 } 7283 7284 // If the BUILD_VECTOR has more elements then all the (smaller) source 7285 // elements must be UNDEF or ZERO. 7286 if ((V.getNumOperands() % Size) == 0) { 7287 int Scale = V->getNumOperands() / Size; 7288 bool AllZeroable = true; 7289 for (int j = 0; j < Scale; ++j) { 7290 SDValue Op = V.getOperand((M * Scale) + j); 7291 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); 7292 } 7293 Zeroable[i] = AllZeroable; 7294 continue; 7295 } 7296 } 7297 7298 return Zeroable; 7299 } 7300 7301 /// Try to lower a shuffle with a single PSHUFB of V1. 7302 /// This is only possible if V2 is unused (at all, or only for zero elements). 7303 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, 7304 ArrayRef<int> Mask, SDValue V1, 7305 SDValue V2, 7306 const X86Subtarget &Subtarget, 7307 SelectionDAG &DAG) { 7308 int Size = Mask.size(); 7309 int LaneSize = 128 / VT.getScalarSizeInBits(); 7310 const int NumBytes = VT.getSizeInBits() / 8; 7311 const int NumEltBytes = VT.getScalarSizeInBits() / 8; 7312 7313 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || 7314 (Subtarget.hasAVX2() && VT.is256BitVector()) || 7315 (Subtarget.hasBWI() && VT.is512BitVector())); 7316 7317 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7318 7319 SmallVector<SDValue, 64> PSHUFBMask(NumBytes); 7320 // Sign bit set in i8 mask means zero element. 7321 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); 7322 7323 for (int i = 0; i < NumBytes; ++i) { 7324 int M = Mask[i / NumEltBytes]; 7325 if (M < 0) { 7326 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); 7327 continue; 7328 } 7329 if (Zeroable[i / NumEltBytes]) { 7330 PSHUFBMask[i] = ZeroMask; 7331 continue; 7332 } 7333 // Only allow V1. 7334 if (M >= Size) 7335 return SDValue(); 7336 7337 // PSHUFB can't cross lanes, ensure this doesn't happen. 7338 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) 7339 return SDValue(); 7340 7341 M = M % LaneSize; 7342 M = M * NumEltBytes + (i % NumEltBytes); 7343 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); 7344 } 7345 7346 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); 7347 return DAG.getBitcast( 7348 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1), 7349 DAG.getBuildVector(I8VT, DL, PSHUFBMask))); 7350 } 7351 7352 // X86 has dedicated unpack instructions that can handle specific blend 7353 // operations: UNPCKH and UNPCKL. 7354 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, 7355 ArrayRef<int> Mask, SDValue V1, 7356 SDValue V2, SelectionDAG &DAG) { 7357 int NumElts = VT.getVectorNumElements(); 7358 int NumEltsInLane = 128 / VT.getScalarSizeInBits(); 7359 SmallVector<int, 8> Unpckl(NumElts); 7360 SmallVector<int, 8> Unpckh(NumElts); 7361 7362 for (int i = 0; i < NumElts; ++i) { 7363 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; 7364 int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); 7365 int HiPos = LoPos + NumEltsInLane / 2; 7366 Unpckl[i] = LoPos; 7367 Unpckh[i] = HiPos; 7368 } 7369 7370 if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) 7371 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); 7372 if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) 7373 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); 7374 7375 // Commute and try again. 7376 ShuffleVectorSDNode::commuteMask(Unpckl); 7377 if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) 7378 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); 7379 7380 ShuffleVectorSDNode::commuteMask(Unpckh); 7381 if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) 7382 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); 7383 7384 return SDValue(); 7385 } 7386 7387 /// \brief Try to emit a bitmask instruction for a shuffle. 7388 /// 7389 /// This handles cases where we can model a blend exactly as a bitmask due to 7390 /// one of the inputs being zeroable. 7391 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, 7392 SDValue V2, ArrayRef<int> Mask, 7393 SelectionDAG &DAG) { 7394 MVT EltVT = VT.getVectorElementType(); 7395 int NumEltBits = EltVT.getSizeInBits(); 7396 MVT IntEltVT = MVT::getIntegerVT(NumEltBits); 7397 SDValue Zero = DAG.getConstant(0, DL, IntEltVT); 7398 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, 7399 IntEltVT); 7400 if (EltVT.isFloatingPoint()) { 7401 Zero = DAG.getBitcast(EltVT, Zero); 7402 AllOnes = DAG.getBitcast(EltVT, AllOnes); 7403 } 7404 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); 7405 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7406 SDValue V; 7407 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7408 if (Zeroable[i]) 7409 continue; 7410 if (Mask[i] % Size != i) 7411 return SDValue(); // Not a blend. 7412 if (!V) 7413 V = Mask[i] < Size ? V1 : V2; 7414 else if (V != (Mask[i] < Size ? V1 : V2)) 7415 return SDValue(); // Can only let one input through the mask. 7416 7417 VMaskOps[i] = AllOnes; 7418 } 7419 if (!V) 7420 return SDValue(); // No non-zeroable elements! 7421 7422 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); 7423 V = DAG.getNode(VT.isFloatingPoint() 7424 ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, 7425 DL, VT, V, VMask); 7426 return V; 7427 } 7428 7429 /// \brief Try to emit a blend instruction for a shuffle using bit math. 7430 /// 7431 /// This is used as a fallback approach when first class blend instructions are 7432 /// unavailable. Currently it is only suitable for integer vectors, but could 7433 /// be generalized for floating point vectors if desirable. 7434 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, 7435 SDValue V2, ArrayRef<int> Mask, 7436 SelectionDAG &DAG) { 7437 assert(VT.isInteger() && "Only supports integer vector types!"); 7438 MVT EltVT = VT.getVectorElementType(); 7439 int NumEltBits = EltVT.getSizeInBits(); 7440 SDValue Zero = DAG.getConstant(0, DL, EltVT); 7441 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, 7442 EltVT); 7443 SmallVector<SDValue, 16> MaskOps; 7444 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7445 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) 7446 return SDValue(); // Shuffled input! 7447 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); 7448 } 7449 7450 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); 7451 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); 7452 // We have to cast V2 around. 7453 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); 7454 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, 7455 DAG.getBitcast(MaskVT, V1Mask), 7456 DAG.getBitcast(MaskVT, V2))); 7457 return DAG.getNode(ISD::OR, DL, VT, V1, V2); 7458 } 7459 7460 /// \brief Try to emit a blend instruction for a shuffle. 7461 /// 7462 /// This doesn't do any checks for the availability of instructions for blending 7463 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to 7464 /// be matched in the backend with the type given. What it does check for is 7465 /// that the shuffle mask is a blend, or convertible into a blend with zero. 7466 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, 7467 SDValue V2, ArrayRef<int> Original, 7468 const X86Subtarget &Subtarget, 7469 SelectionDAG &DAG) { 7470 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); 7471 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); 7472 SmallVector<int, 8> Mask(Original.begin(), Original.end()); 7473 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7474 bool ForceV1Zero = false, ForceV2Zero = false; 7475 7476 // Attempt to generate the binary blend mask. If an input is zero then 7477 // we can use any lane. 7478 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. 7479 unsigned BlendMask = 0; 7480 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7481 int M = Mask[i]; 7482 if (M < 0) 7483 continue; 7484 if (M == i) 7485 continue; 7486 if (M == i + Size) { 7487 BlendMask |= 1u << i; 7488 continue; 7489 } 7490 if (Zeroable[i]) { 7491 if (V1IsZero) { 7492 ForceV1Zero = true; 7493 Mask[i] = i; 7494 continue; 7495 } 7496 if (V2IsZero) { 7497 ForceV2Zero = true; 7498 BlendMask |= 1u << i; 7499 Mask[i] = i + Size; 7500 continue; 7501 } 7502 } 7503 return SDValue(); // Shuffled input! 7504 } 7505 7506 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. 7507 if (ForceV1Zero) 7508 V1 = getZeroVector(VT, Subtarget, DAG, DL); 7509 if (ForceV2Zero) 7510 V2 = getZeroVector(VT, Subtarget, DAG, DL); 7511 7512 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { 7513 unsigned ScaledMask = 0; 7514 for (int i = 0; i != Size; ++i) 7515 if (BlendMask & (1u << i)) 7516 for (int j = 0; j != Scale; ++j) 7517 ScaledMask |= 1u << (i * Scale + j); 7518 return ScaledMask; 7519 }; 7520 7521 switch (VT.SimpleTy) { 7522 case MVT::v2f64: 7523 case MVT::v4f32: 7524 case MVT::v4f64: 7525 case MVT::v8f32: 7526 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, 7527 DAG.getConstant(BlendMask, DL, MVT::i8)); 7528 7529 case MVT::v4i64: 7530 case MVT::v8i32: 7531 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); 7532 // FALLTHROUGH 7533 case MVT::v2i64: 7534 case MVT::v4i32: 7535 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into 7536 // that instruction. 7537 if (Subtarget.hasAVX2()) { 7538 // Scale the blend by the number of 32-bit dwords per element. 7539 int Scale = VT.getScalarSizeInBits() / 32; 7540 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); 7541 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; 7542 V1 = DAG.getBitcast(BlendVT, V1); 7543 V2 = DAG.getBitcast(BlendVT, V2); 7544 return DAG.getBitcast( 7545 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, 7546 DAG.getConstant(BlendMask, DL, MVT::i8))); 7547 } 7548 // FALLTHROUGH 7549 case MVT::v8i16: { 7550 // For integer shuffles we need to expand the mask and cast the inputs to 7551 // v8i16s prior to blending. 7552 int Scale = 8 / VT.getVectorNumElements(); 7553 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); 7554 V1 = DAG.getBitcast(MVT::v8i16, V1); 7555 V2 = DAG.getBitcast(MVT::v8i16, V2); 7556 return DAG.getBitcast(VT, 7557 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, 7558 DAG.getConstant(BlendMask, DL, MVT::i8))); 7559 } 7560 7561 case MVT::v16i16: { 7562 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); 7563 SmallVector<int, 8> RepeatedMask; 7564 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { 7565 // We can lower these with PBLENDW which is mirrored across 128-bit lanes. 7566 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); 7567 BlendMask = 0; 7568 for (int i = 0; i < 8; ++i) 7569 if (RepeatedMask[i] >= 8) 7570 BlendMask |= 1u << i; 7571 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, 7572 DAG.getConstant(BlendMask, DL, MVT::i8)); 7573 } 7574 } 7575 // FALLTHROUGH 7576 case MVT::v16i8: 7577 case MVT::v32i8: { 7578 assert((VT.is128BitVector() || Subtarget.hasAVX2()) && 7579 "256-bit byte-blends require AVX2 support!"); 7580 7581 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. 7582 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) 7583 return Masked; 7584 7585 // Scale the blend by the number of bytes per element. 7586 int Scale = VT.getScalarSizeInBits() / 8; 7587 7588 // This form of blend is always done on bytes. Compute the byte vector 7589 // type. 7590 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); 7591 7592 // Compute the VSELECT mask. Note that VSELECT is really confusing in the 7593 // mix of LLVM's code generator and the x86 backend. We tell the code 7594 // generator that boolean values in the elements of an x86 vector register 7595 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' 7596 // mapping a select to operand #1, and 'false' mapping to operand #2. The 7597 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit 7598 // of the element (the remaining are ignored) and 0 in that high bit would 7599 // mean operand #1 while 1 in the high bit would mean operand #2. So while 7600 // the LLVM model for boolean values in vector elements gets the relevant 7601 // bit set, it is set backwards and over constrained relative to x86's 7602 // actual model. 7603 SmallVector<SDValue, 32> VSELECTMask; 7604 for (int i = 0, Size = Mask.size(); i < Size; ++i) 7605 for (int j = 0; j < Scale; ++j) 7606 VSELECTMask.push_back( 7607 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) 7608 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, 7609 MVT::i8)); 7610 7611 V1 = DAG.getBitcast(BlendVT, V1); 7612 V2 = DAG.getBitcast(BlendVT, V2); 7613 return DAG.getBitcast( 7614 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, 7615 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); 7616 } 7617 7618 default: 7619 llvm_unreachable("Not a supported integer vector type!"); 7620 } 7621 } 7622 7623 /// \brief Try to lower as a blend of elements from two inputs followed by 7624 /// a single-input permutation. 7625 /// 7626 /// This matches the pattern where we can blend elements from two inputs and 7627 /// then reduce the shuffle to a single-input permutation. 7628 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, 7629 SDValue V1, SDValue V2, 7630 ArrayRef<int> Mask, 7631 SelectionDAG &DAG) { 7632 // We build up the blend mask while checking whether a blend is a viable way 7633 // to reduce the shuffle. 7634 SmallVector<int, 32> BlendMask(Mask.size(), -1); 7635 SmallVector<int, 32> PermuteMask(Mask.size(), -1); 7636 7637 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 7638 if (Mask[i] < 0) 7639 continue; 7640 7641 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); 7642 7643 if (BlendMask[Mask[i] % Size] < 0) 7644 BlendMask[Mask[i] % Size] = Mask[i]; 7645 else if (BlendMask[Mask[i] % Size] != Mask[i]) 7646 return SDValue(); // Can't blend in the needed input! 7647 7648 PermuteMask[i] = Mask[i] % Size; 7649 } 7650 7651 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); 7652 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); 7653 } 7654 7655 /// \brief Generic routine to decompose a shuffle and blend into indepndent 7656 /// blends and permutes. 7657 /// 7658 /// This matches the extremely common pattern for handling combined 7659 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend 7660 /// operations. It will try to pick the best arrangement of shuffles and 7661 /// blends. 7662 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL, 7663 MVT VT, SDValue V1, 7664 SDValue V2, 7665 ArrayRef<int> Mask, 7666 SelectionDAG &DAG) { 7667 // Shuffle the input elements into the desired positions in V1 and V2 and 7668 // blend them together. 7669 SmallVector<int, 32> V1Mask(Mask.size(), -1); 7670 SmallVector<int, 32> V2Mask(Mask.size(), -1); 7671 SmallVector<int, 32> BlendMask(Mask.size(), -1); 7672 for (int i = 0, Size = Mask.size(); i < Size; ++i) 7673 if (Mask[i] >= 0 && Mask[i] < Size) { 7674 V1Mask[i] = Mask[i]; 7675 BlendMask[i] = i; 7676 } else if (Mask[i] >= Size) { 7677 V2Mask[i] = Mask[i] - Size; 7678 BlendMask[i] = i + Size; 7679 } 7680 7681 // Try to lower with the simpler initial blend strategy unless one of the 7682 // input shuffles would be a no-op. We prefer to shuffle inputs as the 7683 // shuffle may be able to fold with a load or other benefit. However, when 7684 // we'll have to do 2x as many shuffles in order to achieve this, blending 7685 // first is a better strategy. 7686 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) 7687 if (SDValue BlendPerm = 7688 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) 7689 return BlendPerm; 7690 7691 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); 7692 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); 7693 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); 7694 } 7695 7696 /// \brief Try to lower a vector shuffle as a byte rotation. 7697 /// 7698 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary 7699 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use 7700 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will 7701 /// try to generically lower a vector shuffle through such an pattern. It 7702 /// does not check for the profitability of lowering either as PALIGNR or 7703 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. 7704 /// This matches shuffle vectors that look like: 7705 /// 7706 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] 7707 /// 7708 /// Essentially it concatenates V1 and V2, shifts right by some number of 7709 /// elements, and takes the low elements as the result. Note that while this is 7710 /// specified as a *right shift* because x86 is little-endian, it is a *left 7711 /// rotate* of the vector lanes. 7712 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, 7713 SDValue V1, SDValue V2, 7714 ArrayRef<int> Mask, 7715 const X86Subtarget &Subtarget, 7716 SelectionDAG &DAG) { 7717 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); 7718 7719 int NumElts = Mask.size(); 7720 int NumLanes = VT.getSizeInBits() / 128; 7721 int NumLaneElts = NumElts / NumLanes; 7722 7723 // We need to detect various ways of spelling a rotation: 7724 // [11, 12, 13, 14, 15, 0, 1, 2] 7725 // [-1, 12, 13, 14, -1, -1, 1, -1] 7726 // [-1, -1, -1, -1, -1, -1, 1, 2] 7727 // [ 3, 4, 5, 6, 7, 8, 9, 10] 7728 // [-1, 4, 5, 6, -1, -1, 9, -1] 7729 // [-1, 4, 5, 6, -1, -1, -1, -1] 7730 int Rotation = 0; 7731 SDValue Lo, Hi; 7732 for (int l = 0; l < NumElts; l += NumLaneElts) { 7733 for (int i = 0; i < NumLaneElts; ++i) { 7734 if (Mask[l + i] < 0) 7735 continue; 7736 7737 // Get the mod-Size index and lane correct it. 7738 int LaneIdx = (Mask[l + i] % NumElts) - l; 7739 // Make sure it was in this lane. 7740 if (LaneIdx < 0 || LaneIdx >= NumLaneElts) 7741 return SDValue(); 7742 7743 // Determine where a rotated vector would have started. 7744 int StartIdx = i - LaneIdx; 7745 if (StartIdx == 0) 7746 // The identity rotation isn't interesting, stop. 7747 return SDValue(); 7748 7749 // If we found the tail of a vector the rotation must be the missing 7750 // front. If we found the head of a vector, it must be how much of the 7751 // head. 7752 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; 7753 7754 if (Rotation == 0) 7755 Rotation = CandidateRotation; 7756 else if (Rotation != CandidateRotation) 7757 // The rotations don't match, so we can't match this mask. 7758 return SDValue(); 7759 7760 // Compute which value this mask is pointing at. 7761 SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; 7762 7763 // Compute which of the two target values this index should be assigned 7764 // to. This reflects whether the high elements are remaining or the low 7765 // elements are remaining. 7766 SDValue &TargetV = StartIdx < 0 ? Hi : Lo; 7767 7768 // Either set up this value if we've not encountered it before, or check 7769 // that it remains consistent. 7770 if (!TargetV) 7771 TargetV = MaskV; 7772 else if (TargetV != MaskV) 7773 // This may be a rotation, but it pulls from the inputs in some 7774 // unsupported interleaving. 7775 return SDValue(); 7776 } 7777 } 7778 7779 // Check that we successfully analyzed the mask, and normalize the results. 7780 assert(Rotation != 0 && "Failed to locate a viable rotation!"); 7781 assert((Lo || Hi) && "Failed to find a rotated input vector!"); 7782 if (!Lo) 7783 Lo = Hi; 7784 else if (!Hi) 7785 Hi = Lo; 7786 7787 // Cast the inputs to i8 vector of correct length to match PALIGNR or 7788 // PSLLDQ/PSRLDQ. 7789 MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); 7790 Lo = DAG.getBitcast(ByteVT, Lo); 7791 Hi = DAG.getBitcast(ByteVT, Hi); 7792 7793 // The actual rotate instruction rotates bytes, so we need to scale the 7794 // rotation based on how many bytes are in the vector lane. 7795 int Scale = 16 / NumLaneElts; 7796 7797 // SSSE3 targets can use the palignr instruction. 7798 if (Subtarget.hasSSSE3()) { 7799 assert((!VT.is512BitVector() || Subtarget.hasBWI()) && 7800 "512-bit PALIGNR requires BWI instructions"); 7801 return DAG.getBitcast( 7802 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, 7803 DAG.getConstant(Rotation * Scale, DL, MVT::i8))); 7804 } 7805 7806 assert(VT.is128BitVector() && 7807 "Rotate-based lowering only supports 128-bit lowering!"); 7808 assert(Mask.size() <= 16 && 7809 "Can shuffle at most 16 bytes in a 128-bit vector!"); 7810 assert(ByteVT == MVT::v16i8 && 7811 "SSE2 rotate lowering only needed for v16i8!"); 7812 7813 // Default SSE2 implementation 7814 int LoByteShift = 16 - Rotation * Scale; 7815 int HiByteShift = Rotation * Scale; 7816 7817 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, 7818 DAG.getConstant(LoByteShift, DL, MVT::i8)); 7819 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, 7820 DAG.getConstant(HiByteShift, DL, MVT::i8)); 7821 return DAG.getBitcast(VT, 7822 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); 7823 } 7824 7825 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). 7826 /// 7827 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and 7828 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function 7829 /// matches elements from one of the input vectors shuffled to the left or 7830 /// right with zeroable elements 'shifted in'. It handles both the strictly 7831 /// bit-wise element shifts and the byte shift across an entire 128-bit double 7832 /// quad word lane. 7833 /// 7834 /// PSHL : (little-endian) left bit shift. 7835 /// [ zz, 0, zz, 2 ] 7836 /// [ -1, 4, zz, -1 ] 7837 /// PSRL : (little-endian) right bit shift. 7838 /// [ 1, zz, 3, zz] 7839 /// [ -1, -1, 7, zz] 7840 /// PSLLDQ : (little-endian) left byte shift 7841 /// [ zz, 0, 1, 2, 3, 4, 5, 6] 7842 /// [ zz, zz, -1, -1, 2, 3, 4, -1] 7843 /// [ zz, zz, zz, zz, zz, zz, -1, 1] 7844 /// PSRLDQ : (little-endian) right byte shift 7845 /// [ 5, 6, 7, zz, zz, zz, zz, zz] 7846 /// [ -1, 5, 6, 7, zz, zz, zz, zz] 7847 /// [ 1, 2, -1, -1, -1, -1, zz, zz] 7848 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, 7849 SDValue V2, ArrayRef<int> Mask, 7850 const X86Subtarget &Subtarget, 7851 SelectionDAG &DAG) { 7852 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7853 7854 int Size = Mask.size(); 7855 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); 7856 7857 auto CheckZeros = [&](int Shift, int Scale, bool Left) { 7858 for (int i = 0; i < Size; i += Scale) 7859 for (int j = 0; j < Shift; ++j) 7860 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) 7861 return false; 7862 7863 return true; 7864 }; 7865 7866 auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { 7867 for (int i = 0; i != Size; i += Scale) { 7868 unsigned Pos = Left ? i + Shift : i; 7869 unsigned Low = Left ? i : i + Shift; 7870 unsigned Len = Scale - Shift; 7871 if (!isSequentialOrUndefInRange(Mask, Pos, Len, 7872 Low + (V == V1 ? 0 : Size))) 7873 return SDValue(); 7874 } 7875 7876 int ShiftEltBits = VT.getScalarSizeInBits() * Scale; 7877 bool ByteShift = ShiftEltBits > 64; 7878 unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) 7879 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); 7880 int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); 7881 7882 // Normalize the scale for byte shifts to still produce an i64 element 7883 // type. 7884 Scale = ByteShift ? Scale / 2 : Scale; 7885 7886 // We need to round trip through the appropriate type for the shift. 7887 MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); 7888 MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8) 7889 : MVT::getVectorVT(ShiftSVT, Size / Scale); 7890 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && 7891 "Illegal integer vector type"); 7892 V = DAG.getBitcast(ShiftVT, V); 7893 7894 V = DAG.getNode(OpCode, DL, ShiftVT, V, 7895 DAG.getConstant(ShiftAmt, DL, MVT::i8)); 7896 return DAG.getBitcast(VT, V); 7897 }; 7898 7899 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just 7900 // keep doubling the size of the integer elements up to that. We can 7901 // then shift the elements of the integer vector by whole multiples of 7902 // their width within the elements of the larger integer vector. Test each 7903 // multiple to see if we can find a match with the moved element indices 7904 // and that the shifted in elements are all zeroable. 7905 unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128); 7906 for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2) 7907 for (int Shift = 1; Shift != Scale; ++Shift) 7908 for (bool Left : {true, false}) 7909 if (CheckZeros(Shift, Scale, Left)) 7910 for (SDValue V : {V1, V2}) 7911 if (SDValue Match = MatchShift(Shift, Scale, Left, V)) 7912 return Match; 7913 7914 // no match 7915 return SDValue(); 7916 } 7917 7918 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. 7919 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, 7920 SDValue V2, ArrayRef<int> Mask, 7921 SelectionDAG &DAG) { 7922 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 7923 assert(!Zeroable.all() && "Fully zeroable shuffle mask"); 7924 7925 int Size = Mask.size(); 7926 int HalfSize = Size / 2; 7927 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); 7928 7929 // Upper half must be undefined. 7930 if (!isUndefInRange(Mask, HalfSize, HalfSize)) 7931 return SDValue(); 7932 7933 // EXTRQ: Extract Len elements from lower half of source, starting at Idx. 7934 // Remainder of lower half result is zero and upper half is all undef. 7935 auto LowerAsEXTRQ = [&]() { 7936 // Determine the extraction length from the part of the 7937 // lower half that isn't zeroable. 7938 int Len = HalfSize; 7939 for (; Len > 0; --Len) 7940 if (!Zeroable[Len - 1]) 7941 break; 7942 assert(Len > 0 && "Zeroable shuffle mask"); 7943 7944 // Attempt to match first Len sequential elements from the lower half. 7945 SDValue Src; 7946 int Idx = -1; 7947 for (int i = 0; i != Len; ++i) { 7948 int M = Mask[i]; 7949 if (M < 0) 7950 continue; 7951 SDValue &V = (M < Size ? V1 : V2); 7952 M = M % Size; 7953 7954 // The extracted elements must start at a valid index and all mask 7955 // elements must be in the lower half. 7956 if (i > M || M >= HalfSize) 7957 return SDValue(); 7958 7959 if (Idx < 0 || (Src == V && Idx == (M - i))) { 7960 Src = V; 7961 Idx = M - i; 7962 continue; 7963 } 7964 return SDValue(); 7965 } 7966 7967 if (Idx < 0) 7968 return SDValue(); 7969 7970 assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); 7971 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; 7972 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; 7973 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, 7974 DAG.getConstant(BitLen, DL, MVT::i8), 7975 DAG.getConstant(BitIdx, DL, MVT::i8)); 7976 }; 7977 7978 if (SDValue ExtrQ = LowerAsEXTRQ()) 7979 return ExtrQ; 7980 7981 // INSERTQ: Extract lowest Len elements from lower half of second source and 7982 // insert over first source, starting at Idx. 7983 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } 7984 auto LowerAsInsertQ = [&]() { 7985 for (int Idx = 0; Idx != HalfSize; ++Idx) { 7986 SDValue Base; 7987 7988 // Attempt to match first source from mask before insertion point. 7989 if (isUndefInRange(Mask, 0, Idx)) { 7990 /* EMPTY */ 7991 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { 7992 Base = V1; 7993 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { 7994 Base = V2; 7995 } else { 7996 continue; 7997 } 7998 7999 // Extend the extraction length looking to match both the insertion of 8000 // the second source and the remaining elements of the first. 8001 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { 8002 SDValue Insert; 8003 int Len = Hi - Idx; 8004 8005 // Match insertion. 8006 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { 8007 Insert = V1; 8008 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { 8009 Insert = V2; 8010 } else { 8011 continue; 8012 } 8013 8014 // Match the remaining elements of the lower half. 8015 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { 8016 /* EMPTY */ 8017 } else if ((!Base || (Base == V1)) && 8018 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { 8019 Base = V1; 8020 } else if ((!Base || (Base == V2)) && 8021 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, 8022 Size + Hi)) { 8023 Base = V2; 8024 } else { 8025 continue; 8026 } 8027 8028 // We may not have a base (first source) - this can safely be undefined. 8029 if (!Base) 8030 Base = DAG.getUNDEF(VT); 8031 8032 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; 8033 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; 8034 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, 8035 DAG.getConstant(BitLen, DL, MVT::i8), 8036 DAG.getConstant(BitIdx, DL, MVT::i8)); 8037 } 8038 } 8039 8040 return SDValue(); 8041 }; 8042 8043 if (SDValue InsertQ = LowerAsInsertQ()) 8044 return InsertQ; 8045 8046 return SDValue(); 8047 } 8048 8049 /// \brief Lower a vector shuffle as a zero or any extension. 8050 /// 8051 /// Given a specific number of elements, element bit width, and extension 8052 /// stride, produce either a zero or any extension based on the available 8053 /// features of the subtarget. The extended elements are consecutive and 8054 /// begin and can start from an offseted element index in the input; to 8055 /// avoid excess shuffling the offset must either being in the bottom lane 8056 /// or at the start of a higher lane. All extended elements must be from 8057 /// the same lane. 8058 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( 8059 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, 8060 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { 8061 assert(Scale > 1 && "Need a scale to extend."); 8062 int EltBits = VT.getScalarSizeInBits(); 8063 int NumElements = VT.getVectorNumElements(); 8064 int NumEltsPerLane = 128 / EltBits; 8065 int OffsetLane = Offset / NumEltsPerLane; 8066 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 8067 "Only 8, 16, and 32 bit elements can be extended."); 8068 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); 8069 assert(0 <= Offset && "Extension offset must be positive."); 8070 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && 8071 "Extension offset must be in the first lane or start an upper lane."); 8072 8073 // Check that an index is in same lane as the base offset. 8074 auto SafeOffset = [&](int Idx) { 8075 return OffsetLane == (Idx / NumEltsPerLane); 8076 }; 8077 8078 // Shift along an input so that the offset base moves to the first element. 8079 auto ShuffleOffset = [&](SDValue V) { 8080 if (!Offset) 8081 return V; 8082 8083 SmallVector<int, 8> ShMask((unsigned)NumElements, -1); 8084 for (int i = 0; i * Scale < NumElements; ++i) { 8085 int SrcIdx = i + Offset; 8086 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; 8087 } 8088 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); 8089 }; 8090 8091 // Found a valid zext mask! Try various lowering strategies based on the 8092 // input type and available ISA extensions. 8093 if (Subtarget.hasSSE41()) { 8094 // Not worth offseting 128-bit vectors if scale == 2, a pattern using 8095 // PUNPCK will catch this in a later shuffle match. 8096 if (Offset && Scale == 2 && VT.is128BitVector()) 8097 return SDValue(); 8098 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), 8099 NumElements / Scale); 8100 InputV = ShuffleOffset(InputV); 8101 8102 // For 256-bit vectors, we only need the lower (128-bit) input half. 8103 if (VT.is256BitVector()) 8104 InputV = extract128BitVector(InputV, 0, DAG, DL); 8105 8106 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV); 8107 return DAG.getBitcast(VT, InputV); 8108 } 8109 8110 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); 8111 8112 // For any extends we can cheat for larger element sizes and use shuffle 8113 // instructions that can fold with a load and/or copy. 8114 if (AnyExt && EltBits == 32) { 8115 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, 8116 -1}; 8117 return DAG.getBitcast( 8118 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, 8119 DAG.getBitcast(MVT::v4i32, InputV), 8120 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 8121 } 8122 if (AnyExt && EltBits == 16 && Scale > 2) { 8123 int PSHUFDMask[4] = {Offset / 2, -1, 8124 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; 8125 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, 8126 DAG.getBitcast(MVT::v4i32, InputV), 8127 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); 8128 int PSHUFWMask[4] = {1, -1, -1, -1}; 8129 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); 8130 return DAG.getBitcast( 8131 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, 8132 DAG.getBitcast(MVT::v8i16, InputV), 8133 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); 8134 } 8135 8136 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes 8137 // to 64-bits. 8138 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) { 8139 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); 8140 assert(VT.is128BitVector() && "Unexpected vector width!"); 8141 8142 int LoIdx = Offset * EltBits; 8143 SDValue Lo = DAG.getBitcast( 8144 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, 8145 DAG.getConstant(EltBits, DL, MVT::i8), 8146 DAG.getConstant(LoIdx, DL, MVT::i8))); 8147 8148 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || 8149 !SafeOffset(Offset + 1)) 8150 return DAG.getBitcast(VT, Lo); 8151 8152 int HiIdx = (Offset + 1) * EltBits; 8153 SDValue Hi = DAG.getBitcast( 8154 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, 8155 DAG.getConstant(EltBits, DL, MVT::i8), 8156 DAG.getConstant(HiIdx, DL, MVT::i8))); 8157 return DAG.getBitcast(VT, 8158 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); 8159 } 8160 8161 // If this would require more than 2 unpack instructions to expand, use 8162 // pshufb when available. We can only use more than 2 unpack instructions 8163 // when zero extending i8 elements which also makes it easier to use pshufb. 8164 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) { 8165 assert(NumElements == 16 && "Unexpected byte vector width!"); 8166 SDValue PSHUFBMask[16]; 8167 for (int i = 0; i < 16; ++i) { 8168 int Idx = Offset + (i / Scale); 8169 PSHUFBMask[i] = DAG.getConstant( 8170 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); 8171 } 8172 InputV = DAG.getBitcast(MVT::v16i8, InputV); 8173 return DAG.getBitcast( 8174 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, 8175 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask))); 8176 } 8177 8178 // If we are extending from an offset, ensure we start on a boundary that 8179 // we can unpack from. 8180 int AlignToUnpack = Offset % (NumElements / Scale); 8181 if (AlignToUnpack) { 8182 SmallVector<int, 8> ShMask((unsigned)NumElements, -1); 8183 for (int i = AlignToUnpack; i < NumElements; ++i) 8184 ShMask[i - AlignToUnpack] = i; 8185 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); 8186 Offset -= AlignToUnpack; 8187 } 8188 8189 // Otherwise emit a sequence of unpacks. 8190 do { 8191 unsigned UnpackLoHi = X86ISD::UNPCKL; 8192 if (Offset >= (NumElements / 2)) { 8193 UnpackLoHi = X86ISD::UNPCKH; 8194 Offset -= (NumElements / 2); 8195 } 8196 8197 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); 8198 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) 8199 : getZeroVector(InputVT, Subtarget, DAG, DL); 8200 InputV = DAG.getBitcast(InputVT, InputV); 8201 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); 8202 Scale /= 2; 8203 EltBits *= 2; 8204 NumElements /= 2; 8205 } while (Scale > 1); 8206 return DAG.getBitcast(VT, InputV); 8207 } 8208 8209 /// \brief Try to lower a vector shuffle as a zero extension on any microarch. 8210 /// 8211 /// This routine will try to do everything in its power to cleverly lower 8212 /// a shuffle which happens to match the pattern of a zero extend. It doesn't 8213 /// check for the profitability of this lowering, it tries to aggressively 8214 /// match this pattern. It will use all of the micro-architectural details it 8215 /// can to emit an efficient lowering. It handles both blends with all-zero 8216 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to 8217 /// masking out later). 8218 /// 8219 /// The reason we have dedicated lowering for zext-style shuffles is that they 8220 /// are both incredibly common and often quite performance sensitive. 8221 static SDValue lowerVectorShuffleAsZeroOrAnyExtend( 8222 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 8223 const X86Subtarget &Subtarget, SelectionDAG &DAG) { 8224 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 8225 8226 int Bits = VT.getSizeInBits(); 8227 int NumLanes = Bits / 128; 8228 int NumElements = VT.getVectorNumElements(); 8229 int NumEltsPerLane = NumElements / NumLanes; 8230 assert(VT.getScalarSizeInBits() <= 32 && 8231 "Exceeds 32-bit integer zero extension limit"); 8232 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); 8233 8234 // Define a helper function to check a particular ext-scale and lower to it if 8235 // valid. 8236 auto Lower = [&](int Scale) -> SDValue { 8237 SDValue InputV; 8238 bool AnyExt = true; 8239 int Offset = 0; 8240 int Matches = 0; 8241 for (int i = 0; i < NumElements; ++i) { 8242 int M = Mask[i]; 8243 if (M < 0) 8244 continue; // Valid anywhere but doesn't tell us anything. 8245 if (i % Scale != 0) { 8246 // Each of the extended elements need to be zeroable. 8247 if (!Zeroable[i]) 8248 return SDValue(); 8249 8250 // We no longer are in the anyext case. 8251 AnyExt = false; 8252 continue; 8253 } 8254 8255 // Each of the base elements needs to be consecutive indices into the 8256 // same input vector. 8257 SDValue V = M < NumElements ? V1 : V2; 8258 M = M % NumElements; 8259 if (!InputV) { 8260 InputV = V; 8261 Offset = M - (i / Scale); 8262 } else if (InputV != V) 8263 return SDValue(); // Flip-flopping inputs. 8264 8265 // Offset must start in the lowest 128-bit lane or at the start of an 8266 // upper lane. 8267 // FIXME: Is it ever worth allowing a negative base offset? 8268 if (!((0 <= Offset && Offset < NumEltsPerLane) || 8269 (Offset % NumEltsPerLane) == 0)) 8270 return SDValue(); 8271 8272 // If we are offsetting, all referenced entries must come from the same 8273 // lane. 8274 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) 8275 return SDValue(); 8276 8277 if ((M % NumElements) != (Offset + (i / Scale))) 8278 return SDValue(); // Non-consecutive strided elements. 8279 Matches++; 8280 } 8281 8282 // If we fail to find an input, we have a zero-shuffle which should always 8283 // have already been handled. 8284 // FIXME: Maybe handle this here in case during blending we end up with one? 8285 if (!InputV) 8286 return SDValue(); 8287 8288 // If we are offsetting, don't extend if we only match a single input, we 8289 // can always do better by using a basic PSHUF or PUNPCK. 8290 if (Offset != 0 && Matches < 2) 8291 return SDValue(); 8292 8293 return lowerVectorShuffleAsSpecificZeroOrAnyExtend( 8294 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); 8295 }; 8296 8297 // The widest scale possible for extending is to a 64-bit integer. 8298 assert(Bits % 64 == 0 && 8299 "The number of bits in a vector must be divisible by 64 on x86!"); 8300 int NumExtElements = Bits / 64; 8301 8302 // Each iteration, try extending the elements half as much, but into twice as 8303 // many elements. 8304 for (; NumExtElements < NumElements; NumExtElements *= 2) { 8305 assert(NumElements % NumExtElements == 0 && 8306 "The input vector size must be divisible by the extended size."); 8307 if (SDValue V = Lower(NumElements / NumExtElements)) 8308 return V; 8309 } 8310 8311 // General extends failed, but 128-bit vectors may be able to use MOVQ. 8312 if (Bits != 128) 8313 return SDValue(); 8314 8315 // Returns one of the source operands if the shuffle can be reduced to a 8316 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. 8317 auto CanZExtLowHalf = [&]() { 8318 for (int i = NumElements / 2; i != NumElements; ++i) 8319 if (!Zeroable[i]) 8320 return SDValue(); 8321 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) 8322 return V1; 8323 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) 8324 return V2; 8325 return SDValue(); 8326 }; 8327 8328 if (SDValue V = CanZExtLowHalf()) { 8329 V = DAG.getBitcast(MVT::v2i64, V); 8330 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); 8331 return DAG.getBitcast(VT, V); 8332 } 8333 8334 // No viable ext lowering found. 8335 return SDValue(); 8336 } 8337 8338 /// \brief Try to get a scalar value for a specific element of a vector. 8339 /// 8340 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. 8341 static SDValue getScalarValueForVectorElement(SDValue V, int Idx, 8342 SelectionDAG &DAG) { 8343 MVT VT = V.getSimpleValueType(); 8344 MVT EltVT = VT.getVectorElementType(); 8345 V = peekThroughBitcasts(V); 8346 8347 // If the bitcasts shift the element size, we can't extract an equivalent 8348 // element from it. 8349 MVT NewVT = V.getSimpleValueType(); 8350 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) 8351 return SDValue(); 8352 8353 if (V.getOpcode() == ISD::BUILD_VECTOR || 8354 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { 8355 // Ensure the scalar operand is the same size as the destination. 8356 // FIXME: Add support for scalar truncation where possible. 8357 SDValue S = V.getOperand(Idx); 8358 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) 8359 return DAG.getBitcast(EltVT, S); 8360 } 8361 8362 return SDValue(); 8363 } 8364 8365 /// \brief Helper to test for a load that can be folded with x86 shuffles. 8366 /// 8367 /// This is particularly important because the set of instructions varies 8368 /// significantly based on whether the operand is a load or not. 8369 static bool isShuffleFoldableLoad(SDValue V) { 8370 V = peekThroughBitcasts(V); 8371 return ISD::isNON_EXTLoad(V.getNode()); 8372 } 8373 8374 /// \brief Try to lower insertion of a single element into a zero vector. 8375 /// 8376 /// This is a common pattern that we have especially efficient patterns to lower 8377 /// across all subtarget feature sets. 8378 static SDValue lowerVectorShuffleAsElementInsertion( 8379 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 8380 const X86Subtarget &Subtarget, SelectionDAG &DAG) { 8381 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 8382 MVT ExtVT = VT; 8383 MVT EltVT = VT.getVectorElementType(); 8384 8385 int V2Index = std::find_if(Mask.begin(), Mask.end(), 8386 [&Mask](int M) { return M >= (int)Mask.size(); }) - 8387 Mask.begin(); 8388 bool IsV1Zeroable = true; 8389 for (int i = 0, Size = Mask.size(); i < Size; ++i) 8390 if (i != V2Index && !Zeroable[i]) { 8391 IsV1Zeroable = false; 8392 break; 8393 } 8394 8395 // Check for a single input from a SCALAR_TO_VECTOR node. 8396 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and 8397 // all the smarts here sunk into that routine. However, the current 8398 // lowering of BUILD_VECTOR makes that nearly impossible until the old 8399 // vector shuffle lowering is dead. 8400 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), 8401 DAG); 8402 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { 8403 // We need to zext the scalar if it is smaller than an i32. 8404 V2S = DAG.getBitcast(EltVT, V2S); 8405 if (EltVT == MVT::i8 || EltVT == MVT::i16) { 8406 // Using zext to expand a narrow element won't work for non-zero 8407 // insertions. 8408 if (!IsV1Zeroable) 8409 return SDValue(); 8410 8411 // Zero-extend directly to i32. 8412 ExtVT = MVT::v4i32; 8413 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); 8414 } 8415 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); 8416 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || 8417 EltVT == MVT::i16) { 8418 // Either not inserting from the low element of the input or the input 8419 // element size is too small to use VZEXT_MOVL to clear the high bits. 8420 return SDValue(); 8421 } 8422 8423 if (!IsV1Zeroable) { 8424 // If V1 can't be treated as a zero vector we have fewer options to lower 8425 // this. We can't support integer vectors or non-zero targets cheaply, and 8426 // the V1 elements can't be permuted in any way. 8427 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); 8428 if (!VT.isFloatingPoint() || V2Index != 0) 8429 return SDValue(); 8430 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end()); 8431 V1Mask[V2Index] = -1; 8432 if (!isNoopShuffleMask(V1Mask)) 8433 return SDValue(); 8434 // This is essentially a special case blend operation, but if we have 8435 // general purpose blend operations, they are always faster. Bail and let 8436 // the rest of the lowering handle these as blends. 8437 if (Subtarget.hasSSE41()) 8438 return SDValue(); 8439 8440 // Otherwise, use MOVSD or MOVSS. 8441 assert((EltVT == MVT::f32 || EltVT == MVT::f64) && 8442 "Only two types of floating point element types to handle!"); 8443 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, 8444 ExtVT, V1, V2); 8445 } 8446 8447 // This lowering only works for the low element with floating point vectors. 8448 if (VT.isFloatingPoint() && V2Index != 0) 8449 return SDValue(); 8450 8451 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); 8452 if (ExtVT != VT) 8453 V2 = DAG.getBitcast(VT, V2); 8454 8455 if (V2Index != 0) { 8456 // If we have 4 or fewer lanes we can cheaply shuffle the element into 8457 // the desired position. Otherwise it is more efficient to do a vector 8458 // shift left. We know that we can do a vector shift left because all 8459 // the inputs are zero. 8460 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { 8461 SmallVector<int, 4> V2Shuffle(Mask.size(), 1); 8462 V2Shuffle[V2Index] = 0; 8463 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); 8464 } else { 8465 V2 = DAG.getBitcast(MVT::v16i8, V2); 8466 V2 = DAG.getNode( 8467 X86ISD::VSHLDQ, DL, MVT::v16i8, V2, 8468 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, 8469 DAG.getTargetLoweringInfo().getScalarShiftAmountTy( 8470 DAG.getDataLayout(), VT))); 8471 V2 = DAG.getBitcast(VT, V2); 8472 } 8473 } 8474 return V2; 8475 } 8476 8477 /// Try to lower broadcast of a single - truncated - integer element, 8478 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. 8479 /// 8480 /// This assumes we have AVX2. 8481 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, 8482 SDValue V0, int BroadcastIdx, 8483 const X86Subtarget &Subtarget, 8484 SelectionDAG &DAG) { 8485 assert(Subtarget.hasAVX2() && 8486 "We can only lower integer broadcasts with AVX2!"); 8487 8488 EVT EltVT = VT.getVectorElementType(); 8489 EVT V0VT = V0.getValueType(); 8490 8491 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); 8492 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); 8493 8494 EVT V0EltVT = V0VT.getVectorElementType(); 8495 if (!V0EltVT.isInteger()) 8496 return SDValue(); 8497 8498 const unsigned EltSize = EltVT.getSizeInBits(); 8499 const unsigned V0EltSize = V0EltVT.getSizeInBits(); 8500 8501 // This is only a truncation if the original element type is larger. 8502 if (V0EltSize <= EltSize) 8503 return SDValue(); 8504 8505 assert(((V0EltSize % EltSize) == 0) && 8506 "Scalar type sizes must all be powers of 2 on x86!"); 8507 8508 const unsigned V0Opc = V0.getOpcode(); 8509 const unsigned Scale = V0EltSize / EltSize; 8510 const unsigned V0BroadcastIdx = BroadcastIdx / Scale; 8511 8512 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && 8513 V0Opc != ISD::BUILD_VECTOR) 8514 return SDValue(); 8515 8516 SDValue Scalar = V0.getOperand(V0BroadcastIdx); 8517 8518 // If we're extracting non-least-significant bits, shift so we can truncate. 8519 // Hopefully, we can fold away the trunc/srl/load into the broadcast. 8520 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer 8521 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. 8522 if (const int OffsetIdx = BroadcastIdx % Scale) 8523 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, 8524 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); 8525 8526 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, 8527 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); 8528 } 8529 8530 /// \brief Try to lower broadcast of a single element. 8531 /// 8532 /// For convenience, this code also bundles all of the subtarget feature set 8533 /// filtering. While a little annoying to re-dispatch on type here, there isn't 8534 /// a convenient way to factor it out. 8535 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? 8536 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, 8537 SDValue V1, SDValue V2, 8538 ArrayRef<int> Mask, 8539 const X86Subtarget &Subtarget, 8540 SelectionDAG &DAG) { 8541 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || 8542 (Subtarget.hasAVX() && VT.isFloatingPoint()) || 8543 (Subtarget.hasAVX2() && VT.isInteger()))) 8544 return SDValue(); 8545 8546 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise 8547 // we can only broadcast from a register with AVX2. 8548 unsigned NumElts = Mask.size(); 8549 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; 8550 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); 8551 8552 // Check that the mask is a broadcast. 8553 int BroadcastIdx = -1; 8554 for (int i = 0; i != (int)NumElts; ++i) { 8555 SmallVector<int, 8> BroadcastMask(NumElts, i); 8556 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) { 8557 BroadcastIdx = i; 8558 break; 8559 } 8560 } 8561 8562 if (BroadcastIdx < 0) 8563 return SDValue(); 8564 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " 8565 "a sorted mask where the broadcast " 8566 "comes from V1."); 8567 8568 // Go up the chain of (vector) values to find a scalar load that we can 8569 // combine with the broadcast. 8570 SDValue V = V1; 8571 for (;;) { 8572 switch (V.getOpcode()) { 8573 case ISD::BITCAST: { 8574 SDValue VSrc = V.getOperand(0); 8575 MVT SrcVT = VSrc.getSimpleValueType(); 8576 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits()) 8577 break; 8578 V = VSrc; 8579 continue; 8580 } 8581 case ISD::CONCAT_VECTORS: { 8582 int OperandSize = Mask.size() / V.getNumOperands(); 8583 V = V.getOperand(BroadcastIdx / OperandSize); 8584 BroadcastIdx %= OperandSize; 8585 continue; 8586 } 8587 case ISD::INSERT_SUBVECTOR: { 8588 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); 8589 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); 8590 if (!ConstantIdx) 8591 break; 8592 8593 int BeginIdx = (int)ConstantIdx->getZExtValue(); 8594 int EndIdx = 8595 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); 8596 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { 8597 BroadcastIdx -= BeginIdx; 8598 V = VInner; 8599 } else { 8600 V = VOuter; 8601 } 8602 continue; 8603 } 8604 } 8605 break; 8606 } 8607 8608 // Check if this is a broadcast of a scalar. We special case lowering 8609 // for scalars so that we can more effectively fold with loads. 8610 // First, look through bitcast: if the original value has a larger element 8611 // type than the shuffle, the broadcast element is in essence truncated. 8612 // Make that explicit to ease folding. 8613 if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) 8614 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( 8615 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) 8616 return TruncBroadcast; 8617 8618 MVT BroadcastVT = VT; 8619 8620 // Peek through any bitcast (only useful for loads). 8621 SDValue BC = peekThroughBitcasts(V); 8622 8623 // Also check the simpler case, where we can directly reuse the scalar. 8624 if (V.getOpcode() == ISD::BUILD_VECTOR || 8625 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { 8626 V = V.getOperand(BroadcastIdx); 8627 8628 // If we can't broadcast from a register, check that the input is a load. 8629 if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) 8630 return SDValue(); 8631 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) { 8632 // 32-bit targets need to load i64 as a f64 and then bitcast the result. 8633 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { 8634 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); 8635 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode); 8636 } 8637 8638 // If we are broadcasting a load that is only used by the shuffle 8639 // then we can reduce the vector load to the broadcasted scalar load. 8640 LoadSDNode *Ld = cast<LoadSDNode>(BC); 8641 SDValue BaseAddr = Ld->getOperand(1); 8642 EVT SVT = BroadcastVT.getScalarType(); 8643 unsigned Offset = BroadcastIdx * SVT.getStoreSize(); 8644 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); 8645 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, 8646 DAG.getMachineFunction().getMachineMemOperand( 8647 Ld->getMemOperand(), Offset, SVT.getStoreSize())); 8648 } else if (!BroadcastFromReg) { 8649 // We can't broadcast from a vector register. 8650 return SDValue(); 8651 } else if (BroadcastIdx != 0) { 8652 // We can only broadcast from the zero-element of a vector register, 8653 // but it can be advantageous to broadcast from the zero-element of a 8654 // subvector. 8655 if (!VT.is256BitVector() && !VT.is512BitVector()) 8656 return SDValue(); 8657 8658 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. 8659 if (VT == MVT::v4f64 || VT == MVT::v4i64) 8660 return SDValue(); 8661 8662 // Only broadcast the zero-element of a 128-bit subvector. 8663 unsigned EltSize = VT.getScalarSizeInBits(); 8664 if (((BroadcastIdx * EltSize) % 128) != 0) 8665 return SDValue(); 8666 8667 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize); 8668 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V, 8669 DAG.getIntPtrConstant(BroadcastIdx, DL)); 8670 } 8671 8672 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) 8673 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, 8674 DAG.getBitcast(MVT::f64, V)); 8675 8676 // Bitcast back to the same scalar type as BroadcastVT. 8677 MVT SrcVT = V.getSimpleValueType(); 8678 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) { 8679 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && 8680 "Unexpected vector element size"); 8681 if (SrcVT.isVector()) { 8682 unsigned NumSrcElts = SrcVT.getVectorNumElements(); 8683 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); 8684 } else { 8685 SrcVT = BroadcastVT.getScalarType(); 8686 } 8687 V = DAG.getBitcast(SrcVT, V); 8688 } 8689 8690 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); 8691 } 8692 8693 // Check for whether we can use INSERTPS to perform the shuffle. We only use 8694 // INSERTPS when the V1 elements are already in the correct locations 8695 // because otherwise we can just always use two SHUFPS instructions which 8696 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also 8697 // perform INSERTPS if a single V1 element is out of place and all V2 8698 // elements are zeroable. 8699 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, 8700 unsigned &InsertPSMask, 8701 const SmallBitVector &Zeroable, 8702 ArrayRef<int> Mask, 8703 SelectionDAG &DAG) { 8704 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); 8705 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); 8706 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 8707 unsigned ZMask = 0; 8708 int V1DstIndex = -1; 8709 int V2DstIndex = -1; 8710 bool V1UsedInPlace = false; 8711 8712 for (int i = 0; i < 4; ++i) { 8713 // Synthesize a zero mask from the zeroable elements (includes undefs). 8714 if (Zeroable[i]) { 8715 ZMask |= 1 << i; 8716 continue; 8717 } 8718 8719 // Flag if we use any V1 inputs in place. 8720 if (i == Mask[i]) { 8721 V1UsedInPlace = true; 8722 continue; 8723 } 8724 8725 // We can only insert a single non-zeroable element. 8726 if (V1DstIndex >= 0 || V2DstIndex >= 0) 8727 return false; 8728 8729 if (Mask[i] < 4) { 8730 // V1 input out of place for insertion. 8731 V1DstIndex = i; 8732 } else { 8733 // V2 input for insertion. 8734 V2DstIndex = i; 8735 } 8736 } 8737 8738 // Don't bother if we have no (non-zeroable) element for insertion. 8739 if (V1DstIndex < 0 && V2DstIndex < 0) 8740 return false; 8741 8742 // Determine element insertion src/dst indices. The src index is from the 8743 // start of the inserted vector, not the start of the concatenated vector. 8744 unsigned V2SrcIndex = 0; 8745 if (V1DstIndex >= 0) { 8746 // If we have a V1 input out of place, we use V1 as the V2 element insertion 8747 // and don't use the original V2 at all. 8748 V2SrcIndex = Mask[V1DstIndex]; 8749 V2DstIndex = V1DstIndex; 8750 V2 = V1; 8751 } else { 8752 V2SrcIndex = Mask[V2DstIndex] - 4; 8753 } 8754 8755 // If no V1 inputs are used in place, then the result is created only from 8756 // the zero mask and the V2 insertion - so remove V1 dependency. 8757 if (!V1UsedInPlace) 8758 V1 = DAG.getUNDEF(MVT::v4f32); 8759 8760 // Insert the V2 element into the desired position. 8761 InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; 8762 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); 8763 return true; 8764 } 8765 8766 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, 8767 SDValue V2, ArrayRef<int> Mask, 8768 SelectionDAG &DAG) { 8769 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 8770 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 8771 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 8772 8773 // Attempt to match the insertps pattern. 8774 unsigned InsertPSMask; 8775 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) 8776 return SDValue(); 8777 8778 // Insert the V2 element into the desired position. 8779 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, 8780 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 8781 } 8782 8783 /// \brief Try to lower a shuffle as a permute of the inputs followed by an 8784 /// UNPCK instruction. 8785 /// 8786 /// This specifically targets cases where we end up with alternating between 8787 /// the two inputs, and so can permute them into something that feeds a single 8788 /// UNPCK instruction. Note that this routine only targets integer vectors 8789 /// because for floating point vectors we have a generalized SHUFPS lowering 8790 /// strategy that handles everything that doesn't *exactly* match an unpack, 8791 /// making this clever lowering unnecessary. 8792 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, 8793 SDValue V1, SDValue V2, 8794 ArrayRef<int> Mask, 8795 SelectionDAG &DAG) { 8796 assert(!VT.isFloatingPoint() && 8797 "This routine only supports integer vectors."); 8798 assert(VT.is128BitVector() && 8799 "This routine only works on 128-bit vectors."); 8800 assert(!V2.isUndef() && 8801 "This routine should only be used when blending two inputs."); 8802 assert(Mask.size() >= 2 && "Single element masks are invalid."); 8803 8804 int Size = Mask.size(); 8805 8806 int NumLoInputs = 8807 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; }); 8808 int NumHiInputs = 8809 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; }); 8810 8811 bool UnpackLo = NumLoInputs >= NumHiInputs; 8812 8813 auto TryUnpack = [&](MVT UnpackVT, int Scale) { 8814 SmallVector<int, 16> V1Mask(Mask.size(), -1); 8815 SmallVector<int, 16> V2Mask(Mask.size(), -1); 8816 8817 for (int i = 0; i < Size; ++i) { 8818 if (Mask[i] < 0) 8819 continue; 8820 8821 // Each element of the unpack contains Scale elements from this mask. 8822 int UnpackIdx = i / Scale; 8823 8824 // We only handle the case where V1 feeds the first slots of the unpack. 8825 // We rely on canonicalization to ensure this is the case. 8826 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) 8827 return SDValue(); 8828 8829 // Setup the mask for this input. The indexing is tricky as we have to 8830 // handle the unpack stride. 8831 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; 8832 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = 8833 Mask[i] % Size; 8834 } 8835 8836 // If we will have to shuffle both inputs to use the unpack, check whether 8837 // we can just unpack first and shuffle the result. If so, skip this unpack. 8838 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && 8839 !isNoopShuffleMask(V2Mask)) 8840 return SDValue(); 8841 8842 // Shuffle the inputs into place. 8843 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); 8844 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); 8845 8846 // Cast the inputs to the type we will use to unpack them. 8847 V1 = DAG.getBitcast(UnpackVT, V1); 8848 V2 = DAG.getBitcast(UnpackVT, V2); 8849 8850 // Unpack the inputs and cast the result back to the desired type. 8851 return DAG.getBitcast( 8852 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, 8853 UnpackVT, V1, V2)); 8854 }; 8855 8856 // We try each unpack from the largest to the smallest to try and find one 8857 // that fits this mask. 8858 int OrigNumElements = VT.getVectorNumElements(); 8859 int OrigScalarSize = VT.getScalarSizeInBits(); 8860 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { 8861 int Scale = ScalarSize / OrigScalarSize; 8862 int NumElements = OrigNumElements / Scale; 8863 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); 8864 if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) 8865 return Unpack; 8866 } 8867 8868 // If none of the unpack-rooted lowerings worked (or were profitable) try an 8869 // initial unpack. 8870 if (NumLoInputs == 0 || NumHiInputs == 0) { 8871 assert((NumLoInputs > 0 || NumHiInputs > 0) && 8872 "We have to have *some* inputs!"); 8873 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; 8874 8875 // FIXME: We could consider the total complexity of the permute of each 8876 // possible unpacking. Or at the least we should consider how many 8877 // half-crossings are created. 8878 // FIXME: We could consider commuting the unpacks. 8879 8880 SmallVector<int, 32> PermMask((unsigned)Size, -1); 8881 for (int i = 0; i < Size; ++i) { 8882 if (Mask[i] < 0) 8883 continue; 8884 8885 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); 8886 8887 PermMask[i] = 8888 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); 8889 } 8890 return DAG.getVectorShuffle( 8891 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, 8892 DL, VT, V1, V2), 8893 DAG.getUNDEF(VT), PermMask); 8894 } 8895 8896 return SDValue(); 8897 } 8898 8899 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. 8900 /// 8901 /// This is the basis function for the 2-lane 64-bit shuffles as we have full 8902 /// support for floating point shuffles but not integer shuffles. These 8903 /// instructions will incur a domain crossing penalty on some chips though so 8904 /// it is better to avoid lowering through this for integer vectors where 8905 /// possible. 8906 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 8907 SDValue V1, SDValue V2, 8908 const X86Subtarget &Subtarget, 8909 SelectionDAG &DAG) { 8910 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); 8911 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); 8912 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); 8913 8914 if (V2.isUndef()) { 8915 // Check for being able to broadcast a single element. 8916 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( 8917 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) 8918 return Broadcast; 8919 8920 // Straight shuffle of a single input vector. Simulate this by using the 8921 // single input as both of the "inputs" to this instruction.. 8922 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); 8923 8924 if (Subtarget.hasAVX()) { 8925 // If we have AVX, we can use VPERMILPS which will allow folding a load 8926 // into the shuffle. 8927 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, 8928 DAG.getConstant(SHUFPDMask, DL, MVT::i8)); 8929 } 8930 8931 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1, 8932 DAG.getConstant(SHUFPDMask, DL, MVT::i8)); 8933 } 8934 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); 8935 assert(Mask[1] >= 2 && "Non-canonicalized blend!"); 8936 8937 // If we have a single input, insert that into V1 if we can do so cheaply. 8938 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { 8939 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 8940 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) 8941 return Insertion; 8942 // Try inverting the insertion since for v2 masks it is easy to do and we 8943 // can't reliably sort the mask one way or the other. 8944 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), 8945 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; 8946 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 8947 DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) 8948 return Insertion; 8949 } 8950 8951 // Try to use one of the special instruction patterns to handle two common 8952 // blend patterns if a zero-blend above didn't work. 8953 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || 8954 isShuffleEquivalent(V1, V2, Mask, {1, 3})) 8955 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) 8956 // We can either use a special instruction to load over the low double or 8957 // to move just the low double. 8958 return DAG.getNode( 8959 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, 8960 DL, MVT::v2f64, V2, 8961 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); 8962 8963 if (Subtarget.hasSSE41()) 8964 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, 8965 Subtarget, DAG)) 8966 return Blend; 8967 8968 // Use dedicated unpack instructions for masks that match their pattern. 8969 if (SDValue V = 8970 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) 8971 return V; 8972 8973 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); 8974 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, 8975 DAG.getConstant(SHUFPDMask, DL, MVT::i8)); 8976 } 8977 8978 /// \brief Handle lowering of 2-lane 64-bit integer shuffles. 8979 /// 8980 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by 8981 /// the integer unit to minimize domain crossing penalties. However, for blends 8982 /// it falls back to the floating point shuffle operation with appropriate bit 8983 /// casting. 8984 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 8985 SDValue V1, SDValue V2, 8986 const X86Subtarget &Subtarget, 8987 SelectionDAG &DAG) { 8988 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); 8989 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); 8990 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); 8991 8992 if (V2.isUndef()) { 8993 // Check for being able to broadcast a single element. 8994 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( 8995 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) 8996 return Broadcast; 8997 8998 // Straight shuffle of a single input vector. For everything from SSE2 8999 // onward this has a single fast instruction with no scary immediates. 9000 // We have to map the mask as it is actually a v4i32 shuffle instruction. 9001 V1 = DAG.getBitcast(MVT::v4i32, V1); 9002 int WidenedMask[4] = { 9003 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, 9004 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; 9005 return DAG.getBitcast( 9006 MVT::v2i64, 9007 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, 9008 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); 9009 } 9010 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); 9011 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); 9012 assert(Mask[0] < 2 && "We sort V1 to be the first input."); 9013 assert(Mask[1] >= 2 && "We sort V2 to be the second input."); 9014 9015 // If we have a blend of two same-type PACKUS operations and the blend aligns 9016 // with the low and high halves, we can just merge the PACKUS operations. 9017 // This is particularly important as it lets us merge shuffles that this 9018 // routine itself creates. 9019 auto GetPackNode = [](SDValue V) { 9020 V = peekThroughBitcasts(V); 9021 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); 9022 }; 9023 if (SDValue V1Pack = GetPackNode(V1)) 9024 if (SDValue V2Pack = GetPackNode(V2)) { 9025 EVT PackVT = V1Pack.getValueType(); 9026 if (PackVT == V2Pack.getValueType()) 9027 return DAG.getBitcast(MVT::v2i64, 9028 DAG.getNode(X86ISD::PACKUS, DL, PackVT, 9029 Mask[0] == 0 ? V1Pack.getOperand(0) 9030 : V1Pack.getOperand(1), 9031 Mask[1] == 2 ? V2Pack.getOperand(0) 9032 : V2Pack.getOperand(1))); 9033 } 9034 9035 // Try to use shift instructions. 9036 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, 9037 Subtarget, DAG)) 9038 return Shift; 9039 9040 // When loading a scalar and then shuffling it into a vector we can often do 9041 // the insertion cheaply. 9042 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 9043 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) 9044 return Insertion; 9045 // Try inverting the insertion since for v2 masks it is easy to do and we 9046 // can't reliably sort the mask one way or the other. 9047 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; 9048 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 9049 DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) 9050 return Insertion; 9051 9052 // We have different paths for blend lowering, but they all must use the 9053 // *exact* same predicate. 9054 bool IsBlendSupported = Subtarget.hasSSE41(); 9055 if (IsBlendSupported) 9056 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, 9057 Subtarget, DAG)) 9058 return Blend; 9059 9060 // Use dedicated unpack instructions for masks that match their pattern. 9061 if (SDValue V = 9062 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) 9063 return V; 9064 9065 // Try to use byte rotation instructions. 9066 // Its more profitable for pre-SSSE3 to use shuffles/unpacks. 9067 if (Subtarget.hasSSSE3()) 9068 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 9069 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) 9070 return Rotate; 9071 9072 // If we have direct support for blends, we should lower by decomposing into 9073 // a permute. That will be faster than the domain cross. 9074 if (IsBlendSupported) 9075 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, 9076 Mask, DAG); 9077 9078 // We implement this with SHUFPD which is pretty lame because it will likely 9079 // incur 2 cycles of stall for integer vectors on Nehalem and older chips. 9080 // However, all the alternatives are still more cycles and newer chips don't 9081 // have this problem. It would be really nice if x86 had better shuffles here. 9082 V1 = DAG.getBitcast(MVT::v2f64, V1); 9083 V2 = DAG.getBitcast(MVT::v2f64, V2); 9084 return DAG.getBitcast(MVT::v2i64, 9085 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); 9086 } 9087 9088 /// \brief Test whether this can be lowered with a single SHUFPS instruction. 9089 /// 9090 /// This is used to disable more specialized lowerings when the shufps lowering 9091 /// will happen to be efficient. 9092 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { 9093 // This routine only handles 128-bit shufps. 9094 assert(Mask.size() == 4 && "Unsupported mask size!"); 9095 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); 9096 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); 9097 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); 9098 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); 9099 9100 // To lower with a single SHUFPS we need to have the low half and high half 9101 // each requiring a single input. 9102 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) 9103 return false; 9104 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) 9105 return false; 9106 9107 return true; 9108 } 9109 9110 /// \brief Lower a vector shuffle using the SHUFPS instruction. 9111 /// 9112 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. 9113 /// It makes no assumptions about whether this is the *best* lowering, it simply 9114 /// uses it. 9115 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, 9116 ArrayRef<int> Mask, SDValue V1, 9117 SDValue V2, SelectionDAG &DAG) { 9118 SDValue LowV = V1, HighV = V2; 9119 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; 9120 9121 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); 9122 9123 if (NumV2Elements == 1) { 9124 int V2Index = 9125 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - 9126 Mask.begin(); 9127 9128 // Compute the index adjacent to V2Index and in the same half by toggling 9129 // the low bit. 9130 int V2AdjIndex = V2Index ^ 1; 9131 9132 if (Mask[V2AdjIndex] < 0) { 9133 // Handles all the cases where we have a single V2 element and an undef. 9134 // This will only ever happen in the high lanes because we commute the 9135 // vector otherwise. 9136 if (V2Index < 2) 9137 std::swap(LowV, HighV); 9138 NewMask[V2Index] -= 4; 9139 } else { 9140 // Handle the case where the V2 element ends up adjacent to a V1 element. 9141 // To make this work, blend them together as the first step. 9142 int V1Index = V2AdjIndex; 9143 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; 9144 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, 9145 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); 9146 9147 // Now proceed to reconstruct the final blend as we have the necessary 9148 // high or low half formed. 9149 if (V2Index < 2) { 9150 LowV = V2; 9151 HighV = V1; 9152 } else { 9153 HighV = V2; 9154 } 9155 NewMask[V1Index] = 2; // We put the V1 element in V2[2]. 9156 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. 9157 } 9158 } else if (NumV2Elements == 2) { 9159 if (Mask[0] < 4 && Mask[1] < 4) { 9160 // Handle the easy case where we have V1 in the low lanes and V2 in the 9161 // high lanes. 9162 NewMask[2] -= 4; 9163 NewMask[3] -= 4; 9164 } else if (Mask[2] < 4 && Mask[3] < 4) { 9165 // We also handle the reversed case because this utility may get called 9166 // when we detect a SHUFPS pattern but can't easily commute the shuffle to 9167 // arrange things in the right direction. 9168 NewMask[0] -= 4; 9169 NewMask[1] -= 4; 9170 HighV = V1; 9171 LowV = V2; 9172 } else { 9173 // We have a mixture of V1 and V2 in both low and high lanes. Rather than 9174 // trying to place elements directly, just blend them and set up the final 9175 // shuffle to place them. 9176 9177 // The first two blend mask elements are for V1, the second two are for 9178 // V2. 9179 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], 9180 Mask[2] < 4 ? Mask[2] : Mask[3], 9181 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, 9182 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; 9183 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, 9184 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); 9185 9186 // Now we do a normal shuffle of V1 by giving V1 as both operands to 9187 // a blend. 9188 LowV = HighV = V1; 9189 NewMask[0] = Mask[0] < 4 ? 0 : 2; 9190 NewMask[1] = Mask[0] < 4 ? 2 : 0; 9191 NewMask[2] = Mask[2] < 4 ? 1 : 3; 9192 NewMask[3] = Mask[2] < 4 ? 3 : 1; 9193 } 9194 } 9195 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, 9196 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); 9197 } 9198 9199 /// \brief Lower 4-lane 32-bit floating point shuffles. 9200 /// 9201 /// Uses instructions exclusively from the floating point unit to minimize 9202 /// domain crossing penalties, as these are sufficient to implement all v4f32 9203 /// shuffles. 9204 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 9205 SDValue V1, SDValue V2, 9206 const X86Subtarget &Subtarget, 9207 SelectionDAG &DAG) { 9208 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 9209 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 9210 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 9211 9212 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); 9213 9214 if (NumV2Elements == 0) { 9215 // Check for being able to broadcast a single element. 9216 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( 9217 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) 9218 return Broadcast; 9219 9220 // Use even/odd duplicate instructions for masks that match their pattern. 9221 if (Subtarget.hasSSE3()) { 9222 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) 9223 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); 9224 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) 9225 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); 9226 } 9227 9228 if (Subtarget.hasAVX()) { 9229 // If we have AVX, we can use VPERMILPS which will allow folding a load 9230 // into the shuffle. 9231 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, 9232 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 9233 } 9234 9235 // Otherwise, use a straight shuffle of a single input vector. We pass the 9236 // input vector to both operands to simulate this with a SHUFPS. 9237 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, 9238 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 9239 } 9240 9241 // There are special ways we can lower some single-element blends. However, we 9242 // have custom ways we can lower more complex single-element blends below that 9243 // we defer to if both this and BLENDPS fail to match, so restrict this to 9244 // when the V2 input is targeting element 0 of the mask -- that is the fast 9245 // case here. 9246 if (NumV2Elements == 1 && Mask[0] >= 4) 9247 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, 9248 Mask, Subtarget, DAG)) 9249 return V; 9250 9251 if (Subtarget.hasSSE41()) { 9252 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, 9253 Subtarget, DAG)) 9254 return Blend; 9255 9256 // Use INSERTPS if we can complete the shuffle efficiently. 9257 if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG)) 9258 return V; 9259 9260 if (!isSingleSHUFPSMask(Mask)) 9261 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( 9262 DL, MVT::v4f32, V1, V2, Mask, DAG)) 9263 return BlendPerm; 9264 } 9265 9266 // Use low/high mov instructions. 9267 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) 9268 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); 9269 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})) 9270 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); 9271 9272 // Use dedicated unpack instructions for masks that match their pattern. 9273 if (SDValue V = 9274 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) 9275 return V; 9276 9277 // Otherwise fall back to a SHUFPS lowering strategy. 9278 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); 9279 } 9280 9281 /// \brief Lower 4-lane i32 vector shuffles. 9282 /// 9283 /// We try to handle these with integer-domain shuffles where we can, but for 9284 /// blends we use the floating point domain blend instructions. 9285 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 9286 SDValue V1, SDValue V2, 9287 const X86Subtarget &Subtarget, 9288 SelectionDAG &DAG) { 9289 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); 9290 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); 9291 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 9292 9293 // Whenever we can lower this as a zext, that instruction is strictly faster 9294 // than any alternative. It also allows us to fold memory operands into the 9295 // shuffle in many cases. 9296 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, 9297 Mask, Subtarget, DAG)) 9298 return ZExt; 9299 9300 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); 9301 9302 if (NumV2Elements == 0) { 9303 // Check for being able to broadcast a single element. 9304 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( 9305 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) 9306 return Broadcast; 9307 9308 // Straight shuffle of a single input vector. For everything from SSE2 9309 // onward this has a single fast instruction with no scary immediates. 9310 // We coerce the shuffle pattern to be compatible with UNPCK instructions 9311 // but we aren't actually going to use the UNPCK instruction because doing 9312 // so prevents folding a load into this instruction or making a copy. 9313 const int UnpackLoMask[] = {0, 0, 1, 1}; 9314 const int UnpackHiMask[] = {2, 2, 3, 3}; 9315 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) 9316 Mask = UnpackLoMask; 9317 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) 9318 Mask = UnpackHiMask; 9319 9320 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, 9321 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 9322 } 9323 9324 // Try to use shift instructions. 9325 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, 9326 Subtarget, DAG)) 9327 return Shift; 9328 9329 // There are special ways we can lower some single-element blends. 9330 if (NumV2Elements == 1) 9331 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, 9332 Mask, Subtarget, DAG)) 9333 return V; 9334 9335 // We have different paths for blend lowering, but they all must use the 9336 // *exact* same predicate. 9337 bool IsBlendSupported = Subtarget.hasSSE41(); 9338 if (IsBlendSupported) 9339 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, 9340 Subtarget, DAG)) 9341 return Blend; 9342 9343 if (SDValue Masked = 9344 lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) 9345 return Masked; 9346 9347 // Use dedicated unpack instructions for masks that match their pattern. 9348 if (SDValue V = 9349 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) 9350 return V; 9351 9352 // Try to use byte rotation instructions. 9353 // Its more profitable for pre-SSSE3 to use shuffles/unpacks. 9354 if (Subtarget.hasSSSE3()) 9355 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 9356 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) 9357 return Rotate; 9358 9359 // If we have direct support for blends, we should lower by decomposing into 9360 // a permute. That will be faster than the domain cross. 9361 if (IsBlendSupported) 9362 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, 9363 Mask, DAG); 9364 9365 // Try to lower by permuting the inputs into an unpack instruction. 9366 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, 9367 V2, Mask, DAG)) 9368 return Unpack; 9369 9370 // We implement this with SHUFPS because it can blend from two vectors. 9371 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build 9372 // up the inputs, bypassing domain shift penalties that we would encur if we 9373 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't 9374 // relevant. 9375 return DAG.getBitcast( 9376 MVT::v4i32, 9377 DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1), 9378 DAG.getBitcast(MVT::v4f32, V2), Mask)); 9379 } 9380 9381 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 9382 /// shuffle lowering, and the most complex part. 9383 /// 9384 /// The lowering strategy is to try to form pairs of input lanes which are 9385 /// targeted at the same half of the final vector, and then use a dword shuffle 9386 /// to place them onto the right half, and finally unpack the paired lanes into 9387 /// their final position. 9388 /// 9389 /// The exact breakdown of how to form these dword pairs and align them on the 9390 /// correct sides is really tricky. See the comments within the function for 9391 /// more of the details. 9392 /// 9393 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each 9394 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to 9395 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 9396 /// vector, form the analogous 128-bit 8-element Mask. 9397 static SDValue lowerV8I16GeneralSingleInputVectorShuffle( 9398 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, 9399 const X86Subtarget &Subtarget, SelectionDAG &DAG) { 9400 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); 9401 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); 9402 9403 assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); 9404 MutableArrayRef<int> LoMask = Mask.slice(0, 4); 9405 MutableArrayRef<int> HiMask = Mask.slice(4, 4); 9406 9407 SmallVector<int, 4> LoInputs; 9408 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), 9409 [](int M) { return M >= 0; }); 9410 std::sort(LoInputs.begin(), LoInputs.end()); 9411 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); 9412 SmallVector<int, 4> HiInputs; 9413 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), 9414 [](int M) { return M >= 0; }); 9415 std::sort(HiInputs.begin(), HiInputs.end()); 9416 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); 9417 int NumLToL = 9418 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); 9419 int NumHToL = LoInputs.size() - NumLToL; 9420 int NumLToH = 9421 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); 9422 int NumHToH = HiInputs.size() - NumLToH; 9423 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); 9424 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); 9425 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); 9426 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); 9427 9428 // If we are splatting two values from one half - one to each half, then 9429 // we can shuffle that half so each is splatted to a dword, then splat those 9430 // to their respective halves. 9431 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp, 9432 int DOffset) { 9433 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4}; 9434 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1}; 9435 V = DAG.getNode(ShufWOp, DL, VT, V, 9436 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); 9437 V = DAG.getBitcast(PSHUFDVT, V); 9438 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, 9439 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); 9440 return DAG.getBitcast(VT, V); 9441 }; 9442 9443 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0) 9444 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0); 9445 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0) 9446 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2); 9447 9448 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all 9449 // such inputs we can swap two of the dwords across the half mark and end up 9450 // with <=2 inputs to each half in each half. Once there, we can fall through 9451 // to the generic code below. For example: 9452 // 9453 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] 9454 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] 9455 // 9456 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half 9457 // and an existing 2-into-2 on the other half. In this case we may have to 9458 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or 9459 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. 9460 // Fortunately, we don't have to handle anything but a 2-into-2 pattern 9461 // because any other situation (including a 3-into-1 or 1-into-3 in the other 9462 // half than the one we target for fixing) will be fixed when we re-enter this 9463 // path. We will also combine away any sequence of PSHUFD instructions that 9464 // result into a single instruction. Here is an example of the tricky case: 9465 // 9466 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] 9467 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] 9468 // 9469 // This now has a 1-into-3 in the high half! Instead, we do two shuffles: 9470 // 9471 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] 9472 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] 9473 // 9474 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] 9475 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] 9476 // 9477 // The result is fine to be handled by the generic logic. 9478 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, 9479 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, 9480 int AOffset, int BOffset) { 9481 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && 9482 "Must call this with A having 3 or 1 inputs from the A half."); 9483 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && 9484 "Must call this with B having 1 or 3 inputs from the B half."); 9485 assert(AToAInputs.size() + BToAInputs.size() == 4 && 9486 "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); 9487 9488 bool ThreeAInputs = AToAInputs.size() == 3; 9489 9490 // Compute the index of dword with only one word among the three inputs in 9491 // a half by taking the sum of the half with three inputs and subtracting 9492 // the sum of the actual three inputs. The difference is the remaining 9493 // slot. 9494 int ADWord, BDWord; 9495 int &TripleDWord = ThreeAInputs ? ADWord : BDWord; 9496 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; 9497 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; 9498 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; 9499 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; 9500 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); 9501 int TripleNonInputIdx = 9502 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); 9503 TripleDWord = TripleNonInputIdx / 2; 9504 9505 // We use xor with one to compute the adjacent DWord to whichever one the 9506 // OneInput is in. 9507 OneInputDWord = (OneInput / 2) ^ 1; 9508 9509 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA 9510 // and BToA inputs. If there is also such a problem with the BToB and AToB 9511 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in 9512 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it 9513 // is essential that we don't *create* a 3<-1 as then we might oscillate. 9514 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { 9515 // Compute how many inputs will be flipped by swapping these DWords. We 9516 // need 9517 // to balance this to ensure we don't form a 3-1 shuffle in the other 9518 // half. 9519 int NumFlippedAToBInputs = 9520 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + 9521 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); 9522 int NumFlippedBToBInputs = 9523 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + 9524 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); 9525 if ((NumFlippedAToBInputs == 1 && 9526 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || 9527 (NumFlippedBToBInputs == 1 && 9528 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { 9529 // We choose whether to fix the A half or B half based on whether that 9530 // half has zero flipped inputs. At zero, we may not be able to fix it 9531 // with that half. We also bias towards fixing the B half because that 9532 // will more commonly be the high half, and we have to bias one way. 9533 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, 9534 ArrayRef<int> Inputs) { 9535 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. 9536 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), 9537 PinnedIdx ^ 1) != Inputs.end(); 9538 // Determine whether the free index is in the flipped dword or the 9539 // unflipped dword based on where the pinned index is. We use this bit 9540 // in an xor to conditionally select the adjacent dword. 9541 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); 9542 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), 9543 FixFreeIdx) != Inputs.end(); 9544 if (IsFixIdxInput == IsFixFreeIdxInput) 9545 FixFreeIdx += 1; 9546 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), 9547 FixFreeIdx) != Inputs.end(); 9548 assert(IsFixIdxInput != IsFixFreeIdxInput && 9549 "We need to be changing the number of flipped inputs!"); 9550 int PSHUFHalfMask[] = {0, 1, 2, 3}; 9551 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); 9552 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, 9553 MVT::v8i16, V, 9554 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); 9555 9556 for (int &M : Mask) 9557 if (M >= 0 && M == FixIdx) 9558 M = FixFreeIdx; 9559 else if (M >= 0 && M == FixFreeIdx) 9560 M = FixIdx; 9561 }; 9562 if (NumFlippedBToBInputs != 0) { 9563 int BPinnedIdx = 9564 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; 9565 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); 9566 } else { 9567 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); 9568 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; 9569 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); 9570 } 9571 } 9572 } 9573 9574 int PSHUFDMask[] = {0, 1, 2, 3}; 9575 PSHUFDMask[ADWord] = BDWord; 9576 PSHUFDMask[BDWord] = ADWord; 9577 V = DAG.getBitcast( 9578 VT, 9579 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), 9580 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 9581 9582 // Adjust the mask to match the new locations of A and B. 9583 for (int &M : Mask) 9584 if (M >= 0 && M/2 == ADWord) 9585 M = 2 * BDWord + M % 2; 9586 else if (M >= 0 && M/2 == BDWord) 9587 M = 2 * ADWord + M % 2; 9588 9589 // Recurse back into this routine to re-compute state now that this isn't 9590 // a 3 and 1 problem. 9591 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, 9592 DAG); 9593 }; 9594 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) 9595 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); 9596 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) 9597 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); 9598 9599 // At this point there are at most two inputs to the low and high halves from 9600 // each half. That means the inputs can always be grouped into dwords and 9601 // those dwords can then be moved to the correct half with a dword shuffle. 9602 // We use at most one low and one high word shuffle to collect these paired 9603 // inputs into dwords, and finally a dword shuffle to place them. 9604 int PSHUFLMask[4] = {-1, -1, -1, -1}; 9605 int PSHUFHMask[4] = {-1, -1, -1, -1}; 9606 int PSHUFDMask[4] = {-1, -1, -1, -1}; 9607 9608 // First fix the masks for all the inputs that are staying in their 9609 // original halves. This will then dictate the targets of the cross-half 9610 // shuffles. 9611 auto fixInPlaceInputs = 9612 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, 9613 MutableArrayRef<int> SourceHalfMask, 9614 MutableArrayRef<int> HalfMask, int HalfOffset) { 9615 if (InPlaceInputs.empty()) 9616 return; 9617 if (InPlaceInputs.size() == 1) { 9618 SourceHalfMask[InPlaceInputs[0] - HalfOffset] = 9619 InPlaceInputs[0] - HalfOffset; 9620 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; 9621 return; 9622 } 9623 if (IncomingInputs.empty()) { 9624 // Just fix all of the in place inputs. 9625 for (int Input : InPlaceInputs) { 9626 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; 9627 PSHUFDMask[Input / 2] = Input / 2; 9628 } 9629 return; 9630 } 9631 9632 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); 9633 SourceHalfMask[InPlaceInputs[0] - HalfOffset] = 9634 InPlaceInputs[0] - HalfOffset; 9635 // Put the second input next to the first so that they are packed into 9636 // a dword. We find the adjacent index by toggling the low bit. 9637 int AdjIndex = InPlaceInputs[0] ^ 1; 9638 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; 9639 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); 9640 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; 9641 }; 9642 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); 9643 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); 9644 9645 // Now gather the cross-half inputs and place them into a free dword of 9646 // their target half. 9647 // FIXME: This operation could almost certainly be simplified dramatically to 9648 // look more like the 3-1 fixing operation. 9649 auto moveInputsToRightHalf = [&PSHUFDMask]( 9650 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, 9651 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, 9652 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, 9653 int DestOffset) { 9654 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { 9655 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word; 9656 }; 9657 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, 9658 int Word) { 9659 int LowWord = Word & ~1; 9660 int HighWord = Word | 1; 9661 return isWordClobbered(SourceHalfMask, LowWord) || 9662 isWordClobbered(SourceHalfMask, HighWord); 9663 }; 9664 9665 if (IncomingInputs.empty()) 9666 return; 9667 9668 if (ExistingInputs.empty()) { 9669 // Map any dwords with inputs from them into the right half. 9670 for (int Input : IncomingInputs) { 9671 // If the source half mask maps over the inputs, turn those into 9672 // swaps and use the swapped lane. 9673 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { 9674 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) { 9675 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = 9676 Input - SourceOffset; 9677 // We have to swap the uses in our half mask in one sweep. 9678 for (int &M : HalfMask) 9679 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) 9680 M = Input; 9681 else if (M == Input) 9682 M = SourceHalfMask[Input - SourceOffset] + SourceOffset; 9683 } else { 9684 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == 9685 Input - SourceOffset && 9686 "Previous placement doesn't match!"); 9687 } 9688 // Note that this correctly re-maps both when we do a swap and when 9689 // we observe the other side of the swap above. We rely on that to 9690 // avoid swapping the members of the input list directly. 9691 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; 9692 } 9693 9694 // Map the input's dword into the correct half. 9695 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0) 9696 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; 9697 else 9698 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == 9699 Input / 2 && 9700 "Previous placement doesn't match!"); 9701 } 9702 9703 // And just directly shift any other-half mask elements to be same-half 9704 // as we will have mirrored the dword containing the element into the 9705 // same position within that half. 9706 for (int &M : HalfMask) 9707 if (M >= SourceOffset && M < SourceOffset + 4) { 9708 M = M - SourceOffset + DestOffset; 9709 assert(M >= 0 && "This should never wrap below zero!"); 9710 } 9711 return; 9712 } 9713 9714 // Ensure we have the input in a viable dword of its current half. This 9715 // is particularly tricky because the original position may be clobbered 9716 // by inputs being moved and *staying* in that half. 9717 if (IncomingInputs.size() == 1) { 9718 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { 9719 int InputFixed = std::find(std::begin(SourceHalfMask), 9720 std::end(SourceHalfMask), -1) - 9721 std::begin(SourceHalfMask) + SourceOffset; 9722 SourceHalfMask[InputFixed - SourceOffset] = 9723 IncomingInputs[0] - SourceOffset; 9724 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], 9725 InputFixed); 9726 IncomingInputs[0] = InputFixed; 9727 } 9728 } else if (IncomingInputs.size() == 2) { 9729 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || 9730 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { 9731 // We have two non-adjacent or clobbered inputs we need to extract from 9732 // the source half. To do this, we need to map them into some adjacent 9733 // dword slot in the source mask. 9734 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, 9735 IncomingInputs[1] - SourceOffset}; 9736 9737 // If there is a free slot in the source half mask adjacent to one of 9738 // the inputs, place the other input in it. We use (Index XOR 1) to 9739 // compute an adjacent index. 9740 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && 9741 SourceHalfMask[InputsFixed[0] ^ 1] < 0) { 9742 SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; 9743 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; 9744 InputsFixed[1] = InputsFixed[0] ^ 1; 9745 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && 9746 SourceHalfMask[InputsFixed[1] ^ 1] < 0) { 9747 SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; 9748 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; 9749 InputsFixed[0] = InputsFixed[1] ^ 1; 9750 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 && 9751 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) { 9752 // The two inputs are in the same DWord but it is clobbered and the 9753 // adjacent DWord isn't used at all. Move both inputs to the free 9754 // slot. 9755 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; 9756 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; 9757 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); 9758 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; 9759 } else { 9760 // The only way we hit this point is if there is no clobbering 9761 // (because there are no off-half inputs to this half) and there is no 9762 // free slot adjacent to one of the inputs. In this case, we have to 9763 // swap an input with a non-input. 9764 for (int i = 0; i < 4; ++i) 9765 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && 9766 "We can't handle any clobbers here!"); 9767 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && 9768 "Cannot have adjacent inputs here!"); 9769 9770 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; 9771 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; 9772 9773 // We also have to update the final source mask in this case because 9774 // it may need to undo the above swap. 9775 for (int &M : FinalSourceHalfMask) 9776 if (M == (InputsFixed[0] ^ 1) + SourceOffset) 9777 M = InputsFixed[1] + SourceOffset; 9778 else if (M == InputsFixed[1] + SourceOffset) 9779 M = (InputsFixed[0] ^ 1) + SourceOffset; 9780 9781 InputsFixed[1] = InputsFixed[0] ^ 1; 9782 } 9783 9784 // Point everything at the fixed inputs. 9785 for (int &M : HalfMask) 9786 if (M == IncomingInputs[0]) 9787 M = InputsFixed[0] + SourceOffset; 9788 else if (M == IncomingInputs[1]) 9789 M = InputsFixed[1] + SourceOffset; 9790 9791 IncomingInputs[0] = InputsFixed[0] + SourceOffset; 9792 IncomingInputs[1] = InputsFixed[1] + SourceOffset; 9793 } 9794 } else { 9795 llvm_unreachable("Unhandled input size!"); 9796 } 9797 9798 // Now hoist the DWord down to the right half. 9799 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2; 9800 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free"); 9801 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; 9802 for (int &M : HalfMask) 9803 for (int Input : IncomingInputs) 9804 if (M == Input) 9805 M = FreeDWord * 2 + Input % 2; 9806 }; 9807 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, 9808 /*SourceOffset*/ 4, /*DestOffset*/ 0); 9809 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, 9810 /*SourceOffset*/ 0, /*DestOffset*/ 4); 9811 9812 // Now enact all the shuffles we've computed to move the inputs into their 9813 // target half. 9814 if (!isNoopShuffleMask(PSHUFLMask)) 9815 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, 9816 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); 9817 if (!isNoopShuffleMask(PSHUFHMask)) 9818 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, 9819 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); 9820 if (!isNoopShuffleMask(PSHUFDMask)) 9821 V = DAG.getBitcast( 9822 VT, 9823 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), 9824 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 9825 9826 // At this point, each half should contain all its inputs, and we can then 9827 // just shuffle them into their final position. 9828 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 && 9829 "Failed to lift all the high half inputs to the low mask!"); 9830 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && 9831 "Failed to lift all the low half inputs to the high mask!"); 9832 9833 // Do a half shuffle for the low mask. 9834 if (!isNoopShuffleMask(LoMask)) 9835 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, 9836 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); 9837 9838 // Do a half shuffle with the high mask after shifting its values down. 9839 for (int &M : HiMask) 9840 if (M >= 0) 9841 M -= 4; 9842 if (!isNoopShuffleMask(HiMask)) 9843 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, 9844 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); 9845 9846 return V; 9847 } 9848 9849 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the 9850 /// blend if only one input is used. 9851 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( 9852 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 9853 SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { 9854 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 9855 SDValue V1Mask[16]; 9856 SDValue V2Mask[16]; 9857 V1InUse = false; 9858 V2InUse = false; 9859 9860 int Size = Mask.size(); 9861 int Scale = 16 / Size; 9862 for (int i = 0; i < 16; ++i) { 9863 if (Mask[i / Scale] < 0) { 9864 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); 9865 } else { 9866 const int ZeroMask = 0x80; 9867 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale 9868 : ZeroMask; 9869 int V2Idx = Mask[i / Scale] < Size 9870 ? ZeroMask 9871 : (Mask[i / Scale] - Size) * Scale + i % Scale; 9872 if (Zeroable[i / Scale]) 9873 V1Idx = V2Idx = ZeroMask; 9874 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); 9875 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); 9876 V1InUse |= (ZeroMask != V1Idx); 9877 V2InUse |= (ZeroMask != V2Idx); 9878 } 9879 } 9880 9881 if (V1InUse) 9882 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, 9883 DAG.getBitcast(MVT::v16i8, V1), 9884 DAG.getBuildVector(MVT::v16i8, DL, V1Mask)); 9885 if (V2InUse) 9886 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, 9887 DAG.getBitcast(MVT::v16i8, V2), 9888 DAG.getBuildVector(MVT::v16i8, DL, V2Mask)); 9889 9890 // If we need shuffled inputs from both, blend the two. 9891 SDValue V; 9892 if (V1InUse && V2InUse) 9893 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); 9894 else 9895 V = V1InUse ? V1 : V2; 9896 9897 // Cast the result back to the correct type. 9898 return DAG.getBitcast(VT, V); 9899 } 9900 9901 /// \brief Generic lowering of 8-lane i16 shuffles. 9902 /// 9903 /// This handles both single-input shuffles and combined shuffle/blends with 9904 /// two inputs. The single input shuffles are immediately delegated to 9905 /// a dedicated lowering routine. 9906 /// 9907 /// The blends are lowered in one of three fundamental ways. If there are few 9908 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle 9909 /// of the input is significantly cheaper when lowered as an interleaving of 9910 /// the two inputs, try to interleave them. Otherwise, blend the low and high 9911 /// halves of the inputs separately (making them have relatively few inputs) 9912 /// and then concatenate them. 9913 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 9914 SDValue V1, SDValue V2, 9915 const X86Subtarget &Subtarget, 9916 SelectionDAG &DAG) { 9917 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); 9918 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); 9919 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 9920 9921 // Whenever we can lower this as a zext, that instruction is strictly faster 9922 // than any alternative. 9923 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( 9924 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) 9925 return ZExt; 9926 9927 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); 9928 9929 if (NumV2Inputs == 0) { 9930 // Check for being able to broadcast a single element. 9931 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( 9932 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) 9933 return Broadcast; 9934 9935 // Try to use shift instructions. 9936 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, 9937 Subtarget, DAG)) 9938 return Shift; 9939 9940 // Use dedicated unpack instructions for masks that match their pattern. 9941 if (SDValue V = 9942 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) 9943 return V; 9944 9945 // Try to use byte rotation instructions. 9946 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, 9947 Mask, Subtarget, DAG)) 9948 return Rotate; 9949 9950 // Make a copy of the mask so it can be modified. 9951 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end()); 9952 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, 9953 MutableMask, Subtarget, 9954 DAG); 9955 } 9956 9957 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && 9958 "All single-input shuffles should be canonicalized to be V1-input " 9959 "shuffles."); 9960 9961 // Try to use shift instructions. 9962 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, 9963 Subtarget, DAG)) 9964 return Shift; 9965 9966 // See if we can use SSE4A Extraction / Insertion. 9967 if (Subtarget.hasSSE4A()) 9968 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) 9969 return V; 9970 9971 // There are special ways we can lower some single-element blends. 9972 if (NumV2Inputs == 1) 9973 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, 9974 Mask, Subtarget, DAG)) 9975 return V; 9976 9977 // We have different paths for blend lowering, but they all must use the 9978 // *exact* same predicate. 9979 bool IsBlendSupported = Subtarget.hasSSE41(); 9980 if (IsBlendSupported) 9981 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, 9982 Subtarget, DAG)) 9983 return Blend; 9984 9985 if (SDValue Masked = 9986 lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) 9987 return Masked; 9988 9989 // Use dedicated unpack instructions for masks that match their pattern. 9990 if (SDValue V = 9991 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) 9992 return V; 9993 9994 // Try to use byte rotation instructions. 9995 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 9996 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) 9997 return Rotate; 9998 9999 if (SDValue BitBlend = 10000 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) 10001 return BitBlend; 10002 10003 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, 10004 V2, Mask, DAG)) 10005 return Unpack; 10006 10007 // If we can't directly blend but can use PSHUFB, that will be better as it 10008 // can both shuffle and set up the inefficient blend. 10009 if (!IsBlendSupported && Subtarget.hasSSSE3()) { 10010 bool V1InUse, V2InUse; 10011 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG, 10012 V1InUse, V2InUse); 10013 } 10014 10015 // We can always bit-blend if we have to so the fallback strategy is to 10016 // decompose into single-input permutes and blends. 10017 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, 10018 Mask, DAG); 10019 } 10020 10021 /// \brief Check whether a compaction lowering can be done by dropping even 10022 /// elements and compute how many times even elements must be dropped. 10023 /// 10024 /// This handles shuffles which take every Nth element where N is a power of 10025 /// two. Example shuffle masks: 10026 /// 10027 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 10028 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 10029 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 10030 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 10031 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 10032 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 10033 /// 10034 /// Any of these lanes can of course be undef. 10035 /// 10036 /// This routine only supports N <= 3. 10037 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here 10038 /// for larger N. 10039 /// 10040 /// \returns N above, or the number of times even elements must be dropped if 10041 /// there is such a number. Otherwise returns zero. 10042 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, 10043 bool IsSingleInput) { 10044 // The modulus for the shuffle vector entries is based on whether this is 10045 // a single input or not. 10046 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); 10047 assert(isPowerOf2_32((uint32_t)ShuffleModulus) && 10048 "We should only be called with masks with a power-of-2 size!"); 10049 10050 uint64_t ModMask = (uint64_t)ShuffleModulus - 1; 10051 10052 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, 10053 // and 2^3 simultaneously. This is because we may have ambiguity with 10054 // partially undef inputs. 10055 bool ViableForN[3] = {true, true, true}; 10056 10057 for (int i = 0, e = Mask.size(); i < e; ++i) { 10058 // Ignore undef lanes, we'll optimistically collapse them to the pattern we 10059 // want. 10060 if (Mask[i] < 0) 10061 continue; 10062 10063 bool IsAnyViable = false; 10064 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) 10065 if (ViableForN[j]) { 10066 uint64_t N = j + 1; 10067 10068 // The shuffle mask must be equal to (i * 2^N) % M. 10069 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) 10070 IsAnyViable = true; 10071 else 10072 ViableForN[j] = false; 10073 } 10074 // Early exit if we exhaust the possible powers of two. 10075 if (!IsAnyViable) 10076 break; 10077 } 10078 10079 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) 10080 if (ViableForN[j]) 10081 return j + 1; 10082 10083 // Return 0 as there is no viable power of two. 10084 return 0; 10085 } 10086 10087 /// \brief Generic lowering of v16i8 shuffles. 10088 /// 10089 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to 10090 /// detect any complexity reducing interleaving. If that doesn't help, it uses 10091 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses 10092 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them 10093 /// back together. 10094 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 10095 SDValue V1, SDValue V2, 10096 const X86Subtarget &Subtarget, 10097 SelectionDAG &DAG) { 10098 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); 10099 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); 10100 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 10101 10102 // Try to use shift instructions. 10103 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, 10104 Subtarget, DAG)) 10105 return Shift; 10106 10107 // Try to use byte rotation instructions. 10108 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 10109 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) 10110 return Rotate; 10111 10112 // Try to use a zext lowering. 10113 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( 10114 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) 10115 return ZExt; 10116 10117 // See if we can use SSE4A Extraction / Insertion. 10118 if (Subtarget.hasSSE4A()) 10119 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) 10120 return V; 10121 10122 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); 10123 10124 // For single-input shuffles, there are some nicer lowering tricks we can use. 10125 if (NumV2Elements == 0) { 10126 // Check for being able to broadcast a single element. 10127 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( 10128 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) 10129 return Broadcast; 10130 10131 // Check whether we can widen this to an i16 shuffle by duplicating bytes. 10132 // Notably, this handles splat and partial-splat shuffles more efficiently. 10133 // However, it only makes sense if the pre-duplication shuffle simplifies 10134 // things significantly. Currently, this means we need to be able to 10135 // express the pre-duplication shuffle as an i16 shuffle. 10136 // 10137 // FIXME: We should check for other patterns which can be widened into an 10138 // i16 shuffle as well. 10139 auto canWidenViaDuplication = [](ArrayRef<int> Mask) { 10140 for (int i = 0; i < 16; i += 2) 10141 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1]) 10142 return false; 10143 10144 return true; 10145 }; 10146 auto tryToWidenViaDuplication = [&]() -> SDValue { 10147 if (!canWidenViaDuplication(Mask)) 10148 return SDValue(); 10149 SmallVector<int, 4> LoInputs; 10150 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), 10151 [](int M) { return M >= 0 && M < 8; }); 10152 std::sort(LoInputs.begin(), LoInputs.end()); 10153 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), 10154 LoInputs.end()); 10155 SmallVector<int, 4> HiInputs; 10156 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), 10157 [](int M) { return M >= 8; }); 10158 std::sort(HiInputs.begin(), HiInputs.end()); 10159 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), 10160 HiInputs.end()); 10161 10162 bool TargetLo = LoInputs.size() >= HiInputs.size(); 10163 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; 10164 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs; 10165 10166 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; 10167 SmallDenseMap<int, int, 8> LaneMap; 10168 for (int I : InPlaceInputs) { 10169 PreDupI16Shuffle[I/2] = I/2; 10170 LaneMap[I] = I; 10171 } 10172 int j = TargetLo ? 0 : 4, je = j + 4; 10173 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { 10174 // Check if j is already a shuffle of this input. This happens when 10175 // there are two adjacent bytes after we move the low one. 10176 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { 10177 // If we haven't yet mapped the input, search for a slot into which 10178 // we can map it. 10179 while (j < je && PreDupI16Shuffle[j] >= 0) 10180 ++j; 10181 10182 if (j == je) 10183 // We can't place the inputs into a single half with a simple i16 shuffle, so bail. 10184 return SDValue(); 10185 10186 // Map this input with the i16 shuffle. 10187 PreDupI16Shuffle[j] = MovingInputs[i] / 2; 10188 } 10189 10190 // Update the lane map based on the mapping we ended up with. 10191 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; 10192 } 10193 V1 = DAG.getBitcast( 10194 MVT::v16i8, 10195 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), 10196 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); 10197 10198 // Unpack the bytes to form the i16s that will be shuffled into place. 10199 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, 10200 MVT::v16i8, V1, V1); 10201 10202 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 10203 for (int i = 0; i < 16; ++i) 10204 if (Mask[i] >= 0) { 10205 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); 10206 assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); 10207 if (PostDupI16Shuffle[i / 2] < 0) 10208 PostDupI16Shuffle[i / 2] = MappedMask; 10209 else 10210 assert(PostDupI16Shuffle[i / 2] == MappedMask && 10211 "Conflicting entrties in the original shuffle!"); 10212 } 10213 return DAG.getBitcast( 10214 MVT::v16i8, 10215 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), 10216 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); 10217 }; 10218 if (SDValue V = tryToWidenViaDuplication()) 10219 return V; 10220 } 10221 10222 if (SDValue Masked = 10223 lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) 10224 return Masked; 10225 10226 // Use dedicated unpack instructions for masks that match their pattern. 10227 if (SDValue V = 10228 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) 10229 return V; 10230 10231 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly 10232 // with PSHUFB. It is important to do this before we attempt to generate any 10233 // blends but after all of the single-input lowerings. If the single input 10234 // lowerings can find an instruction sequence that is faster than a PSHUFB, we 10235 // want to preserve that and we can DAG combine any longer sequences into 10236 // a PSHUFB in the end. But once we start blending from multiple inputs, 10237 // the complexity of DAG combining bad patterns back into PSHUFB is too high, 10238 // and there are *very* few patterns that would actually be faster than the 10239 // PSHUFB approach because of its ability to zero lanes. 10240 // 10241 // FIXME: The only exceptions to the above are blends which are exact 10242 // interleavings with direct instructions supporting them. We currently don't 10243 // handle those well here. 10244 if (Subtarget.hasSSSE3()) { 10245 bool V1InUse = false; 10246 bool V2InUse = false; 10247 10248 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs( 10249 DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse); 10250 10251 // If both V1 and V2 are in use and we can use a direct blend or an unpack, 10252 // do so. This avoids using them to handle blends-with-zero which is 10253 // important as a single pshufb is significantly faster for that. 10254 if (V1InUse && V2InUse) { 10255 if (Subtarget.hasSSE41()) 10256 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, 10257 Mask, Subtarget, DAG)) 10258 return Blend; 10259 10260 // We can use an unpack to do the blending rather than an or in some 10261 // cases. Even though the or may be (very minorly) more efficient, we 10262 // preference this lowering because there are common cases where part of 10263 // the complexity of the shuffles goes away when we do the final blend as 10264 // an unpack. 10265 // FIXME: It might be worth trying to detect if the unpack-feeding 10266 // shuffles will both be pshufb, in which case we shouldn't bother with 10267 // this. 10268 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( 10269 DL, MVT::v16i8, V1, V2, Mask, DAG)) 10270 return Unpack; 10271 } 10272 10273 return PSHUFB; 10274 } 10275 10276 // There are special ways we can lower some single-element blends. 10277 if (NumV2Elements == 1) 10278 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, 10279 Mask, Subtarget, DAG)) 10280 return V; 10281 10282 if (SDValue BitBlend = 10283 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) 10284 return BitBlend; 10285 10286 // Check whether a compaction lowering can be done. This handles shuffles 10287 // which take every Nth element for some even N. See the helper function for 10288 // details. 10289 // 10290 // We special case these as they can be particularly efficiently handled with 10291 // the PACKUSB instruction on x86 and they show up in common patterns of 10292 // rearranging bytes to truncate wide elements. 10293 bool IsSingleInput = V2.isUndef(); 10294 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) { 10295 // NumEvenDrops is the power of two stride of the elements. Another way of 10296 // thinking about it is that we need to drop the even elements this many 10297 // times to get the original input. 10298 10299 // First we need to zero all the dropped bytes. 10300 assert(NumEvenDrops <= 3 && 10301 "No support for dropping even elements more than 3 times."); 10302 // We use the mask type to pick which bytes are preserved based on how many 10303 // elements are dropped. 10304 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; 10305 SDValue ByteClearMask = DAG.getBitcast( 10306 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); 10307 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); 10308 if (!IsSingleInput) 10309 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); 10310 10311 // Now pack things back together. 10312 V1 = DAG.getBitcast(MVT::v8i16, V1); 10313 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); 10314 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); 10315 for (int i = 1; i < NumEvenDrops; ++i) { 10316 Result = DAG.getBitcast(MVT::v8i16, Result); 10317 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); 10318 } 10319 10320 return Result; 10321 } 10322 10323 // Handle multi-input cases by blending single-input shuffles. 10324 if (NumV2Elements > 0) 10325 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, 10326 Mask, DAG); 10327 10328 // The fallback path for single-input shuffles widens this into two v8i16 10329 // vectors with unpacks, shuffles those, and then pulls them back together 10330 // with a pack. 10331 SDValue V = V1; 10332 10333 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 10334 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 10335 for (int i = 0; i < 16; ++i) 10336 if (Mask[i] >= 0) 10337 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; 10338 10339 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); 10340 10341 SDValue VLoHalf, VHiHalf; 10342 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask 10343 // them out and avoid using UNPCK{L,H} to extract the elements of V as 10344 // i16s. 10345 if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask), 10346 [](int M) { return M >= 0 && M % 2 == 1; }) && 10347 std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), 10348 [](int M) { return M >= 0 && M % 2 == 1; })) { 10349 // Use a mask to drop the high bytes. 10350 VLoHalf = DAG.getBitcast(MVT::v8i16, V); 10351 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, 10352 DAG.getConstant(0x00FF, DL, MVT::v8i16)); 10353 10354 // This will be a single vector shuffle instead of a blend so nuke VHiHalf. 10355 VHiHalf = DAG.getUNDEF(MVT::v8i16); 10356 10357 // Squash the masks to point directly into VLoHalf. 10358 for (int &M : LoBlendMask) 10359 if (M >= 0) 10360 M /= 2; 10361 for (int &M : HiBlendMask) 10362 if (M >= 0) 10363 M /= 2; 10364 } else { 10365 // Otherwise just unpack the low half of V into VLoHalf and the high half into 10366 // VHiHalf so that we can blend them as i16s. 10367 VLoHalf = DAG.getBitcast( 10368 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); 10369 VHiHalf = DAG.getBitcast( 10370 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); 10371 } 10372 10373 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); 10374 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); 10375 10376 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); 10377 } 10378 10379 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. 10380 /// 10381 /// This routine breaks down the specific type of 128-bit shuffle and 10382 /// dispatches to the lowering routines accordingly. 10383 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 10384 MVT VT, SDValue V1, SDValue V2, 10385 const X86Subtarget &Subtarget, 10386 SelectionDAG &DAG) { 10387 switch (VT.SimpleTy) { 10388 case MVT::v2i64: 10389 return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 10390 case MVT::v2f64: 10391 return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 10392 case MVT::v4i32: 10393 return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 10394 case MVT::v4f32: 10395 return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 10396 case MVT::v8i16: 10397 return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 10398 case MVT::v16i8: 10399 return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 10400 10401 default: 10402 llvm_unreachable("Unimplemented!"); 10403 } 10404 } 10405 10406 /// \brief Helper function to test whether a shuffle mask could be 10407 /// simplified by widening the elements being shuffled. 10408 /// 10409 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise 10410 /// leaves it in an unspecified state. 10411 /// 10412 /// NOTE: This must handle normal vector shuffle masks and *target* vector 10413 /// shuffle masks. The latter have the special property of a '-2' representing 10414 /// a zero-ed lane of a vector. 10415 static bool canWidenShuffleElements(ArrayRef<int> Mask, 10416 SmallVectorImpl<int> &WidenedMask) { 10417 WidenedMask.assign(Mask.size() / 2, 0); 10418 for (int i = 0, Size = Mask.size(); i < Size; i += 2) { 10419 // If both elements are undef, its trivial. 10420 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { 10421 WidenedMask[i/2] = SM_SentinelUndef; 10422 continue; 10423 } 10424 10425 // Check for an undef mask and a mask value properly aligned to fit with 10426 // a pair of values. If we find such a case, use the non-undef mask's value. 10427 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { 10428 WidenedMask[i/2] = Mask[i + 1] / 2; 10429 continue; 10430 } 10431 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { 10432 WidenedMask[i/2] = Mask[i] / 2; 10433 continue; 10434 } 10435 10436 // When zeroing, we need to spread the zeroing across both lanes to widen. 10437 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { 10438 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && 10439 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { 10440 WidenedMask[i/2] = SM_SentinelZero; 10441 continue; 10442 } 10443 return false; 10444 } 10445 10446 // Finally check if the two mask values are adjacent and aligned with 10447 // a pair. 10448 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { 10449 WidenedMask[i/2] = Mask[i] / 2; 10450 continue; 10451 } 10452 10453 // Otherwise we can't safely widen the elements used in this shuffle. 10454 return false; 10455 } 10456 assert(WidenedMask.size() == Mask.size() / 2 && 10457 "Incorrect size of mask after widening the elements!"); 10458 10459 return true; 10460 } 10461 10462 /// \brief Generic routine to split vector shuffle into half-sized shuffles. 10463 /// 10464 /// This routine just extracts two subvectors, shuffles them independently, and 10465 /// then concatenates them back together. This should work effectively with all 10466 /// AVX vector shuffle types. 10467 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, 10468 SDValue V2, ArrayRef<int> Mask, 10469 SelectionDAG &DAG) { 10470 assert(VT.getSizeInBits() >= 256 && 10471 "Only for 256-bit or wider vector shuffles!"); 10472 assert(V1.getSimpleValueType() == VT && "Bad operand type!"); 10473 assert(V2.getSimpleValueType() == VT && "Bad operand type!"); 10474 10475 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2); 10476 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2); 10477 10478 int NumElements = VT.getVectorNumElements(); 10479 int SplitNumElements = NumElements / 2; 10480 MVT ScalarVT = VT.getVectorElementType(); 10481 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); 10482 10483 // Rather than splitting build-vectors, just build two narrower build 10484 // vectors. This helps shuffling with splats and zeros. 10485 auto SplitVector = [&](SDValue V) { 10486 V = peekThroughBitcasts(V); 10487 10488 MVT OrigVT = V.getSimpleValueType(); 10489 int OrigNumElements = OrigVT.getVectorNumElements(); 10490 int OrigSplitNumElements = OrigNumElements / 2; 10491 MVT OrigScalarVT = OrigVT.getVectorElementType(); 10492 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); 10493 10494 SDValue LoV, HiV; 10495 10496 auto *BV = dyn_cast<BuildVectorSDNode>(V); 10497 if (!BV) { 10498 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, 10499 DAG.getIntPtrConstant(0, DL)); 10500 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, 10501 DAG.getIntPtrConstant(OrigSplitNumElements, DL)); 10502 } else { 10503 10504 SmallVector<SDValue, 16> LoOps, HiOps; 10505 for (int i = 0; i < OrigSplitNumElements; ++i) { 10506 LoOps.push_back(BV->getOperand(i)); 10507 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); 10508 } 10509 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps); 10510 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps); 10511 } 10512 return std::make_pair(DAG.getBitcast(SplitVT, LoV), 10513 DAG.getBitcast(SplitVT, HiV)); 10514 }; 10515 10516 SDValue LoV1, HiV1, LoV2, HiV2; 10517 std::tie(LoV1, HiV1) = SplitVector(V1); 10518 std::tie(LoV2, HiV2) = SplitVector(V2); 10519 10520 // Now create two 4-way blends of these half-width vectors. 10521 auto HalfBlend = [&](ArrayRef<int> HalfMask) { 10522 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; 10523 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1); 10524 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1); 10525 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1); 10526 for (int i = 0; i < SplitNumElements; ++i) { 10527 int M = HalfMask[i]; 10528 if (M >= NumElements) { 10529 if (M >= NumElements + SplitNumElements) 10530 UseHiV2 = true; 10531 else 10532 UseLoV2 = true; 10533 V2BlendMask[i] = M - NumElements; 10534 BlendMask[i] = SplitNumElements + i; 10535 } else if (M >= 0) { 10536 if (M >= SplitNumElements) 10537 UseHiV1 = true; 10538 else 10539 UseLoV1 = true; 10540 V1BlendMask[i] = M; 10541 BlendMask[i] = i; 10542 } 10543 } 10544 10545 // Because the lowering happens after all combining takes place, we need to 10546 // manually combine these blend masks as much as possible so that we create 10547 // a minimal number of high-level vector shuffle nodes. 10548 10549 // First try just blending the halves of V1 or V2. 10550 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) 10551 return DAG.getUNDEF(SplitVT); 10552 if (!UseLoV2 && !UseHiV2) 10553 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); 10554 if (!UseLoV1 && !UseHiV1) 10555 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); 10556 10557 SDValue V1Blend, V2Blend; 10558 if (UseLoV1 && UseHiV1) { 10559 V1Blend = 10560 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); 10561 } else { 10562 // We only use half of V1 so map the usage down into the final blend mask. 10563 V1Blend = UseLoV1 ? LoV1 : HiV1; 10564 for (int i = 0; i < SplitNumElements; ++i) 10565 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) 10566 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); 10567 } 10568 if (UseLoV2 && UseHiV2) { 10569 V2Blend = 10570 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); 10571 } else { 10572 // We only use half of V2 so map the usage down into the final blend mask. 10573 V2Blend = UseLoV2 ? LoV2 : HiV2; 10574 for (int i = 0; i < SplitNumElements; ++i) 10575 if (BlendMask[i] >= SplitNumElements) 10576 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); 10577 } 10578 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); 10579 }; 10580 SDValue Lo = HalfBlend(LoMask); 10581 SDValue Hi = HalfBlend(HiMask); 10582 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); 10583 } 10584 10585 /// \brief Either split a vector in halves or decompose the shuffles and the 10586 /// blend. 10587 /// 10588 /// This is provided as a good fallback for many lowerings of non-single-input 10589 /// shuffles with more than one 128-bit lane. In those cases, we want to select 10590 /// between splitting the shuffle into 128-bit components and stitching those 10591 /// back together vs. extracting the single-input shuffles and blending those 10592 /// results. 10593 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, 10594 SDValue V1, SDValue V2, 10595 ArrayRef<int> Mask, 10596 SelectionDAG &DAG) { 10597 assert(!V2.isUndef() && "This routine must not be used to lower single-input " 10598 "shuffles as it could then recurse on itself."); 10599 int Size = Mask.size(); 10600 10601 // If this can be modeled as a broadcast of two elements followed by a blend, 10602 // prefer that lowering. This is especially important because broadcasts can 10603 // often fold with memory operands. 10604 auto DoBothBroadcast = [&] { 10605 int V1BroadcastIdx = -1, V2BroadcastIdx = -1; 10606 for (int M : Mask) 10607 if (M >= Size) { 10608 if (V2BroadcastIdx < 0) 10609 V2BroadcastIdx = M - Size; 10610 else if (M - Size != V2BroadcastIdx) 10611 return false; 10612 } else if (M >= 0) { 10613 if (V1BroadcastIdx < 0) 10614 V1BroadcastIdx = M; 10615 else if (M != V1BroadcastIdx) 10616 return false; 10617 } 10618 return true; 10619 }; 10620 if (DoBothBroadcast()) 10621 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, 10622 DAG); 10623 10624 // If the inputs all stem from a single 128-bit lane of each input, then we 10625 // split them rather than blending because the split will decompose to 10626 // unusually few instructions. 10627 int LaneCount = VT.getSizeInBits() / 128; 10628 int LaneSize = Size / LaneCount; 10629 SmallBitVector LaneInputs[2]; 10630 LaneInputs[0].resize(LaneCount, false); 10631 LaneInputs[1].resize(LaneCount, false); 10632 for (int i = 0; i < Size; ++i) 10633 if (Mask[i] >= 0) 10634 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; 10635 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) 10636 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 10637 10638 // Otherwise, just fall back to decomposed shuffles and a blend. This requires 10639 // that the decomposed single-input shuffles don't end up here. 10640 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); 10641 } 10642 10643 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as 10644 /// a permutation and blend of those lanes. 10645 /// 10646 /// This essentially blends the out-of-lane inputs to each lane into the lane 10647 /// from a permuted copy of the vector. This lowering strategy results in four 10648 /// instructions in the worst case for a single-input cross lane shuffle which 10649 /// is lower than any other fully general cross-lane shuffle strategy I'm aware 10650 /// of. Special cases for each particular shuffle pattern should be handled 10651 /// prior to trying this lowering. 10652 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, 10653 SDValue V1, SDValue V2, 10654 ArrayRef<int> Mask, 10655 SelectionDAG &DAG) { 10656 // FIXME: This should probably be generalized for 512-bit vectors as well. 10657 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); 10658 int Size = Mask.size(); 10659 int LaneSize = Size / 2; 10660 10661 // If there are only inputs from one 128-bit lane, splitting will in fact be 10662 // less expensive. The flags track whether the given lane contains an element 10663 // that crosses to another lane. 10664 bool LaneCrossing[2] = {false, false}; 10665 for (int i = 0; i < Size; ++i) 10666 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) 10667 LaneCrossing[(Mask[i] % Size) / LaneSize] = true; 10668 if (!LaneCrossing[0] || !LaneCrossing[1]) 10669 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 10670 10671 assert(V2.isUndef() && 10672 "This last part of this routine only works on single input shuffles"); 10673 10674 SmallVector<int, 32> FlippedBlendMask(Size); 10675 for (int i = 0; i < Size; ++i) 10676 FlippedBlendMask[i] = 10677 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) 10678 ? Mask[i] 10679 : Mask[i] % LaneSize + 10680 (i / LaneSize) * LaneSize + Size); 10681 10682 // Flip the vector, and blend the results which should now be in-lane. The 10683 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and 10684 // 5 for the high source. The value 3 selects the high half of source 2 and 10685 // the value 2 selects the low half of source 2. We only use source 2 to 10686 // allow folding it into a memory operand. 10687 unsigned PERMMask = 3 | 2 << 4; 10688 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), 10689 V1, DAG.getConstant(PERMMask, DL, MVT::i8)); 10690 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); 10691 } 10692 10693 /// \brief Handle lowering 2-lane 128-bit shuffles. 10694 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, 10695 SDValue V2, ArrayRef<int> Mask, 10696 const X86Subtarget &Subtarget, 10697 SelectionDAG &DAG) { 10698 // TODO: If minimizing size and one of the inputs is a zero vector and the 10699 // the zero vector has only one use, we could use a VPERM2X128 to save the 10700 // instruction bytes needed to explicitly generate the zero vector. 10701 10702 // Blends are faster and handle all the non-lane-crossing cases. 10703 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, 10704 Subtarget, DAG)) 10705 return Blend; 10706 10707 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); 10708 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); 10709 10710 // If either input operand is a zero vector, use VPERM2X128 because its mask 10711 // allows us to replace the zero input with an implicit zero. 10712 if (!IsV1Zero && !IsV2Zero) { 10713 // Check for patterns which can be matched with a single insert of a 128-bit 10714 // subvector. 10715 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); 10716 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { 10717 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding. 10718 if (Subtarget.hasAVX2() && V2.isUndef()) 10719 return SDValue(); 10720 10721 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 10722 VT.getVectorNumElements() / 2); 10723 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, 10724 DAG.getIntPtrConstant(0, DL)); 10725 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, 10726 OnlyUsesV1 ? V1 : V2, 10727 DAG.getIntPtrConstant(0, DL)); 10728 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); 10729 } 10730 } 10731 10732 // Otherwise form a 128-bit permutation. After accounting for undefs, 10733 // convert the 64-bit shuffle mask selection values into 128-bit 10734 // selection bits by dividing the indexes by 2 and shifting into positions 10735 // defined by a vperm2*128 instruction's immediate control byte. 10736 10737 // The immediate permute control byte looks like this: 10738 // [1:0] - select 128 bits from sources for low half of destination 10739 // [2] - ignore 10740 // [3] - zero low half of destination 10741 // [5:4] - select 128 bits from sources for high half of destination 10742 // [6] - ignore 10743 // [7] - zero high half of destination 10744 10745 int MaskLO = Mask[0]; 10746 if (MaskLO == SM_SentinelUndef) 10747 MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; 10748 10749 int MaskHI = Mask[2]; 10750 if (MaskHI == SM_SentinelUndef) 10751 MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; 10752 10753 unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; 10754 10755 // If either input is a zero vector, replace it with an undef input. 10756 // Shuffle mask values < 4 are selecting elements of V1. 10757 // Shuffle mask values >= 4 are selecting elements of V2. 10758 // Adjust each half of the permute mask by clearing the half that was 10759 // selecting the zero vector and setting the zero mask bit. 10760 if (IsV1Zero) { 10761 V1 = DAG.getUNDEF(VT); 10762 if (MaskLO < 4) 10763 PermMask = (PermMask & 0xf0) | 0x08; 10764 if (MaskHI < 4) 10765 PermMask = (PermMask & 0x0f) | 0x80; 10766 } 10767 if (IsV2Zero) { 10768 V2 = DAG.getUNDEF(VT); 10769 if (MaskLO >= 4) 10770 PermMask = (PermMask & 0xf0) | 0x08; 10771 if (MaskHI >= 4) 10772 PermMask = (PermMask & 0x0f) | 0x80; 10773 } 10774 10775 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, 10776 DAG.getConstant(PermMask, DL, MVT::i8)); 10777 } 10778 10779 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then 10780 /// shuffling each lane. 10781 /// 10782 /// This will only succeed when the result of fixing the 128-bit lanes results 10783 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in 10784 /// each 128-bit lanes. This handles many cases where we can quickly blend away 10785 /// the lane crosses early and then use simpler shuffles within each lane. 10786 /// 10787 /// FIXME: It might be worthwhile at some point to support this without 10788 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently 10789 /// in x86 only floating point has interesting non-repeating shuffles, and even 10790 /// those are still *marginally* more expensive. 10791 static SDValue lowerVectorShuffleByMerging128BitLanes( 10792 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 10793 const X86Subtarget &Subtarget, SelectionDAG &DAG) { 10794 assert(!V2.isUndef() && "This is only useful with multiple inputs."); 10795 10796 int Size = Mask.size(); 10797 int LaneSize = 128 / VT.getScalarSizeInBits(); 10798 int NumLanes = Size / LaneSize; 10799 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); 10800 10801 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also 10802 // check whether the in-128-bit lane shuffles share a repeating pattern. 10803 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1); 10804 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1); 10805 for (int i = 0; i < Size; ++i) { 10806 if (Mask[i] < 0) 10807 continue; 10808 10809 int j = i / LaneSize; 10810 10811 if (Lanes[j] < 0) { 10812 // First entry we've seen for this lane. 10813 Lanes[j] = Mask[i] / LaneSize; 10814 } else if (Lanes[j] != Mask[i] / LaneSize) { 10815 // This doesn't match the lane selected previously! 10816 return SDValue(); 10817 } 10818 10819 // Check that within each lane we have a consistent shuffle mask. 10820 int k = i % LaneSize; 10821 if (InLaneMask[k] < 0) { 10822 InLaneMask[k] = Mask[i] % LaneSize; 10823 } else if (InLaneMask[k] != Mask[i] % LaneSize) { 10824 // This doesn't fit a repeating in-lane mask. 10825 return SDValue(); 10826 } 10827 } 10828 10829 // First shuffle the lanes into place. 10830 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, 10831 VT.getSizeInBits() / 64); 10832 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1); 10833 for (int i = 0; i < NumLanes; ++i) 10834 if (Lanes[i] >= 0) { 10835 LaneMask[2 * i + 0] = 2*Lanes[i] + 0; 10836 LaneMask[2 * i + 1] = 2*Lanes[i] + 1; 10837 } 10838 10839 V1 = DAG.getBitcast(LaneVT, V1); 10840 V2 = DAG.getBitcast(LaneVT, V2); 10841 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); 10842 10843 // Cast it back to the type we actually want. 10844 LaneShuffle = DAG.getBitcast(VT, LaneShuffle); 10845 10846 // Now do a simple shuffle that isn't lane crossing. 10847 SmallVector<int, 8> NewMask((unsigned)Size, -1); 10848 for (int i = 0; i < Size; ++i) 10849 if (Mask[i] >= 0) 10850 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; 10851 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && 10852 "Must not introduce lane crosses at this point!"); 10853 10854 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); 10855 } 10856 10857 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF. 10858 /// This allows for fast cases such as subvector extraction/insertion 10859 /// or shuffling smaller vector types which can lower more efficiently. 10860 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, 10861 SDValue V1, SDValue V2, 10862 ArrayRef<int> Mask, 10863 const X86Subtarget &Subtarget, 10864 SelectionDAG &DAG) { 10865 assert(VT.is256BitVector() && "Expected 256-bit vector"); 10866 10867 unsigned NumElts = VT.getVectorNumElements(); 10868 unsigned HalfNumElts = NumElts / 2; 10869 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); 10870 10871 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); 10872 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); 10873 if (!UndefLower && !UndefUpper) 10874 return SDValue(); 10875 10876 // Upper half is undef and lower half is whole upper subvector. 10877 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 10878 if (UndefUpper && 10879 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { 10880 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, 10881 DAG.getIntPtrConstant(HalfNumElts, DL)); 10882 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, 10883 DAG.getIntPtrConstant(0, DL)); 10884 } 10885 10886 // Lower half is undef and upper half is whole lower subvector. 10887 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 10888 if (UndefLower && 10889 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { 10890 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, 10891 DAG.getIntPtrConstant(0, DL)); 10892 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, 10893 DAG.getIntPtrConstant(HalfNumElts, DL)); 10894 } 10895 10896 // If the shuffle only uses two of the four halves of the input operands, 10897 // then extract them and perform the 'half' shuffle at half width. 10898 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u> 10899 int HalfIdx1 = -1, HalfIdx2 = -1; 10900 SmallVector<int, 8> HalfMask(HalfNumElts); 10901 unsigned Offset = UndefLower ? HalfNumElts : 0; 10902 for (unsigned i = 0; i != HalfNumElts; ++i) { 10903 int M = Mask[i + Offset]; 10904 if (M < 0) { 10905 HalfMask[i] = M; 10906 continue; 10907 } 10908 10909 // Determine which of the 4 half vectors this element is from. 10910 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. 10911 int HalfIdx = M / HalfNumElts; 10912 10913 // Determine the element index into its half vector source. 10914 int HalfElt = M % HalfNumElts; 10915 10916 // We can shuffle with up to 2 half vectors, set the new 'half' 10917 // shuffle mask accordingly. 10918 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) { 10919 HalfMask[i] = HalfElt; 10920 HalfIdx1 = HalfIdx; 10921 continue; 10922 } 10923 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) { 10924 HalfMask[i] = HalfElt + HalfNumElts; 10925 HalfIdx2 = HalfIdx; 10926 continue; 10927 } 10928 10929 // Too many half vectors referenced. 10930 return SDValue(); 10931 } 10932 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); 10933 10934 // Only shuffle the halves of the inputs when useful. 10935 int NumLowerHalves = 10936 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); 10937 int NumUpperHalves = 10938 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); 10939 10940 // uuuuXXXX - don't extract uppers just to insert again. 10941 if (UndefLower && NumUpperHalves != 0) 10942 return SDValue(); 10943 10944 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract. 10945 if (UndefUpper && NumUpperHalves == 2) 10946 return SDValue(); 10947 10948 // AVX2 - XXXXuuuu - always extract lowers. 10949 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) { 10950 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. 10951 if (VT == MVT::v4f64 || VT == MVT::v4i64) 10952 return SDValue(); 10953 // AVX2 supports variable 32-bit element cross-lane shuffles. 10954 if (VT == MVT::v8f32 || VT == MVT::v8i32) { 10955 // XXXXuuuu - don't extract lowers and uppers. 10956 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0) 10957 return SDValue(); 10958 } 10959 } 10960 10961 auto GetHalfVector = [&](int HalfIdx) { 10962 if (HalfIdx < 0) 10963 return DAG.getUNDEF(HalfVT); 10964 SDValue V = (HalfIdx < 2 ? V1 : V2); 10965 HalfIdx = (HalfIdx % 2) * HalfNumElts; 10966 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, 10967 DAG.getIntPtrConstant(HalfIdx, DL)); 10968 }; 10969 10970 SDValue Half1 = GetHalfVector(HalfIdx1); 10971 SDValue Half2 = GetHalfVector(HalfIdx2); 10972 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); 10973 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, 10974 DAG.getIntPtrConstant(Offset, DL)); 10975 } 10976 10977 /// \brief Test whether the specified input (0 or 1) is in-place blended by the 10978 /// given mask. 10979 /// 10980 /// This returns true if the elements from a particular input are already in the 10981 /// slot required by the given mask and require no permutation. 10982 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { 10983 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); 10984 int Size = Mask.size(); 10985 for (int i = 0; i < Size; ++i) 10986 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) 10987 return false; 10988 10989 return true; 10990 } 10991 10992 /// Handle case where shuffle sources are coming from the same 128-bit lane and 10993 /// every lane can be represented as the same repeating mask - allowing us to 10994 /// shuffle the sources with the repeating shuffle and then permute the result 10995 /// to the destination lanes. 10996 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( 10997 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, 10998 const X86Subtarget &Subtarget, SelectionDAG &DAG) { 10999 int NumElts = VT.getVectorNumElements(); 11000 int NumLanes = VT.getSizeInBits() / 128; 11001 int NumLaneElts = NumElts / NumLanes; 11002 11003 // On AVX2 we may be able to just shuffle the lowest elements and then 11004 // broadcast the result. 11005 if (Subtarget.hasAVX2()) { 11006 for (unsigned BroadcastSize : {16, 32, 64}) { 11007 if (BroadcastSize <= VT.getScalarSizeInBits()) 11008 continue; 11009 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits(); 11010 11011 // Attempt to match a repeating pattern every NumBroadcastElts, 11012 // accounting for UNDEFs but only references the lowest 128-bit 11013 // lane of the inputs. 11014 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) { 11015 for (int i = 0; i != NumElts; i += NumBroadcastElts) 11016 for (int j = 0; j != NumBroadcastElts; ++j) { 11017 int M = Mask[i + j]; 11018 if (M < 0) 11019 continue; 11020 int &R = RepeatMask[j]; 11021 if (0 != ((M % NumElts) / NumLaneElts)) 11022 return false; 11023 if (0 <= R && R != M) 11024 return false; 11025 R = M; 11026 } 11027 return true; 11028 }; 11029 11030 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1); 11031 if (!FindRepeatingBroadcastMask(RepeatMask)) 11032 continue; 11033 11034 // Shuffle the (lowest) repeated elements in place for broadcast. 11035 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask); 11036 11037 // Shuffle the actual broadcast. 11038 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1); 11039 for (int i = 0; i != NumElts; i += NumBroadcastElts) 11040 for (int j = 0; j != NumBroadcastElts; ++j) 11041 BroadcastMask[i + j] = j; 11042 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT), 11043 BroadcastMask); 11044 } 11045 } 11046 11047 // Bail if we already have a repeated lane shuffle mask. 11048 SmallVector<int, 8> RepeatedShuffleMask; 11049 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask)) 11050 return SDValue(); 11051 11052 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes 11053 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes. 11054 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1; 11055 int NumSubLanes = NumLanes * SubLaneScale; 11056 int NumSubLaneElts = NumLaneElts / SubLaneScale; 11057 11058 // Check that all the sources are coming from the same lane and see if we 11059 // can form a repeating shuffle mask (local to each lane). At the same time, 11060 // determine the source sub-lane for each destination sub-lane. 11061 int TopSrcSubLane = -1; 11062 SmallVector<int, 8> RepeatedLaneMask((unsigned)NumLaneElts, -1); 11063 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1); 11064 for (int i = 0; i != NumElts; ++i) { 11065 int M = Mask[i]; 11066 if (M < 0) 11067 continue; 11068 assert(0 <= M && M < 2 * NumElts); 11069 11070 // Check that the local mask index is the same for every lane. We always do 11071 // this with 128-bit lanes to match in is128BitLaneRepeatedShuffleMask. 11072 int LocalM = M < NumElts ? (M % NumLaneElts) : (M % NumLaneElts) + NumElts; 11073 int &RepeatM = RepeatedLaneMask[i % NumLaneElts]; 11074 if (0 <= RepeatM && RepeatM != LocalM) 11075 return SDValue(); 11076 RepeatM = LocalM; 11077 11078 // Check that the whole of each destination sub-lane comes from the same 11079 // sub-lane, we need to calculate the source based off where the repeated 11080 // lane mask will have left it. 11081 int SrcLane = (M % NumElts) / NumLaneElts; 11082 int SrcSubLane = (SrcLane * SubLaneScale) + 11083 ((i % NumLaneElts) / NumSubLaneElts); 11084 int &Dst2SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; 11085 if (0 <= Dst2SrcSubLane && SrcSubLane != Dst2SrcSubLane) 11086 return SDValue(); 11087 Dst2SrcSubLane = SrcSubLane; 11088 11089 // Track the top most source sub-lane - by setting the remaining to UNDEF 11090 // we can greatly simplify shuffle matching. 11091 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); 11092 } 11093 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && 11094 "Unexpected source lane"); 11095 11096 // Create a repeating shuffle mask for the entire vector. 11097 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1); 11098 for (int i = 0, e = ((TopSrcSubLane + 1) * NumSubLaneElts); i != e; ++i) { 11099 int M = RepeatedLaneMask[i % NumLaneElts]; 11100 if (M < 0) 11101 continue; 11102 int Lane = i / NumLaneElts; 11103 RepeatedMask[i] = M + (Lane * NumLaneElts); 11104 } 11105 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); 11106 11107 // Shuffle each source sub-lane to its destination. 11108 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1); 11109 for (int i = 0; i != NumElts; i += NumSubLaneElts) { 11110 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; 11111 if (SrcSubLane < 0) 11112 continue; 11113 for (int j = 0; j != NumSubLaneElts; ++j) 11114 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); 11115 } 11116 11117 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), 11118 SubLaneMask); 11119 } 11120 11121 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, 11122 ArrayRef<int> Mask, SDValue V1, 11123 SDValue V2, SelectionDAG &DAG) { 11124 11125 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. 11126 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. 11127 assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD"); 11128 int NumElts = VT.getVectorNumElements(); 11129 bool ShufpdMask = true; 11130 bool CommutableMask = true; 11131 unsigned Immediate = 0; 11132 for (int i = 0; i < NumElts; ++i) { 11133 if (Mask[i] < 0) 11134 continue; 11135 int Val = (i & 6) + NumElts * (i & 1); 11136 int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1); 11137 if (Mask[i] < Val || Mask[i] > Val + 1) 11138 ShufpdMask = false; 11139 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) 11140 CommutableMask = false; 11141 Immediate |= (Mask[i] % 2) << i; 11142 } 11143 if (ShufpdMask) 11144 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, 11145 DAG.getConstant(Immediate, DL, MVT::i8)); 11146 if (CommutableMask) 11147 return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, 11148 DAG.getConstant(Immediate, DL, MVT::i8)); 11149 return SDValue(); 11150 } 11151 11152 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. 11153 /// 11154 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 11155 /// isn't available. 11156 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11157 SDValue V1, SDValue V2, 11158 const X86Subtarget &Subtarget, 11159 SelectionDAG &DAG) { 11160 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); 11161 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); 11162 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 11163 11164 SmallVector<int, 4> WidenedMask; 11165 if (canWidenShuffleElements(Mask, WidenedMask)) 11166 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, 11167 Subtarget, DAG)) 11168 return V; 11169 11170 if (V2.isUndef()) { 11171 // Check for being able to broadcast a single element. 11172 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( 11173 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) 11174 return Broadcast; 11175 11176 // Use low duplicate instructions for masks that match their pattern. 11177 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) 11178 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); 11179 11180 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { 11181 // Non-half-crossing single input shuffles can be lowered with an 11182 // interleaved permutation. 11183 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | 11184 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); 11185 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, 11186 DAG.getConstant(VPERMILPMask, DL, MVT::i8)); 11187 } 11188 11189 // With AVX2 we have direct support for this permutation. 11190 if (Subtarget.hasAVX2()) 11191 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, 11192 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 11193 11194 // Try to create an in-lane repeating shuffle mask and then shuffle the 11195 // the results into the target lanes. 11196 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( 11197 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) 11198 return V; 11199 11200 // Otherwise, fall back. 11201 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, 11202 DAG); 11203 } 11204 11205 // Use dedicated unpack instructions for masks that match their pattern. 11206 if (SDValue V = 11207 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) 11208 return V; 11209 11210 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, 11211 Subtarget, DAG)) 11212 return Blend; 11213 11214 // Check if the blend happens to exactly fit that of SHUFPD. 11215 if (SDValue Op = 11216 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) 11217 return Op; 11218 11219 // Try to create an in-lane repeating shuffle mask and then shuffle the 11220 // the results into the target lanes. 11221 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( 11222 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) 11223 return V; 11224 11225 // Try to simplify this by merging 128-bit lanes to enable a lane-based 11226 // shuffle. However, if we have AVX2 and either inputs are already in place, 11227 // we will be able to shuffle even across lanes the other input in a single 11228 // instruction so skip this pattern. 11229 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || 11230 isShuffleMaskInputInPlace(1, Mask)))) 11231 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 11232 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) 11233 return Result; 11234 11235 // If we have AVX2 then we always want to lower with a blend because an v4 we 11236 // can fully permute the elements. 11237 if (Subtarget.hasAVX2()) 11238 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, 11239 Mask, DAG); 11240 11241 // Otherwise fall back on generic lowering. 11242 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); 11243 } 11244 11245 /// \brief Handle lowering of 4-lane 64-bit integer shuffles. 11246 /// 11247 /// This routine is only called when we have AVX2 and thus a reasonable 11248 /// instruction set for v4i64 shuffling.. 11249 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11250 SDValue V1, SDValue V2, 11251 const X86Subtarget &Subtarget, 11252 SelectionDAG &DAG) { 11253 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); 11254 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); 11255 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 11256 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); 11257 11258 SmallVector<int, 4> WidenedMask; 11259 if (canWidenShuffleElements(Mask, WidenedMask)) 11260 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, 11261 Subtarget, DAG)) 11262 return V; 11263 11264 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, 11265 Subtarget, DAG)) 11266 return Blend; 11267 11268 // Check for being able to broadcast a single element. 11269 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, 11270 Mask, Subtarget, DAG)) 11271 return Broadcast; 11272 11273 if (V2.isUndef()) { 11274 // When the shuffle is mirrored between the 128-bit lanes of the unit, we 11275 // can use lower latency instructions that will operate on both lanes. 11276 SmallVector<int, 2> RepeatedMask; 11277 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { 11278 SmallVector<int, 4> PSHUFDMask; 11279 scaleShuffleMask(2, RepeatedMask, PSHUFDMask); 11280 return DAG.getBitcast( 11281 MVT::v4i64, 11282 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, 11283 DAG.getBitcast(MVT::v8i32, V1), 11284 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 11285 } 11286 11287 // AVX2 provides a direct instruction for permuting a single input across 11288 // lanes. 11289 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, 11290 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 11291 } 11292 11293 // Try to use shift instructions. 11294 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, 11295 Subtarget, DAG)) 11296 return Shift; 11297 11298 // Use dedicated unpack instructions for masks that match their pattern. 11299 if (SDValue V = 11300 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) 11301 return V; 11302 11303 // Try to simplify this by merging 128-bit lanes to enable a lane-based 11304 // shuffle. However, if we have AVX2 and either inputs are already in place, 11305 // we will be able to shuffle even across lanes the other input in a single 11306 // instruction so skip this pattern. 11307 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || 11308 isShuffleMaskInputInPlace(1, Mask)))) 11309 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 11310 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) 11311 return Result; 11312 11313 // Otherwise fall back on generic blend lowering. 11314 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, 11315 Mask, DAG); 11316 } 11317 11318 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles. 11319 /// 11320 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 11321 /// isn't available. 11322 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11323 SDValue V1, SDValue V2, 11324 const X86Subtarget &Subtarget, 11325 SelectionDAG &DAG) { 11326 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); 11327 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); 11328 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 11329 11330 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, 11331 Subtarget, DAG)) 11332 return Blend; 11333 11334 // Check for being able to broadcast a single element. 11335 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, 11336 Mask, Subtarget, DAG)) 11337 return Broadcast; 11338 11339 // If the shuffle mask is repeated in each 128-bit lane, we have many more 11340 // options to efficiently lower the shuffle. 11341 SmallVector<int, 4> RepeatedMask; 11342 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { 11343 assert(RepeatedMask.size() == 4 && 11344 "Repeated masks must be half the mask width!"); 11345 11346 // Use even/odd duplicate instructions for masks that match their pattern. 11347 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) 11348 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); 11349 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) 11350 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); 11351 11352 if (V2.isUndef()) 11353 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, 11354 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); 11355 11356 // Use dedicated unpack instructions for masks that match their pattern. 11357 if (SDValue V = 11358 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) 11359 return V; 11360 11361 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we 11362 // have already handled any direct blends. 11363 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); 11364 } 11365 11366 // Try to create an in-lane repeating shuffle mask and then shuffle the 11367 // the results into the target lanes. 11368 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( 11369 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) 11370 return V; 11371 11372 // If we have a single input shuffle with different shuffle patterns in the 11373 // two 128-bit lanes use the variable mask to VPERMILPS. 11374 if (V2.isUndef()) { 11375 SDValue VPermMask[8]; 11376 for (int i = 0; i < 8; ++i) 11377 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) 11378 : DAG.getConstant(Mask[i], DL, MVT::i32); 11379 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) 11380 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, 11381 DAG.getBuildVector(MVT::v8i32, DL, VPermMask)); 11382 11383 if (Subtarget.hasAVX2()) 11384 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, 11385 DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1); 11386 11387 // Otherwise, fall back. 11388 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, 11389 DAG); 11390 } 11391 11392 // Try to simplify this by merging 128-bit lanes to enable a lane-based 11393 // shuffle. 11394 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 11395 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) 11396 return Result; 11397 11398 // If we have AVX2 then we always want to lower with a blend because at v8 we 11399 // can fully permute the elements. 11400 if (Subtarget.hasAVX2()) 11401 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, 11402 Mask, DAG); 11403 11404 // Otherwise fall back on generic lowering. 11405 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); 11406 } 11407 11408 /// \brief Handle lowering of 8-lane 32-bit integer shuffles. 11409 /// 11410 /// This routine is only called when we have AVX2 and thus a reasonable 11411 /// instruction set for v8i32 shuffling.. 11412 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11413 SDValue V1, SDValue V2, 11414 const X86Subtarget &Subtarget, 11415 SelectionDAG &DAG) { 11416 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); 11417 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); 11418 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 11419 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"); 11420 11421 // Whenever we can lower this as a zext, that instruction is strictly faster 11422 // than any alternative. It also allows us to fold memory operands into the 11423 // shuffle in many cases. 11424 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, 11425 Mask, Subtarget, DAG)) 11426 return ZExt; 11427 11428 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, 11429 Subtarget, DAG)) 11430 return Blend; 11431 11432 // Check for being able to broadcast a single element. 11433 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, 11434 Mask, Subtarget, DAG)) 11435 return Broadcast; 11436 11437 // If the shuffle mask is repeated in each 128-bit lane we can use more 11438 // efficient instructions that mirror the shuffles across the two 128-bit 11439 // lanes. 11440 SmallVector<int, 4> RepeatedMask; 11441 if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { 11442 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); 11443 if (V2.isUndef()) 11444 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, 11445 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); 11446 11447 // Use dedicated unpack instructions for masks that match their pattern. 11448 if (SDValue V = 11449 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) 11450 return V; 11451 } 11452 11453 // Try to use shift instructions. 11454 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, 11455 Subtarget, DAG)) 11456 return Shift; 11457 11458 // Try to use byte rotation instructions. 11459 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 11460 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) 11461 return Rotate; 11462 11463 // Try to create an in-lane repeating shuffle mask and then shuffle the 11464 // the results into the target lanes. 11465 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( 11466 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) 11467 return V; 11468 11469 // If the shuffle patterns aren't repeated but it is a single input, directly 11470 // generate a cross-lane VPERMD instruction. 11471 if (V2.isUndef()) { 11472 SDValue VPermMask[8]; 11473 for (int i = 0; i < 8; ++i) 11474 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) 11475 : DAG.getConstant(Mask[i], DL, MVT::i32); 11476 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, 11477 DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1); 11478 } 11479 11480 // Try to simplify this by merging 128-bit lanes to enable a lane-based 11481 // shuffle. 11482 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 11483 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) 11484 return Result; 11485 11486 // Otherwise fall back on generic blend lowering. 11487 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, 11488 Mask, DAG); 11489 } 11490 11491 /// \brief Handle lowering of 16-lane 16-bit integer shuffles. 11492 /// 11493 /// This routine is only called when we have AVX2 and thus a reasonable 11494 /// instruction set for v16i16 shuffling.. 11495 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11496 SDValue V1, SDValue V2, 11497 const X86Subtarget &Subtarget, 11498 SelectionDAG &DAG) { 11499 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); 11500 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); 11501 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 11502 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"); 11503 11504 // Whenever we can lower this as a zext, that instruction is strictly faster 11505 // than any alternative. It also allows us to fold memory operands into the 11506 // shuffle in many cases. 11507 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2, 11508 Mask, Subtarget, DAG)) 11509 return ZExt; 11510 11511 // Check for being able to broadcast a single element. 11512 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, 11513 Mask, Subtarget, DAG)) 11514 return Broadcast; 11515 11516 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, 11517 Subtarget, DAG)) 11518 return Blend; 11519 11520 // Use dedicated unpack instructions for masks that match their pattern. 11521 if (SDValue V = 11522 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) 11523 return V; 11524 11525 // Try to use shift instructions. 11526 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, 11527 Subtarget, DAG)) 11528 return Shift; 11529 11530 // Try to use byte rotation instructions. 11531 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 11532 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) 11533 return Rotate; 11534 11535 // Try to create an in-lane repeating shuffle mask and then shuffle the 11536 // the results into the target lanes. 11537 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( 11538 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) 11539 return V; 11540 11541 if (V2.isUndef()) { 11542 // There are no generalized cross-lane shuffle operations available on i16 11543 // element types. 11544 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) 11545 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, 11546 Mask, DAG); 11547 11548 SmallVector<int, 8> RepeatedMask; 11549 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { 11550 // As this is a single-input shuffle, the repeated mask should be 11551 // a strictly valid v8i16 mask that we can pass through to the v8i16 11552 // lowering to handle even the v16 case. 11553 return lowerV8I16GeneralSingleInputVectorShuffle( 11554 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); 11555 } 11556 } 11557 11558 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, 11559 V2, Subtarget, DAG)) 11560 return PSHUFB; 11561 11562 // Try to simplify this by merging 128-bit lanes to enable a lane-based 11563 // shuffle. 11564 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 11565 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) 11566 return Result; 11567 11568 // Otherwise fall back on generic lowering. 11569 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); 11570 } 11571 11572 /// \brief Handle lowering of 32-lane 8-bit integer shuffles. 11573 /// 11574 /// This routine is only called when we have AVX2 and thus a reasonable 11575 /// instruction set for v32i8 shuffling.. 11576 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11577 SDValue V1, SDValue V2, 11578 const X86Subtarget &Subtarget, 11579 SelectionDAG &DAG) { 11580 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); 11581 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); 11582 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); 11583 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"); 11584 11585 // Whenever we can lower this as a zext, that instruction is strictly faster 11586 // than any alternative. It also allows us to fold memory operands into the 11587 // shuffle in many cases. 11588 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, 11589 Mask, Subtarget, DAG)) 11590 return ZExt; 11591 11592 // Check for being able to broadcast a single element. 11593 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, 11594 Mask, Subtarget, DAG)) 11595 return Broadcast; 11596 11597 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, 11598 Subtarget, DAG)) 11599 return Blend; 11600 11601 // Use dedicated unpack instructions for masks that match their pattern. 11602 if (SDValue V = 11603 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) 11604 return V; 11605 11606 // Try to use shift instructions. 11607 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, 11608 Subtarget, DAG)) 11609 return Shift; 11610 11611 // Try to use byte rotation instructions. 11612 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 11613 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) 11614 return Rotate; 11615 11616 // Try to create an in-lane repeating shuffle mask and then shuffle the 11617 // the results into the target lanes. 11618 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( 11619 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) 11620 return V; 11621 11622 // There are no generalized cross-lane shuffle operations available on i8 11623 // element types. 11624 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) 11625 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, 11626 DAG); 11627 11628 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, 11629 V2, Subtarget, DAG)) 11630 return PSHUFB; 11631 11632 // Try to simplify this by merging 128-bit lanes to enable a lane-based 11633 // shuffle. 11634 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( 11635 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) 11636 return Result; 11637 11638 // Otherwise fall back on generic lowering. 11639 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); 11640 } 11641 11642 /// \brief High-level routine to lower various 256-bit x86 vector shuffles. 11643 /// 11644 /// This routine either breaks down the specific type of a 256-bit x86 vector 11645 /// shuffle or splits it into two 128-bit shuffles and fuses the results back 11646 /// together based on the available instructions. 11647 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11648 MVT VT, SDValue V1, SDValue V2, 11649 const X86Subtarget &Subtarget, 11650 SelectionDAG &DAG) { 11651 // If we have a single input to the zero element, insert that into V1 if we 11652 // can do so cheaply. 11653 int NumElts = VT.getVectorNumElements(); 11654 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); 11655 11656 if (NumV2Elements == 1 && Mask[0] >= NumElts) 11657 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( 11658 DL, VT, V1, V2, Mask, Subtarget, DAG)) 11659 return Insertion; 11660 11661 // Handle special cases where the lower or upper half is UNDEF. 11662 if (SDValue V = 11663 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) 11664 return V; 11665 11666 // There is a really nice hard cut-over between AVX1 and AVX2 that means we 11667 // can check for those subtargets here and avoid much of the subtarget 11668 // querying in the per-vector-type lowering routines. With AVX1 we have 11669 // essentially *zero* ability to manipulate a 256-bit vector with integer 11670 // types. Since we'll use floating point types there eventually, just 11671 // immediately cast everything to a float and operate entirely in that domain. 11672 if (VT.isInteger() && !Subtarget.hasAVX2()) { 11673 int ElementBits = VT.getScalarSizeInBits(); 11674 if (ElementBits < 32) { 11675 // No floating point type available, if we can't use the bit operations 11676 // for masking/blending then decompose into 128-bit vectors. 11677 if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) 11678 return V; 11679 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) 11680 return V; 11681 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); 11682 } 11683 11684 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), 11685 VT.getVectorNumElements()); 11686 V1 = DAG.getBitcast(FpVT, V1); 11687 V2 = DAG.getBitcast(FpVT, V2); 11688 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); 11689 } 11690 11691 switch (VT.SimpleTy) { 11692 case MVT::v4f64: 11693 return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 11694 case MVT::v4i64: 11695 return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 11696 case MVT::v8f32: 11697 return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 11698 case MVT::v8i32: 11699 return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 11700 case MVT::v16i16: 11701 return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 11702 case MVT::v32i8: 11703 return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 11704 11705 default: 11706 llvm_unreachable("Not a valid 256-bit x86 vector type!"); 11707 } 11708 } 11709 11710 /// \brief Try to lower a vector shuffle as a 128-bit shuffles. 11711 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, 11712 ArrayRef<int> Mask, SDValue V1, 11713 SDValue V2, SelectionDAG &DAG) { 11714 assert(VT.getScalarSizeInBits() == 64 && 11715 "Unexpected element type size for 128bit shuffle."); 11716 11717 // To handle 256 bit vector requires VLX and most probably 11718 // function lowerV2X128VectorShuffle() is better solution. 11719 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."); 11720 11721 SmallVector<int, 4> WidenedMask; 11722 if (!canWidenShuffleElements(Mask, WidenedMask)) 11723 return SDValue(); 11724 11725 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; 11726 // Insure elements came from the same Op. 11727 int MaxOp1Index = VT.getVectorNumElements()/2 - 1; 11728 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { 11729 if (WidenedMask[i] == SM_SentinelZero) 11730 return SDValue(); 11731 if (WidenedMask[i] == SM_SentinelUndef) 11732 continue; 11733 11734 SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1; 11735 unsigned OpIndex = (i < Size/2) ? 0 : 1; 11736 if (Ops[OpIndex].isUndef()) 11737 Ops[OpIndex] = Op; 11738 else if (Ops[OpIndex] != Op) 11739 return SDValue(); 11740 } 11741 11742 // Form a 128-bit permutation. 11743 // Convert the 64-bit shuffle mask selection values into 128-bit selection 11744 // bits defined by a vshuf64x2 instruction's immediate control byte. 11745 unsigned PermMask = 0, Imm = 0; 11746 unsigned ControlBitsNum = WidenedMask.size() / 2; 11747 11748 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { 11749 // Use first element in place of undef mask. 11750 Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; 11751 PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); 11752 } 11753 11754 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], 11755 DAG.getConstant(PermMask, DL, MVT::i8)); 11756 } 11757 11758 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, 11759 ArrayRef<int> Mask, SDValue V1, 11760 SDValue V2, SelectionDAG &DAG) { 11761 11762 assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); 11763 11764 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); 11765 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); 11766 11767 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); 11768 if (V2.isUndef()) 11769 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); 11770 11771 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); 11772 } 11773 11774 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. 11775 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11776 SDValue V1, SDValue V2, 11777 const X86Subtarget &Subtarget, 11778 SelectionDAG &DAG) { 11779 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); 11780 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); 11781 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 11782 11783 if (V2.isUndef()) { 11784 // Use low duplicate instructions for masks that match their pattern. 11785 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) 11786 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); 11787 11788 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) { 11789 // Non-half-crossing single input shuffles can be lowered with an 11790 // interleaved permutation. 11791 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | 11792 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) | 11793 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | 11794 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); 11795 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, 11796 DAG.getConstant(VPERMILPMask, DL, MVT::i8)); 11797 } 11798 11799 SmallVector<int, 4> RepeatedMask; 11800 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) 11801 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1, 11802 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); 11803 } 11804 11805 if (SDValue Shuf128 = 11806 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) 11807 return Shuf128; 11808 11809 if (SDValue Unpck = 11810 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) 11811 return Unpck; 11812 11813 // Check if the blend happens to exactly fit that of SHUFPD. 11814 if (SDValue Op = 11815 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) 11816 return Op; 11817 11818 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); 11819 } 11820 11821 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. 11822 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, 11823 SDValue V1, SDValue V2, 11824 const X86Subtarget &Subtarget, 11825 SelectionDAG &DAG) { 11826 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); 11827 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); 11828 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 11829 11830 // If the shuffle mask is repeated in each 128-bit lane, we have many more 11831 // options to efficiently lower the shuffle. 11832 SmallVector<int, 4> RepeatedMask; 11833 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) { 11834 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); 11835 11836 // Use even/odd duplicate instructions for masks that match their pattern. 11837 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) 11838 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1); 11839 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) 11840 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1); 11841 11842 if (V2.isUndef()) 11843 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1, 11844 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); 11845 11846 // Use dedicated unpack instructions for masks that match their pattern. 11847 if (SDValue Unpck = 11848 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) 11849 return Unpck; 11850 11851 // Otherwise, fall back to a SHUFPS sequence. 11852 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); 11853 } 11854 11855 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); 11856 } 11857 11858 /// \brief Handle lowering of 8-lane 64-bit integer shuffles. 11859 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11860 SDValue V1, SDValue V2, 11861 const X86Subtarget &Subtarget, 11862 SelectionDAG &DAG) { 11863 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); 11864 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); 11865 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 11866 11867 if (SDValue Shuf128 = 11868 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) 11869 return Shuf128; 11870 11871 if (V2.isUndef()) { 11872 // When the shuffle is mirrored between the 128-bit lanes of the unit, we 11873 // can use lower latency instructions that will operate on all four 11874 // 128-bit lanes. 11875 SmallVector<int, 2> Repeated128Mask; 11876 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { 11877 SmallVector<int, 4> PSHUFDMask; 11878 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask); 11879 return DAG.getBitcast( 11880 MVT::v8i64, 11881 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, 11882 DAG.getBitcast(MVT::v16i32, V1), 11883 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); 11884 } 11885 11886 SmallVector<int, 4> Repeated256Mask; 11887 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) 11888 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, 11889 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); 11890 } 11891 11892 // Try to use shift instructions. 11893 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, 11894 Subtarget, DAG)) 11895 return Shift; 11896 11897 if (SDValue Unpck = 11898 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) 11899 return Unpck; 11900 11901 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); 11902 } 11903 11904 /// \brief Handle lowering of 16-lane 32-bit integer shuffles. 11905 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11906 SDValue V1, SDValue V2, 11907 const X86Subtarget &Subtarget, 11908 SelectionDAG &DAG) { 11909 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); 11910 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); 11911 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 11912 11913 // If the shuffle mask is repeated in each 128-bit lane we can use more 11914 // efficient instructions that mirror the shuffles across the four 128-bit 11915 // lanes. 11916 SmallVector<int, 4> RepeatedMask; 11917 if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) { 11918 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); 11919 if (V2.isUndef()) 11920 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, 11921 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); 11922 11923 // Use dedicated unpack instructions for masks that match their pattern. 11924 if (SDValue V = 11925 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) 11926 return V; 11927 } 11928 11929 // Try to use shift instructions. 11930 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, 11931 Subtarget, DAG)) 11932 return Shift; 11933 11934 // Try to use byte rotation instructions. 11935 if (Subtarget.hasBWI()) 11936 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 11937 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) 11938 return Rotate; 11939 11940 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); 11941 } 11942 11943 /// \brief Handle lowering of 32-lane 16-bit integer shuffles. 11944 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11945 SDValue V1, SDValue V2, 11946 const X86Subtarget &Subtarget, 11947 SelectionDAG &DAG) { 11948 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); 11949 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); 11950 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); 11951 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); 11952 11953 // Use dedicated unpack instructions for masks that match their pattern. 11954 if (SDValue V = 11955 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) 11956 return V; 11957 11958 // Try to use shift instructions. 11959 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, 11960 Subtarget, DAG)) 11961 return Shift; 11962 11963 // Try to use byte rotation instructions. 11964 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 11965 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) 11966 return Rotate; 11967 11968 if (V2.isUndef()) { 11969 SmallVector<int, 8> RepeatedMask; 11970 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) { 11971 // As this is a single-input shuffle, the repeated mask should be 11972 // a strictly valid v8i16 mask that we can pass through to the v8i16 11973 // lowering to handle even the v32 case. 11974 return lowerV8I16GeneralSingleInputVectorShuffle( 11975 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); 11976 } 11977 } 11978 11979 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); 11980 } 11981 11982 /// \brief Handle lowering of 64-lane 8-bit integer shuffles. 11983 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 11984 SDValue V1, SDValue V2, 11985 const X86Subtarget &Subtarget, 11986 SelectionDAG &DAG) { 11987 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); 11988 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); 11989 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); 11990 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); 11991 11992 // Use dedicated unpack instructions for masks that match their pattern. 11993 if (SDValue V = 11994 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) 11995 return V; 11996 11997 // Try to use shift instructions. 11998 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, 11999 Subtarget, DAG)) 12000 return Shift; 12001 12002 // Try to use byte rotation instructions. 12003 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 12004 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) 12005 return Rotate; 12006 12007 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, 12008 V2, Subtarget, DAG)) 12009 return PSHUFB; 12010 12011 // FIXME: Implement direct support for this type! 12012 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); 12013 } 12014 12015 /// \brief High-level routine to lower various 512-bit x86 vector shuffles. 12016 /// 12017 /// This routine either breaks down the specific type of a 512-bit x86 vector 12018 /// shuffle or splits it into two 256-bit shuffles and fuses the results back 12019 /// together based on the available instructions. 12020 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 12021 MVT VT, SDValue V1, SDValue V2, 12022 const X86Subtarget &Subtarget, 12023 SelectionDAG &DAG) { 12024 assert(Subtarget.hasAVX512() && 12025 "Cannot lower 512-bit vectors w/ basic ISA!"); 12026 12027 // Check for being able to broadcast a single element. 12028 if (SDValue Broadcast = 12029 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) 12030 return Broadcast; 12031 12032 // Dispatch to each element type for lowering. If we don't have support for 12033 // specific element type shuffles at 512 bits, immediately split them and 12034 // lower them. Each lowering routine of a given type is allowed to assume that 12035 // the requisite ISA extensions for that element type are available. 12036 switch (VT.SimpleTy) { 12037 case MVT::v8f64: 12038 return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 12039 case MVT::v16f32: 12040 return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 12041 case MVT::v8i64: 12042 return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 12043 case MVT::v16i32: 12044 return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 12045 case MVT::v32i16: 12046 return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 12047 case MVT::v64i8: 12048 return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); 12049 12050 default: 12051 llvm_unreachable("Not a valid 512-bit x86 vector type!"); 12052 } 12053 } 12054 12055 // Lower vXi1 vector shuffles. 12056 // There is no a dedicated instruction on AVX-512 that shuffles the masks. 12057 // The only way to shuffle bits is to sign-extend the mask vector to SIMD 12058 // vector, shuffle and then truncate it back. 12059 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, 12060 MVT VT, SDValue V1, SDValue V2, 12061 const X86Subtarget &Subtarget, 12062 SelectionDAG &DAG) { 12063 assert(Subtarget.hasAVX512() && 12064 "Cannot lower 512-bit vectors w/o basic ISA!"); 12065 MVT ExtVT; 12066 switch (VT.SimpleTy) { 12067 default: 12068 llvm_unreachable("Expected a vector of i1 elements"); 12069 case MVT::v2i1: 12070 ExtVT = MVT::v2i64; 12071 break; 12072 case MVT::v4i1: 12073 ExtVT = MVT::v4i32; 12074 break; 12075 case MVT::v8i1: 12076 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL 12077 break; 12078 case MVT::v16i1: 12079 ExtVT = MVT::v16i32; 12080 break; 12081 case MVT::v32i1: 12082 ExtVT = MVT::v32i16; 12083 break; 12084 case MVT::v64i1: 12085 ExtVT = MVT::v64i8; 12086 break; 12087 } 12088 12089 if (ISD::isBuildVectorAllZeros(V1.getNode())) 12090 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); 12091 else if (ISD::isBuildVectorAllOnes(V1.getNode())) 12092 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); 12093 else 12094 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); 12095 12096 if (V2.isUndef()) 12097 V2 = DAG.getUNDEF(ExtVT); 12098 else if (ISD::isBuildVectorAllZeros(V2.getNode())) 12099 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); 12100 else if (ISD::isBuildVectorAllOnes(V2.getNode())) 12101 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); 12102 else 12103 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); 12104 return DAG.getNode(ISD::TRUNCATE, DL, VT, 12105 DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); 12106 } 12107 /// \brief Top-level lowering for x86 vector shuffles. 12108 /// 12109 /// This handles decomposition, canonicalization, and lowering of all x86 12110 /// vector shuffles. Most of the specific lowering strategies are encapsulated 12111 /// above in helper routines. The canonicalization attempts to widen shuffles 12112 /// to involve fewer lanes of wider elements, consolidate symmetric patterns 12113 /// s.t. only one of the two inputs needs to be tested, etc. 12114 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, 12115 SelectionDAG &DAG) { 12116 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 12117 ArrayRef<int> Mask = SVOp->getMask(); 12118 SDValue V1 = Op.getOperand(0); 12119 SDValue V2 = Op.getOperand(1); 12120 MVT VT = Op.getSimpleValueType(); 12121 int NumElements = VT.getVectorNumElements(); 12122 SDLoc DL(Op); 12123 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); 12124 12125 assert((VT.getSizeInBits() != 64 || Is1BitVector) && 12126 "Can't lower MMX shuffles"); 12127 12128 bool V1IsUndef = V1.isUndef(); 12129 bool V2IsUndef = V2.isUndef(); 12130 if (V1IsUndef && V2IsUndef) 12131 return DAG.getUNDEF(VT); 12132 12133 // When we create a shuffle node we put the UNDEF node to second operand, 12134 // but in some cases the first operand may be transformed to UNDEF. 12135 // In this case we should just commute the node. 12136 if (V1IsUndef) 12137 return DAG.getCommutedVectorShuffle(*SVOp); 12138 12139 // Check for non-undef masks pointing at an undef vector and make the masks 12140 // undef as well. This makes it easier to match the shuffle based solely on 12141 // the mask. 12142 if (V2IsUndef) 12143 for (int M : Mask) 12144 if (M >= NumElements) { 12145 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); 12146 for (int &M : NewMask) 12147 if (M >= NumElements) 12148 M = -1; 12149 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); 12150 } 12151 12152 // We actually see shuffles that are entirely re-arrangements of a set of 12153 // zero inputs. This mostly happens while decomposing complex shuffles into 12154 // simple ones. Directly lower these as a buildvector of zeros. 12155 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); 12156 if (Zeroable.all()) 12157 return getZeroVector(VT, Subtarget, DAG, DL); 12158 12159 // Try to collapse shuffles into using a vector type with fewer elements but 12160 // wider element types. We cap this to not form integers or floating point 12161 // elements wider than 64 bits, but it might be interesting to form i128 12162 // integers to handle flipping the low and high halves of AVX 256-bit vectors. 12163 SmallVector<int, 16> WidenedMask; 12164 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && 12165 canWidenShuffleElements(Mask, WidenedMask)) { 12166 MVT NewEltVT = VT.isFloatingPoint() 12167 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) 12168 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); 12169 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); 12170 // Make sure that the new vector type is legal. For example, v2f64 isn't 12171 // legal on SSE1. 12172 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { 12173 V1 = DAG.getBitcast(NewVT, V1); 12174 V2 = DAG.getBitcast(NewVT, V2); 12175 return DAG.getBitcast( 12176 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask)); 12177 } 12178 } 12179 12180 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; 12181 for (int M : Mask) 12182 if (M < 0) 12183 ++NumUndefElements; 12184 else if (M < NumElements) 12185 ++NumV1Elements; 12186 else 12187 ++NumV2Elements; 12188 12189 // Commute the shuffle as needed such that more elements come from V1 than 12190 // V2. This allows us to match the shuffle pattern strictly on how many 12191 // elements come from V1 without handling the symmetric cases. 12192 if (NumV2Elements > NumV1Elements) 12193 return DAG.getCommutedVectorShuffle(*SVOp); 12194 12195 assert(NumV1Elements > 0 && "No V1 indices"); 12196 assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used"); 12197 12198 // When the number of V1 and V2 elements are the same, try to minimize the 12199 // number of uses of V2 in the low half of the vector. When that is tied, 12200 // ensure that the sum of indices for V1 is equal to or lower than the sum 12201 // indices for V2. When those are equal, try to ensure that the number of odd 12202 // indices for V1 is lower than the number of odd indices for V2. 12203 if (NumV1Elements == NumV2Elements) { 12204 int LowV1Elements = 0, LowV2Elements = 0; 12205 for (int M : Mask.slice(0, NumElements / 2)) 12206 if (M >= NumElements) 12207 ++LowV2Elements; 12208 else if (M >= 0) 12209 ++LowV1Elements; 12210 if (LowV2Elements > LowV1Elements) 12211 return DAG.getCommutedVectorShuffle(*SVOp); 12212 if (LowV2Elements == LowV1Elements) { 12213 int SumV1Indices = 0, SumV2Indices = 0; 12214 for (int i = 0, Size = Mask.size(); i < Size; ++i) 12215 if (Mask[i] >= NumElements) 12216 SumV2Indices += i; 12217 else if (Mask[i] >= 0) 12218 SumV1Indices += i; 12219 if (SumV2Indices < SumV1Indices) 12220 return DAG.getCommutedVectorShuffle(*SVOp); 12221 if (SumV2Indices == SumV1Indices) { 12222 int NumV1OddIndices = 0, NumV2OddIndices = 0; 12223 for (int i = 0, Size = Mask.size(); i < Size; ++i) 12224 if (Mask[i] >= NumElements) 12225 NumV2OddIndices += i % 2; 12226 else if (Mask[i] >= 0) 12227 NumV1OddIndices += i % 2; 12228 if (NumV2OddIndices < NumV1OddIndices) 12229 return DAG.getCommutedVectorShuffle(*SVOp); 12230 } 12231 } 12232 } 12233 12234 // For each vector width, delegate to a specialized lowering routine. 12235 if (VT.is128BitVector()) 12236 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); 12237 12238 if (VT.is256BitVector()) 12239 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); 12240 12241 if (VT.is512BitVector()) 12242 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); 12243 12244 if (Is1BitVector) 12245 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); 12246 12247 llvm_unreachable("Unimplemented!"); 12248 } 12249 12250 /// \brief Try to lower a VSELECT instruction to a vector shuffle. 12251 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, 12252 const X86Subtarget &Subtarget, 12253 SelectionDAG &DAG) { 12254 SDValue Cond = Op.getOperand(0); 12255 SDValue LHS = Op.getOperand(1); 12256 SDValue RHS = Op.getOperand(2); 12257 SDLoc dl(Op); 12258 MVT VT = Op.getSimpleValueType(); 12259 12260 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) 12261 return SDValue(); 12262 auto *CondBV = cast<BuildVectorSDNode>(Cond); 12263 12264 // Only non-legal VSELECTs reach this lowering, convert those into generic 12265 // shuffles and re-use the shuffle lowering path for blends. 12266 SmallVector<int, 32> Mask; 12267 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { 12268 SDValue CondElt = CondBV->getOperand(i); 12269 Mask.push_back( 12270 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) 12271 : -1); 12272 } 12273 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); 12274 } 12275 12276 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { 12277 // A vselect where all conditions and data are constants can be optimized into 12278 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). 12279 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && 12280 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && 12281 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) 12282 return SDValue(); 12283 12284 // Try to lower this to a blend-style vector shuffle. This can handle all 12285 // constant condition cases. 12286 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) 12287 return BlendOp; 12288 12289 // Variable blends are only legal from SSE4.1 onward. 12290 if (!Subtarget.hasSSE41()) 12291 return SDValue(); 12292 12293 // Only some types will be legal on some subtargets. If we can emit a legal 12294 // VSELECT-matching blend, return Op, and but if we need to expand, return 12295 // a null value. 12296 switch (Op.getSimpleValueType().SimpleTy) { 12297 default: 12298 // Most of the vector types have blends past SSE4.1. 12299 return Op; 12300 12301 case MVT::v32i8: 12302 // The byte blends for AVX vectors were introduced only in AVX2. 12303 if (Subtarget.hasAVX2()) 12304 return Op; 12305 12306 return SDValue(); 12307 12308 case MVT::v8i16: 12309 case MVT::v16i16: 12310 // AVX-512 BWI and VLX features support VSELECT with i16 elements. 12311 if (Subtarget.hasBWI() && Subtarget.hasVLX()) 12312 return Op; 12313 12314 // FIXME: We should custom lower this by fixing the condition and using i8 12315 // blends. 12316 return SDValue(); 12317 } 12318 } 12319 12320 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 12321 MVT VT = Op.getSimpleValueType(); 12322 SDLoc dl(Op); 12323 12324 if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) 12325 return SDValue(); 12326 12327 if (VT.getSizeInBits() == 8) { 12328 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 12329 Op.getOperand(0), Op.getOperand(1)); 12330 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 12331 DAG.getValueType(VT)); 12332 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 12333 } 12334 12335 if (VT.getSizeInBits() == 16) { 12336 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 12337 if (isNullConstant(Op.getOperand(1))) 12338 return DAG.getNode( 12339 ISD::TRUNCATE, dl, MVT::i16, 12340 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 12341 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), 12342 Op.getOperand(1))); 12343 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 12344 Op.getOperand(0), Op.getOperand(1)); 12345 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 12346 DAG.getValueType(VT)); 12347 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 12348 } 12349 12350 if (VT == MVT::f32) { 12351 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 12352 // the result back to FR32 register. It's only worth matching if the 12353 // result has a single use which is a store or a bitcast to i32. And in 12354 // the case of a store, it's not worth it if the index is a constant 0, 12355 // because a MOVSSmr can be used instead, which is smaller and faster. 12356 if (!Op.hasOneUse()) 12357 return SDValue(); 12358 SDNode *User = *Op.getNode()->use_begin(); 12359 if ((User->getOpcode() != ISD::STORE || 12360 isNullConstant(Op.getOperand(1))) && 12361 (User->getOpcode() != ISD::BITCAST || 12362 User->getValueType(0) != MVT::i32)) 12363 return SDValue(); 12364 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 12365 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), 12366 Op.getOperand(1)); 12367 return DAG.getBitcast(MVT::f32, Extract); 12368 } 12369 12370 if (VT == MVT::i32 || VT == MVT::i64) { 12371 // ExtractPS/pextrq works with constant index. 12372 if (isa<ConstantSDNode>(Op.getOperand(1))) 12373 return Op; 12374 } 12375 return SDValue(); 12376 } 12377 12378 /// Extract one bit from mask vector, like v16i1 or v8i1. 12379 /// AVX-512 feature. 12380 SDValue 12381 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { 12382 SDValue Vec = Op.getOperand(0); 12383 SDLoc dl(Vec); 12384 MVT VecVT = Vec.getSimpleValueType(); 12385 SDValue Idx = Op.getOperand(1); 12386 MVT EltVT = Op.getSimpleValueType(); 12387 12388 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); 12389 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && 12390 "Unexpected vector type in ExtractBitFromMaskVector"); 12391 12392 // variable index can't be handled in mask registers, 12393 // extend vector to VR512 12394 if (!isa<ConstantSDNode>(Idx)) { 12395 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 12396 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); 12397 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 12398 ExtVT.getVectorElementType(), Ext, Idx); 12399 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 12400 } 12401 12402 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 12403 if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) { 12404 // Use kshiftlw/rw instruction. 12405 VecVT = MVT::v16i1; 12406 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, 12407 DAG.getUNDEF(VecVT), 12408 Vec, 12409 DAG.getIntPtrConstant(0, dl)); 12410 } 12411 unsigned MaxSift = VecVT.getVectorNumElements() - 1; 12412 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, 12413 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); 12414 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, 12415 DAG.getConstant(MaxSift, dl, MVT::i8)); 12416 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, 12417 DAG.getIntPtrConstant(0, dl)); 12418 } 12419 12420 SDValue 12421 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 12422 SelectionDAG &DAG) const { 12423 SDLoc dl(Op); 12424 SDValue Vec = Op.getOperand(0); 12425 MVT VecVT = Vec.getSimpleValueType(); 12426 SDValue Idx = Op.getOperand(1); 12427 12428 if (Op.getSimpleValueType() == MVT::i1) 12429 return ExtractBitFromMaskVector(Op, DAG); 12430 12431 if (!isa<ConstantSDNode>(Idx)) { 12432 if (VecVT.is512BitVector() || 12433 (VecVT.is256BitVector() && Subtarget.hasInt256() && 12434 VecVT.getVectorElementType().getSizeInBits() == 32)) { 12435 12436 MVT MaskEltVT = 12437 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); 12438 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / 12439 MaskEltVT.getSizeInBits()); 12440 12441 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); 12442 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12443 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, 12444 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, 12445 DAG.getConstant(0, dl, PtrVT)); 12446 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); 12447 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, 12448 DAG.getConstant(0, dl, PtrVT)); 12449 } 12450 return SDValue(); 12451 } 12452 12453 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 12454 12455 // If this is a 256-bit vector result, first extract the 128-bit vector and 12456 // then extract the element from the 128-bit vector. 12457 if (VecVT.is256BitVector() || VecVT.is512BitVector()) { 12458 // Get the 128-bit vector. 12459 Vec = extract128BitVector(Vec, IdxVal, DAG, dl); 12460 MVT EltVT = VecVT.getVectorElementType(); 12461 12462 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); 12463 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); 12464 12465 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 12466 // this can be done with a mask. 12467 IdxVal &= ElemsPerChunk - 1; 12468 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 12469 DAG.getConstant(IdxVal, dl, MVT::i32)); 12470 } 12471 12472 assert(VecVT.is128BitVector() && "Unexpected vector length"); 12473 12474 if (Subtarget.hasSSE41()) 12475 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) 12476 return Res; 12477 12478 MVT VT = Op.getSimpleValueType(); 12479 // TODO: handle v16i8. 12480 if (VT.getSizeInBits() == 16) { 12481 if (IdxVal == 0) 12482 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 12483 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 12484 DAG.getBitcast(MVT::v4i32, Vec), Idx)); 12485 12486 // Transform it so it match pextrw which produces a 32-bit result. 12487 MVT EltVT = MVT::i32; 12488 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx); 12489 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 12490 DAG.getValueType(VT)); 12491 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 12492 } 12493 12494 if (VT.getSizeInBits() == 32) { 12495 if (IdxVal == 0) 12496 return Op; 12497 12498 // SHUFPS the element to the lowest double word, then movss. 12499 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 }; 12500 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); 12501 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 12502 DAG.getIntPtrConstant(0, dl)); 12503 } 12504 12505 if (VT.getSizeInBits() == 64) { 12506 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 12507 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 12508 // to match extract_elt for f64. 12509 if (IdxVal == 0) 12510 return Op; 12511 12512 // UNPCKHPD the element to the lowest double word, then movsd. 12513 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 12514 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 12515 int Mask[2] = { 1, -1 }; 12516 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); 12517 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 12518 DAG.getIntPtrConstant(0, dl)); 12519 } 12520 12521 return SDValue(); 12522 } 12523 12524 /// Insert one bit to mask vector, like v16i1 or v8i1. 12525 /// AVX-512 feature. 12526 SDValue 12527 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { 12528 SDLoc dl(Op); 12529 SDValue Vec = Op.getOperand(0); 12530 SDValue Elt = Op.getOperand(1); 12531 SDValue Idx = Op.getOperand(2); 12532 MVT VecVT = Vec.getSimpleValueType(); 12533 12534 if (!isa<ConstantSDNode>(Idx)) { 12535 // Non constant index. Extend source and destination, 12536 // insert element and then truncate the result. 12537 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 12538 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); 12539 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 12540 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), 12541 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); 12542 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); 12543 } 12544 12545 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 12546 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); 12547 if (IdxVal) 12548 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, 12549 DAG.getConstant(IdxVal, dl, MVT::i8)); 12550 if (Vec.isUndef()) 12551 return EltInVec; 12552 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); 12553 } 12554 12555 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 12556 SelectionDAG &DAG) const { 12557 MVT VT = Op.getSimpleValueType(); 12558 MVT EltVT = VT.getVectorElementType(); 12559 unsigned NumElts = VT.getVectorNumElements(); 12560 12561 if (EltVT == MVT::i1) 12562 return InsertBitToMaskVector(Op, DAG); 12563 12564 SDLoc dl(Op); 12565 SDValue N0 = Op.getOperand(0); 12566 SDValue N1 = Op.getOperand(1); 12567 SDValue N2 = Op.getOperand(2); 12568 if (!isa<ConstantSDNode>(N2)) 12569 return SDValue(); 12570 auto *N2C = cast<ConstantSDNode>(N2); 12571 unsigned IdxVal = N2C->getZExtValue(); 12572 12573 // If we are clearing out a element, we do this more efficiently with a 12574 // blend shuffle than a costly integer insertion. 12575 // TODO: would other rematerializable values (e.g. allbits) benefit as well? 12576 // TODO: pre-SSE41 targets will tend to use bit masking - this could still 12577 // be beneficial if we are inserting several zeros and can combine the masks. 12578 if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) { 12579 SmallVector<int, 8> ClearMask; 12580 for (unsigned i = 0; i != NumElts; ++i) 12581 ClearMask.push_back(i == IdxVal ? i + NumElts : i); 12582 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl); 12583 return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask); 12584 } 12585 12586 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert 12587 // into that, and then insert the subvector back into the result. 12588 if (VT.is256BitVector() || VT.is512BitVector()) { 12589 // With a 256-bit vector, we can insert into the zero element efficiently 12590 // using a blend if we have AVX or AVX2 and the right data type. 12591 if (VT.is256BitVector() && IdxVal == 0) { 12592 // TODO: It is worthwhile to cast integer to floating point and back 12593 // and incur a domain crossing penalty if that's what we'll end up 12594 // doing anyway after extracting to a 128-bit vector. 12595 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || 12596 (Subtarget.hasAVX2() && EltVT == MVT::i32)) { 12597 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); 12598 N2 = DAG.getIntPtrConstant(1, dl); 12599 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); 12600 } 12601 } 12602 12603 // Get the desired 128-bit vector chunk. 12604 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); 12605 12606 // Insert the element into the desired chunk. 12607 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); 12608 assert(isPowerOf2_32(NumEltsIn128)); 12609 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. 12610 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); 12611 12612 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 12613 DAG.getConstant(IdxIn128, dl, MVT::i32)); 12614 12615 // Insert the changed part back into the bigger vector 12616 return insert128BitVector(N0, V, IdxVal, DAG, dl); 12617 } 12618 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); 12619 12620 if (Subtarget.hasSSE41()) { 12621 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { 12622 unsigned Opc; 12623 if (VT == MVT::v8i16) { 12624 Opc = X86ISD::PINSRW; 12625 } else { 12626 assert(VT == MVT::v16i8); 12627 Opc = X86ISD::PINSRB; 12628 } 12629 12630 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 12631 // argument. 12632 if (N1.getValueType() != MVT::i32) 12633 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 12634 if (N2.getValueType() != MVT::i32) 12635 N2 = DAG.getIntPtrConstant(IdxVal, dl); 12636 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 12637 } 12638 12639 if (EltVT == MVT::f32) { 12640 // Bits [7:6] of the constant are the source select. This will always be 12641 // zero here. The DAG Combiner may combine an extract_elt index into 12642 // these bits. For example (insert (extract, 3), 2) could be matched by 12643 // putting the '3' into bits [7:6] of X86ISD::INSERTPS. 12644 // Bits [5:4] of the constant are the destination select. This is the 12645 // value of the incoming immediate. 12646 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 12647 // combine either bitwise AND or insert of float 0.0 to set these bits. 12648 12649 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); 12650 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { 12651 // If this is an insertion of 32-bits into the low 32-bits of 12652 // a vector, we prefer to generate a blend with immediate rather 12653 // than an insertps. Blends are simpler operations in hardware and so 12654 // will always have equal or better performance than insertps. 12655 // But if optimizing for size and there's a load folding opportunity, 12656 // generate insertps because blendps does not have a 32-bit memory 12657 // operand form. 12658 N2 = DAG.getIntPtrConstant(1, dl); 12659 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 12660 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); 12661 } 12662 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); 12663 // Create this as a scalar to vector.. 12664 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 12665 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 12666 } 12667 12668 if (EltVT == MVT::i32 || EltVT == MVT::i64) { 12669 // PINSR* works with constant index. 12670 return Op; 12671 } 12672 } 12673 12674 if (EltVT == MVT::i8) 12675 return SDValue(); 12676 12677 if (EltVT.getSizeInBits() == 16) { 12678 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 12679 // as its second argument. 12680 if (N1.getValueType() != MVT::i32) 12681 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 12682 if (N2.getValueType() != MVT::i32) 12683 N2 = DAG.getIntPtrConstant(IdxVal, dl); 12684 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 12685 } 12686 return SDValue(); 12687 } 12688 12689 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 12690 SDLoc dl(Op); 12691 MVT OpVT = Op.getSimpleValueType(); 12692 12693 // If this is a 256-bit vector result, first insert into a 128-bit 12694 // vector and then insert into the 256-bit vector. 12695 if (!OpVT.is128BitVector()) { 12696 // Insert into a 128-bit vector. 12697 unsigned SizeFactor = OpVT.getSizeInBits()/128; 12698 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), 12699 OpVT.getVectorNumElements() / SizeFactor); 12700 12701 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 12702 12703 // Insert the 128-bit vector. 12704 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 12705 } 12706 12707 if (OpVT == MVT::v1i64 && 12708 Op.getOperand(0).getValueType() == MVT::i64) 12709 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 12710 12711 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 12712 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 12713 return DAG.getBitcast( 12714 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); 12715 } 12716 12717 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 12718 // a simple subregister reference or explicit instructions to grab 12719 // upper bits of a vector. 12720 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, 12721 SelectionDAG &DAG) { 12722 SDLoc dl(Op); 12723 SDValue In = Op.getOperand(0); 12724 SDValue Idx = Op.getOperand(1); 12725 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 12726 MVT ResVT = Op.getSimpleValueType(); 12727 MVT InVT = In.getSimpleValueType(); 12728 12729 if (Subtarget.hasFp256()) { 12730 if (ResVT.is128BitVector() && 12731 (InVT.is256BitVector() || InVT.is512BitVector()) && 12732 isa<ConstantSDNode>(Idx)) { 12733 return extract128BitVector(In, IdxVal, DAG, dl); 12734 } 12735 if (ResVT.is256BitVector() && InVT.is512BitVector() && 12736 isa<ConstantSDNode>(Idx)) { 12737 return extract256BitVector(In, IdxVal, DAG, dl); 12738 } 12739 } 12740 return SDValue(); 12741 } 12742 12743 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 12744 // simple superregister reference or explicit instructions to insert 12745 // the upper bits of a vector. 12746 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, 12747 SelectionDAG &DAG) { 12748 if (!Subtarget.hasAVX()) 12749 return SDValue(); 12750 12751 SDLoc dl(Op); 12752 SDValue Vec = Op.getOperand(0); 12753 SDValue SubVec = Op.getOperand(1); 12754 SDValue Idx = Op.getOperand(2); 12755 12756 if (!isa<ConstantSDNode>(Idx)) 12757 return SDValue(); 12758 12759 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 12760 MVT OpVT = Op.getSimpleValueType(); 12761 MVT SubVecVT = SubVec.getSimpleValueType(); 12762 12763 // Fold two 16-byte subvector loads into one 32-byte load: 12764 // (insert_subvector (insert_subvector undef, (load addr), 0), 12765 // (load addr + 16), Elts/2) 12766 // --> load32 addr 12767 if ((IdxVal == OpVT.getVectorNumElements() / 2) && 12768 Vec.getOpcode() == ISD::INSERT_SUBVECTOR && 12769 OpVT.is256BitVector() && SubVecVT.is128BitVector()) { 12770 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); 12771 if (Idx2 && Idx2->getZExtValue() == 0) { 12772 // If needed, look through bitcasts to get to the load. 12773 SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1)); 12774 if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) { 12775 bool Fast; 12776 unsigned Alignment = FirstLd->getAlignment(); 12777 unsigned AS = FirstLd->getAddressSpace(); 12778 const X86TargetLowering *TLI = Subtarget.getTargetLowering(); 12779 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), 12780 OpVT, AS, Alignment, &Fast) && Fast) { 12781 SDValue Ops[] = { SubVec2, SubVec }; 12782 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) 12783 return Ld; 12784 } 12785 } 12786 } 12787 } 12788 12789 if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && 12790 SubVecVT.is128BitVector()) 12791 return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 12792 12793 if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) 12794 return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); 12795 12796 if (OpVT.getVectorElementType() == MVT::i1) 12797 return insert1BitVector(Op, DAG, Subtarget); 12798 12799 return SDValue(); 12800 } 12801 12802 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 12803 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 12804 // one of the above mentioned nodes. It has to be wrapped because otherwise 12805 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 12806 // be used to form addressing mode. These wrapped nodes will be selected 12807 // into MOV32ri. 12808 SDValue 12809 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 12810 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 12811 12812 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 12813 // global base reg. 12814 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); 12815 unsigned WrapperKind = X86ISD::Wrapper; 12816 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12817 12818 if (Subtarget.isPICStyleRIPRel() && 12819 (M == CodeModel::Small || M == CodeModel::Kernel)) 12820 WrapperKind = X86ISD::WrapperRIP; 12821 12822 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12823 SDValue Result = DAG.getTargetConstantPool( 12824 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); 12825 SDLoc DL(CP); 12826 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); 12827 // With PIC, the address is actually $g + Offset. 12828 if (OpFlag) { 12829 Result = 12830 DAG.getNode(ISD::ADD, DL, PtrVT, 12831 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); 12832 } 12833 12834 return Result; 12835 } 12836 12837 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 12838 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 12839 12840 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 12841 // global base reg. 12842 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); 12843 unsigned WrapperKind = X86ISD::Wrapper; 12844 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12845 12846 if (Subtarget.isPICStyleRIPRel() && 12847 (M == CodeModel::Small || M == CodeModel::Kernel)) 12848 WrapperKind = X86ISD::WrapperRIP; 12849 12850 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12851 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); 12852 SDLoc DL(JT); 12853 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); 12854 12855 // With PIC, the address is actually $g + Offset. 12856 if (OpFlag) 12857 Result = 12858 DAG.getNode(ISD::ADD, DL, PtrVT, 12859 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); 12860 12861 return Result; 12862 } 12863 12864 SDValue 12865 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 12866 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 12867 12868 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 12869 // global base reg. 12870 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); 12871 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod); 12872 unsigned WrapperKind = X86ISD::Wrapper; 12873 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12874 12875 if (Subtarget.isPICStyleRIPRel() && 12876 (M == CodeModel::Small || M == CodeModel::Kernel)) 12877 WrapperKind = X86ISD::WrapperRIP; 12878 12879 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12880 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); 12881 12882 SDLoc DL(Op); 12883 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); 12884 12885 // With PIC, the address is actually $g + Offset. 12886 if (isPositionIndependent() && !Subtarget.is64Bit()) { 12887 Result = 12888 DAG.getNode(ISD::ADD, DL, PtrVT, 12889 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); 12890 } 12891 12892 // For symbols that require a load from a stub to get the address, emit the 12893 // load. 12894 if (isGlobalStubReference(OpFlag)) 12895 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 12896 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 12897 false, false, false, 0); 12898 12899 return Result; 12900 } 12901 12902 SDValue 12903 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 12904 // Create the TargetBlockAddressAddress node. 12905 unsigned char OpFlags = 12906 Subtarget.classifyBlockAddressReference(); 12907 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12908 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 12909 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 12910 SDLoc dl(Op); 12911 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12912 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); 12913 12914 if (Subtarget.isPICStyleRIPRel() && 12915 (M == CodeModel::Small || M == CodeModel::Kernel)) 12916 Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); 12917 else 12918 Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); 12919 12920 // With PIC, the address is actually $g + Offset. 12921 if (isGlobalRelativeToPICBase(OpFlags)) { 12922 Result = DAG.getNode(ISD::ADD, dl, PtrVT, 12923 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); 12924 } 12925 12926 return Result; 12927 } 12928 12929 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, 12930 const SDLoc &dl, int64_t Offset, 12931 SelectionDAG &DAG) const { 12932 // Create the TargetGlobalAddress node, folding in the constant 12933 // offset if it is legal. 12934 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV); 12935 CodeModel::Model M = DAG.getTarget().getCodeModel(); 12936 auto PtrVT = getPointerTy(DAG.getDataLayout()); 12937 SDValue Result; 12938 if (OpFlags == X86II::MO_NO_FLAG && 12939 X86::isOffsetSuitableForCodeModel(Offset, M)) { 12940 // A direct static reference to a global. 12941 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); 12942 Offset = 0; 12943 } else { 12944 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); 12945 } 12946 12947 if (Subtarget.isPICStyleRIPRel() && 12948 (M == CodeModel::Small || M == CodeModel::Kernel)) 12949 Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); 12950 else 12951 Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); 12952 12953 // With PIC, the address is actually $g + Offset. 12954 if (isGlobalRelativeToPICBase(OpFlags)) { 12955 Result = DAG.getNode(ISD::ADD, dl, PtrVT, 12956 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); 12957 } 12958 12959 // For globals that require a load from a stub to get the address, emit the 12960 // load. 12961 if (isGlobalStubReference(OpFlags)) 12962 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 12963 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 12964 false, false, false, 0); 12965 12966 // If there was a non-zero offset that we didn't fold, create an explicit 12967 // addition for it. 12968 if (Offset != 0) 12969 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, 12970 DAG.getConstant(Offset, dl, PtrVT)); 12971 12972 return Result; 12973 } 12974 12975 SDValue 12976 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 12977 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 12978 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 12979 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); 12980 } 12981 12982 static SDValue 12983 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 12984 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 12985 unsigned char OperandFlags, bool LocalDynamic = false) { 12986 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 12987 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 12988 SDLoc dl(GA); 12989 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 12990 GA->getValueType(0), 12991 GA->getOffset(), 12992 OperandFlags); 12993 12994 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 12995 : X86ISD::TLSADDR; 12996 12997 if (InFlag) { 12998 SDValue Ops[] = { Chain, TGA, *InFlag }; 12999 Chain = DAG.getNode(CallType, dl, NodeTys, Ops); 13000 } else { 13001 SDValue Ops[] = { Chain, TGA }; 13002 Chain = DAG.getNode(CallType, dl, NodeTys, Ops); 13003 } 13004 13005 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 13006 MFI->setAdjustsStack(true); 13007 MFI->setHasCalls(true); 13008 13009 SDValue Flag = Chain.getValue(1); 13010 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 13011 } 13012 13013 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 13014 static SDValue 13015 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 13016 const EVT PtrVT) { 13017 SDValue InFlag; 13018 SDLoc dl(GA); // ? function entry point might be better 13019 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 13020 DAG.getNode(X86ISD::GlobalBaseReg, 13021 SDLoc(), PtrVT), InFlag); 13022 InFlag = Chain.getValue(1); 13023 13024 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 13025 } 13026 13027 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 13028 static SDValue 13029 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 13030 const EVT PtrVT) { 13031 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, 13032 X86::RAX, X86II::MO_TLSGD); 13033 } 13034 13035 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 13036 SelectionDAG &DAG, 13037 const EVT PtrVT, 13038 bool is64Bit) { 13039 SDLoc dl(GA); 13040 13041 // Get the start address of the TLS block for this module. 13042 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 13043 .getInfo<X86MachineFunctionInfo>(); 13044 MFI->incNumLocalDynamicTLSAccesses(); 13045 13046 SDValue Base; 13047 if (is64Bit) { 13048 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, 13049 X86II::MO_TLSLD, /*LocalDynamic=*/true); 13050 } else { 13051 SDValue InFlag; 13052 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 13053 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); 13054 InFlag = Chain.getValue(1); 13055 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 13056 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 13057 } 13058 13059 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 13060 // of Base. 13061 13062 // Build x@dtpoff. 13063 unsigned char OperandFlags = X86II::MO_DTPOFF; 13064 unsigned WrapperKind = X86ISD::Wrapper; 13065 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 13066 GA->getValueType(0), 13067 GA->getOffset(), OperandFlags); 13068 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 13069 13070 // Add x@dtpoff with the base. 13071 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 13072 } 13073 13074 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 13075 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 13076 const EVT PtrVT, TLSModel::Model model, 13077 bool is64Bit, bool isPIC) { 13078 SDLoc dl(GA); 13079 13080 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 13081 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 13082 is64Bit ? 257 : 256)); 13083 13084 SDValue ThreadPointer = 13085 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), 13086 MachinePointerInfo(Ptr), false, false, false, 0); 13087 13088 unsigned char OperandFlags = 0; 13089 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 13090 // initialexec. 13091 unsigned WrapperKind = X86ISD::Wrapper; 13092 if (model == TLSModel::LocalExec) { 13093 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 13094 } else if (model == TLSModel::InitialExec) { 13095 if (is64Bit) { 13096 OperandFlags = X86II::MO_GOTTPOFF; 13097 WrapperKind = X86ISD::WrapperRIP; 13098 } else { 13099 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 13100 } 13101 } else { 13102 llvm_unreachable("Unexpected model"); 13103 } 13104 13105 // emit "addl x@ntpoff,%eax" (local exec) 13106 // or "addl x@indntpoff,%eax" (initial exec) 13107 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 13108 SDValue TGA = 13109 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), 13110 GA->getOffset(), OperandFlags); 13111 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 13112 13113 if (model == TLSModel::InitialExec) { 13114 if (isPIC && !is64Bit) { 13115 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 13116 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 13117 Offset); 13118 } 13119 13120 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 13121 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 13122 false, false, false, 0); 13123 } 13124 13125 // The address of the thread local variable is the add of the thread 13126 // pointer with the offset of the variable. 13127 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 13128 } 13129 13130 SDValue 13131 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 13132 13133 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 13134 13135 if (DAG.getTarget().Options.EmulatedTLS) 13136 return LowerToTLSEmulatedModel(GA, DAG); 13137 13138 const GlobalValue *GV = GA->getGlobal(); 13139 auto PtrVT = getPointerTy(DAG.getDataLayout()); 13140 bool PositionIndependent = isPositionIndependent(); 13141 13142 if (Subtarget.isTargetELF()) { 13143 TLSModel::Model model = DAG.getTarget().getTLSModel(GV); 13144 switch (model) { 13145 case TLSModel::GeneralDynamic: 13146 if (Subtarget.is64Bit()) 13147 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); 13148 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); 13149 case TLSModel::LocalDynamic: 13150 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, 13151 Subtarget.is64Bit()); 13152 case TLSModel::InitialExec: 13153 case TLSModel::LocalExec: 13154 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), 13155 PositionIndependent); 13156 } 13157 llvm_unreachable("Unknown TLS model."); 13158 } 13159 13160 if (Subtarget.isTargetDarwin()) { 13161 // Darwin only has one model of TLS. Lower to that. 13162 unsigned char OpFlag = 0; 13163 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ? 13164 X86ISD::WrapperRIP : X86ISD::Wrapper; 13165 13166 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 13167 // global base reg. 13168 bool PIC32 = PositionIndependent && !Subtarget.is64Bit(); 13169 if (PIC32) 13170 OpFlag = X86II::MO_TLVP_PIC_BASE; 13171 else 13172 OpFlag = X86II::MO_TLVP; 13173 SDLoc DL(Op); 13174 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 13175 GA->getValueType(0), 13176 GA->getOffset(), OpFlag); 13177 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); 13178 13179 // With PIC32, the address is actually $g + Offset. 13180 if (PIC32) 13181 Offset = DAG.getNode(ISD::ADD, DL, PtrVT, 13182 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 13183 Offset); 13184 13185 // Lowering the machine isd will make sure everything is in the right 13186 // location. 13187 SDValue Chain = DAG.getEntryNode(); 13188 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 13189 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); 13190 SDValue Args[] = { Chain, Offset }; 13191 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); 13192 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), 13193 DAG.getIntPtrConstant(0, DL, true), 13194 Chain.getValue(1), DL); 13195 13196 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 13197 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 13198 MFI->setAdjustsStack(true); 13199 13200 // And our return value (tls address) is in the standard call return value 13201 // location. 13202 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; 13203 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); 13204 } 13205 13206 if (Subtarget.isTargetKnownWindowsMSVC() || 13207 Subtarget.isTargetWindowsItanium() || 13208 Subtarget.isTargetWindowsGNU()) { 13209 // Just use the implicit TLS architecture 13210 // Need to generate someting similar to: 13211 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 13212 // ; from TEB 13213 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 13214 // mov rcx, qword [rdx+rcx*8] 13215 // mov eax, .tls$:tlsvar 13216 // [rax+rcx] contains the address 13217 // Windows 64bit: gs:0x58 13218 // Windows 32bit: fs:__tls_array 13219 13220 SDLoc dl(GA); 13221 SDValue Chain = DAG.getEntryNode(); 13222 13223 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 13224 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly 13225 // use its literal value of 0x2C. 13226 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit() 13227 ? Type::getInt8PtrTy(*DAG.getContext(), 13228 256) 13229 : Type::getInt32PtrTy(*DAG.getContext(), 13230 257)); 13231 13232 SDValue TlsArray = Subtarget.is64Bit() 13233 ? DAG.getIntPtrConstant(0x58, dl) 13234 : (Subtarget.isTargetWindowsGNU() 13235 ? DAG.getIntPtrConstant(0x2C, dl) 13236 : DAG.getExternalSymbol("_tls_array", PtrVT)); 13237 13238 SDValue ThreadPointer = 13239 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, 13240 false, false, 0); 13241 13242 SDValue res; 13243 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { 13244 res = ThreadPointer; 13245 } else { 13246 // Load the _tls_index variable 13247 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); 13248 if (Subtarget.is64Bit()) 13249 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, 13250 MachinePointerInfo(), MVT::i32, false, false, 13251 false, 0); 13252 else 13253 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false, 13254 false, false, 0); 13255 13256 auto &DL = DAG.getDataLayout(); 13257 SDValue Scale = 13258 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT); 13259 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); 13260 13261 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); 13262 } 13263 13264 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false, 13265 false, 0); 13266 13267 // Get the offset of start of .tls section 13268 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 13269 GA->getValueType(0), 13270 GA->getOffset(), X86II::MO_SECREL); 13271 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); 13272 13273 // The address of the thread local variable is the add of the thread 13274 // pointer with the offset of the variable. 13275 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); 13276 } 13277 13278 llvm_unreachable("TLS not implemented for this target."); 13279 } 13280 13281 /// Lower SRA_PARTS and friends, which return two i32 values 13282 /// and take a 2 x i32 value to shift plus a shift amount. 13283 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { 13284 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 13285 MVT VT = Op.getSimpleValueType(); 13286 unsigned VTBits = VT.getSizeInBits(); 13287 SDLoc dl(Op); 13288 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 13289 SDValue ShOpLo = Op.getOperand(0); 13290 SDValue ShOpHi = Op.getOperand(1); 13291 SDValue ShAmt = Op.getOperand(2); 13292 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the 13293 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away 13294 // during isel. 13295 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 13296 DAG.getConstant(VTBits - 1, dl, MVT::i8)); 13297 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 13298 DAG.getConstant(VTBits - 1, dl, MVT::i8)) 13299 : DAG.getConstant(0, dl, VT); 13300 13301 SDValue Tmp2, Tmp3; 13302 if (Op.getOpcode() == ISD::SHL_PARTS) { 13303 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 13304 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); 13305 } else { 13306 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 13307 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); 13308 } 13309 13310 // If the shift amount is larger or equal than the width of a part we can't 13311 // rely on the results of shld/shrd. Insert a test and select the appropriate 13312 // values for large shift amounts. 13313 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 13314 DAG.getConstant(VTBits, dl, MVT::i8)); 13315 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 13316 AndNode, DAG.getConstant(0, dl, MVT::i8)); 13317 13318 SDValue Hi, Lo; 13319 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); 13320 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 13321 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 13322 13323 if (Op.getOpcode() == ISD::SHL_PARTS) { 13324 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); 13325 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); 13326 } else { 13327 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); 13328 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); 13329 } 13330 13331 SDValue Ops[2] = { Lo, Hi }; 13332 return DAG.getMergeValues(Ops, dl); 13333 } 13334 13335 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 13336 SelectionDAG &DAG) const { 13337 SDValue Src = Op.getOperand(0); 13338 MVT SrcVT = Src.getSimpleValueType(); 13339 MVT VT = Op.getSimpleValueType(); 13340 SDLoc dl(Op); 13341 13342 if (SrcVT.isVector()) { 13343 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { 13344 return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, 13345 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, 13346 DAG.getUNDEF(SrcVT))); 13347 } 13348 if (SrcVT.getVectorElementType() == MVT::i1) { 13349 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); 13350 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 13351 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); 13352 } 13353 return SDValue(); 13354 } 13355 13356 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && 13357 "Unknown SINT_TO_FP to lower!"); 13358 13359 // These are really Legal; return the operand so the caller accepts it as 13360 // Legal. 13361 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 13362 return Op; 13363 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 13364 Subtarget.is64Bit()) { 13365 return Op; 13366 } 13367 13368 SDValue ValueToStore = Op.getOperand(0); 13369 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 13370 !Subtarget.is64Bit()) 13371 // Bitcasting to f64 here allows us to do a single 64-bit store from 13372 // an SSE register, avoiding the store forwarding penalty that would come 13373 // with two 32-bit stores. 13374 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); 13375 13376 unsigned Size = SrcVT.getSizeInBits()/8; 13377 MachineFunction &MF = DAG.getMachineFunction(); 13378 auto PtrVT = getPointerTy(MF.getDataLayout()); 13379 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 13380 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 13381 SDValue Chain = DAG.getStore( 13382 DAG.getEntryNode(), dl, ValueToStore, StackSlot, 13383 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, 13384 false, 0); 13385 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 13386 } 13387 13388 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 13389 SDValue StackSlot, 13390 SelectionDAG &DAG) const { 13391 // Build the FILD 13392 SDLoc DL(Op); 13393 SDVTList Tys; 13394 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 13395 if (useSSE) 13396 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 13397 else 13398 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 13399 13400 unsigned ByteSize = SrcVT.getSizeInBits()/8; 13401 13402 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 13403 MachineMemOperand *MMO; 13404 if (FI) { 13405 int SSFI = FI->getIndex(); 13406 MMO = DAG.getMachineFunction().getMachineMemOperand( 13407 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), 13408 MachineMemOperand::MOLoad, ByteSize, ByteSize); 13409 } else { 13410 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 13411 StackSlot = StackSlot.getOperand(1); 13412 } 13413 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 13414 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 13415 X86ISD::FILD, DL, 13416 Tys, Ops, SrcVT, MMO); 13417 13418 if (useSSE) { 13419 Chain = Result.getValue(1); 13420 SDValue InFlag = Result.getValue(2); 13421 13422 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 13423 // shouldn't be necessary except that RFP cannot be live across 13424 // multiple blocks. When stackifier is fixed, they can be uncoupled. 13425 MachineFunction &MF = DAG.getMachineFunction(); 13426 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 13427 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 13428 auto PtrVT = getPointerTy(MF.getDataLayout()); 13429 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 13430 Tys = DAG.getVTList(MVT::Other); 13431 SDValue Ops[] = { 13432 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 13433 }; 13434 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( 13435 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), 13436 MachineMemOperand::MOStore, SSFISize, SSFISize); 13437 13438 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 13439 Ops, Op.getValueType(), MMO); 13440 Result = DAG.getLoad( 13441 Op.getValueType(), DL, Chain, StackSlot, 13442 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), 13443 false, false, false, 0); 13444 } 13445 13446 return Result; 13447 } 13448 13449 /// 64-bit unsigned integer to double expansion. 13450 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 13451 SelectionDAG &DAG) const { 13452 // This algorithm is not obvious. Here it is what we're trying to output: 13453 /* 13454 movq %rax, %xmm0 13455 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 13456 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 13457 #ifdef __SSE3__ 13458 haddpd %xmm0, %xmm0 13459 #else 13460 pshufd $0x4e, %xmm0, %xmm1 13461 addpd %xmm1, %xmm0 13462 #endif 13463 */ 13464 13465 SDLoc dl(Op); 13466 LLVMContext *Context = DAG.getContext(); 13467 13468 // Build some magic constants. 13469 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 13470 Constant *C0 = ConstantDataVector::get(*Context, CV0); 13471 auto PtrVT = getPointerTy(DAG.getDataLayout()); 13472 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); 13473 13474 SmallVector<Constant*,2> CV1; 13475 CV1.push_back( 13476 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 13477 APInt(64, 0x4330000000000000ULL)))); 13478 CV1.push_back( 13479 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 13480 APInt(64, 0x4530000000000000ULL)))); 13481 Constant *C1 = ConstantVector::get(CV1); 13482 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); 13483 13484 // Load the 64-bit value into an XMM register. 13485 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 13486 Op.getOperand(0)); 13487 SDValue CLod0 = 13488 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 13489 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 13490 false, false, false, 16); 13491 SDValue Unpck1 = 13492 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); 13493 13494 SDValue CLod1 = 13495 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 13496 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 13497 false, false, false, 16); 13498 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); 13499 // TODO: Are there any fast-math-flags to propagate here? 13500 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 13501 SDValue Result; 13502 13503 if (Subtarget.hasSSE3()) { 13504 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 13505 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 13506 } else { 13507 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub); 13508 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 13509 S2F, 0x4E, DAG); 13510 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 13511 DAG.getBitcast(MVT::v2f64, Shuffle), Sub); 13512 } 13513 13514 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 13515 DAG.getIntPtrConstant(0, dl)); 13516 } 13517 13518 /// 32-bit unsigned integer to float expansion. 13519 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 13520 SelectionDAG &DAG) const { 13521 SDLoc dl(Op); 13522 // FP constant to bias correct the final result. 13523 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, 13524 MVT::f64); 13525 13526 // Load the 32-bit value into an XMM register. 13527 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 13528 Op.getOperand(0)); 13529 13530 // Zero out the upper parts of the register. 13531 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 13532 13533 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 13534 DAG.getBitcast(MVT::v2f64, Load), 13535 DAG.getIntPtrConstant(0, dl)); 13536 13537 // Or the load with the bias. 13538 SDValue Or = DAG.getNode( 13539 ISD::OR, dl, MVT::v2i64, 13540 DAG.getBitcast(MVT::v2i64, 13541 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), 13542 DAG.getBitcast(MVT::v2i64, 13543 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); 13544 Or = 13545 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 13546 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); 13547 13548 // Subtract the bias. 13549 // TODO: Are there any fast-math-flags to propagate here? 13550 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 13551 13552 // Handle final rounding. 13553 MVT DestVT = Op.getSimpleValueType(); 13554 13555 if (DestVT.bitsLT(MVT::f64)) 13556 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 13557 DAG.getIntPtrConstant(0, dl)); 13558 if (DestVT.bitsGT(MVT::f64)) 13559 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 13560 13561 // Handle final rounding. 13562 return Sub; 13563 } 13564 13565 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, 13566 const X86Subtarget &Subtarget) { 13567 // The algorithm is the following: 13568 // #ifdef __SSE4_1__ 13569 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); 13570 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), 13571 // (uint4) 0x53000000, 0xaa); 13572 // #else 13573 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; 13574 // uint4 hi = (v >> 16) | (uint4) 0x53000000; 13575 // #endif 13576 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); 13577 // return (float4) lo + fhi; 13578 13579 // We shouldn't use it when unsafe-fp-math is enabled though: we might later 13580 // reassociate the two FADDs, and if we do that, the algorithm fails 13581 // spectacularly (PR24512). 13582 // FIXME: If we ever have some kind of Machine FMF, this should be marked 13583 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because 13584 // there's also the MachineCombiner reassociations happening on Machine IR. 13585 if (DAG.getTarget().Options.UnsafeFPMath) 13586 return SDValue(); 13587 13588 SDLoc DL(Op); 13589 SDValue V = Op->getOperand(0); 13590 MVT VecIntVT = V.getSimpleValueType(); 13591 bool Is128 = VecIntVT == MVT::v4i32; 13592 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; 13593 // If we convert to something else than the supported type, e.g., to v4f64, 13594 // abort early. 13595 if (VecFloatVT != Op->getSimpleValueType(0)) 13596 return SDValue(); 13597 13598 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && 13599 "Unsupported custom type"); 13600 13601 // In the #idef/#else code, we have in common: 13602 // - The vector of constants: 13603 // -- 0x4b000000 13604 // -- 0x53000000 13605 // - A shift: 13606 // -- v >> 16 13607 13608 // Create the splat vector for 0x4b000000. 13609 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT); 13610 // Create the splat vector for 0x53000000. 13611 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT); 13612 13613 // Create the right shift. 13614 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT); 13615 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); 13616 13617 SDValue Low, High; 13618 if (Subtarget.hasSSE41()) { 13619 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; 13620 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); 13621 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); 13622 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); 13623 // Low will be bitcasted right away, so do not bother bitcasting back to its 13624 // original type. 13625 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, 13626 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); 13627 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), 13628 // (uint4) 0x53000000, 0xaa); 13629 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); 13630 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); 13631 // High will be bitcasted right away, so do not bother bitcasting back to 13632 // its original type. 13633 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, 13634 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); 13635 } else { 13636 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); 13637 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; 13638 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); 13639 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); 13640 13641 // uint4 hi = (v >> 16) | (uint4) 0x53000000; 13642 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); 13643 } 13644 13645 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). 13646 SDValue VecCstFAdd = DAG.getConstantFP( 13647 APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT); 13648 13649 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); 13650 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); 13651 // TODO: Are there any fast-math-flags to propagate here? 13652 SDValue FHigh = 13653 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); 13654 // return (float4) lo + fhi; 13655 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); 13656 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); 13657 } 13658 13659 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, 13660 SelectionDAG &DAG) const { 13661 SDValue N0 = Op.getOperand(0); 13662 MVT SVT = N0.getSimpleValueType(); 13663 SDLoc dl(Op); 13664 13665 switch (SVT.SimpleTy) { 13666 default: 13667 llvm_unreachable("Custom UINT_TO_FP is not supported!"); 13668 case MVT::v4i8: 13669 case MVT::v4i16: 13670 case MVT::v8i8: 13671 case MVT::v8i16: { 13672 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); 13673 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 13674 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); 13675 } 13676 case MVT::v4i32: 13677 case MVT::v8i32: 13678 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); 13679 case MVT::v16i8: 13680 case MVT::v16i16: 13681 assert(Subtarget.hasAVX512()); 13682 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), 13683 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); 13684 } 13685 } 13686 13687 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 13688 SelectionDAG &DAG) const { 13689 SDValue N0 = Op.getOperand(0); 13690 SDLoc dl(Op); 13691 auto PtrVT = getPointerTy(DAG.getDataLayout()); 13692 13693 if (Op.getSimpleValueType().isVector()) 13694 return lowerUINT_TO_FP_vec(Op, DAG); 13695 13696 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 13697 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 13698 // the optimization here. 13699 if (DAG.SignBitIsZero(N0)) 13700 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 13701 13702 MVT SrcVT = N0.getSimpleValueType(); 13703 MVT DstVT = Op.getSimpleValueType(); 13704 13705 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && 13706 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { 13707 // Conversions from unsigned i32 to f32/f64 are legal, 13708 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. 13709 return Op; 13710 } 13711 13712 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 13713 return LowerUINT_TO_FP_i64(Op, DAG); 13714 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 13715 return LowerUINT_TO_FP_i32(Op, DAG); 13716 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 13717 return SDValue(); 13718 13719 // Make a 64-bit buffer, and use it to build an FILD. 13720 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 13721 if (SrcVT == MVT::i32) { 13722 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); 13723 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 13724 StackSlot, MachinePointerInfo(), 13725 false, false, 0); 13726 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), 13727 OffsetSlot, MachinePointerInfo(), 13728 false, false, 0); 13729 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 13730 return Fild; 13731 } 13732 13733 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 13734 SDValue ValueToStore = Op.getOperand(0); 13735 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) 13736 // Bitcasting to f64 here allows us to do a single 64-bit store from 13737 // an SSE register, avoiding the store forwarding penalty that would come 13738 // with two 32-bit stores. 13739 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); 13740 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, 13741 StackSlot, MachinePointerInfo(), 13742 false, false, 0); 13743 // For i64 source, we need to add the appropriate power of 2 if the input 13744 // was negative. This is the same as the optimization in 13745 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 13746 // we must be careful to do the computation in x87 extended precision, not 13747 // in SSE. (The generic code can't know it's OK to do this, or how to.) 13748 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 13749 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( 13750 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), 13751 MachineMemOperand::MOLoad, 8, 8); 13752 13753 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 13754 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 13755 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 13756 MVT::i64, MMO); 13757 13758 APInt FF(32, 0x5F800000ULL); 13759 13760 // Check whether the sign bit is set. 13761 SDValue SignSet = DAG.getSetCC( 13762 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), 13763 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); 13764 13765 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 13766 SDValue FudgePtr = DAG.getConstantPool( 13767 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); 13768 13769 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 13770 SDValue Zero = DAG.getIntPtrConstant(0, dl); 13771 SDValue Four = DAG.getIntPtrConstant(4, dl); 13772 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 13773 Zero, Four); 13774 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); 13775 13776 // Load the value out, extending it from f32 to f80. 13777 // FIXME: Avoid the extend by constructing the right constant pool? 13778 SDValue Fudge = DAG.getExtLoad( 13779 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, 13780 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, 13781 false, false, false, 4); 13782 // Extend everything to 80 bits to force it to be done on x87. 13783 // TODO: Are there any fast-math-flags to propagate here? 13784 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 13785 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, 13786 DAG.getIntPtrConstant(0, dl)); 13787 } 13788 13789 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation 13790 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), 13791 // just return an <SDValue(), SDValue()> pair. 13792 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 13793 // to i16, i32 or i64, and we lower it to a legal sequence. 13794 // If lowered to the final integer result we return a <result, SDValue()> pair. 13795 // Otherwise we lower it to a sequence ending with a FIST, return a 13796 // <FIST, StackSlot> pair, and the caller is responsible for loading 13797 // the final integer result from StackSlot. 13798 std::pair<SDValue,SDValue> 13799 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, 13800 bool IsSigned, bool IsReplace) const { 13801 SDLoc DL(Op); 13802 13803 EVT DstTy = Op.getValueType(); 13804 EVT TheVT = Op.getOperand(0).getValueType(); 13805 auto PtrVT = getPointerTy(DAG.getDataLayout()); 13806 13807 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { 13808 // f16 must be promoted before using the lowering in this routine. 13809 // fp128 does not use this lowering. 13810 return std::make_pair(SDValue(), SDValue()); 13811 } 13812 13813 // If using FIST to compute an unsigned i64, we'll need some fixup 13814 // to handle values above the maximum signed i64. A FIST is always 13815 // used for the 32-bit subtarget, but also for f80 on a 64-bit target. 13816 bool UnsignedFixup = !IsSigned && 13817 DstTy == MVT::i64 && 13818 (!Subtarget.is64Bit() || 13819 !isScalarFPTypeInSSEReg(TheVT)); 13820 13821 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) { 13822 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. 13823 // The low 32 bits of the fist result will have the correct uint32 result. 13824 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 13825 DstTy = MVT::i64; 13826 } 13827 13828 assert(DstTy.getSimpleVT() <= MVT::i64 && 13829 DstTy.getSimpleVT() >= MVT::i16 && 13830 "Unknown FP_TO_INT to lower!"); 13831 13832 // These are really Legal. 13833 if (DstTy == MVT::i32 && 13834 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 13835 return std::make_pair(SDValue(), SDValue()); 13836 if (Subtarget.is64Bit() && 13837 DstTy == MVT::i64 && 13838 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 13839 return std::make_pair(SDValue(), SDValue()); 13840 13841 // We lower FP->int64 into FISTP64 followed by a load from a temporary 13842 // stack slot. 13843 MachineFunction &MF = DAG.getMachineFunction(); 13844 unsigned MemSize = DstTy.getSizeInBits()/8; 13845 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 13846 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 13847 13848 unsigned Opc; 13849 switch (DstTy.getSimpleVT().SimpleTy) { 13850 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 13851 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 13852 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 13853 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 13854 } 13855 13856 SDValue Chain = DAG.getEntryNode(); 13857 SDValue Value = Op.getOperand(0); 13858 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. 13859 13860 if (UnsignedFixup) { 13861 // 13862 // Conversion to unsigned i64 is implemented with a select, 13863 // depending on whether the source value fits in the range 13864 // of a signed i64. Let Thresh be the FP equivalent of 13865 // 0x8000000000000000ULL. 13866 // 13867 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; 13868 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); 13869 // Fist-to-mem64 FistSrc 13870 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent 13871 // to XOR'ing the high 32 bits with Adjust. 13872 // 13873 // Being a power of 2, Thresh is exactly representable in all FP formats. 13874 // For X87 we'd like to use the smallest FP type for this constant, but 13875 // for DAG type consistency we have to match the FP operand type. 13876 13877 APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); 13878 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; 13879 bool LosesInfo = false; 13880 if (TheVT == MVT::f64) 13881 // The rounding mode is irrelevant as the conversion should be exact. 13882 Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, 13883 &LosesInfo); 13884 else if (TheVT == MVT::f80) 13885 Status = Thresh.convert(APFloat::x87DoubleExtended, 13886 APFloat::rmNearestTiesToEven, &LosesInfo); 13887 13888 assert(Status == APFloat::opOK && !LosesInfo && 13889 "FP conversion should have been exact"); 13890 13891 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); 13892 13893 SDValue Cmp = DAG.getSetCC(DL, 13894 getSetCCResultType(DAG.getDataLayout(), 13895 *DAG.getContext(), TheVT), 13896 Value, ThreshVal, ISD::SETLT); 13897 Adjust = DAG.getSelect(DL, MVT::i32, Cmp, 13898 DAG.getConstant(0, DL, MVT::i32), 13899 DAG.getConstant(0x80000000, DL, MVT::i32)); 13900 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); 13901 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), 13902 *DAG.getContext(), TheVT), 13903 Value, ThreshVal, ISD::SETLT); 13904 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); 13905 } 13906 13907 // FIXME This causes a redundant load/store if the SSE-class value is already 13908 // in memory, such as if it is on the callstack. 13909 if (isScalarFPTypeInSSEReg(TheVT)) { 13910 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 13911 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 13912 MachinePointerInfo::getFixedStack(MF, SSFI), false, 13913 false, 0); 13914 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 13915 SDValue Ops[] = { 13916 Chain, StackSlot, DAG.getValueType(TheVT) 13917 }; 13918 13919 MachineMemOperand *MMO = 13920 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), 13921 MachineMemOperand::MOLoad, MemSize, MemSize); 13922 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); 13923 Chain = Value.getValue(1); 13924 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 13925 StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 13926 } 13927 13928 MachineMemOperand *MMO = 13929 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), 13930 MachineMemOperand::MOStore, MemSize, MemSize); 13931 13932 if (UnsignedFixup) { 13933 13934 // Insert the FIST, load its result as two i32's, 13935 // and XOR the high i32 with Adjust. 13936 13937 SDValue FistOps[] = { Chain, Value, StackSlot }; 13938 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 13939 FistOps, DstTy, MMO); 13940 13941 SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, 13942 MachinePointerInfo(), 13943 false, false, false, 0); 13944 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL); 13945 13946 SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, 13947 MachinePointerInfo(), 13948 false, false, false, 0); 13949 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); 13950 13951 if (Subtarget.is64Bit()) { 13952 // Join High32 and Low32 into a 64-bit result. 13953 // (High32 << 32) | Low32 13954 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); 13955 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); 13956 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, 13957 DAG.getConstant(32, DL, MVT::i8)); 13958 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); 13959 return std::make_pair(Result, SDValue()); 13960 } 13961 13962 SDValue ResultOps[] = { Low32, High32 }; 13963 13964 SDValue pair = IsReplace 13965 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) 13966 : DAG.getMergeValues(ResultOps, DL); 13967 return std::make_pair(pair, SDValue()); 13968 } else { 13969 // Build the FP_TO_INT*_IN_MEM 13970 SDValue Ops[] = { Chain, Value, StackSlot }; 13971 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 13972 Ops, DstTy, MMO); 13973 return std::make_pair(FIST, StackSlot); 13974 } 13975 } 13976 13977 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, 13978 const X86Subtarget &Subtarget) { 13979 MVT VT = Op->getSimpleValueType(0); 13980 SDValue In = Op->getOperand(0); 13981 MVT InVT = In.getSimpleValueType(); 13982 SDLoc dl(Op); 13983 13984 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 13985 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); 13986 13987 // Optimize vectors in AVX mode: 13988 // 13989 // v8i16 -> v8i32 13990 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 13991 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 13992 // Concat upper and lower parts. 13993 // 13994 // v4i32 -> v4i64 13995 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 13996 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 13997 // Concat upper and lower parts. 13998 // 13999 14000 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && 14001 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && 14002 ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) 14003 return SDValue(); 14004 14005 if (Subtarget.hasInt256()) 14006 return DAG.getNode(X86ISD::VZEXT, dl, VT, In); 14007 14008 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); 14009 SDValue Undef = DAG.getUNDEF(InVT); 14010 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; 14011 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 14012 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 14013 14014 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), 14015 VT.getVectorNumElements()/2); 14016 14017 OpLo = DAG.getBitcast(HVT, OpLo); 14018 OpHi = DAG.getBitcast(HVT, OpHi); 14019 14020 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 14021 } 14022 14023 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, 14024 const X86Subtarget &Subtarget, SelectionDAG &DAG) { 14025 MVT VT = Op->getSimpleValueType(0); 14026 SDValue In = Op->getOperand(0); 14027 MVT InVT = In.getSimpleValueType(); 14028 SDLoc DL(Op); 14029 unsigned int NumElts = VT.getVectorNumElements(); 14030 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) 14031 return SDValue(); 14032 14033 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 14034 return DAG.getNode(X86ISD::VZEXT, DL, VT, In); 14035 14036 assert(InVT.getVectorElementType() == MVT::i1); 14037 14038 // Extend VT if the target is 256 or 128bit vector and VLX is not supported. 14039 MVT ExtVT = VT; 14040 if (!VT.is512BitVector() && !Subtarget.hasVLX()) 14041 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); 14042 14043 SDValue One = 14044 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); 14045 SDValue Zero = 14046 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); 14047 14048 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); 14049 if (VT == ExtVT) 14050 return SelectedVal; 14051 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); 14052 } 14053 14054 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, 14055 SelectionDAG &DAG) { 14056 if (Subtarget.hasFp256()) 14057 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) 14058 return Res; 14059 14060 return SDValue(); 14061 } 14062 14063 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, 14064 SelectionDAG &DAG) { 14065 SDLoc DL(Op); 14066 MVT VT = Op.getSimpleValueType(); 14067 SDValue In = Op.getOperand(0); 14068 MVT SVT = In.getSimpleValueType(); 14069 14070 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) 14071 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); 14072 14073 if (Subtarget.hasFp256()) 14074 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) 14075 return Res; 14076 14077 assert(!VT.is256BitVector() || !SVT.is128BitVector() || 14078 VT.getVectorNumElements() != SVT.getVectorNumElements()); 14079 return SDValue(); 14080 } 14081 14082 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, 14083 const X86Subtarget &Subtarget) { 14084 14085 SDLoc DL(Op); 14086 MVT VT = Op.getSimpleValueType(); 14087 SDValue In = Op.getOperand(0); 14088 MVT InVT = In.getSimpleValueType(); 14089 14090 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."); 14091 14092 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q. 14093 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; 14094 if (InVT.getScalarSizeInBits() <= 16) { 14095 if (Subtarget.hasBWI()) { 14096 // legal, will go to VPMOVB2M, VPMOVW2M 14097 // Shift packed bytes not supported natively, bitcast to word 14098 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); 14099 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, 14100 DAG.getBitcast(ExtVT, In), 14101 DAG.getConstant(ShiftInx, DL, ExtVT)); 14102 ShiftNode = DAG.getBitcast(InVT, ShiftNode); 14103 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); 14104 } 14105 // Use TESTD/Q, extended vector to packed dword/qword. 14106 assert((InVT.is256BitVector() || InVT.is128BitVector()) && 14107 "Unexpected vector type."); 14108 unsigned NumElts = InVT.getVectorNumElements(); 14109 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); 14110 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); 14111 InVT = ExtVT; 14112 ShiftInx = InVT.getScalarSizeInBits() - 1; 14113 } 14114 14115 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, 14116 DAG.getConstant(ShiftInx, DL, InVT)); 14117 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); 14118 } 14119 14120 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 14121 SDLoc DL(Op); 14122 MVT VT = Op.getSimpleValueType(); 14123 SDValue In = Op.getOperand(0); 14124 MVT InVT = In.getSimpleValueType(); 14125 14126 if (VT == MVT::i1) { 14127 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && 14128 "Invalid scalar TRUNCATE operation"); 14129 if (InVT.getSizeInBits() >= 32) 14130 return SDValue(); 14131 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); 14132 return DAG.getNode(ISD::TRUNCATE, DL, VT, In); 14133 } 14134 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && 14135 "Invalid TRUNCATE operation"); 14136 14137 if (VT.getVectorElementType() == MVT::i1) 14138 return LowerTruncateVecI1(Op, DAG, Subtarget); 14139 14140 // vpmovqb/w/d, vpmovdb/w, vpmovwb 14141 if (Subtarget.hasAVX512()) { 14142 // word to byte only under BWI 14143 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8 14144 return DAG.getNode(X86ISD::VTRUNC, DL, VT, 14145 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); 14146 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); 14147 } 14148 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { 14149 // On AVX2, v4i64 -> v4i32 becomes VPERMD. 14150 if (Subtarget.hasInt256()) { 14151 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 14152 In = DAG.getBitcast(MVT::v8i32, In); 14153 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), 14154 ShufMask); 14155 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, 14156 DAG.getIntPtrConstant(0, DL)); 14157 } 14158 14159 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 14160 DAG.getIntPtrConstant(0, DL)); 14161 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 14162 DAG.getIntPtrConstant(2, DL)); 14163 OpLo = DAG.getBitcast(MVT::v4i32, OpLo); 14164 OpHi = DAG.getBitcast(MVT::v4i32, OpHi); 14165 static const int ShufMask[] = {0, 2, 4, 6}; 14166 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); 14167 } 14168 14169 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { 14170 // On AVX2, v8i32 -> v8i16 becomed PSHUFB. 14171 if (Subtarget.hasInt256()) { 14172 In = DAG.getBitcast(MVT::v32i8, In); 14173 14174 SmallVector<SDValue,32> pshufbMask; 14175 for (unsigned i = 0; i < 2; ++i) { 14176 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8)); 14177 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8)); 14178 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8)); 14179 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8)); 14180 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8)); 14181 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8)); 14182 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8)); 14183 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8)); 14184 for (unsigned j = 0; j < 8; ++j) 14185 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); 14186 } 14187 SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask); 14188 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); 14189 In = DAG.getBitcast(MVT::v4i64, In); 14190 14191 static const int ShufMask[] = {0, 2, -1, -1}; 14192 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), 14193 ShufMask); 14194 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 14195 DAG.getIntPtrConstant(0, DL)); 14196 return DAG.getBitcast(VT, In); 14197 } 14198 14199 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 14200 DAG.getIntPtrConstant(0, DL)); 14201 14202 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 14203 DAG.getIntPtrConstant(4, DL)); 14204 14205 OpLo = DAG.getBitcast(MVT::v16i8, OpLo); 14206 OpHi = DAG.getBitcast(MVT::v16i8, OpHi); 14207 14208 // The PSHUFB mask: 14209 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 14210 -1, -1, -1, -1, -1, -1, -1, -1}; 14211 14212 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 14213 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); 14214 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); 14215 14216 OpLo = DAG.getBitcast(MVT::v4i32, OpLo); 14217 OpHi = DAG.getBitcast(MVT::v4i32, OpHi); 14218 14219 // The MOVLHPS Mask: 14220 static const int ShufMask2[] = {0, 1, 4, 5}; 14221 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); 14222 return DAG.getBitcast(MVT::v8i16, res); 14223 } 14224 14225 // Handle truncation of V256 to V128 using shuffles. 14226 if (!VT.is128BitVector() || !InVT.is256BitVector()) 14227 return SDValue(); 14228 14229 assert(Subtarget.hasFp256() && "256-bit vector without AVX!"); 14230 14231 unsigned NumElems = VT.getVectorNumElements(); 14232 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); 14233 14234 SmallVector<int, 16> MaskVec(NumElems * 2, -1); 14235 // Prepare truncation shuffle mask 14236 for (unsigned i = 0; i != NumElems; ++i) 14237 MaskVec[i] = i * 2; 14238 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), 14239 DAG.getUNDEF(NVT), MaskVec); 14240 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, 14241 DAG.getIntPtrConstant(0, DL)); 14242 } 14243 14244 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 14245 SelectionDAG &DAG) const { 14246 assert(!Op.getSimpleValueType().isVector()); 14247 14248 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 14249 /*IsSigned=*/ true, /*IsReplace=*/ false); 14250 SDValue FIST = Vals.first, StackSlot = Vals.second; 14251 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 14252 if (!FIST.getNode()) 14253 return Op; 14254 14255 if (StackSlot.getNode()) 14256 // Load the result. 14257 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 14258 FIST, StackSlot, MachinePointerInfo(), 14259 false, false, false, 0); 14260 14261 // The node is the result. 14262 return FIST; 14263 } 14264 14265 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 14266 SelectionDAG &DAG) const { 14267 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 14268 /*IsSigned=*/ false, /*IsReplace=*/ false); 14269 SDValue FIST = Vals.first, StackSlot = Vals.second; 14270 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 14271 if (!FIST.getNode()) 14272 return Op; 14273 14274 if (StackSlot.getNode()) 14275 // Load the result. 14276 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 14277 FIST, StackSlot, MachinePointerInfo(), 14278 false, false, false, 0); 14279 14280 // The node is the result. 14281 return FIST; 14282 } 14283 14284 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { 14285 SDLoc DL(Op); 14286 MVT VT = Op.getSimpleValueType(); 14287 SDValue In = Op.getOperand(0); 14288 MVT SVT = In.getSimpleValueType(); 14289 14290 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 14291 14292 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 14293 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 14294 In, DAG.getUNDEF(SVT))); 14295 } 14296 14297 /// The only differences between FABS and FNEG are the mask and the logic op. 14298 /// FNEG also has a folding opportunity for FNEG(FABS(x)). 14299 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { 14300 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && 14301 "Wrong opcode for lowering FABS or FNEG."); 14302 14303 bool IsFABS = (Op.getOpcode() == ISD::FABS); 14304 14305 // If this is a FABS and it has an FNEG user, bail out to fold the combination 14306 // into an FNABS. We'll lower the FABS after that if it is still in use. 14307 if (IsFABS) 14308 for (SDNode *User : Op->uses()) 14309 if (User->getOpcode() == ISD::FNEG) 14310 return Op; 14311 14312 SDLoc dl(Op); 14313 MVT VT = Op.getSimpleValueType(); 14314 14315 bool IsF128 = (VT == MVT::f128); 14316 14317 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to 14318 // decide if we should generate a 16-byte constant mask when we only need 4 or 14319 // 8 bytes for the scalar case. 14320 14321 MVT LogicVT; 14322 MVT EltVT; 14323 unsigned NumElts; 14324 14325 if (VT.isVector()) { 14326 LogicVT = VT; 14327 EltVT = VT.getVectorElementType(); 14328 NumElts = VT.getVectorNumElements(); 14329 } else if (IsF128) { 14330 // SSE instructions are used for optimized f128 logical operations. 14331 LogicVT = MVT::f128; 14332 EltVT = VT; 14333 NumElts = 1; 14334 } else { 14335 // There are no scalar bitwise logical SSE/AVX instructions, so we 14336 // generate a 16-byte vector constant and logic op even for the scalar case. 14337 // Using a 16-byte mask allows folding the load of the mask with 14338 // the logic op, so it can save (~4 bytes) on code size. 14339 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; 14340 EltVT = VT; 14341 NumElts = (VT == MVT::f64) ? 2 : 4; 14342 } 14343 14344 unsigned EltBits = EltVT.getSizeInBits(); 14345 LLVMContext *Context = DAG.getContext(); 14346 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... 14347 APInt MaskElt = 14348 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); 14349 Constant *C = ConstantInt::get(*Context, MaskElt); 14350 C = ConstantVector::getSplat(NumElts, C); 14351 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14352 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); 14353 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 14354 SDValue Mask = 14355 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, 14356 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 14357 false, false, false, Alignment); 14358 14359 SDValue Op0 = Op.getOperand(0); 14360 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); 14361 unsigned LogicOp = 14362 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; 14363 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; 14364 14365 if (VT.isVector() || IsF128) 14366 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); 14367 14368 // For the scalar case extend to a 128-bit vector, perform the logic op, 14369 // and extract the scalar result back out. 14370 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); 14371 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); 14372 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, 14373 DAG.getIntPtrConstant(0, dl)); 14374 } 14375 14376 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 14377 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14378 LLVMContext *Context = DAG.getContext(); 14379 SDValue Op0 = Op.getOperand(0); 14380 SDValue Op1 = Op.getOperand(1); 14381 SDLoc dl(Op); 14382 MVT VT = Op.getSimpleValueType(); 14383 MVT SrcVT = Op1.getSimpleValueType(); 14384 bool IsF128 = (VT == MVT::f128); 14385 14386 // If second operand is smaller, extend it first. 14387 if (SrcVT.bitsLT(VT)) { 14388 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 14389 SrcVT = VT; 14390 } 14391 // And if it is bigger, shrink it first. 14392 if (SrcVT.bitsGT(VT)) { 14393 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl)); 14394 SrcVT = VT; 14395 } 14396 14397 // At this point the operands and the result should have the same 14398 // type, and that won't be f80 since that is not custom lowered. 14399 assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && 14400 "Unexpected type in LowerFCOPYSIGN"); 14401 14402 const fltSemantics &Sem = 14403 VT == MVT::f64 ? APFloat::IEEEdouble : 14404 (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); 14405 const unsigned SizeInBits = VT.getSizeInBits(); 14406 14407 SmallVector<Constant *, 4> CV( 14408 VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), 14409 ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); 14410 14411 // First, clear all bits but the sign bit from the second operand (sign). 14412 CV[0] = ConstantFP::get(*Context, 14413 APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); 14414 Constant *C = ConstantVector::get(CV); 14415 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); 14416 SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16); 14417 14418 // Perform all logic operations as 16-byte vectors because there are no 14419 // scalar FP logic instructions in SSE. This allows load folding of the 14420 // constants into the logic instructions. 14421 MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); 14422 SDValue Mask1 = 14423 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, 14424 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 14425 false, false, false, 16); 14426 if (!IsF128) 14427 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); 14428 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); 14429 14430 // Next, clear the sign bit from the first operand (magnitude). 14431 // If it's a constant, we can clear it here. 14432 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) { 14433 APFloat APF = Op0CN->getValueAPF(); 14434 // If the magnitude is a positive zero, the sign bit alone is enough. 14435 if (APF.isPosZero()) 14436 return IsF128 ? SignBit : 14437 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, 14438 DAG.getIntPtrConstant(0, dl)); 14439 APF.clearSign(); 14440 CV[0] = ConstantFP::get(*Context, APF); 14441 } else { 14442 CV[0] = ConstantFP::get( 14443 *Context, 14444 APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); 14445 } 14446 C = ConstantVector::get(CV); 14447 CPIdx = DAG.getConstantPool(C, PtrVT, 16); 14448 SDValue Val = 14449 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, 14450 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 14451 false, false, false, 16); 14452 // If the magnitude operand wasn't a constant, we need to AND out the sign. 14453 if (!isa<ConstantFPSDNode>(Op0)) { 14454 if (!IsF128) 14455 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); 14456 Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); 14457 } 14458 // OR the magnitude value with the sign bit. 14459 Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); 14460 return IsF128 ? Val : 14461 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, 14462 DAG.getIntPtrConstant(0, dl)); 14463 } 14464 14465 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 14466 SDValue N0 = Op.getOperand(0); 14467 SDLoc dl(Op); 14468 MVT VT = Op.getSimpleValueType(); 14469 14470 MVT OpVT = N0.getSimpleValueType(); 14471 assert((OpVT == MVT::f32 || OpVT == MVT::f64) && 14472 "Unexpected type for FGETSIGN"); 14473 14474 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1). 14475 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64); 14476 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0); 14477 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res); 14478 Res = DAG.getZExtOrTrunc(Res, dl, VT); 14479 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT)); 14480 return Res; 14481 } 14482 14483 // Check whether an OR'd tree is PTEST-able. 14484 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, 14485 SelectionDAG &DAG) { 14486 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 14487 14488 if (!Subtarget.hasSSE41()) 14489 return SDValue(); 14490 14491 if (!Op->hasOneUse()) 14492 return SDValue(); 14493 14494 SDNode *N = Op.getNode(); 14495 SDLoc DL(N); 14496 14497 SmallVector<SDValue, 8> Opnds; 14498 DenseMap<SDValue, unsigned> VecInMap; 14499 SmallVector<SDValue, 8> VecIns; 14500 EVT VT = MVT::Other; 14501 14502 // Recognize a special case where a vector is casted into wide integer to 14503 // test all 0s. 14504 Opnds.push_back(N->getOperand(0)); 14505 Opnds.push_back(N->getOperand(1)); 14506 14507 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 14508 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; 14509 // BFS traverse all OR'd operands. 14510 if (I->getOpcode() == ISD::OR) { 14511 Opnds.push_back(I->getOperand(0)); 14512 Opnds.push_back(I->getOperand(1)); 14513 // Re-evaluate the number of nodes to be traversed. 14514 e += 2; // 2 more nodes (LHS and RHS) are pushed. 14515 continue; 14516 } 14517 14518 // Quit if a non-EXTRACT_VECTOR_ELT 14519 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 14520 return SDValue(); 14521 14522 // Quit if without a constant index. 14523 SDValue Idx = I->getOperand(1); 14524 if (!isa<ConstantSDNode>(Idx)) 14525 return SDValue(); 14526 14527 SDValue ExtractedFromVec = I->getOperand(0); 14528 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 14529 if (M == VecInMap.end()) { 14530 VT = ExtractedFromVec.getValueType(); 14531 // Quit if not 128/256-bit vector. 14532 if (!VT.is128BitVector() && !VT.is256BitVector()) 14533 return SDValue(); 14534 // Quit if not the same type. 14535 if (VecInMap.begin() != VecInMap.end() && 14536 VT != VecInMap.begin()->first.getValueType()) 14537 return SDValue(); 14538 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 14539 VecIns.push_back(ExtractedFromVec); 14540 } 14541 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 14542 } 14543 14544 assert((VT.is128BitVector() || VT.is256BitVector()) && 14545 "Not extracted from 128-/256-bit vector."); 14546 14547 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 14548 14549 for (DenseMap<SDValue, unsigned>::const_iterator 14550 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 14551 // Quit if not all elements are used. 14552 if (I->second != FullMask) 14553 return SDValue(); 14554 } 14555 14556 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 14557 14558 // Cast all vectors into TestVT for PTEST. 14559 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 14560 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); 14561 14562 // If more than one full vectors are evaluated, OR them first before PTEST. 14563 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 14564 // Each iteration will OR 2 nodes and append the result until there is only 14565 // 1 node left, i.e. the final OR'd value of all vectors. 14566 SDValue LHS = VecIns[Slot]; 14567 SDValue RHS = VecIns[Slot + 1]; 14568 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 14569 } 14570 14571 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 14572 VecIns.back(), VecIns.back()); 14573 } 14574 14575 /// \brief return true if \c Op has a use that doesn't just read flags. 14576 static bool hasNonFlagsUse(SDValue Op) { 14577 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; 14578 ++UI) { 14579 SDNode *User = *UI; 14580 unsigned UOpNo = UI.getOperandNo(); 14581 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 14582 // Look pass truncate. 14583 UOpNo = User->use_begin().getOperandNo(); 14584 User = *User->use_begin(); 14585 } 14586 14587 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && 14588 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) 14589 return true; 14590 } 14591 return false; 14592 } 14593 14594 // Emit KTEST instruction for bit vectors on AVX-512 14595 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG, 14596 const X86Subtarget &Subtarget) { 14597 if (Op.getOpcode() == ISD::BITCAST) { 14598 auto hasKTEST = [&](MVT VT) { 14599 unsigned SizeInBits = VT.getSizeInBits(); 14600 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) || 14601 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64)); 14602 }; 14603 SDValue Op0 = Op.getOperand(0); 14604 MVT Op0VT = Op0.getValueType().getSimpleVT(); 14605 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 && 14606 hasKTEST(Op0VT)) 14607 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0); 14608 } 14609 return SDValue(); 14610 } 14611 14612 /// Emit nodes that will be selected as "test Op0,Op0", or something 14613 /// equivalent. 14614 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, 14615 SelectionDAG &DAG) const { 14616 if (Op.getValueType() == MVT::i1) { 14617 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); 14618 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, 14619 DAG.getConstant(0, dl, MVT::i8)); 14620 } 14621 // CF and OF aren't always set the way we want. Determine which 14622 // of these we need. 14623 bool NeedCF = false; 14624 bool NeedOF = false; 14625 switch (X86CC) { 14626 default: break; 14627 case X86::COND_A: case X86::COND_AE: 14628 case X86::COND_B: case X86::COND_BE: 14629 NeedCF = true; 14630 break; 14631 case X86::COND_G: case X86::COND_GE: 14632 case X86::COND_L: case X86::COND_LE: 14633 case X86::COND_O: case X86::COND_NO: { 14634 // Check if we really need to set the 14635 // Overflow flag. If NoSignedWrap is present 14636 // that is not actually needed. 14637 switch (Op->getOpcode()) { 14638 case ISD::ADD: 14639 case ISD::SUB: 14640 case ISD::MUL: 14641 case ISD::SHL: { 14642 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode()); 14643 if (BinNode->Flags.hasNoSignedWrap()) 14644 break; 14645 } 14646 default: 14647 NeedOF = true; 14648 break; 14649 } 14650 break; 14651 } 14652 } 14653 // See if we can use the EFLAGS value from the operand instead of 14654 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 14655 // we prove that the arithmetic won't overflow, we can't use OF or CF. 14656 if (Op.getResNo() != 0 || NeedOF || NeedCF) { 14657 // Emit KTEST for bit vectors 14658 if (auto Node = EmitKTEST(Op, DAG, Subtarget)) 14659 return Node; 14660 // Emit a CMP with 0, which is the TEST pattern. 14661 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 14662 DAG.getConstant(0, dl, Op.getValueType())); 14663 } 14664 unsigned Opcode = 0; 14665 unsigned NumOperands = 0; 14666 14667 // Truncate operations may prevent the merge of the SETCC instruction 14668 // and the arithmetic instruction before it. Attempt to truncate the operands 14669 // of the arithmetic instruction and use a reduced bit-width instruction. 14670 bool NeedTruncation = false; 14671 SDValue ArithOp = Op; 14672 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 14673 SDValue Arith = Op->getOperand(0); 14674 // Both the trunc and the arithmetic op need to have one user each. 14675 if (Arith->hasOneUse()) 14676 switch (Arith.getOpcode()) { 14677 default: break; 14678 case ISD::ADD: 14679 case ISD::SUB: 14680 case ISD::AND: 14681 case ISD::OR: 14682 case ISD::XOR: { 14683 NeedTruncation = true; 14684 ArithOp = Arith; 14685 } 14686 } 14687 } 14688 14689 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 14690 // which may be the result of a CAST. We use the variable 'Op', which is the 14691 // non-casted variable when we check for possible users. 14692 switch (ArithOp.getOpcode()) { 14693 case ISD::ADD: 14694 // Due to an isel shortcoming, be conservative if this add is likely to be 14695 // selected as part of a load-modify-store instruction. When the root node 14696 // in a match is a store, isel doesn't know how to remap non-chain non-flag 14697 // uses of other nodes in the match, such as the ADD in this case. This 14698 // leads to the ADD being left around and reselected, with the result being 14699 // two adds in the output. Alas, even if none our users are stores, that 14700 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 14701 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 14702 // climbing the DAG back to the root, and it doesn't seem to be worth the 14703 // effort. 14704 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 14705 UE = Op.getNode()->use_end(); UI != UE; ++UI) 14706 if (UI->getOpcode() != ISD::CopyToReg && 14707 UI->getOpcode() != ISD::SETCC && 14708 UI->getOpcode() != ISD::STORE) 14709 goto default_case; 14710 14711 if (ConstantSDNode *C = 14712 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 14713 // An add of one will be selected as an INC. 14714 if (C->isOne() && !Subtarget.slowIncDec()) { 14715 Opcode = X86ISD::INC; 14716 NumOperands = 1; 14717 break; 14718 } 14719 14720 // An add of negative one (subtract of one) will be selected as a DEC. 14721 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) { 14722 Opcode = X86ISD::DEC; 14723 NumOperands = 1; 14724 break; 14725 } 14726 } 14727 14728 // Otherwise use a regular EFLAGS-setting add. 14729 Opcode = X86ISD::ADD; 14730 NumOperands = 2; 14731 break; 14732 case ISD::SHL: 14733 case ISD::SRL: 14734 // If we have a constant logical shift that's only used in a comparison 14735 // against zero turn it into an equivalent AND. This allows turning it into 14736 // a TEST instruction later. 14737 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && 14738 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { 14739 EVT VT = Op.getValueType(); 14740 unsigned BitWidth = VT.getSizeInBits(); 14741 unsigned ShAmt = Op->getConstantOperandVal(1); 14742 if (ShAmt >= BitWidth) // Avoid undefined shifts. 14743 break; 14744 APInt Mask = ArithOp.getOpcode() == ISD::SRL 14745 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) 14746 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); 14747 if (!Mask.isSignedIntN(32)) // Avoid large immediates. 14748 break; 14749 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), 14750 DAG.getConstant(Mask, dl, VT)); 14751 } 14752 break; 14753 14754 case ISD::AND: 14755 // If the primary 'and' result isn't used, don't bother using X86ISD::AND, 14756 // because a TEST instruction will be better. 14757 if (!hasNonFlagsUse(Op)) { 14758 SDValue Op0 = ArithOp->getOperand(0); 14759 SDValue Op1 = ArithOp->getOperand(1); 14760 EVT VT = ArithOp.getValueType(); 14761 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1); 14762 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64; 14763 14764 // But if we can combine this into an ANDN operation, then create an AND 14765 // now and allow it to be pattern matched into an ANDN. 14766 if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType) 14767 break; 14768 } 14769 // FALL THROUGH 14770 case ISD::SUB: 14771 case ISD::OR: 14772 case ISD::XOR: 14773 // Due to the ISEL shortcoming noted above, be conservative if this op is 14774 // likely to be selected as part of a load-modify-store instruction. 14775 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 14776 UE = Op.getNode()->use_end(); UI != UE; ++UI) 14777 if (UI->getOpcode() == ISD::STORE) 14778 goto default_case; 14779 14780 // Otherwise use a regular EFLAGS-setting instruction. 14781 switch (ArithOp.getOpcode()) { 14782 default: llvm_unreachable("unexpected operator!"); 14783 case ISD::SUB: Opcode = X86ISD::SUB; break; 14784 case ISD::XOR: Opcode = X86ISD::XOR; break; 14785 case ISD::AND: Opcode = X86ISD::AND; break; 14786 case ISD::OR: { 14787 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 14788 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG)) 14789 return EFLAGS; 14790 } 14791 Opcode = X86ISD::OR; 14792 break; 14793 } 14794 } 14795 14796 NumOperands = 2; 14797 break; 14798 case X86ISD::ADD: 14799 case X86ISD::SUB: 14800 case X86ISD::INC: 14801 case X86ISD::DEC: 14802 case X86ISD::OR: 14803 case X86ISD::XOR: 14804 case X86ISD::AND: 14805 return SDValue(Op.getNode(), 1); 14806 default: 14807 default_case: 14808 break; 14809 } 14810 14811 // If we found that truncation is beneficial, perform the truncation and 14812 // update 'Op'. 14813 if (NeedTruncation) { 14814 EVT VT = Op.getValueType(); 14815 SDValue WideVal = Op->getOperand(0); 14816 EVT WideVT = WideVal.getValueType(); 14817 unsigned ConvertedOp = 0; 14818 // Use a target machine opcode to prevent further DAGCombine 14819 // optimizations that may separate the arithmetic operations 14820 // from the setcc node. 14821 switch (WideVal.getOpcode()) { 14822 default: break; 14823 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 14824 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 14825 case ISD::AND: ConvertedOp = X86ISD::AND; break; 14826 case ISD::OR: ConvertedOp = X86ISD::OR; break; 14827 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 14828 } 14829 14830 if (ConvertedOp) { 14831 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14832 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 14833 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 14834 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 14835 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 14836 } 14837 } 14838 } 14839 14840 if (Opcode == 0) { 14841 // Emit KTEST for bit vectors 14842 if (auto Node = EmitKTEST(Op, DAG, Subtarget)) 14843 return Node; 14844 14845 // Emit a CMP with 0, which is the TEST pattern. 14846 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 14847 DAG.getConstant(0, dl, Op.getValueType())); 14848 } 14849 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 14850 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands); 14851 14852 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); 14853 DAG.ReplaceAllUsesWith(Op, New); 14854 return SDValue(New.getNode(), 1); 14855 } 14856 14857 /// Emit nodes that will be selected as "cmp Op0,Op1", or something 14858 /// equivalent. 14859 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 14860 const SDLoc &dl, SelectionDAG &DAG) const { 14861 if (isNullConstant(Op1)) 14862 return EmitTest(Op0, X86CC, dl, DAG); 14863 14864 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && 14865 "Unexpected comparison operation for MVT::i1 operands"); 14866 14867 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 14868 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 14869 // Only promote the compare up to I32 if it is a 16 bit operation 14870 // with an immediate. 16 bit immediates are to be avoided. 14871 if ((Op0.getValueType() == MVT::i16 && 14872 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) && 14873 !DAG.getMachineFunction().getFunction()->optForMinSize() && 14874 !Subtarget.isAtom()) { 14875 unsigned ExtendOp = 14876 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; 14877 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); 14878 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); 14879 } 14880 // Use SUB instead of CMP to enable CSE between SUB and CMP. 14881 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 14882 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 14883 Op0, Op1); 14884 return SDValue(Sub.getNode(), 1); 14885 } 14886 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 14887 } 14888 14889 /// Convert a comparison if required by the subtarget. 14890 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 14891 SelectionDAG &DAG) const { 14892 // If the subtarget does not support the FUCOMI instruction, floating-point 14893 // comparisons have to be converted. 14894 if (Subtarget.hasCMov() || 14895 Cmp.getOpcode() != X86ISD::CMP || 14896 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 14897 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 14898 return Cmp; 14899 14900 // The instruction selector will select an FUCOM instruction instead of 14901 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 14902 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 14903 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 14904 SDLoc dl(Cmp); 14905 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 14906 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 14907 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 14908 DAG.getConstant(8, dl, MVT::i8)); 14909 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 14910 14911 // Some 64-bit targets lack SAHF support, but they do support FCOMI. 14912 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); 14913 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 14914 } 14915 14916 /// The minimum architected relative accuracy is 2^-12. We need one 14917 /// Newton-Raphson step to have a good float result (24 bits of precision). 14918 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, 14919 DAGCombinerInfo &DCI, 14920 unsigned &RefinementSteps, 14921 bool &UseOneConstNR) const { 14922 EVT VT = Op.getValueType(); 14923 const char *RecipOp; 14924 14925 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. 14926 // TODO: Add support for AVX512 (v16f32). 14927 // It is likely not profitable to do this for f64 because a double-precision 14928 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 14929 // instructions: convert to single, rsqrtss, convert back to double, refine 14930 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA 14931 // along with FMA, this could be a throughput win. 14932 if (VT == MVT::f32 && Subtarget.hasSSE1()) 14933 RecipOp = "sqrtf"; 14934 else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) || 14935 (VT == MVT::v8f32 && Subtarget.hasAVX())) 14936 RecipOp = "vec-sqrtf"; 14937 else 14938 return SDValue(); 14939 14940 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; 14941 if (!Recips.isEnabled(RecipOp)) 14942 return SDValue(); 14943 14944 RefinementSteps = Recips.getRefinementSteps(RecipOp); 14945 UseOneConstNR = false; 14946 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); 14947 } 14948 14949 /// The minimum architected relative accuracy is 2^-12. We need one 14950 /// Newton-Raphson step to have a good float result (24 bits of precision). 14951 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, 14952 DAGCombinerInfo &DCI, 14953 unsigned &RefinementSteps) const { 14954 EVT VT = Op.getValueType(); 14955 const char *RecipOp; 14956 14957 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. 14958 // TODO: Add support for AVX512 (v16f32). 14959 // It is likely not profitable to do this for f64 because a double-precision 14960 // reciprocal estimate with refinement on x86 prior to FMA requires 14961 // 15 instructions: convert to single, rcpss, convert back to double, refine 14962 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA 14963 // along with FMA, this could be a throughput win. 14964 if (VT == MVT::f32 && Subtarget.hasSSE1()) 14965 RecipOp = "divf"; 14966 else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) || 14967 (VT == MVT::v8f32 && Subtarget.hasAVX())) 14968 RecipOp = "vec-divf"; 14969 else 14970 return SDValue(); 14971 14972 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; 14973 if (!Recips.isEnabled(RecipOp)) 14974 return SDValue(); 14975 14976 RefinementSteps = Recips.getRefinementSteps(RecipOp); 14977 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); 14978 } 14979 14980 /// If we have at least two divisions that use the same divisor, convert to 14981 /// multplication by a reciprocal. This may need to be adjusted for a given 14982 /// CPU if a division's cost is not at least twice the cost of a multiplication. 14983 /// This is because we still need one division to calculate the reciprocal and 14984 /// then we need two multiplies by that reciprocal as replacements for the 14985 /// original divisions. 14986 unsigned X86TargetLowering::combineRepeatedFPDivisors() const { 14987 return 2; 14988 } 14989 14990 /// Result of 'and' is compared against zero. Change to a BT node if possible. 14991 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 14992 const SDLoc &dl, SelectionDAG &DAG) const { 14993 SDValue Op0 = And.getOperand(0); 14994 SDValue Op1 = And.getOperand(1); 14995 if (Op0.getOpcode() == ISD::TRUNCATE) 14996 Op0 = Op0.getOperand(0); 14997 if (Op1.getOpcode() == ISD::TRUNCATE) 14998 Op1 = Op1.getOperand(0); 14999 15000 SDValue LHS, RHS; 15001 if (Op1.getOpcode() == ISD::SHL) 15002 std::swap(Op0, Op1); 15003 if (Op0.getOpcode() == ISD::SHL) { 15004 if (isOneConstant(Op0.getOperand(0))) { 15005 // If we looked past a truncate, check that it's only truncating away 15006 // known zeros. 15007 unsigned BitWidth = Op0.getValueSizeInBits(); 15008 unsigned AndBitWidth = And.getValueSizeInBits(); 15009 if (BitWidth > AndBitWidth) { 15010 APInt Zeros, Ones; 15011 DAG.computeKnownBits(Op0, Zeros, Ones); 15012 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 15013 return SDValue(); 15014 } 15015 LHS = Op1; 15016 RHS = Op0.getOperand(1); 15017 } 15018 } else if (Op1.getOpcode() == ISD::Constant) { 15019 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 15020 uint64_t AndRHSVal = AndRHS->getZExtValue(); 15021 SDValue AndLHS = Op0; 15022 15023 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 15024 LHS = AndLHS.getOperand(0); 15025 RHS = AndLHS.getOperand(1); 15026 } 15027 15028 // Use BT if the immediate can't be encoded in a TEST instruction. 15029 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 15030 LHS = AndLHS; 15031 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType()); 15032 } 15033 } 15034 15035 if (LHS.getNode()) { 15036 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 15037 // instruction. Since the shift amount is in-range-or-undefined, we know 15038 // that doing a bittest on the i32 value is ok. We extend to i32 because 15039 // the encoding for the i16 version is larger than the i32 version. 15040 // Also promote i16 to i32 for performance / code size reason. 15041 if (LHS.getValueType() == MVT::i8 || 15042 LHS.getValueType() == MVT::i16) 15043 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 15044 15045 // If the operand types disagree, extend the shift amount to match. Since 15046 // BT ignores high bits (like shifts) we can use anyextend. 15047 if (LHS.getValueType() != RHS.getValueType()) 15048 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 15049 15050 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 15051 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 15052 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15053 DAG.getConstant(Cond, dl, MVT::i8), BT); 15054 } 15055 15056 return SDValue(); 15057 } 15058 15059 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask 15060 /// CMPs. 15061 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, 15062 SDValue &Op1) { 15063 unsigned SSECC; 15064 bool Swap = false; 15065 15066 // SSE Condition code mapping: 15067 // 0 - EQ 15068 // 1 - LT 15069 // 2 - LE 15070 // 3 - UNORD 15071 // 4 - NEQ 15072 // 5 - NLT 15073 // 6 - NLE 15074 // 7 - ORD 15075 switch (SetCCOpcode) { 15076 default: llvm_unreachable("Unexpected SETCC condition"); 15077 case ISD::SETOEQ: 15078 case ISD::SETEQ: SSECC = 0; break; 15079 case ISD::SETOGT: 15080 case ISD::SETGT: Swap = true; // Fallthrough 15081 case ISD::SETLT: 15082 case ISD::SETOLT: SSECC = 1; break; 15083 case ISD::SETOGE: 15084 case ISD::SETGE: Swap = true; // Fallthrough 15085 case ISD::SETLE: 15086 case ISD::SETOLE: SSECC = 2; break; 15087 case ISD::SETUO: SSECC = 3; break; 15088 case ISD::SETUNE: 15089 case ISD::SETNE: SSECC = 4; break; 15090 case ISD::SETULE: Swap = true; // Fallthrough 15091 case ISD::SETUGE: SSECC = 5; break; 15092 case ISD::SETULT: Swap = true; // Fallthrough 15093 case ISD::SETUGT: SSECC = 6; break; 15094 case ISD::SETO: SSECC = 7; break; 15095 case ISD::SETUEQ: 15096 case ISD::SETONE: SSECC = 8; break; 15097 } 15098 if (Swap) 15099 std::swap(Op0, Op1); 15100 15101 return SSECC; 15102 } 15103 15104 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then 15105 /// concatenate the result back. 15106 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 15107 MVT VT = Op.getSimpleValueType(); 15108 15109 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 15110 "Unsupported value type for operation"); 15111 15112 unsigned NumElems = VT.getVectorNumElements(); 15113 SDLoc dl(Op); 15114 SDValue CC = Op.getOperand(2); 15115 15116 // Extract the LHS vectors 15117 SDValue LHS = Op.getOperand(0); 15118 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); 15119 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); 15120 15121 // Extract the RHS vectors 15122 SDValue RHS = Op.getOperand(1); 15123 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); 15124 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); 15125 15126 // Issue the operation on the smaller types and concatenate the result back 15127 MVT EltVT = VT.getVectorElementType(); 15128 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 15129 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 15130 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 15131 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 15132 } 15133 15134 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { 15135 SDValue Op0 = Op.getOperand(0); 15136 SDValue Op1 = Op.getOperand(1); 15137 SDValue CC = Op.getOperand(2); 15138 MVT VT = Op.getSimpleValueType(); 15139 SDLoc dl(Op); 15140 15141 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && 15142 "Unexpected type for boolean compare operation"); 15143 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 15144 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, 15145 DAG.getConstant(-1, dl, VT)); 15146 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1, 15147 DAG.getConstant(-1, dl, VT)); 15148 switch (SetCCOpcode) { 15149 default: llvm_unreachable("Unexpected SETCC condition"); 15150 case ISD::SETEQ: 15151 // (x == y) -> ~(x ^ y) 15152 return DAG.getNode(ISD::XOR, dl, VT, 15153 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), 15154 DAG.getConstant(-1, dl, VT)); 15155 case ISD::SETNE: 15156 // (x != y) -> (x ^ y) 15157 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); 15158 case ISD::SETUGT: 15159 case ISD::SETGT: 15160 // (x > y) -> (x & ~y) 15161 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1); 15162 case ISD::SETULT: 15163 case ISD::SETLT: 15164 // (x < y) -> (~x & y) 15165 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1); 15166 case ISD::SETULE: 15167 case ISD::SETLE: 15168 // (x <= y) -> (~x | y) 15169 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1); 15170 case ISD::SETUGE: 15171 case ISD::SETGE: 15172 // (x >=y) -> (x | ~y) 15173 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1); 15174 } 15175 } 15176 15177 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { 15178 15179 SDValue Op0 = Op.getOperand(0); 15180 SDValue Op1 = Op.getOperand(1); 15181 SDValue CC = Op.getOperand(2); 15182 MVT VT = Op.getSimpleValueType(); 15183 SDLoc dl(Op); 15184 15185 assert(VT.getVectorElementType() == MVT::i1 && 15186 "Cannot set masked compare for this operation"); 15187 15188 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 15189 unsigned Opc = 0; 15190 bool Unsigned = false; 15191 bool Swap = false; 15192 unsigned SSECC; 15193 switch (SetCCOpcode) { 15194 default: llvm_unreachable("Unexpected SETCC condition"); 15195 case ISD::SETNE: SSECC = 4; break; 15196 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; 15197 case ISD::SETUGT: SSECC = 6; Unsigned = true; break; 15198 case ISD::SETLT: Swap = true; //fall-through 15199 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; 15200 case ISD::SETULT: SSECC = 1; Unsigned = true; break; 15201 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT 15202 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap 15203 case ISD::SETULE: Unsigned = true; //fall-through 15204 case ISD::SETLE: SSECC = 2; break; 15205 } 15206 15207 if (Swap) 15208 std::swap(Op0, Op1); 15209 if (Opc) 15210 return DAG.getNode(Opc, dl, VT, Op0, Op1); 15211 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; 15212 return DAG.getNode(Opc, dl, VT, Op0, Op1, 15213 DAG.getConstant(SSECC, dl, MVT::i8)); 15214 } 15215 15216 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second 15217 /// operand \p Op1. If non-trivial (for example because it's not constant) 15218 /// return an empty value. 15219 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1, 15220 SelectionDAG &DAG) { 15221 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode()); 15222 if (!BV) 15223 return SDValue(); 15224 15225 MVT VT = Op1.getSimpleValueType(); 15226 MVT EVT = VT.getVectorElementType(); 15227 unsigned n = VT.getVectorNumElements(); 15228 SmallVector<SDValue, 8> ULTOp1; 15229 15230 for (unsigned i = 0; i < n; ++i) { 15231 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); 15232 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) 15233 return SDValue(); 15234 15235 // Avoid underflow. 15236 APInt Val = Elt->getAPIntValue(); 15237 if (Val == 0) 15238 return SDValue(); 15239 15240 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT)); 15241 } 15242 15243 return DAG.getBuildVector(VT, dl, ULTOp1); 15244 } 15245 15246 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, 15247 SelectionDAG &DAG) { 15248 SDValue Op0 = Op.getOperand(0); 15249 SDValue Op1 = Op.getOperand(1); 15250 SDValue CC = Op.getOperand(2); 15251 MVT VT = Op.getSimpleValueType(); 15252 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 15253 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); 15254 SDLoc dl(Op); 15255 15256 if (isFP) { 15257 #ifndef NDEBUG 15258 MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); 15259 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 15260 #endif 15261 15262 unsigned Opc; 15263 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { 15264 assert(VT.getVectorNumElements() <= 16); 15265 Opc = X86ISD::CMPM; 15266 } else { 15267 Opc = X86ISD::CMPP; 15268 // The SSE/AVX packed FP comparison nodes are defined with a 15269 // floating-point vector result that matches the operand type. This allows 15270 // them to work with an SSE1 target (integer vector types are not legal). 15271 VT = Op0.getSimpleValueType(); 15272 } 15273 15274 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), 15275 // emit two comparisons and a logic op to tie them together. 15276 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is 15277 // available. 15278 SDValue Cmp; 15279 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); 15280 if (SSECC == 8) { 15281 // LLVM predicate is SETUEQ or SETONE. 15282 unsigned CC0, CC1; 15283 unsigned CombineOpc; 15284 if (SetCCOpcode == ISD::SETUEQ) { 15285 CC0 = 3; // UNORD 15286 CC1 = 0; // EQ 15287 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) : 15288 static_cast<unsigned>(ISD::OR); 15289 } else { 15290 assert(SetCCOpcode == ISD::SETONE); 15291 CC0 = 7; // ORD 15292 CC1 = 4; // NEQ 15293 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) : 15294 static_cast<unsigned>(ISD::AND); 15295 } 15296 15297 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, 15298 DAG.getConstant(CC0, dl, MVT::i8)); 15299 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, 15300 DAG.getConstant(CC1, dl, MVT::i8)); 15301 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 15302 } else { 15303 // Handle all other FP comparisons here. 15304 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, 15305 DAG.getConstant(SSECC, dl, MVT::i8)); 15306 } 15307 15308 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the 15309 // result type of SETCC. The bitcast is expected to be optimized away 15310 // during combining/isel. 15311 if (Opc == X86ISD::CMPP) 15312 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); 15313 15314 return Cmp; 15315 } 15316 15317 MVT VTOp0 = Op0.getSimpleValueType(); 15318 assert(VTOp0 == Op1.getSimpleValueType() && 15319 "Expected operands with same type!"); 15320 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && 15321 "Invalid number of packed elements for source and destination!"); 15322 15323 if (VT.is128BitVector() && VTOp0.is256BitVector()) { 15324 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type 15325 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the 15326 // legalizer firstly checks if the first operand in input to the setcc has 15327 // a legal type. If so, then it promotes the return type to that same type. 15328 // Otherwise, the return type is promoted to the 'next legal type' which, 15329 // for a vector of MVT::i1 is always a 128-bit integer vector type. 15330 // 15331 // We reach this code only if the following two conditions are met: 15332 // 1. Both return type and operand type have been promoted to wider types 15333 // by the type legalizer. 15334 // 2. The original operand type has been promoted to a 256-bit vector. 15335 // 15336 // Note that condition 2. only applies for AVX targets. 15337 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode); 15338 return DAG.getZExtOrTrunc(NewOp, dl, VT); 15339 } 15340 15341 // The non-AVX512 code below works under the assumption that source and 15342 // destination types are the same. 15343 assert((Subtarget.hasAVX512() || (VT == VTOp0)) && 15344 "Value types for source and destination must be the same!"); 15345 15346 // Break 256-bit integer vector compare into smaller ones. 15347 if (VT.is256BitVector() && !Subtarget.hasInt256()) 15348 return Lower256IntVSETCC(Op, DAG); 15349 15350 // Operands are boolean (vectors of i1) 15351 MVT OpVT = Op1.getSimpleValueType(); 15352 if (OpVT.getVectorElementType() == MVT::i1) 15353 return LowerBoolVSETCC_AVX512(Op, DAG); 15354 15355 // The result is boolean, but operands are int/float 15356 if (VT.getVectorElementType() == MVT::i1) { 15357 // In AVX-512 architecture setcc returns mask with i1 elements, 15358 // But there is no compare instruction for i8 and i16 elements in KNL. 15359 // In this case use SSE compare 15360 bool UseAVX512Inst = 15361 (OpVT.is512BitVector() || 15362 OpVT.getVectorElementType().getSizeInBits() >= 32 || 15363 (Subtarget.hasBWI() && Subtarget.hasVLX())); 15364 15365 if (UseAVX512Inst) 15366 return LowerIntVSETCC_AVX512(Op, DAG); 15367 15368 return DAG.getNode(ISD::TRUNCATE, dl, VT, 15369 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); 15370 } 15371 15372 // Lower using XOP integer comparisons. 15373 if ((VT == MVT::v16i8 || VT == MVT::v8i16 || 15374 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) { 15375 // Translate compare code to XOP PCOM compare mode. 15376 unsigned CmpMode = 0; 15377 switch (SetCCOpcode) { 15378 default: llvm_unreachable("Unexpected SETCC condition"); 15379 case ISD::SETULT: 15380 case ISD::SETLT: CmpMode = 0x00; break; 15381 case ISD::SETULE: 15382 case ISD::SETLE: CmpMode = 0x01; break; 15383 case ISD::SETUGT: 15384 case ISD::SETGT: CmpMode = 0x02; break; 15385 case ISD::SETUGE: 15386 case ISD::SETGE: CmpMode = 0x03; break; 15387 case ISD::SETEQ: CmpMode = 0x04; break; 15388 case ISD::SETNE: CmpMode = 0x05; break; 15389 } 15390 15391 // Are we comparing unsigned or signed integers? 15392 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) 15393 ? X86ISD::VPCOMU : X86ISD::VPCOM; 15394 15395 return DAG.getNode(Opc, dl, VT, Op0, Op1, 15396 DAG.getConstant(CmpMode, dl, MVT::i8)); 15397 } 15398 15399 // We are handling one of the integer comparisons here. Since SSE only has 15400 // GT and EQ comparisons for integer, swapping operands and multiple 15401 // operations may be required for some comparisons. 15402 unsigned Opc; 15403 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; 15404 bool Subus = false; 15405 15406 switch (SetCCOpcode) { 15407 default: llvm_unreachable("Unexpected SETCC condition"); 15408 case ISD::SETNE: Invert = true; 15409 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; 15410 case ISD::SETLT: Swap = true; 15411 case ISD::SETGT: Opc = X86ISD::PCMPGT; break; 15412 case ISD::SETGE: Swap = true; 15413 case ISD::SETLE: Opc = X86ISD::PCMPGT; 15414 Invert = true; break; 15415 case ISD::SETULT: Swap = true; 15416 case ISD::SETUGT: Opc = X86ISD::PCMPGT; 15417 FlipSigns = true; break; 15418 case ISD::SETUGE: Swap = true; 15419 case ISD::SETULE: Opc = X86ISD::PCMPGT; 15420 FlipSigns = true; Invert = true; break; 15421 } 15422 15423 // Special case: Use min/max operations for SETULE/SETUGE 15424 MVT VET = VT.getVectorElementType(); 15425 bool hasMinMax = 15426 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) 15427 || (Subtarget.hasSSE2() && (VET == MVT::i8)); 15428 15429 if (hasMinMax) { 15430 switch (SetCCOpcode) { 15431 default: break; 15432 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; 15433 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; 15434 } 15435 15436 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } 15437 } 15438 15439 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); 15440 if (!MinMax && hasSubus) { 15441 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for 15442 // Op0 u<= Op1: 15443 // t = psubus Op0, Op1 15444 // pcmpeq t, <0..0> 15445 switch (SetCCOpcode) { 15446 default: break; 15447 case ISD::SETULT: { 15448 // If the comparison is against a constant we can turn this into a 15449 // setule. With psubus, setule does not require a swap. This is 15450 // beneficial because the constant in the register is no longer 15451 // destructed as the destination so it can be hoisted out of a loop. 15452 // Only do this pre-AVX since vpcmp* is no longer destructive. 15453 if (Subtarget.hasAVX()) 15454 break; 15455 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) { 15456 Op1 = ULEOp1; 15457 Subus = true; Invert = false; Swap = false; 15458 } 15459 break; 15460 } 15461 // Psubus is better than flip-sign because it requires no inversion. 15462 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; 15463 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; 15464 } 15465 15466 if (Subus) { 15467 Opc = X86ISD::SUBUS; 15468 FlipSigns = false; 15469 } 15470 } 15471 15472 if (Swap) 15473 std::swap(Op0, Op1); 15474 15475 // Check that the operation in question is available (most are plain SSE2, 15476 // but PCMPGTQ and PCMPEQQ have different requirements). 15477 if (VT == MVT::v2i64) { 15478 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { 15479 assert(Subtarget.hasSSE2() && "Don't know how to lower!"); 15480 15481 // First cast everything to the right type. 15482 Op0 = DAG.getBitcast(MVT::v4i32, Op0); 15483 Op1 = DAG.getBitcast(MVT::v4i32, Op1); 15484 15485 // Since SSE has no unsigned integer comparisons, we need to flip the sign 15486 // bits of the inputs before performing those operations. The lower 15487 // compare is always unsigned. 15488 SDValue SB; 15489 if (FlipSigns) { 15490 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32); 15491 } else { 15492 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32); 15493 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32); 15494 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero}); 15495 } 15496 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); 15497 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); 15498 15499 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) 15500 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); 15501 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); 15502 15503 // Create masks for only the low parts/high parts of the 64 bit integers. 15504 static const int MaskHi[] = { 1, 1, 3, 3 }; 15505 static const int MaskLo[] = { 0, 0, 2, 2 }; 15506 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); 15507 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); 15508 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); 15509 15510 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); 15511 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); 15512 15513 if (Invert) 15514 Result = DAG.getNOT(dl, Result, MVT::v4i32); 15515 15516 return DAG.getBitcast(VT, Result); 15517 } 15518 15519 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) { 15520 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with 15521 // pcmpeqd + pshufd + pand. 15522 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"); 15523 15524 // First cast everything to the right type. 15525 Op0 = DAG.getBitcast(MVT::v4i32, Op0); 15526 Op1 = DAG.getBitcast(MVT::v4i32, Op1); 15527 15528 // Do the compare. 15529 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); 15530 15531 // Make sure the lower and upper halves are both all-ones. 15532 static const int Mask[] = { 1, 0, 3, 2 }; 15533 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); 15534 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); 15535 15536 if (Invert) 15537 Result = DAG.getNOT(dl, Result, MVT::v4i32); 15538 15539 return DAG.getBitcast(VT, Result); 15540 } 15541 } 15542 15543 // Since SSE has no unsigned integer comparisons, we need to flip the sign 15544 // bits of the inputs before performing those operations. 15545 if (FlipSigns) { 15546 MVT EltVT = VT.getVectorElementType(); 15547 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, 15548 VT); 15549 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); 15550 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); 15551 } 15552 15553 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 15554 15555 // If the logical-not of the result is required, perform that now. 15556 if (Invert) 15557 Result = DAG.getNOT(dl, Result, VT); 15558 15559 if (MinMax) 15560 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); 15561 15562 if (Subus) 15563 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, 15564 getZeroVector(VT, Subtarget, DAG, dl)); 15565 15566 return Result; 15567 } 15568 15569 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 15570 15571 MVT VT = Op.getSimpleValueType(); 15572 15573 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); 15574 15575 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) 15576 && "SetCC type must be 8-bit or 1-bit integer"); 15577 SDValue Op0 = Op.getOperand(0); 15578 SDValue Op1 = Op.getOperand(1); 15579 SDLoc dl(Op); 15580 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 15581 15582 // Optimize to BT if possible. 15583 // Lower (X & (1 << N)) == 0 to BT(X, N). 15584 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 15585 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 15586 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 15587 isNullConstant(Op1) && 15588 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 15589 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { 15590 if (VT == MVT::i1) { 15591 NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC, 15592 DAG.getValueType(MVT::i1)); 15593 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); 15594 } 15595 return NewSetCC; 15596 } 15597 } 15598 15599 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 15600 // these. 15601 if ((isOneConstant(Op1) || isNullConstant(Op1)) && 15602 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 15603 15604 // If the input is a setcc, then reuse the input setcc or use a new one with 15605 // the inverted condition. 15606 if (Op0.getOpcode() == X86ISD::SETCC) { 15607 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 15608 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); 15609 if (!Invert) 15610 return Op0; 15611 15612 CCode = X86::GetOppositeBranchCondition(CCode); 15613 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15614 DAG.getConstant(CCode, dl, MVT::i8), 15615 Op0.getOperand(1)); 15616 if (VT == MVT::i1) { 15617 SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC, 15618 DAG.getValueType(MVT::i1)); 15619 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); 15620 } 15621 return SetCC; 15622 } 15623 } 15624 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 15625 if (isOneConstant(Op1)) { 15626 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); 15627 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC); 15628 } 15629 if (!isNullConstant(Op1)) { 15630 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1); 15631 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC); 15632 } 15633 } 15634 15635 bool isFP = Op1.getSimpleValueType().isFloatingPoint(); 15636 unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG); 15637 if (X86CC == X86::COND_INVALID) 15638 return SDValue(); 15639 15640 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); 15641 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 15642 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15643 DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS); 15644 if (VT == MVT::i1) { 15645 SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC, 15646 DAG.getValueType(MVT::i1)); 15647 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); 15648 } 15649 return SetCC; 15650 } 15651 15652 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { 15653 SDValue LHS = Op.getOperand(0); 15654 SDValue RHS = Op.getOperand(1); 15655 SDValue Carry = Op.getOperand(2); 15656 SDValue Cond = Op.getOperand(3); 15657 SDLoc DL(Op); 15658 15659 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); 15660 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); 15661 15662 assert(Carry.getOpcode() != ISD::CARRY_FALSE); 15663 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 15664 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); 15665 SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 15666 DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); 15667 if (Op.getSimpleValueType() == MVT::i1) { 15668 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, 15669 DAG.getValueType(MVT::i1)); 15670 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); 15671 } 15672 return SetCC; 15673 } 15674 15675 /// Return true if opcode is a X86 logical comparison. 15676 static bool isX86LogicalCmp(SDValue Op) { 15677 unsigned Opc = Op.getNode()->getOpcode(); 15678 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 15679 Opc == X86ISD::SAHF) 15680 return true; 15681 if (Op.getResNo() == 1 && 15682 (Opc == X86ISD::ADD || 15683 Opc == X86ISD::SUB || 15684 Opc == X86ISD::ADC || 15685 Opc == X86ISD::SBB || 15686 Opc == X86ISD::SMUL || 15687 Opc == X86ISD::UMUL || 15688 Opc == X86ISD::INC || 15689 Opc == X86ISD::DEC || 15690 Opc == X86ISD::OR || 15691 Opc == X86ISD::XOR || 15692 Opc == X86ISD::AND)) 15693 return true; 15694 15695 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 15696 return true; 15697 15698 return false; 15699 } 15700 15701 /// Returns the "condition" node, that may be wrapped with "truncate". 15702 /// Like this: (i1 (trunc (i8 X86ISD::SETCC))). 15703 static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 15704 if (V.getOpcode() != ISD::TRUNCATE) 15705 return V; 15706 15707 SDValue VOp0 = V.getOperand(0); 15708 if (VOp0.getOpcode() == ISD::AssertZext && 15709 V.getValueSizeInBits() == 15710 cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits()) 15711 return VOp0.getOperand(0); 15712 15713 unsigned InBits = VOp0.getValueSizeInBits(); 15714 unsigned Bits = V.getValueSizeInBits(); 15715 if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits))) 15716 return V.getOperand(0); 15717 return V; 15718 } 15719 15720 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 15721 bool addTest = true; 15722 SDValue Cond = Op.getOperand(0); 15723 SDValue Op1 = Op.getOperand(1); 15724 SDValue Op2 = Op.getOperand(2); 15725 SDLoc DL(Op); 15726 MVT VT = Op1.getSimpleValueType(); 15727 SDValue CC; 15728 15729 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops 15730 // are available or VBLENDV if AVX is available. 15731 // Otherwise FP cmovs get lowered into a less efficient branch sequence later. 15732 if (Cond.getOpcode() == ISD::SETCC && 15733 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || 15734 (Subtarget.hasSSE1() && VT == MVT::f32)) && 15735 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { 15736 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); 15737 int SSECC = translateX86FSETCC( 15738 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); 15739 15740 if (SSECC != 8) { 15741 if (Subtarget.hasAVX512()) { 15742 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, 15743 DAG.getConstant(SSECC, DL, MVT::i8)); 15744 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); 15745 } 15746 15747 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, 15748 DAG.getConstant(SSECC, DL, MVT::i8)); 15749 15750 // If we have AVX, we can use a variable vector select (VBLENDV) instead 15751 // of 3 logic instructions for size savings and potentially speed. 15752 // Unfortunately, there is no scalar form of VBLENDV. 15753 15754 // If either operand is a constant, don't try this. We can expect to 15755 // optimize away at least one of the logic instructions later in that 15756 // case, so that sequence would be faster than a variable blend. 15757 15758 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly 15759 // uses XMM0 as the selection register. That may need just as many 15760 // instructions as the AND/ANDN/OR sequence due to register moves, so 15761 // don't bother. 15762 15763 if (Subtarget.hasAVX() && 15764 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) { 15765 15766 // Convert to vectors, do a VSELECT, and convert back to scalar. 15767 // All of the conversions should be optimized away. 15768 15769 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; 15770 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); 15771 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); 15772 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); 15773 15774 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; 15775 VCmp = DAG.getBitcast(VCmpVT, VCmp); 15776 15777 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); 15778 15779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, 15780 VSel, DAG.getIntPtrConstant(0, DL)); 15781 } 15782 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); 15783 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); 15784 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); 15785 } 15786 } 15787 15788 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { 15789 SDValue Op1Scalar; 15790 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) 15791 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); 15792 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) 15793 Op1Scalar = Op1.getOperand(0); 15794 SDValue Op2Scalar; 15795 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) 15796 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); 15797 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) 15798 Op2Scalar = Op2.getOperand(0); 15799 if (Op1Scalar.getNode() && Op2Scalar.getNode()) { 15800 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, 15801 Op1Scalar.getValueType(), 15802 Cond, Op1Scalar, Op2Scalar); 15803 if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) 15804 return DAG.getBitcast(VT, newSelect); 15805 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); 15806 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, 15807 DAG.getIntPtrConstant(0, DL)); 15808 } 15809 } 15810 15811 if (VT == MVT::v4i1 || VT == MVT::v2i1) { 15812 SDValue zeroConst = DAG.getIntPtrConstant(0, DL); 15813 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, 15814 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); 15815 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, 15816 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); 15817 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, 15818 Cond, Op1, Op2); 15819 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); 15820 } 15821 15822 if (Cond.getOpcode() == ISD::SETCC) { 15823 if (SDValue NewCond = LowerSETCC(Cond, DAG)) 15824 Cond = NewCond; 15825 } 15826 15827 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 15828 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 15829 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 15830 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 15831 if (Cond.getOpcode() == X86ISD::SETCC && 15832 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 15833 isNullConstant(Cond.getOperand(1).getOperand(1))) { 15834 SDValue Cmp = Cond.getOperand(1); 15835 15836 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 15837 15838 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && 15839 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 15840 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; 15841 15842 SDValue CmpOp0 = Cmp.getOperand(0); 15843 // Apply further optimizations for special cases 15844 // (select (x != 0), -1, 0) -> neg & sbb 15845 // (select (x == 0), 0, -1) -> neg & sbb 15846 if (isNullConstant(Y) && 15847 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { 15848 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 15849 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 15850 DAG.getConstant(0, DL, 15851 CmpOp0.getValueType()), 15852 CmpOp0); 15853 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 15854 DAG.getConstant(X86::COND_B, DL, MVT::i8), 15855 SDValue(Neg.getNode(), 1)); 15856 return Res; 15857 } 15858 15859 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 15860 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); 15861 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 15862 15863 SDValue Res = // Res = 0 or -1. 15864 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 15865 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); 15866 15867 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) 15868 Res = DAG.getNOT(DL, Res, Res.getValueType()); 15869 15870 if (!isNullConstant(Op2)) 15871 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 15872 return Res; 15873 } 15874 } 15875 15876 // Look past (and (setcc_carry (cmp ...)), 1). 15877 if (Cond.getOpcode() == ISD::AND && 15878 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && 15879 isOneConstant(Cond.getOperand(1))) 15880 Cond = Cond.getOperand(0); 15881 15882 // If condition flag is set by a X86ISD::CMP, then use it as the condition 15883 // setting operand in place of the X86ISD::SETCC. 15884 unsigned CondOpcode = Cond.getOpcode(); 15885 if (CondOpcode == X86ISD::SETCC || 15886 CondOpcode == X86ISD::SETCC_CARRY) { 15887 CC = Cond.getOperand(0); 15888 15889 SDValue Cmp = Cond.getOperand(1); 15890 unsigned Opc = Cmp.getOpcode(); 15891 MVT VT = Op.getSimpleValueType(); 15892 15893 bool IllegalFPCMov = false; 15894 if (VT.isFloatingPoint() && !VT.isVector() && 15895 !isScalarFPTypeInSSEReg(VT)) // FPStack? 15896 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 15897 15898 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 15899 Opc == X86ISD::BT) { // FIXME 15900 Cond = Cmp; 15901 addTest = false; 15902 } 15903 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 15904 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 15905 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 15906 Cond.getOperand(0).getValueType() != MVT::i8)) { 15907 SDValue LHS = Cond.getOperand(0); 15908 SDValue RHS = Cond.getOperand(1); 15909 unsigned X86Opcode; 15910 unsigned X86Cond; 15911 SDVTList VTs; 15912 switch (CondOpcode) { 15913 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 15914 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 15915 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 15916 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 15917 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 15918 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 15919 default: llvm_unreachable("unexpected overflowing operator"); 15920 } 15921 if (CondOpcode == ISD::UMULO) 15922 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 15923 MVT::i32); 15924 else 15925 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 15926 15927 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 15928 15929 if (CondOpcode == ISD::UMULO) 15930 Cond = X86Op.getValue(2); 15931 else 15932 Cond = X86Op.getValue(1); 15933 15934 CC = DAG.getConstant(X86Cond, DL, MVT::i8); 15935 addTest = false; 15936 } 15937 15938 if (addTest) { 15939 // Look past the truncate if the high bits are known zero. 15940 Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); 15941 15942 // We know the result of AND is compared against zero. Try to match 15943 // it to BT. 15944 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 15945 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { 15946 CC = NewSetCC.getOperand(0); 15947 Cond = NewSetCC.getOperand(1); 15948 addTest = false; 15949 } 15950 } 15951 } 15952 15953 if (addTest) { 15954 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); 15955 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); 15956 } 15957 15958 // a < b ? -1 : 0 -> RES = ~setcc_carry 15959 // a < b ? 0 : -1 -> RES = setcc_carry 15960 // a >= b ? -1 : 0 -> RES = setcc_carry 15961 // a >= b ? 0 : -1 -> RES = ~setcc_carry 15962 if (Cond.getOpcode() == X86ISD::SUB) { 15963 Cond = ConvertCmpIfNecessary(Cond, DAG); 15964 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 15965 15966 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 15967 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && 15968 (isNullConstant(Op1) || isNullConstant(Op2))) { 15969 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 15970 DAG.getConstant(X86::COND_B, DL, MVT::i8), 15971 Cond); 15972 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) 15973 return DAG.getNOT(DL, Res, Res.getValueType()); 15974 return Res; 15975 } 15976 } 15977 15978 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate 15979 // widen the cmov and push the truncate through. This avoids introducing a new 15980 // branch during isel and doesn't add any extensions. 15981 if (Op.getValueType() == MVT::i8 && 15982 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { 15983 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); 15984 if (T1.getValueType() == T2.getValueType() && 15985 // Blacklist CopyFromReg to avoid partial register stalls. 15986 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ 15987 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); 15988 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); 15989 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); 15990 } 15991 } 15992 15993 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 15994 // condition is true. 15995 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 15996 SDValue Ops[] = { Op2, Op1, CC, Cond }; 15997 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); 15998 } 15999 16000 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, 16001 const X86Subtarget &Subtarget, 16002 SelectionDAG &DAG) { 16003 MVT VT = Op->getSimpleValueType(0); 16004 SDValue In = Op->getOperand(0); 16005 MVT InVT = In.getSimpleValueType(); 16006 MVT VTElt = VT.getVectorElementType(); 16007 MVT InVTElt = InVT.getVectorElementType(); 16008 SDLoc dl(Op); 16009 16010 // SKX processor 16011 if ((InVTElt == MVT::i1) && 16012 (((Subtarget.hasBWI() && Subtarget.hasVLX() && 16013 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || 16014 16015 ((Subtarget.hasBWI() && VT.is512BitVector() && 16016 VTElt.getSizeInBits() <= 16)) || 16017 16018 ((Subtarget.hasDQI() && Subtarget.hasVLX() && 16019 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || 16020 16021 ((Subtarget.hasDQI() && VT.is512BitVector() && 16022 VTElt.getSizeInBits() >= 32)))) 16023 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 16024 16025 unsigned int NumElts = VT.getVectorNumElements(); 16026 16027 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) 16028 return SDValue(); 16029 16030 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { 16031 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) 16032 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); 16033 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 16034 } 16035 16036 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 16037 MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; 16038 SDValue NegOne = 16039 DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, 16040 ExtVT); 16041 SDValue Zero = 16042 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); 16043 16044 SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); 16045 if (VT.is512BitVector()) 16046 return V; 16047 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); 16048 } 16049 16050 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, 16051 const X86Subtarget &Subtarget, 16052 SelectionDAG &DAG) { 16053 SDValue In = Op->getOperand(0); 16054 MVT VT = Op->getSimpleValueType(0); 16055 MVT InVT = In.getSimpleValueType(); 16056 assert(VT.getSizeInBits() == InVT.getSizeInBits()); 16057 16058 MVT SVT = VT.getVectorElementType(); 16059 MVT InSVT = InVT.getVectorElementType(); 16060 assert(SVT.getSizeInBits() > InSVT.getSizeInBits()); 16061 16062 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) 16063 return SDValue(); 16064 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) 16065 return SDValue(); 16066 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && 16067 !(VT.is256BitVector() && Subtarget.hasInt256())) 16068 return SDValue(); 16069 16070 SDLoc dl(Op); 16071 16072 // For 256-bit vectors, we only need the lower (128-bit) half of the input. 16073 if (VT.is256BitVector()) 16074 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, 16075 MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2), 16076 In, DAG.getIntPtrConstant(0, dl)); 16077 16078 // SSE41 targets can use the pmovsx* instructions directly. 16079 if (Subtarget.hasSSE41()) 16080 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 16081 16082 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. 16083 SDValue Curr = In; 16084 MVT CurrVT = InVT; 16085 16086 // As SRAI is only available on i16/i32 types, we expand only up to i32 16087 // and handle i64 separately. 16088 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { 16089 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); 16090 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); 16091 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); 16092 Curr = DAG.getBitcast(CurrVT, Curr); 16093 } 16094 16095 SDValue SignExt = Curr; 16096 if (CurrVT != InVT) { 16097 unsigned SignExtShift = 16098 CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits(); 16099 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, 16100 DAG.getConstant(SignExtShift, dl, MVT::i8)); 16101 } 16102 16103 if (CurrVT == VT) 16104 return SignExt; 16105 16106 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { 16107 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, 16108 DAG.getConstant(31, dl, MVT::i8)); 16109 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); 16110 return DAG.getBitcast(VT, Ext); 16111 } 16112 16113 return SDValue(); 16114 } 16115 16116 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, 16117 SelectionDAG &DAG) { 16118 MVT VT = Op->getSimpleValueType(0); 16119 SDValue In = Op->getOperand(0); 16120 MVT InVT = In.getSimpleValueType(); 16121 SDLoc dl(Op); 16122 16123 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 16124 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); 16125 16126 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && 16127 (VT != MVT::v8i32 || InVT != MVT::v8i16) && 16128 (VT != MVT::v16i16 || InVT != MVT::v16i8)) 16129 return SDValue(); 16130 16131 if (Subtarget.hasInt256()) 16132 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 16133 16134 // Optimize vectors in AVX mode 16135 // Sign extend v8i16 to v8i32 and 16136 // v4i32 to v4i64 16137 // 16138 // Divide input vector into two parts 16139 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 16140 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 16141 // concat the vectors to original VT 16142 16143 unsigned NumElems = InVT.getVectorNumElements(); 16144 SDValue Undef = DAG.getUNDEF(InVT); 16145 16146 SmallVector<int,8> ShufMask1(NumElems, -1); 16147 for (unsigned i = 0; i != NumElems/2; ++i) 16148 ShufMask1[i] = i; 16149 16150 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1); 16151 16152 SmallVector<int,8> ShufMask2(NumElems, -1); 16153 for (unsigned i = 0; i != NumElems/2; ++i) 16154 ShufMask2[i] = i + NumElems/2; 16155 16156 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2); 16157 16158 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), 16159 VT.getVectorNumElements()/2); 16160 16161 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); 16162 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); 16163 16164 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 16165 } 16166 16167 // Lower truncating store. We need a special lowering to vXi1 vectors 16168 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget, 16169 SelectionDAG &DAG) { 16170 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode()); 16171 SDLoc dl(St); 16172 EVT MemVT = St->getMemoryVT(); 16173 assert(St->isTruncatingStore() && "We only custom truncating store."); 16174 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && 16175 "Expected truncstore of i1 vector"); 16176 16177 SDValue Op = St->getValue(); 16178 MVT OpVT = Op.getValueType().getSimpleVT(); 16179 unsigned NumElts = OpVT.getVectorNumElements(); 16180 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || 16181 NumElts == 16) { 16182 // Truncate and store - everything is legal 16183 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op); 16184 if (MemVT.getSizeInBits() < 8) 16185 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, 16186 DAG.getUNDEF(MVT::v8i1), Op, 16187 DAG.getIntPtrConstant(0, dl)); 16188 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), 16189 St->getMemOperand()); 16190 } 16191 16192 // A subset, assume that we have only AVX-512F 16193 if (NumElts <= 8) { 16194 if (NumElts < 8) { 16195 // Extend to 8-elts vector 16196 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8); 16197 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT, 16198 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl)); 16199 } 16200 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op); 16201 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), 16202 St->getMemOperand()); 16203 } 16204 // v32i8 16205 assert(OpVT == MVT::v32i8 && "Unexpected operand type"); 16206 // Divide the vector into 2 parts and store each part separately 16207 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op, 16208 DAG.getIntPtrConstant(0, dl)); 16209 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo); 16210 SDValue BasePtr = St->getBasePtr(); 16211 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr, 16212 St->getMemOperand()); 16213 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op, 16214 DAG.getIntPtrConstant(16, dl)); 16215 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi); 16216 16217 SDValue BasePtrHi = 16218 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 16219 DAG.getConstant(2, dl, BasePtr.getValueType())); 16220 16221 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi, 16222 BasePtrHi, St->getMemOperand()); 16223 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi); 16224 } 16225 16226 static SDValue LowerExtended1BitVectorLoad(SDValue Op, 16227 const X86Subtarget &Subtarget, 16228 SelectionDAG &DAG) { 16229 16230 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); 16231 SDLoc dl(Ld); 16232 EVT MemVT = Ld->getMemoryVT(); 16233 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && 16234 "Expected i1 vector load"); 16235 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ? 16236 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; 16237 MVT VT = Op.getValueType().getSimpleVT(); 16238 unsigned NumElts = VT.getVectorNumElements(); 16239 16240 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || 16241 NumElts == 16) { 16242 // Load and extend - everything is legal 16243 if (NumElts < 8) { 16244 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(), 16245 Ld->getBasePtr(), 16246 Ld->getMemOperand()); 16247 // Replace chain users with the new chain. 16248 assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); 16249 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); 16250 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8); 16251 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load); 16252 16253 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, 16254 DAG.getIntPtrConstant(0, dl)); 16255 } 16256 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(), 16257 Ld->getBasePtr(), 16258 Ld->getMemOperand()); 16259 // Replace chain users with the new chain. 16260 assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); 16261 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); 16262 16263 // Finally, do a normal sign-extend to the desired register. 16264 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load); 16265 } 16266 16267 if (NumElts <= 8) { 16268 // A subset, assume that we have only AVX-512F 16269 unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts; 16270 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad); 16271 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(), 16272 Ld->getBasePtr(), 16273 Ld->getMemOperand()); 16274 // Replace chain users with the new chain. 16275 assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); 16276 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); 16277 16278 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad); 16279 SDValue BitVec = DAG.getBitcast(MaskVT, Load); 16280 16281 if (NumElts == 8) 16282 return DAG.getNode(ExtOpcode, dl, VT, BitVec); 16283 16284 // we should take care to v4i1 and v2i1 16285 16286 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8); 16287 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec); 16288 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, 16289 DAG.getIntPtrConstant(0, dl)); 16290 } 16291 16292 assert(VT == MVT::v32i8 && "Unexpected extload type"); 16293 16294 SmallVector<SDValue, 2> Chains; 16295 16296 SDValue BasePtr = Ld->getBasePtr(); 16297 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), 16298 Ld->getBasePtr(), 16299 Ld->getMemOperand()); 16300 Chains.push_back(LoadLo.getValue(1)); 16301 16302 SDValue BasePtrHi = 16303 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 16304 DAG.getConstant(2, dl, BasePtr.getValueType())); 16305 16306 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), 16307 BasePtrHi, 16308 Ld->getMemOperand()); 16309 Chains.push_back(LoadHi.getValue(1)); 16310 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 16311 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 16312 16313 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo); 16314 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi); 16315 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi); 16316 } 16317 16318 // Lower vector extended loads using a shuffle. If SSSE3 is not available we 16319 // may emit an illegal shuffle but the expansion is still better than scalar 16320 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise 16321 // we'll emit a shuffle and a arithmetic shift. 16322 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. 16323 // TODO: It is possible to support ZExt by zeroing the undef values during 16324 // the shuffle phase or after the shuffle. 16325 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, 16326 SelectionDAG &DAG) { 16327 MVT RegVT = Op.getSimpleValueType(); 16328 assert(RegVT.isVector() && "We only custom lower vector sext loads."); 16329 assert(RegVT.isInteger() && 16330 "We only custom lower integer vector sext loads."); 16331 16332 // Nothing useful we can do without SSE2 shuffles. 16333 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); 16334 16335 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); 16336 SDLoc dl(Ld); 16337 EVT MemVT = Ld->getMemoryVT(); 16338 if (MemVT.getScalarType() == MVT::i1) 16339 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG); 16340 16341 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16342 unsigned RegSz = RegVT.getSizeInBits(); 16343 16344 ISD::LoadExtType Ext = Ld->getExtensionType(); 16345 16346 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) 16347 && "Only anyext and sext are currently implemented."); 16348 assert(MemVT != RegVT && "Cannot extend to the same type"); 16349 assert(MemVT.isVector() && "Must load a vector from memory"); 16350 16351 unsigned NumElems = RegVT.getVectorNumElements(); 16352 unsigned MemSz = MemVT.getSizeInBits(); 16353 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 16354 16355 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) { 16356 // The only way in which we have a legal 256-bit vector result but not the 16357 // integer 256-bit operations needed to directly lower a sextload is if we 16358 // have AVX1 but not AVX2. In that case, we can always emit a sextload to 16359 // a 128-bit vector and a normal sign_extend to 256-bits that should get 16360 // correctly legalized. We do this late to allow the canonical form of 16361 // sextload to persist throughout the rest of the DAG combiner -- it wants 16362 // to fold together any extensions it can, and so will fuse a sign_extend 16363 // of an sextload into a sextload targeting a wider value. 16364 SDValue Load; 16365 if (MemSz == 128) { 16366 // Just switch this to a normal load. 16367 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " 16368 "it must be a legal 128-bit vector " 16369 "type!"); 16370 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), 16371 Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), 16372 Ld->isInvariant(), Ld->getAlignment()); 16373 } else { 16374 assert(MemSz < 128 && 16375 "Can't extend a type wider than 128 bits to a 256 bit vector!"); 16376 // Do an sext load to a 128-bit vector type. We want to use the same 16377 // number of elements, but elements half as wide. This will end up being 16378 // recursively lowered by this routine, but will succeed as we definitely 16379 // have all the necessary features if we're using AVX1. 16380 EVT HalfEltVT = 16381 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); 16382 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); 16383 Load = 16384 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), 16385 Ld->getPointerInfo(), MemVT, Ld->isVolatile(), 16386 Ld->isNonTemporal(), Ld->isInvariant(), 16387 Ld->getAlignment()); 16388 } 16389 16390 // Replace chain users with the new chain. 16391 assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); 16392 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); 16393 16394 // Finally, do a normal sign-extend to the desired register. 16395 return DAG.getSExtOrTrunc(Load, dl, RegVT); 16396 } 16397 16398 // All sizes must be a power of two. 16399 assert(isPowerOf2_32(RegSz * MemSz * NumElems) && 16400 "Non-power-of-two elements are not custom lowered!"); 16401 16402 // Attempt to load the original value using scalar loads. 16403 // Find the largest scalar type that divides the total loaded size. 16404 MVT SclrLoadTy = MVT::i8; 16405 for (MVT Tp : MVT::integer_valuetypes()) { 16406 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 16407 SclrLoadTy = Tp; 16408 } 16409 } 16410 16411 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 16412 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 16413 (64 <= MemSz)) 16414 SclrLoadTy = MVT::f64; 16415 16416 // Calculate the number of scalar loads that we need to perform 16417 // in order to load our vector from memory. 16418 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 16419 16420 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && 16421 "Can only lower sext loads with a single scalar load!"); 16422 16423 unsigned loadRegZize = RegSz; 16424 if (Ext == ISD::SEXTLOAD && RegSz >= 256) 16425 loadRegZize = 128; 16426 16427 // Represent our vector as a sequence of elements which are the 16428 // largest scalar that we can load. 16429 EVT LoadUnitVecVT = EVT::getVectorVT( 16430 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); 16431 16432 // Represent the data using the same element type that is stored in 16433 // memory. In practice, we ''widen'' MemVT. 16434 EVT WideVecVT = 16435 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 16436 loadRegZize / MemVT.getScalarSizeInBits()); 16437 16438 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 16439 "Invalid vector type"); 16440 16441 // We can't shuffle using an illegal type. 16442 assert(TLI.isTypeLegal(WideVecVT) && 16443 "We only lower types that form legal widened vector types"); 16444 16445 SmallVector<SDValue, 8> Chains; 16446 SDValue Ptr = Ld->getBasePtr(); 16447 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, 16448 TLI.getPointerTy(DAG.getDataLayout())); 16449 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 16450 16451 for (unsigned i = 0; i < NumLoads; ++i) { 16452 // Perform a single load. 16453 SDValue ScalarLoad = 16454 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 16455 Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), 16456 Ld->getAlignment()); 16457 Chains.push_back(ScalarLoad.getValue(1)); 16458 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 16459 // another round of DAGCombining. 16460 if (i == 0) 16461 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 16462 else 16463 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 16464 ScalarLoad, DAG.getIntPtrConstant(i, dl)); 16465 16466 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 16467 } 16468 16469 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 16470 16471 // Bitcast the loaded value to a vector of the original element type, in 16472 // the size of the target vector type. 16473 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); 16474 unsigned SizeRatio = RegSz / MemSz; 16475 16476 if (Ext == ISD::SEXTLOAD) { 16477 // If we have SSE4.1, we can directly emit a VSEXT node. 16478 if (Subtarget.hasSSE41()) { 16479 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); 16480 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 16481 return Sext; 16482 } 16483 16484 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest 16485 // lanes. 16486 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && 16487 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); 16488 16489 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT); 16490 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 16491 return Shuff; 16492 } 16493 16494 // Redistribute the loaded elements into the different locations. 16495 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 16496 for (unsigned i = 0; i != NumElems; ++i) 16497 ShuffleVec[i * SizeRatio] = i; 16498 16499 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 16500 DAG.getUNDEF(WideVecVT), ShuffleVec); 16501 16502 // Bitcast to the requested type. 16503 Shuff = DAG.getBitcast(RegVT, Shuff); 16504 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 16505 return Shuff; 16506 } 16507 16508 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes 16509 /// each of which has no other use apart from the AND / OR. 16510 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 16511 Opc = Op.getOpcode(); 16512 if (Opc != ISD::OR && Opc != ISD::AND) 16513 return false; 16514 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 16515 Op.getOperand(0).hasOneUse() && 16516 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 16517 Op.getOperand(1).hasOneUse()); 16518 } 16519 16520 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the 16521 /// SETCC node has a single use. 16522 static bool isXor1OfSetCC(SDValue Op) { 16523 if (Op.getOpcode() != ISD::XOR) 16524 return false; 16525 if (isOneConstant(Op.getOperand(1))) 16526 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 16527 Op.getOperand(0).hasOneUse(); 16528 return false; 16529 } 16530 16531 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 16532 bool addTest = true; 16533 SDValue Chain = Op.getOperand(0); 16534 SDValue Cond = Op.getOperand(1); 16535 SDValue Dest = Op.getOperand(2); 16536 SDLoc dl(Op); 16537 SDValue CC; 16538 bool Inverted = false; 16539 16540 if (Cond.getOpcode() == ISD::SETCC) { 16541 // Check for setcc([su]{add,sub,mul}o == 0). 16542 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 16543 isNullConstant(Cond.getOperand(1)) && 16544 Cond.getOperand(0).getResNo() == 1 && 16545 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 16546 Cond.getOperand(0).getOpcode() == ISD::UADDO || 16547 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 16548 Cond.getOperand(0).getOpcode() == ISD::USUBO || 16549 Cond.getOperand(0).getOpcode() == ISD::SMULO || 16550 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 16551 Inverted = true; 16552 Cond = Cond.getOperand(0); 16553 } else { 16554 if (SDValue NewCond = LowerSETCC(Cond, DAG)) 16555 Cond = NewCond; 16556 } 16557 } 16558 #if 0 16559 // FIXME: LowerXALUO doesn't handle these!! 16560 else if (Cond.getOpcode() == X86ISD::ADD || 16561 Cond.getOpcode() == X86ISD::SUB || 16562 Cond.getOpcode() == X86ISD::SMUL || 16563 Cond.getOpcode() == X86ISD::UMUL) 16564 Cond = LowerXALUO(Cond, DAG); 16565 #endif 16566 16567 // Look pass (and (setcc_carry (cmp ...)), 1). 16568 if (Cond.getOpcode() == ISD::AND && 16569 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && 16570 isOneConstant(Cond.getOperand(1))) 16571 Cond = Cond.getOperand(0); 16572 16573 // If condition flag is set by a X86ISD::CMP, then use it as the condition 16574 // setting operand in place of the X86ISD::SETCC. 16575 unsigned CondOpcode = Cond.getOpcode(); 16576 if (CondOpcode == X86ISD::SETCC || 16577 CondOpcode == X86ISD::SETCC_CARRY) { 16578 CC = Cond.getOperand(0); 16579 16580 SDValue Cmp = Cond.getOperand(1); 16581 unsigned Opc = Cmp.getOpcode(); 16582 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 16583 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 16584 Cond = Cmp; 16585 addTest = false; 16586 } else { 16587 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 16588 default: break; 16589 case X86::COND_O: 16590 case X86::COND_B: 16591 // These can only come from an arithmetic instruction with overflow, 16592 // e.g. SADDO, UADDO. 16593 Cond = Cond.getNode()->getOperand(1); 16594 addTest = false; 16595 break; 16596 } 16597 } 16598 } 16599 CondOpcode = Cond.getOpcode(); 16600 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 16601 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 16602 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 16603 Cond.getOperand(0).getValueType() != MVT::i8)) { 16604 SDValue LHS = Cond.getOperand(0); 16605 SDValue RHS = Cond.getOperand(1); 16606 unsigned X86Opcode; 16607 unsigned X86Cond; 16608 SDVTList VTs; 16609 // Keep this in sync with LowerXALUO, otherwise we might create redundant 16610 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and 16611 // X86ISD::INC). 16612 switch (CondOpcode) { 16613 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 16614 case ISD::SADDO: 16615 if (isOneConstant(RHS)) { 16616 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; 16617 break; 16618 } 16619 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 16620 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 16621 case ISD::SSUBO: 16622 if (isOneConstant(RHS)) { 16623 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; 16624 break; 16625 } 16626 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 16627 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 16628 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 16629 default: llvm_unreachable("unexpected overflowing operator"); 16630 } 16631 if (Inverted) 16632 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 16633 if (CondOpcode == ISD::UMULO) 16634 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 16635 MVT::i32); 16636 else 16637 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 16638 16639 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 16640 16641 if (CondOpcode == ISD::UMULO) 16642 Cond = X86Op.getValue(2); 16643 else 16644 Cond = X86Op.getValue(1); 16645 16646 CC = DAG.getConstant(X86Cond, dl, MVT::i8); 16647 addTest = false; 16648 } else { 16649 unsigned CondOpc; 16650 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 16651 SDValue Cmp = Cond.getOperand(0).getOperand(1); 16652 if (CondOpc == ISD::OR) { 16653 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 16654 // two branches instead of an explicit OR instruction with a 16655 // separate test. 16656 if (Cmp == Cond.getOperand(1).getOperand(1) && 16657 isX86LogicalCmp(Cmp)) { 16658 CC = Cond.getOperand(0).getOperand(0); 16659 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 16660 Chain, Dest, CC, Cmp); 16661 CC = Cond.getOperand(1).getOperand(0); 16662 Cond = Cmp; 16663 addTest = false; 16664 } 16665 } else { // ISD::AND 16666 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 16667 // two branches instead of an explicit AND instruction with a 16668 // separate test. However, we only do this if this block doesn't 16669 // have a fall-through edge, because this requires an explicit 16670 // jmp when the condition is false. 16671 if (Cmp == Cond.getOperand(1).getOperand(1) && 16672 isX86LogicalCmp(Cmp) && 16673 Op.getNode()->hasOneUse()) { 16674 X86::CondCode CCode = 16675 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 16676 CCode = X86::GetOppositeBranchCondition(CCode); 16677 CC = DAG.getConstant(CCode, dl, MVT::i8); 16678 SDNode *User = *Op.getNode()->use_begin(); 16679 // Look for an unconditional branch following this conditional branch. 16680 // We need this because we need to reverse the successors in order 16681 // to implement FCMP_OEQ. 16682 if (User->getOpcode() == ISD::BR) { 16683 SDValue FalseBB = User->getOperand(1); 16684 SDNode *NewBR = 16685 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 16686 assert(NewBR == User); 16687 (void)NewBR; 16688 Dest = FalseBB; 16689 16690 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 16691 Chain, Dest, CC, Cmp); 16692 X86::CondCode CCode = 16693 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 16694 CCode = X86::GetOppositeBranchCondition(CCode); 16695 CC = DAG.getConstant(CCode, dl, MVT::i8); 16696 Cond = Cmp; 16697 addTest = false; 16698 } 16699 } 16700 } 16701 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 16702 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 16703 // It should be transformed during dag combiner except when the condition 16704 // is set by a arithmetics with overflow node. 16705 X86::CondCode CCode = 16706 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 16707 CCode = X86::GetOppositeBranchCondition(CCode); 16708 CC = DAG.getConstant(CCode, dl, MVT::i8); 16709 Cond = Cond.getOperand(0).getOperand(1); 16710 addTest = false; 16711 } else if (Cond.getOpcode() == ISD::SETCC && 16712 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 16713 // For FCMP_OEQ, we can emit 16714 // two branches instead of an explicit AND instruction with a 16715 // separate test. However, we only do this if this block doesn't 16716 // have a fall-through edge, because this requires an explicit 16717 // jmp when the condition is false. 16718 if (Op.getNode()->hasOneUse()) { 16719 SDNode *User = *Op.getNode()->use_begin(); 16720 // Look for an unconditional branch following this conditional branch. 16721 // We need this because we need to reverse the successors in order 16722 // to implement FCMP_OEQ. 16723 if (User->getOpcode() == ISD::BR) { 16724 SDValue FalseBB = User->getOperand(1); 16725 SDNode *NewBR = 16726 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 16727 assert(NewBR == User); 16728 (void)NewBR; 16729 Dest = FalseBB; 16730 16731 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 16732 Cond.getOperand(0), Cond.getOperand(1)); 16733 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 16734 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); 16735 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 16736 Chain, Dest, CC, Cmp); 16737 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); 16738 Cond = Cmp; 16739 addTest = false; 16740 } 16741 } 16742 } else if (Cond.getOpcode() == ISD::SETCC && 16743 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 16744 // For FCMP_UNE, we can emit 16745 // two branches instead of an explicit AND instruction with a 16746 // separate test. However, we only do this if this block doesn't 16747 // have a fall-through edge, because this requires an explicit 16748 // jmp when the condition is false. 16749 if (Op.getNode()->hasOneUse()) { 16750 SDNode *User = *Op.getNode()->use_begin(); 16751 // Look for an unconditional branch following this conditional branch. 16752 // We need this because we need to reverse the successors in order 16753 // to implement FCMP_UNE. 16754 if (User->getOpcode() == ISD::BR) { 16755 SDValue FalseBB = User->getOperand(1); 16756 SDNode *NewBR = 16757 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 16758 assert(NewBR == User); 16759 (void)NewBR; 16760 16761 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 16762 Cond.getOperand(0), Cond.getOperand(1)); 16763 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 16764 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); 16765 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 16766 Chain, Dest, CC, Cmp); 16767 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8); 16768 Cond = Cmp; 16769 addTest = false; 16770 Dest = FalseBB; 16771 } 16772 } 16773 } 16774 } 16775 16776 if (addTest) { 16777 // Look pass the truncate if the high bits are known zero. 16778 Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); 16779 16780 // We know the result of AND is compared against zero. Try to match 16781 // it to BT. 16782 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 16783 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { 16784 CC = NewSetCC.getOperand(0); 16785 Cond = NewSetCC.getOperand(1); 16786 addTest = false; 16787 } 16788 } 16789 } 16790 16791 if (addTest) { 16792 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; 16793 CC = DAG.getConstant(X86Cond, dl, MVT::i8); 16794 Cond = EmitTest(Cond, X86Cond, dl, DAG); 16795 } 16796 Cond = ConvertCmpIfNecessary(Cond, DAG); 16797 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 16798 Chain, Dest, CC, Cond); 16799 } 16800 16801 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 16802 // Calls to _alloca are needed to probe the stack when allocating more than 4k 16803 // bytes in one go. Touching the stack at 4K increments is necessary to ensure 16804 // that the guard pages used by the OS virtual memory manager are allocated in 16805 // correct sequence. 16806 SDValue 16807 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 16808 SelectionDAG &DAG) const { 16809 MachineFunction &MF = DAG.getMachineFunction(); 16810 bool SplitStack = MF.shouldSplitStack(); 16811 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || 16812 SplitStack; 16813 SDLoc dl(Op); 16814 16815 // Get the inputs. 16816 SDNode *Node = Op.getNode(); 16817 SDValue Chain = Op.getOperand(0); 16818 SDValue Size = Op.getOperand(1); 16819 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 16820 EVT VT = Node->getValueType(0); 16821 16822 // Chain the dynamic stack allocation so that it doesn't modify the stack 16823 // pointer when other instructions are using the stack. 16824 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); 16825 16826 bool Is64Bit = Subtarget.is64Bit(); 16827 MVT SPTy = getPointerTy(DAG.getDataLayout()); 16828 16829 SDValue Result; 16830 if (!Lower) { 16831 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16832 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); 16833 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" 16834 " not tell us which reg is the stack pointer!"); 16835 16836 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); 16837 Chain = SP.getValue(1); 16838 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); 16839 unsigned StackAlign = TFI.getStackAlignment(); 16840 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value 16841 if (Align > StackAlign) 16842 Result = DAG.getNode(ISD::AND, dl, VT, Result, 16843 DAG.getConstant(-(uint64_t)Align, dl, VT)); 16844 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain 16845 } else if (SplitStack) { 16846 MachineRegisterInfo &MRI = MF.getRegInfo(); 16847 16848 if (Is64Bit) { 16849 // The 64 bit implementation of segmented stacks needs to clobber both r10 16850 // r11. This makes it impossible to use it along with nested parameters. 16851 const Function *F = MF.getFunction(); 16852 for (const auto &A : F->args()) { 16853 if (A.hasNestAttr()) 16854 report_fatal_error("Cannot use segmented stacks with functions that " 16855 "have nested arguments."); 16856 } 16857 } 16858 16859 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); 16860 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 16861 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 16862 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 16863 DAG.getRegister(Vreg, SPTy)); 16864 } else { 16865 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 16866 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size); 16867 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true); 16868 16869 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 16870 unsigned SPReg = RegInfo->getStackRegister(); 16871 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); 16872 Chain = SP.getValue(1); 16873 16874 if (Align) { 16875 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 16876 DAG.getConstant(-(uint64_t)Align, dl, VT)); 16877 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); 16878 } 16879 16880 Result = SP; 16881 } 16882 16883 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), 16884 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); 16885 16886 SDValue Ops[2] = {Result, Chain}; 16887 return DAG.getMergeValues(Ops, dl); 16888 } 16889 16890 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 16891 MachineFunction &MF = DAG.getMachineFunction(); 16892 auto PtrVT = getPointerTy(MF.getDataLayout()); 16893 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 16894 16895 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 16896 SDLoc DL(Op); 16897 16898 if (!Subtarget.is64Bit() || 16899 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) { 16900 // vastart just stores the address of the VarArgsFrameIndex slot into the 16901 // memory location argument. 16902 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 16903 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 16904 MachinePointerInfo(SV), false, false, 0); 16905 } 16906 16907 // __va_list_tag: 16908 // gp_offset (0 - 6 * 8) 16909 // fp_offset (48 - 48 + 8 * 16) 16910 // overflow_arg_area (point to parameters coming in memory). 16911 // reg_save_area 16912 SmallVector<SDValue, 8> MemOps; 16913 SDValue FIN = Op.getOperand(1); 16914 // Store gp_offset 16915 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 16916 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 16917 DL, MVT::i32), 16918 FIN, MachinePointerInfo(SV), false, false, 0); 16919 MemOps.push_back(Store); 16920 16921 // Store fp_offset 16922 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL); 16923 Store = DAG.getStore(Op.getOperand(0), DL, 16924 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, 16925 MVT::i32), 16926 FIN, MachinePointerInfo(SV, 4), false, false, 0); 16927 MemOps.push_back(Store); 16928 16929 // Store ptr to overflow_arg_area 16930 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); 16931 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 16932 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 16933 MachinePointerInfo(SV, 8), 16934 false, false, 0); 16935 MemOps.push_back(Store); 16936 16937 // Store ptr to reg_save_area. 16938 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( 16939 Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); 16940 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); 16941 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo( 16942 SV, Subtarget.isTarget64BitLP64() ? 16 : 12), false, false, 0); 16943 MemOps.push_back(Store); 16944 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 16945 } 16946 16947 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 16948 assert(Subtarget.is64Bit() && 16949 "LowerVAARG only handles 64-bit va_arg!"); 16950 assert(Op.getNode()->getNumOperands() == 4); 16951 16952 MachineFunction &MF = DAG.getMachineFunction(); 16953 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) 16954 // The Win64 ABI uses char* instead of a structure. 16955 return DAG.expandVAArg(Op.getNode()); 16956 16957 SDValue Chain = Op.getOperand(0); 16958 SDValue SrcPtr = Op.getOperand(1); 16959 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 16960 unsigned Align = Op.getConstantOperandVal(3); 16961 SDLoc dl(Op); 16962 16963 EVT ArgVT = Op.getNode()->getValueType(0); 16964 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 16965 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 16966 uint8_t ArgMode; 16967 16968 // Decide which area this value should be read from. 16969 // TODO: Implement the AMD64 ABI in its entirety. This simple 16970 // selection mechanism works only for the basic types. 16971 if (ArgVT == MVT::f80) { 16972 llvm_unreachable("va_arg for f80 not yet implemented"); 16973 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 16974 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 16975 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 16976 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 16977 } else { 16978 llvm_unreachable("Unhandled argument type in LowerVAARG"); 16979 } 16980 16981 if (ArgMode == 2) { 16982 // Sanity Check: Make sure using fp_offset makes sense. 16983 assert(!Subtarget.useSoftFloat() && 16984 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && 16985 Subtarget.hasSSE1()); 16986 } 16987 16988 // Insert VAARG_64 node into the DAG 16989 // VAARG_64 returns two values: Variable Argument Address, Chain 16990 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), 16991 DAG.getConstant(ArgMode, dl, MVT::i8), 16992 DAG.getConstant(Align, dl, MVT::i32)}; 16993 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); 16994 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 16995 VTs, InstOps, MVT::i64, 16996 MachinePointerInfo(SV), 16997 /*Align=*/0, 16998 /*Volatile=*/false, 16999 /*ReadMem=*/true, 17000 /*WriteMem=*/true); 17001 Chain = VAARG.getValue(1); 17002 17003 // Load the next argument and return it 17004 return DAG.getLoad(ArgVT, dl, 17005 Chain, 17006 VAARG, 17007 MachinePointerInfo(), 17008 false, false, false, 0); 17009 } 17010 17011 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, 17012 SelectionDAG &DAG) { 17013 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, 17014 // where a va_list is still an i8*. 17015 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"); 17016 if (Subtarget.isCallingConvWin64( 17017 DAG.getMachineFunction().getFunction()->getCallingConv())) 17018 // Probably a Win64 va_copy. 17019 return DAG.expandVACopy(Op.getNode()); 17020 17021 SDValue Chain = Op.getOperand(0); 17022 SDValue DstPtr = Op.getOperand(1); 17023 SDValue SrcPtr = Op.getOperand(2); 17024 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 17025 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 17026 SDLoc DL(Op); 17027 17028 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 17029 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, 17030 false, false, 17031 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 17032 } 17033 17034 /// Handle vector element shifts where the shift amount is a constant. 17035 /// Takes immediate version of shift as input. 17036 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, 17037 SDValue SrcOp, uint64_t ShiftAmt, 17038 SelectionDAG &DAG) { 17039 MVT ElementType = VT.getVectorElementType(); 17040 17041 // Fold this packed shift into its first operand if ShiftAmt is 0. 17042 if (ShiftAmt == 0) 17043 return SrcOp; 17044 17045 // Check for ShiftAmt >= element width 17046 if (ShiftAmt >= ElementType.getSizeInBits()) { 17047 if (Opc == X86ISD::VSRAI) 17048 ShiftAmt = ElementType.getSizeInBits() - 1; 17049 else 17050 return DAG.getConstant(0, dl, VT); 17051 } 17052 17053 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) 17054 && "Unknown target vector shift-by-constant node"); 17055 17056 // Fold this packed vector shift into a build vector if SrcOp is a 17057 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. 17058 if (VT == SrcOp.getSimpleValueType() && 17059 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { 17060 SmallVector<SDValue, 8> Elts; 17061 unsigned NumElts = SrcOp->getNumOperands(); 17062 ConstantSDNode *ND; 17063 17064 switch(Opc) { 17065 default: llvm_unreachable("Unknown opcode!"); 17066 case X86ISD::VSHLI: 17067 for (unsigned i=0; i!=NumElts; ++i) { 17068 SDValue CurrentOp = SrcOp->getOperand(i); 17069 if (CurrentOp->isUndef()) { 17070 Elts.push_back(CurrentOp); 17071 continue; 17072 } 17073 ND = cast<ConstantSDNode>(CurrentOp); 17074 const APInt &C = ND->getAPIntValue(); 17075 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); 17076 } 17077 break; 17078 case X86ISD::VSRLI: 17079 for (unsigned i=0; i!=NumElts; ++i) { 17080 SDValue CurrentOp = SrcOp->getOperand(i); 17081 if (CurrentOp->isUndef()) { 17082 Elts.push_back(CurrentOp); 17083 continue; 17084 } 17085 ND = cast<ConstantSDNode>(CurrentOp); 17086 const APInt &C = ND->getAPIntValue(); 17087 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); 17088 } 17089 break; 17090 case X86ISD::VSRAI: 17091 for (unsigned i=0; i!=NumElts; ++i) { 17092 SDValue CurrentOp = SrcOp->getOperand(i); 17093 if (CurrentOp->isUndef()) { 17094 Elts.push_back(CurrentOp); 17095 continue; 17096 } 17097 ND = cast<ConstantSDNode>(CurrentOp); 17098 const APInt &C = ND->getAPIntValue(); 17099 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); 17100 } 17101 break; 17102 } 17103 17104 return DAG.getBuildVector(VT, dl, Elts); 17105 } 17106 17107 return DAG.getNode(Opc, dl, VT, SrcOp, 17108 DAG.getConstant(ShiftAmt, dl, MVT::i8)); 17109 } 17110 17111 /// Handle vector element shifts where the shift amount may or may not be a 17112 /// constant. Takes immediate version of shift as input. 17113 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, 17114 SDValue SrcOp, SDValue ShAmt, 17115 SelectionDAG &DAG) { 17116 MVT SVT = ShAmt.getSimpleValueType(); 17117 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); 17118 17119 // Catch shift-by-constant. 17120 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) 17121 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, 17122 CShAmt->getZExtValue(), DAG); 17123 17124 // Change opcode to non-immediate version 17125 switch (Opc) { 17126 default: llvm_unreachable("Unknown target vector shift node"); 17127 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 17128 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 17129 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 17130 } 17131 17132 const X86Subtarget &Subtarget = 17133 static_cast<const X86Subtarget &>(DAG.getSubtarget()); 17134 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && 17135 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { 17136 // Let the shuffle legalizer expand this shift amount node. 17137 SDValue Op0 = ShAmt.getOperand(0); 17138 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); 17139 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG); 17140 } else { 17141 // Need to build a vector containing shift amount. 17142 // SSE/AVX packed shifts only use the lower 64-bit of the shift count. 17143 SmallVector<SDValue, 4> ShOps; 17144 ShOps.push_back(ShAmt); 17145 if (SVT == MVT::i32) { 17146 ShOps.push_back(DAG.getConstant(0, dl, SVT)); 17147 ShOps.push_back(DAG.getUNDEF(SVT)); 17148 } 17149 ShOps.push_back(DAG.getUNDEF(SVT)); 17150 17151 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; 17152 ShAmt = DAG.getBuildVector(BVT, dl, ShOps); 17153 } 17154 17155 // The return type has to be a 128-bit type with the same element 17156 // type as the input type. 17157 MVT EltVT = VT.getVectorElementType(); 17158 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 17159 17160 ShAmt = DAG.getBitcast(ShVT, ShAmt); 17161 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 17162 } 17163 17164 /// \brief Return Mask with the necessary casting or extending 17165 /// for \p Mask according to \p MaskVT when lowering masking intrinsics 17166 static SDValue getMaskNode(SDValue Mask, MVT MaskVT, 17167 const X86Subtarget &Subtarget, SelectionDAG &DAG, 17168 const SDLoc &dl) { 17169 17170 if (isAllOnesConstant(Mask)) 17171 return DAG.getTargetConstant(1, dl, MaskVT); 17172 if (X86::isZeroNode(Mask)) 17173 return DAG.getTargetConstant(0, dl, MaskVT); 17174 17175 if (MaskVT.bitsGT(Mask.getSimpleValueType())) { 17176 // Mask should be extended 17177 Mask = DAG.getNode(ISD::ANY_EXTEND, dl, 17178 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); 17179 } 17180 17181 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) { 17182 if (MaskVT == MVT::v64i1) { 17183 assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); 17184 // In case 32bit mode, bitcast i64 is illegal, extend/split it. 17185 SDValue Lo, Hi; 17186 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, 17187 DAG.getConstant(0, dl, MVT::i32)); 17188 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, 17189 DAG.getConstant(1, dl, MVT::i32)); 17190 17191 Lo = DAG.getBitcast(MVT::v32i1, Lo); 17192 Hi = DAG.getBitcast(MVT::v32i1, Hi); 17193 17194 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); 17195 } else { 17196 // MaskVT require < 64bit. Truncate mask (should succeed in any case), 17197 // and bitcast. 17198 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); 17199 return DAG.getBitcast(MaskVT, 17200 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); 17201 } 17202 17203 } else { 17204 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 17205 Mask.getSimpleValueType().getSizeInBits()); 17206 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements 17207 // are extracted by EXTRACT_SUBVECTOR. 17208 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, 17209 DAG.getBitcast(BitcastVT, Mask), 17210 DAG.getIntPtrConstant(0, dl)); 17211 } 17212 } 17213 17214 /// \brief Return (and \p Op, \p Mask) for compare instructions or 17215 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the 17216 /// necessary casting or extending for \p Mask when lowering masking intrinsics 17217 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, 17218 SDValue PreservedSrc, 17219 const X86Subtarget &Subtarget, 17220 SelectionDAG &DAG) { 17221 MVT VT = Op.getSimpleValueType(); 17222 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 17223 unsigned OpcodeSelect = ISD::VSELECT; 17224 SDLoc dl(Op); 17225 17226 if (isAllOnesConstant(Mask)) 17227 return Op; 17228 17229 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); 17230 17231 switch (Op.getOpcode()) { 17232 default: break; 17233 case X86ISD::PCMPEQM: 17234 case X86ISD::PCMPGTM: 17235 case X86ISD::CMPM: 17236 case X86ISD::CMPMU: 17237 return DAG.getNode(ISD::AND, dl, VT, Op, VMask); 17238 case X86ISD::VFPCLASS: 17239 case X86ISD::VFPCLASSS: 17240 return DAG.getNode(ISD::OR, dl, VT, Op, VMask); 17241 case X86ISD::VTRUNC: 17242 case X86ISD::VTRUNCS: 17243 case X86ISD::VTRUNCUS: 17244 case ISD::FP_TO_FP16: 17245 // We can't use ISD::VSELECT here because it is not always "Legal" 17246 // for the destination type. For example vpmovqb require only AVX512 17247 // and vselect that can operate on byte element type require BWI 17248 OpcodeSelect = X86ISD::SELECT; 17249 break; 17250 } 17251 if (PreservedSrc.isUndef()) 17252 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); 17253 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); 17254 } 17255 17256 /// \brief Creates an SDNode for a predicated scalar operation. 17257 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). 17258 /// The mask is coming as MVT::i8 and it should be truncated 17259 /// to MVT::i1 while lowering masking intrinsics. 17260 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using 17261 /// "X86select" instead of "vselect". We just can't create the "vselect" node 17262 /// for a scalar instruction. 17263 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, 17264 SDValue PreservedSrc, 17265 const X86Subtarget &Subtarget, 17266 SelectionDAG &DAG) { 17267 if (isAllOnesConstant(Mask)) 17268 return Op; 17269 17270 MVT VT = Op.getSimpleValueType(); 17271 SDLoc dl(Op); 17272 // The mask should be of type MVT::i1 17273 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); 17274 17275 if (Op.getOpcode() == X86ISD::FSETCC) 17276 return DAG.getNode(ISD::AND, dl, VT, Op, IMask); 17277 if (Op.getOpcode() == X86ISD::VFPCLASS || 17278 Op.getOpcode() == X86ISD::VFPCLASSS) 17279 return DAG.getNode(ISD::OR, dl, VT, Op, IMask); 17280 17281 if (PreservedSrc.isUndef()) 17282 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); 17283 return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); 17284 } 17285 17286 static int getSEHRegistrationNodeSize(const Function *Fn) { 17287 if (!Fn->hasPersonalityFn()) 17288 report_fatal_error( 17289 "querying registration node size for function without personality"); 17290 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See 17291 // WinEHStatePass for the full struct definition. 17292 switch (classifyEHPersonality(Fn->getPersonalityFn())) { 17293 case EHPersonality::MSVC_X86SEH: return 24; 17294 case EHPersonality::MSVC_CXX: return 16; 17295 default: break; 17296 } 17297 report_fatal_error( 17298 "can only recover FP for 32-bit MSVC EH personality functions"); 17299 } 17300 17301 /// When the MSVC runtime transfers control to us, either to an outlined 17302 /// function or when returning to a parent frame after catching an exception, we 17303 /// recover the parent frame pointer by doing arithmetic on the incoming EBP. 17304 /// Here's the math: 17305 /// RegNodeBase = EntryEBP - RegNodeSize 17306 /// ParentFP = RegNodeBase - ParentFrameOffset 17307 /// Subtracting RegNodeSize takes us to the offset of the registration node, and 17308 /// subtracting the offset (negative on x86) takes us back to the parent FP. 17309 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, 17310 SDValue EntryEBP) { 17311 MachineFunction &MF = DAG.getMachineFunction(); 17312 SDLoc dl; 17313 17314 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17315 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); 17316 17317 // It's possible that the parent function no longer has a personality function 17318 // if the exceptional code was optimized away, in which case we just return 17319 // the incoming EBP. 17320 if (!Fn->hasPersonalityFn()) 17321 return EntryEBP; 17322 17323 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH 17324 // registration, or the .set_setframe offset. 17325 MCSymbol *OffsetSym = 17326 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( 17327 GlobalValue::getRealLinkageName(Fn->getName())); 17328 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); 17329 SDValue ParentFrameOffset = 17330 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); 17331 17332 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after 17333 // prologue to RBP in the parent function. 17334 const X86Subtarget &Subtarget = 17335 static_cast<const X86Subtarget &>(DAG.getSubtarget()); 17336 if (Subtarget.is64Bit()) 17337 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); 17338 17339 int RegNodeSize = getSEHRegistrationNodeSize(Fn); 17340 // RegNodeBase = EntryEBP - RegNodeSize 17341 // ParentFP = RegNodeBase - ParentFrameOffset 17342 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, 17343 DAG.getConstant(RegNodeSize, dl, PtrVT)); 17344 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); 17345 } 17346 17347 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget, 17348 SelectionDAG &DAG) { 17349 SDLoc dl(Op); 17350 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 17351 MVT VT = Op.getSimpleValueType(); 17352 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); 17353 if (IntrData) { 17354 switch(IntrData->Type) { 17355 case INTR_TYPE_1OP: 17356 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); 17357 case INTR_TYPE_2OP: 17358 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 17359 Op.getOperand(2)); 17360 case INTR_TYPE_2OP_IMM8: 17361 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 17362 DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); 17363 case INTR_TYPE_3OP: 17364 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 17365 Op.getOperand(2), Op.getOperand(3)); 17366 case INTR_TYPE_4OP: 17367 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 17368 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); 17369 case INTR_TYPE_1OP_MASK_RM: { 17370 SDValue Src = Op.getOperand(1); 17371 SDValue PassThru = Op.getOperand(2); 17372 SDValue Mask = Op.getOperand(3); 17373 SDValue RoundingMode; 17374 // We allways add rounding mode to the Node. 17375 // If the rounding mode is not specified, we add the 17376 // "current direction" mode. 17377 if (Op.getNumOperands() == 4) 17378 RoundingMode = 17379 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); 17380 else 17381 RoundingMode = Op.getOperand(4); 17382 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 17383 if (IntrWithRoundingModeOpcode != 0) 17384 if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() != 17385 X86::STATIC_ROUNDING::CUR_DIRECTION) 17386 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 17387 dl, Op.getValueType(), Src, RoundingMode), 17388 Mask, PassThru, Subtarget, DAG); 17389 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, 17390 RoundingMode), 17391 Mask, PassThru, Subtarget, DAG); 17392 } 17393 case INTR_TYPE_1OP_MASK: { 17394 SDValue Src = Op.getOperand(1); 17395 SDValue PassThru = Op.getOperand(2); 17396 SDValue Mask = Op.getOperand(3); 17397 // We add rounding mode to the Node when 17398 // - RM Opcode is specified and 17399 // - RM is not "current direction". 17400 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 17401 if (IntrWithRoundingModeOpcode != 0) { 17402 SDValue Rnd = Op.getOperand(4); 17403 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); 17404 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { 17405 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 17406 dl, Op.getValueType(), 17407 Src, Rnd), 17408 Mask, PassThru, Subtarget, DAG); 17409 } 17410 } 17411 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), 17412 Mask, PassThru, Subtarget, DAG); 17413 } 17414 case INTR_TYPE_SCALAR_MASK: { 17415 SDValue Src1 = Op.getOperand(1); 17416 SDValue Src2 = Op.getOperand(2); 17417 SDValue passThru = Op.getOperand(3); 17418 SDValue Mask = Op.getOperand(4); 17419 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), 17420 Mask, passThru, Subtarget, DAG); 17421 } 17422 case INTR_TYPE_SCALAR_MASK_RM: { 17423 SDValue Src1 = Op.getOperand(1); 17424 SDValue Src2 = Op.getOperand(2); 17425 SDValue Src0 = Op.getOperand(3); 17426 SDValue Mask = Op.getOperand(4); 17427 // There are 2 kinds of intrinsics in this group: 17428 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands 17429 // (2) With rounding mode and sae - 7 operands. 17430 if (Op.getNumOperands() == 6) { 17431 SDValue Sae = Op.getOperand(5); 17432 unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0; 17433 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, 17434 Sae), 17435 Mask, Src0, Subtarget, DAG); 17436 } 17437 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); 17438 SDValue RoundingMode = Op.getOperand(5); 17439 SDValue Sae = Op.getOperand(6); 17440 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, 17441 RoundingMode, Sae), 17442 Mask, Src0, Subtarget, DAG); 17443 } 17444 case INTR_TYPE_2OP_MASK: 17445 case INTR_TYPE_2OP_IMM8_MASK: { 17446 SDValue Src1 = Op.getOperand(1); 17447 SDValue Src2 = Op.getOperand(2); 17448 SDValue PassThru = Op.getOperand(3); 17449 SDValue Mask = Op.getOperand(4); 17450 17451 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK) 17452 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2); 17453 17454 // We specify 2 possible opcodes for intrinsics with rounding modes. 17455 // First, we check if the intrinsic may have non-default rounding mode, 17456 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 17457 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 17458 if (IntrWithRoundingModeOpcode != 0) { 17459 SDValue Rnd = Op.getOperand(5); 17460 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); 17461 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { 17462 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 17463 dl, Op.getValueType(), 17464 Src1, Src2, Rnd), 17465 Mask, PassThru, Subtarget, DAG); 17466 } 17467 } 17468 // TODO: Intrinsics should have fast-math-flags to propagate. 17469 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), 17470 Mask, PassThru, Subtarget, DAG); 17471 } 17472 case INTR_TYPE_2OP_MASK_RM: { 17473 SDValue Src1 = Op.getOperand(1); 17474 SDValue Src2 = Op.getOperand(2); 17475 SDValue PassThru = Op.getOperand(3); 17476 SDValue Mask = Op.getOperand(4); 17477 // We specify 2 possible modes for intrinsics, with/without rounding 17478 // modes. 17479 // First, we check if the intrinsic have rounding mode (6 operands), 17480 // if not, we set rounding mode to "current". 17481 SDValue Rnd; 17482 if (Op.getNumOperands() == 6) 17483 Rnd = Op.getOperand(5); 17484 else 17485 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); 17486 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 17487 Src1, Src2, Rnd), 17488 Mask, PassThru, Subtarget, DAG); 17489 } 17490 case INTR_TYPE_3OP_SCALAR_MASK_RM: { 17491 SDValue Src1 = Op.getOperand(1); 17492 SDValue Src2 = Op.getOperand(2); 17493 SDValue Src3 = Op.getOperand(3); 17494 SDValue PassThru = Op.getOperand(4); 17495 SDValue Mask = Op.getOperand(5); 17496 SDValue Sae = Op.getOperand(6); 17497 17498 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, 17499 Src2, Src3, Sae), 17500 Mask, PassThru, Subtarget, DAG); 17501 } 17502 case INTR_TYPE_3OP_MASK_RM: { 17503 SDValue Src1 = Op.getOperand(1); 17504 SDValue Src2 = Op.getOperand(2); 17505 SDValue Imm = Op.getOperand(3); 17506 SDValue PassThru = Op.getOperand(4); 17507 SDValue Mask = Op.getOperand(5); 17508 // We specify 2 possible modes for intrinsics, with/without rounding 17509 // modes. 17510 // First, we check if the intrinsic have rounding mode (7 operands), 17511 // if not, we set rounding mode to "current". 17512 SDValue Rnd; 17513 if (Op.getNumOperands() == 7) 17514 Rnd = Op.getOperand(6); 17515 else 17516 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); 17517 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 17518 Src1, Src2, Imm, Rnd), 17519 Mask, PassThru, Subtarget, DAG); 17520 } 17521 case INTR_TYPE_3OP_IMM8_MASK: 17522 case INTR_TYPE_3OP_MASK: 17523 case INSERT_SUBVEC: { 17524 SDValue Src1 = Op.getOperand(1); 17525 SDValue Src2 = Op.getOperand(2); 17526 SDValue Src3 = Op.getOperand(3); 17527 SDValue PassThru = Op.getOperand(4); 17528 SDValue Mask = Op.getOperand(5); 17529 17530 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) 17531 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); 17532 else if (IntrData->Type == INSERT_SUBVEC) { 17533 // imm should be adapted to ISD::INSERT_SUBVECTOR behavior 17534 assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!"); 17535 unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue(); 17536 Imm *= Src2.getSimpleValueType().getVectorNumElements(); 17537 Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32); 17538 } 17539 17540 // We specify 2 possible opcodes for intrinsics with rounding modes. 17541 // First, we check if the intrinsic may have non-default rounding mode, 17542 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 17543 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 17544 if (IntrWithRoundingModeOpcode != 0) { 17545 SDValue Rnd = Op.getOperand(6); 17546 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); 17547 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { 17548 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 17549 dl, Op.getValueType(), 17550 Src1, Src2, Src3, Rnd), 17551 Mask, PassThru, Subtarget, DAG); 17552 } 17553 } 17554 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 17555 Src1, Src2, Src3), 17556 Mask, PassThru, Subtarget, DAG); 17557 } 17558 case VPERM_2OP_MASK : { 17559 SDValue Src1 = Op.getOperand(1); 17560 SDValue Src2 = Op.getOperand(2); 17561 SDValue PassThru = Op.getOperand(3); 17562 SDValue Mask = Op.getOperand(4); 17563 17564 // Swap Src1 and Src2 in the node creation 17565 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1), 17566 Mask, PassThru, Subtarget, DAG); 17567 } 17568 case VPERM_3OP_MASKZ: 17569 case VPERM_3OP_MASK:{ 17570 // Src2 is the PassThru 17571 SDValue Src1 = Op.getOperand(1); 17572 SDValue Src2 = Op.getOperand(2); 17573 SDValue Src3 = Op.getOperand(3); 17574 SDValue Mask = Op.getOperand(4); 17575 MVT VT = Op.getSimpleValueType(); 17576 SDValue PassThru = SDValue(); 17577 17578 // set PassThru element 17579 if (IntrData->Type == VPERM_3OP_MASKZ) 17580 PassThru = getZeroVector(VT, Subtarget, DAG, dl); 17581 else 17582 PassThru = DAG.getBitcast(VT, Src2); 17583 17584 // Swap Src1 and Src2 in the node creation 17585 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, 17586 dl, Op.getValueType(), 17587 Src2, Src1, Src3), 17588 Mask, PassThru, Subtarget, DAG); 17589 } 17590 case FMA_OP_MASK3: 17591 case FMA_OP_MASKZ: 17592 case FMA_OP_MASK: { 17593 SDValue Src1 = Op.getOperand(1); 17594 SDValue Src2 = Op.getOperand(2); 17595 SDValue Src3 = Op.getOperand(3); 17596 SDValue Mask = Op.getOperand(4); 17597 MVT VT = Op.getSimpleValueType(); 17598 SDValue PassThru = SDValue(); 17599 17600 // set PassThru element 17601 if (IntrData->Type == FMA_OP_MASKZ) 17602 PassThru = getZeroVector(VT, Subtarget, DAG, dl); 17603 else if (IntrData->Type == FMA_OP_MASK3) 17604 PassThru = Src3; 17605 else 17606 PassThru = Src1; 17607 17608 // We specify 2 possible opcodes for intrinsics with rounding modes. 17609 // First, we check if the intrinsic may have non-default rounding mode, 17610 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 17611 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; 17612 if (IntrWithRoundingModeOpcode != 0) { 17613 SDValue Rnd = Op.getOperand(5); 17614 if (cast<ConstantSDNode>(Rnd)->getZExtValue() != 17615 X86::STATIC_ROUNDING::CUR_DIRECTION) 17616 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, 17617 dl, Op.getValueType(), 17618 Src1, Src2, Src3, Rnd), 17619 Mask, PassThru, Subtarget, DAG); 17620 } 17621 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, 17622 dl, Op.getValueType(), 17623 Src1, Src2, Src3), 17624 Mask, PassThru, Subtarget, DAG); 17625 } 17626 case FMA_OP_SCALAR_MASK: 17627 case FMA_OP_SCALAR_MASK3: 17628 case FMA_OP_SCALAR_MASKZ: { 17629 SDValue Src1 = Op.getOperand(1); 17630 SDValue Src2 = Op.getOperand(2); 17631 SDValue Src3 = Op.getOperand(3); 17632 SDValue Mask = Op.getOperand(4); 17633 MVT VT = Op.getSimpleValueType(); 17634 SDValue PassThru = SDValue(); 17635 17636 // set PassThru element 17637 if (IntrData->Type == FMA_OP_SCALAR_MASKZ) 17638 PassThru = getZeroVector(VT, Subtarget, DAG, dl); 17639 else if (IntrData->Type == FMA_OP_SCALAR_MASK3) 17640 PassThru = Src3; 17641 else 17642 PassThru = Src1; 17643 17644 SDValue Rnd = Op.getOperand(5); 17645 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, 17646 Op.getValueType(), Src1, Src2, 17647 Src3, Rnd), 17648 Mask, PassThru, Subtarget, DAG); 17649 } 17650 case TERLOG_OP_MASK: 17651 case TERLOG_OP_MASKZ: { 17652 SDValue Src1 = Op.getOperand(1); 17653 SDValue Src2 = Op.getOperand(2); 17654 SDValue Src3 = Op.getOperand(3); 17655 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); 17656 SDValue Mask = Op.getOperand(5); 17657 MVT VT = Op.getSimpleValueType(); 17658 SDValue PassThru = Src1; 17659 // Set PassThru element. 17660 if (IntrData->Type == TERLOG_OP_MASKZ) 17661 PassThru = getZeroVector(VT, Subtarget, DAG, dl); 17662 17663 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 17664 Src1, Src2, Src3, Src4), 17665 Mask, PassThru, Subtarget, DAG); 17666 } 17667 case FPCLASS: { 17668 // FPclass intrinsics with mask 17669 SDValue Src1 = Op.getOperand(1); 17670 MVT VT = Src1.getSimpleValueType(); 17671 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 17672 SDValue Imm = Op.getOperand(2); 17673 SDValue Mask = Op.getOperand(3); 17674 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 17675 Mask.getSimpleValueType().getSizeInBits()); 17676 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); 17677 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, 17678 DAG.getTargetConstant(0, dl, MaskVT), 17679 Subtarget, DAG); 17680 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, 17681 DAG.getUNDEF(BitcastVT), FPclassMask, 17682 DAG.getIntPtrConstant(0, dl)); 17683 return DAG.getBitcast(Op.getValueType(), Res); 17684 } 17685 case FPCLASSS: { 17686 SDValue Src1 = Op.getOperand(1); 17687 SDValue Imm = Op.getOperand(2); 17688 SDValue Mask = Op.getOperand(3); 17689 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); 17690 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, 17691 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); 17692 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask); 17693 } 17694 case CMP_MASK: 17695 case CMP_MASK_CC: { 17696 // Comparison intrinsics with masks. 17697 // Example of transformation: 17698 // (i8 (int_x86_avx512_mask_pcmpeq_q_128 17699 // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> 17700 // (i8 (bitcast 17701 // (v8i1 (insert_subvector undef, 17702 // (v2i1 (and (PCMPEQM %a, %b), 17703 // (extract_subvector 17704 // (v8i1 (bitcast %mask)), 0))), 0)))) 17705 MVT VT = Op.getOperand(1).getSimpleValueType(); 17706 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 17707 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); 17708 MVT BitcastVT = MVT::getVectorVT(MVT::i1, 17709 Mask.getSimpleValueType().getSizeInBits()); 17710 SDValue Cmp; 17711 if (IntrData->Type == CMP_MASK_CC) { 17712 SDValue CC = Op.getOperand(3); 17713 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); 17714 // We specify 2 possible opcodes for intrinsics with rounding modes. 17715 // First, we check if the intrinsic may have non-default rounding mode, 17716 // (IntrData->Opc1 != 0), then we check the rounding mode operand. 17717 if (IntrData->Opc1 != 0) { 17718 SDValue Rnd = Op.getOperand(5); 17719 if (cast<ConstantSDNode>(Rnd)->getZExtValue() != 17720 X86::STATIC_ROUNDING::CUR_DIRECTION) 17721 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), 17722 Op.getOperand(2), CC, Rnd); 17723 } 17724 //default rounding mode 17725 if(!Cmp.getNode()) 17726 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), 17727 Op.getOperand(2), CC); 17728 17729 } else { 17730 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); 17731 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), 17732 Op.getOperand(2)); 17733 } 17734 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, 17735 DAG.getTargetConstant(0, dl, 17736 MaskVT), 17737 Subtarget, DAG); 17738 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, 17739 DAG.getUNDEF(BitcastVT), CmpMask, 17740 DAG.getIntPtrConstant(0, dl)); 17741 return DAG.getBitcast(Op.getValueType(), Res); 17742 } 17743 case CMP_MASK_SCALAR_CC: { 17744 SDValue Src1 = Op.getOperand(1); 17745 SDValue Src2 = Op.getOperand(2); 17746 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); 17747 SDValue Mask = Op.getOperand(4); 17748 17749 SDValue Cmp; 17750 if (IntrData->Opc1 != 0) { 17751 SDValue Rnd = Op.getOperand(5); 17752 if (cast<ConstantSDNode>(Rnd)->getZExtValue() != 17753 X86::STATIC_ROUNDING::CUR_DIRECTION) 17754 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); 17755 } 17756 //default rounding mode 17757 if(!Cmp.getNode()) 17758 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); 17759 17760 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, 17761 DAG.getTargetConstant(0, dl, 17762 MVT::i1), 17763 Subtarget, DAG); 17764 17765 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask); 17766 } 17767 case COMI: { // Comparison intrinsics 17768 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; 17769 SDValue LHS = Op.getOperand(1); 17770 SDValue RHS = Op.getOperand(2); 17771 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); 17772 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS); 17773 SDValue SetCC; 17774 switch (CC) { 17775 case ISD::SETEQ: { // (ZF = 0 and PF = 0) 17776 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17777 DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi); 17778 SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17779 DAG.getConstant(X86::COND_NP, dl, MVT::i8), 17780 Comi); 17781 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP); 17782 break; 17783 } 17784 case ISD::SETNE: { // (ZF = 1 or PF = 1) 17785 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17786 DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi); 17787 SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17788 DAG.getConstant(X86::COND_P, dl, MVT::i8), 17789 Comi); 17790 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP); 17791 break; 17792 } 17793 case ISD::SETGT: // (CF = 0 and ZF = 0) 17794 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17795 DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi); 17796 break; 17797 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands. 17798 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17799 DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi); 17800 break; 17801 } 17802 case ISD::SETGE: // CF = 0 17803 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17804 DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi); 17805 break; 17806 case ISD::SETLE: // The condition is opposite to GE. Swap the operands. 17807 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17808 DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi); 17809 break; 17810 default: 17811 llvm_unreachable("Unexpected illegal condition!"); 17812 } 17813 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 17814 } 17815 case COMI_RM: { // Comparison intrinsics with Sae 17816 SDValue LHS = Op.getOperand(1); 17817 SDValue RHS = Op.getOperand(2); 17818 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 17819 SDValue Sae = Op.getOperand(4); 17820 17821 SDValue FCmp; 17822 if (cast<ConstantSDNode>(Sae)->getZExtValue() == 17823 X86::STATIC_ROUNDING::CUR_DIRECTION) 17824 FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS, 17825 DAG.getConstant(CondVal, dl, MVT::i8)); 17826 else 17827 FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS, 17828 DAG.getConstant(CondVal, dl, MVT::i8), Sae); 17829 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg" 17830 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp); 17831 } 17832 case VSHIFT: 17833 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), 17834 Op.getOperand(1), Op.getOperand(2), DAG); 17835 case COMPRESS_EXPAND_IN_REG: { 17836 SDValue Mask = Op.getOperand(3); 17837 SDValue DataToCompress = Op.getOperand(1); 17838 SDValue PassThru = Op.getOperand(2); 17839 if (isAllOnesConstant(Mask)) // return data as is 17840 return Op.getOperand(1); 17841 17842 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 17843 DataToCompress), 17844 Mask, PassThru, Subtarget, DAG); 17845 } 17846 case BROADCASTM: { 17847 SDValue Mask = Op.getOperand(1); 17848 MVT MaskVT = MVT::getVectorVT(MVT::i1, 17849 Mask.getSimpleValueType().getSizeInBits()); 17850 Mask = DAG.getBitcast(MaskVT, Mask); 17851 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); 17852 } 17853 case KUNPCK: { 17854 MVT VT = Op.getSimpleValueType(); 17855 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); 17856 17857 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); 17858 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); 17859 // Arguments should be swapped. 17860 SDValue Res = DAG.getNode(IntrData->Opc0, dl, 17861 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), 17862 Src2, Src1); 17863 return DAG.getBitcast(VT, Res); 17864 } 17865 case FIXUPIMMS: 17866 case FIXUPIMMS_MASKZ: 17867 case FIXUPIMM: 17868 case FIXUPIMM_MASKZ:{ 17869 SDValue Src1 = Op.getOperand(1); 17870 SDValue Src2 = Op.getOperand(2); 17871 SDValue Src3 = Op.getOperand(3); 17872 SDValue Imm = Op.getOperand(4); 17873 SDValue Mask = Op.getOperand(5); 17874 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ? 17875 Src1 : getZeroVector(VT, Subtarget, DAG, dl); 17876 // We specify 2 possible modes for intrinsics, with/without rounding 17877 // modes. 17878 // First, we check if the intrinsic have rounding mode (7 operands), 17879 // if not, we set rounding mode to "current". 17880 SDValue Rnd; 17881 if (Op.getNumOperands() == 7) 17882 Rnd = Op.getOperand(6); 17883 else 17884 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); 17885 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ) 17886 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 17887 Src1, Src2, Src3, Imm, Rnd), 17888 Mask, Passthru, Subtarget, DAG); 17889 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ 17890 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 17891 Src1, Src2, Src3, Imm, Rnd), 17892 Mask, Passthru, Subtarget, DAG); 17893 } 17894 case CONVERT_TO_MASK: { 17895 MVT SrcVT = Op.getOperand(1).getSimpleValueType(); 17896 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); 17897 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); 17898 17899 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, 17900 Op.getOperand(1)); 17901 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, 17902 DAG.getUNDEF(BitcastVT), CvtMask, 17903 DAG.getIntPtrConstant(0, dl)); 17904 return DAG.getBitcast(Op.getValueType(), Res); 17905 } 17906 case CONVERT_MASK_TO_VEC: { 17907 SDValue Mask = Op.getOperand(1); 17908 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 17909 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); 17910 return DAG.getNode(IntrData->Opc0, dl, VT, VMask); 17911 } 17912 case BRCST_SUBVEC_TO_VEC: { 17913 SDValue Src = Op.getOperand(1); 17914 SDValue Passthru = Op.getOperand(2); 17915 SDValue Mask = Op.getOperand(3); 17916 EVT resVT = Passthru.getValueType(); 17917 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, 17918 DAG.getUNDEF(resVT), Src, 17919 DAG.getIntPtrConstant(0, dl)); 17920 SDValue immVal; 17921 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) 17922 immVal = DAG.getConstant(0x44, dl, MVT::i8); 17923 else 17924 immVal = DAG.getConstant(0, dl, MVT::i8); 17925 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, 17926 subVec, subVec, immVal), 17927 Mask, Passthru, Subtarget, DAG); 17928 } 17929 case BRCST32x2_TO_VEC: { 17930 SDValue Src = Op.getOperand(1); 17931 SDValue PassThru = Op.getOperand(2); 17932 SDValue Mask = Op.getOperand(3); 17933 17934 assert((VT.getScalarType() == MVT::i32 || 17935 VT.getScalarType() == MVT::f32) && "Unexpected type!"); 17936 //bitcast Src to packed 64 17937 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64; 17938 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64); 17939 Src = DAG.getBitcast(BitcastVT, Src); 17940 17941 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), 17942 Mask, PassThru, Subtarget, DAG); 17943 } 17944 default: 17945 break; 17946 } 17947 } 17948 17949 switch (IntNo) { 17950 default: return SDValue(); // Don't custom lower most intrinsics. 17951 17952 case Intrinsic::x86_avx2_permd: 17953 case Intrinsic::x86_avx2_permps: 17954 // Operands intentionally swapped. Mask is last operand to intrinsic, 17955 // but second operand for node/instruction. 17956 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 17957 Op.getOperand(2), Op.getOperand(1)); 17958 17959 // ptest and testp intrinsics. The intrinsic these come from are designed to 17960 // return an integer value, not just an instruction so lower it to the ptest 17961 // or testp pattern and a setcc for the result. 17962 case Intrinsic::x86_sse41_ptestz: 17963 case Intrinsic::x86_sse41_ptestc: 17964 case Intrinsic::x86_sse41_ptestnzc: 17965 case Intrinsic::x86_avx_ptestz_256: 17966 case Intrinsic::x86_avx_ptestc_256: 17967 case Intrinsic::x86_avx_ptestnzc_256: 17968 case Intrinsic::x86_avx_vtestz_ps: 17969 case Intrinsic::x86_avx_vtestc_ps: 17970 case Intrinsic::x86_avx_vtestnzc_ps: 17971 case Intrinsic::x86_avx_vtestz_pd: 17972 case Intrinsic::x86_avx_vtestc_pd: 17973 case Intrinsic::x86_avx_vtestnzc_pd: 17974 case Intrinsic::x86_avx_vtestz_ps_256: 17975 case Intrinsic::x86_avx_vtestc_ps_256: 17976 case Intrinsic::x86_avx_vtestnzc_ps_256: 17977 case Intrinsic::x86_avx_vtestz_pd_256: 17978 case Intrinsic::x86_avx_vtestc_pd_256: 17979 case Intrinsic::x86_avx_vtestnzc_pd_256: { 17980 bool IsTestPacked = false; 17981 unsigned X86CC; 17982 switch (IntNo) { 17983 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 17984 case Intrinsic::x86_avx_vtestz_ps: 17985 case Intrinsic::x86_avx_vtestz_pd: 17986 case Intrinsic::x86_avx_vtestz_ps_256: 17987 case Intrinsic::x86_avx_vtestz_pd_256: 17988 IsTestPacked = true; // Fallthrough 17989 case Intrinsic::x86_sse41_ptestz: 17990 case Intrinsic::x86_avx_ptestz_256: 17991 // ZF = 1 17992 X86CC = X86::COND_E; 17993 break; 17994 case Intrinsic::x86_avx_vtestc_ps: 17995 case Intrinsic::x86_avx_vtestc_pd: 17996 case Intrinsic::x86_avx_vtestc_ps_256: 17997 case Intrinsic::x86_avx_vtestc_pd_256: 17998 IsTestPacked = true; // Fallthrough 17999 case Intrinsic::x86_sse41_ptestc: 18000 case Intrinsic::x86_avx_ptestc_256: 18001 // CF = 1 18002 X86CC = X86::COND_B; 18003 break; 18004 case Intrinsic::x86_avx_vtestnzc_ps: 18005 case Intrinsic::x86_avx_vtestnzc_pd: 18006 case Intrinsic::x86_avx_vtestnzc_ps_256: 18007 case Intrinsic::x86_avx_vtestnzc_pd_256: 18008 IsTestPacked = true; // Fallthrough 18009 case Intrinsic::x86_sse41_ptestnzc: 18010 case Intrinsic::x86_avx_ptestnzc_256: 18011 // ZF and CF = 0 18012 X86CC = X86::COND_A; 18013 break; 18014 } 18015 18016 SDValue LHS = Op.getOperand(1); 18017 SDValue RHS = Op.getOperand(2); 18018 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 18019 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 18020 SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); 18021 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 18022 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 18023 } 18024 case Intrinsic::x86_avx512_kortestz_w: 18025 case Intrinsic::x86_avx512_kortestc_w: { 18026 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; 18027 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); 18028 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); 18029 SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); 18030 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); 18031 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 18032 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 18033 } 18034 18035 case Intrinsic::x86_sse42_pcmpistria128: 18036 case Intrinsic::x86_sse42_pcmpestria128: 18037 case Intrinsic::x86_sse42_pcmpistric128: 18038 case Intrinsic::x86_sse42_pcmpestric128: 18039 case Intrinsic::x86_sse42_pcmpistrio128: 18040 case Intrinsic::x86_sse42_pcmpestrio128: 18041 case Intrinsic::x86_sse42_pcmpistris128: 18042 case Intrinsic::x86_sse42_pcmpestris128: 18043 case Intrinsic::x86_sse42_pcmpistriz128: 18044 case Intrinsic::x86_sse42_pcmpestriz128: { 18045 unsigned Opcode; 18046 unsigned X86CC; 18047 switch (IntNo) { 18048 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 18049 case Intrinsic::x86_sse42_pcmpistria128: 18050 Opcode = X86ISD::PCMPISTRI; 18051 X86CC = X86::COND_A; 18052 break; 18053 case Intrinsic::x86_sse42_pcmpestria128: 18054 Opcode = X86ISD::PCMPESTRI; 18055 X86CC = X86::COND_A; 18056 break; 18057 case Intrinsic::x86_sse42_pcmpistric128: 18058 Opcode = X86ISD::PCMPISTRI; 18059 X86CC = X86::COND_B; 18060 break; 18061 case Intrinsic::x86_sse42_pcmpestric128: 18062 Opcode = X86ISD::PCMPESTRI; 18063 X86CC = X86::COND_B; 18064 break; 18065 case Intrinsic::x86_sse42_pcmpistrio128: 18066 Opcode = X86ISD::PCMPISTRI; 18067 X86CC = X86::COND_O; 18068 break; 18069 case Intrinsic::x86_sse42_pcmpestrio128: 18070 Opcode = X86ISD::PCMPESTRI; 18071 X86CC = X86::COND_O; 18072 break; 18073 case Intrinsic::x86_sse42_pcmpistris128: 18074 Opcode = X86ISD::PCMPISTRI; 18075 X86CC = X86::COND_S; 18076 break; 18077 case Intrinsic::x86_sse42_pcmpestris128: 18078 Opcode = X86ISD::PCMPESTRI; 18079 X86CC = X86::COND_S; 18080 break; 18081 case Intrinsic::x86_sse42_pcmpistriz128: 18082 Opcode = X86ISD::PCMPISTRI; 18083 X86CC = X86::COND_E; 18084 break; 18085 case Intrinsic::x86_sse42_pcmpestriz128: 18086 Opcode = X86ISD::PCMPESTRI; 18087 X86CC = X86::COND_E; 18088 break; 18089 } 18090 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 18091 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 18092 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); 18093 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 18094 DAG.getConstant(X86CC, dl, MVT::i8), 18095 SDValue(PCMP.getNode(), 1)); 18096 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 18097 } 18098 18099 case Intrinsic::x86_sse42_pcmpistri128: 18100 case Intrinsic::x86_sse42_pcmpestri128: { 18101 unsigned Opcode; 18102 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 18103 Opcode = X86ISD::PCMPISTRI; 18104 else 18105 Opcode = X86ISD::PCMPESTRI; 18106 18107 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 18108 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 18109 return DAG.getNode(Opcode, dl, VTs, NewOps); 18110 } 18111 18112 case Intrinsic::eh_sjlj_lsda: { 18113 MachineFunction &MF = DAG.getMachineFunction(); 18114 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18115 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); 18116 auto &Context = MF.getMMI().getContext(); 18117 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") + 18118 Twine(MF.getFunctionNumber())); 18119 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT)); 18120 } 18121 18122 case Intrinsic::x86_seh_lsda: { 18123 // Compute the symbol for the LSDA. We know it'll get emitted later. 18124 MachineFunction &MF = DAG.getMachineFunction(); 18125 SDValue Op1 = Op.getOperand(1); 18126 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); 18127 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( 18128 GlobalValue::getRealLinkageName(Fn->getName())); 18129 18130 // Generate a simple absolute symbol reference. This intrinsic is only 18131 // supported on 32-bit Windows, which isn't PIC. 18132 SDValue Result = DAG.getMCSymbol(LSDASym, VT); 18133 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); 18134 } 18135 18136 case Intrinsic::x86_seh_recoverfp: { 18137 SDValue FnOp = Op.getOperand(1); 18138 SDValue IncomingFPOp = Op.getOperand(2); 18139 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); 18140 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); 18141 if (!Fn) 18142 report_fatal_error( 18143 "llvm.x86.seh.recoverfp must take a function as the first argument"); 18144 return recoverFramePointer(DAG, Fn, IncomingFPOp); 18145 } 18146 18147 case Intrinsic::localaddress: { 18148 // Returns one of the stack, base, or frame pointer registers, depending on 18149 // which is used to reference local variables. 18150 MachineFunction &MF = DAG.getMachineFunction(); 18151 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 18152 unsigned Reg; 18153 if (RegInfo->hasBasePointer(MF)) 18154 Reg = RegInfo->getBaseRegister(); 18155 else // This function handles the SP or FP case. 18156 Reg = RegInfo->getPtrSizedFrameRegister(MF); 18157 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 18158 } 18159 } 18160 } 18161 18162 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 18163 SDValue Src, SDValue Mask, SDValue Base, 18164 SDValue Index, SDValue ScaleOp, SDValue Chain, 18165 const X86Subtarget &Subtarget) { 18166 SDLoc dl(Op); 18167 auto *C = cast<ConstantSDNode>(ScaleOp); 18168 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); 18169 MVT MaskVT = MVT::getVectorVT(MVT::i1, 18170 Index.getSimpleValueType().getVectorNumElements()); 18171 18172 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); 18173 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 18174 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); 18175 SDValue Segment = DAG.getRegister(0, MVT::i32); 18176 if (Src.isUndef()) 18177 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); 18178 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; 18179 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 18180 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 18181 return DAG.getMergeValues(RetOps, dl); 18182 } 18183 18184 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 18185 SDValue Src, SDValue Mask, SDValue Base, 18186 SDValue Index, SDValue ScaleOp, SDValue Chain, 18187 const X86Subtarget &Subtarget) { 18188 SDLoc dl(Op); 18189 auto *C = cast<ConstantSDNode>(ScaleOp); 18190 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); 18191 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); 18192 SDValue Segment = DAG.getRegister(0, MVT::i32); 18193 MVT MaskVT = MVT::getVectorVT(MVT::i1, 18194 Index.getSimpleValueType().getVectorNumElements()); 18195 18196 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); 18197 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 18198 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain}; 18199 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 18200 return SDValue(Res, 1); 18201 } 18202 18203 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 18204 SDValue Mask, SDValue Base, SDValue Index, 18205 SDValue ScaleOp, SDValue Chain, 18206 const X86Subtarget &Subtarget) { 18207 SDLoc dl(Op); 18208 auto *C = cast<ConstantSDNode>(ScaleOp); 18209 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); 18210 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); 18211 SDValue Segment = DAG.getRegister(0, MVT::i32); 18212 MVT MaskVT = 18213 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); 18214 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); 18215 //SDVTList VTs = DAG.getVTList(MVT::Other); 18216 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; 18217 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); 18218 return SDValue(Res, 0); 18219 } 18220 18221 /// Handles the lowering of builtin intrinsics that read performance monitor 18222 /// counters (x86_rdpmc). 18223 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, 18224 SelectionDAG &DAG, 18225 const X86Subtarget &Subtarget, 18226 SmallVectorImpl<SDValue> &Results) { 18227 assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); 18228 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 18229 SDValue LO, HI; 18230 18231 // The ECX register is used to select the index of the performance counter 18232 // to read. 18233 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, 18234 N->getOperand(2)); 18235 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); 18236 18237 // Reads the content of a 64-bit performance counter and returns it in the 18238 // registers EDX:EAX. 18239 if (Subtarget.is64Bit()) { 18240 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); 18241 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, 18242 LO.getValue(2)); 18243 } else { 18244 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); 18245 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, 18246 LO.getValue(2)); 18247 } 18248 Chain = HI.getValue(1); 18249 18250 if (Subtarget.is64Bit()) { 18251 // The EAX register is loaded with the low-order 32 bits. The EDX register 18252 // is loaded with the supported high-order bits of the counter. 18253 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, 18254 DAG.getConstant(32, DL, MVT::i8)); 18255 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); 18256 Results.push_back(Chain); 18257 return; 18258 } 18259 18260 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 18261 SDValue Ops[] = { LO, HI }; 18262 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); 18263 Results.push_back(Pair); 18264 Results.push_back(Chain); 18265 } 18266 18267 /// Handles the lowering of builtin intrinsics that read the time stamp counter 18268 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower 18269 /// READCYCLECOUNTER nodes. 18270 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, 18271 SelectionDAG &DAG, 18272 const X86Subtarget &Subtarget, 18273 SmallVectorImpl<SDValue> &Results) { 18274 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 18275 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); 18276 SDValue LO, HI; 18277 18278 // The processor's time-stamp counter (a 64-bit MSR) is stored into the 18279 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR 18280 // and the EAX register is loaded with the low-order 32 bits. 18281 if (Subtarget.is64Bit()) { 18282 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); 18283 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, 18284 LO.getValue(2)); 18285 } else { 18286 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); 18287 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, 18288 LO.getValue(2)); 18289 } 18290 SDValue Chain = HI.getValue(1); 18291 18292 if (Opcode == X86ISD::RDTSCP_DAG) { 18293 assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); 18294 18295 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into 18296 // the ECX register. Add 'ecx' explicitly to the chain. 18297 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, 18298 HI.getValue(2)); 18299 // Explicitly store the content of ECX at the location passed in input 18300 // to the 'rdtscp' intrinsic. 18301 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), 18302 MachinePointerInfo(), false, false, 0); 18303 } 18304 18305 if (Subtarget.is64Bit()) { 18306 // The EDX register is loaded with the high-order 32 bits of the MSR, and 18307 // the EAX register is loaded with the low-order 32 bits. 18308 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, 18309 DAG.getConstant(32, DL, MVT::i8)); 18310 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); 18311 Results.push_back(Chain); 18312 return; 18313 } 18314 18315 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 18316 SDValue Ops[] = { LO, HI }; 18317 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); 18318 Results.push_back(Pair); 18319 Results.push_back(Chain); 18320 } 18321 18322 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, 18323 SelectionDAG &DAG) { 18324 SmallVector<SDValue, 2> Results; 18325 SDLoc DL(Op); 18326 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, 18327 Results); 18328 return DAG.getMergeValues(Results, DL); 18329 } 18330 18331 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { 18332 MachineFunction &MF = DAG.getMachineFunction(); 18333 SDValue Chain = Op.getOperand(0); 18334 SDValue RegNode = Op.getOperand(2); 18335 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); 18336 if (!EHInfo) 18337 report_fatal_error("EH registrations only live in functions using WinEH"); 18338 18339 // Cast the operand to an alloca, and remember the frame index. 18340 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode); 18341 if (!FINode) 18342 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); 18343 EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); 18344 18345 // Return the chain operand without making any DAG nodes. 18346 return Chain; 18347 } 18348 18349 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) { 18350 MachineFunction &MF = DAG.getMachineFunction(); 18351 SDValue Chain = Op.getOperand(0); 18352 SDValue EHGuard = Op.getOperand(2); 18353 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); 18354 if (!EHInfo) 18355 report_fatal_error("EHGuard only live in functions using WinEH"); 18356 18357 // Cast the operand to an alloca, and remember the frame index. 18358 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard); 18359 if (!FINode) 18360 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca"); 18361 EHInfo->EHGuardFrameIndex = FINode->getIndex(); 18362 18363 // Return the chain operand without making any DAG nodes. 18364 return Chain; 18365 } 18366 18367 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, 18368 SelectionDAG &DAG) { 18369 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 18370 18371 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); 18372 if (!IntrData) { 18373 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) 18374 return MarkEHRegistrationNode(Op, DAG); 18375 if (IntNo == llvm::Intrinsic::x86_seh_ehguard) 18376 return MarkEHGuard(Op, DAG); 18377 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || 18378 IntNo == llvm::Intrinsic::x86_flags_read_u64 || 18379 IntNo == llvm::Intrinsic::x86_flags_write_u32 || 18380 IntNo == llvm::Intrinsic::x86_flags_write_u64) { 18381 // We need a frame pointer because this will get lowered to a PUSH/POP 18382 // sequence. 18383 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 18384 MFI->setHasCopyImplyingStackAdjustment(true); 18385 // Don't do anything here, we will expand these intrinsics out later 18386 // during ExpandISelPseudos in EmitInstrWithCustomInserter. 18387 return SDValue(); 18388 } 18389 return SDValue(); 18390 } 18391 18392 SDLoc dl(Op); 18393 switch(IntrData->Type) { 18394 default: llvm_unreachable("Unknown Intrinsic Type"); 18395 case RDSEED: 18396 case RDRAND: { 18397 // Emit the node with the right value type. 18398 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 18399 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); 18400 18401 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. 18402 // Otherwise return the value from Rand, which is always 0, casted to i32. 18403 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 18404 DAG.getConstant(1, dl, Op->getValueType(1)), 18405 DAG.getConstant(X86::COND_B, dl, MVT::i32), 18406 SDValue(Result.getNode(), 1) }; 18407 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 18408 DAG.getVTList(Op->getValueType(1), MVT::Glue), 18409 Ops); 18410 18411 // Return { result, isValid, chain }. 18412 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 18413 SDValue(Result.getNode(), 2)); 18414 } 18415 case GATHER: { 18416 //gather(v1, mask, index, base, scale); 18417 SDValue Chain = Op.getOperand(0); 18418 SDValue Src = Op.getOperand(2); 18419 SDValue Base = Op.getOperand(3); 18420 SDValue Index = Op.getOperand(4); 18421 SDValue Mask = Op.getOperand(5); 18422 SDValue Scale = Op.getOperand(6); 18423 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, 18424 Chain, Subtarget); 18425 } 18426 case SCATTER: { 18427 //scatter(base, mask, index, v1, scale); 18428 SDValue Chain = Op.getOperand(0); 18429 SDValue Base = Op.getOperand(2); 18430 SDValue Mask = Op.getOperand(3); 18431 SDValue Index = Op.getOperand(4); 18432 SDValue Src = Op.getOperand(5); 18433 SDValue Scale = Op.getOperand(6); 18434 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, 18435 Scale, Chain, Subtarget); 18436 } 18437 case PREFETCH: { 18438 SDValue Hint = Op.getOperand(6); 18439 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); 18440 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); 18441 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); 18442 SDValue Chain = Op.getOperand(0); 18443 SDValue Mask = Op.getOperand(2); 18444 SDValue Index = Op.getOperand(3); 18445 SDValue Base = Op.getOperand(4); 18446 SDValue Scale = Op.getOperand(5); 18447 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, 18448 Subtarget); 18449 } 18450 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). 18451 case RDTSC: { 18452 SmallVector<SDValue, 2> Results; 18453 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, 18454 Results); 18455 return DAG.getMergeValues(Results, dl); 18456 } 18457 // Read Performance Monitoring Counters. 18458 case RDPMC: { 18459 SmallVector<SDValue, 2> Results; 18460 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); 18461 return DAG.getMergeValues(Results, dl); 18462 } 18463 // XTEST intrinsics. 18464 case XTEST: { 18465 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 18466 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); 18467 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 18468 DAG.getConstant(X86::COND_NE, dl, MVT::i8), 18469 InTrans); 18470 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); 18471 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), 18472 Ret, SDValue(InTrans.getNode(), 1)); 18473 } 18474 // ADC/ADCX/SBB 18475 case ADX: { 18476 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 18477 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); 18478 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), 18479 DAG.getConstant(-1, dl, MVT::i8)); 18480 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), 18481 Op.getOperand(4), GenCF.getValue(1)); 18482 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), 18483 Op.getOperand(5), MachinePointerInfo(), 18484 false, false, 0); 18485 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 18486 DAG.getConstant(X86::COND_B, dl, MVT::i8), 18487 Res.getValue(1)); 18488 SDValue Results[] = { SetCC, Store }; 18489 return DAG.getMergeValues(Results, dl); 18490 } 18491 case COMPRESS_TO_MEM: { 18492 SDValue Mask = Op.getOperand(4); 18493 SDValue DataToCompress = Op.getOperand(3); 18494 SDValue Addr = Op.getOperand(2); 18495 SDValue Chain = Op.getOperand(0); 18496 MVT VT = DataToCompress.getSimpleValueType(); 18497 18498 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); 18499 assert(MemIntr && "Expected MemIntrinsicSDNode!"); 18500 18501 if (isAllOnesConstant(Mask)) // return just a store 18502 return DAG.getStore(Chain, dl, DataToCompress, Addr, 18503 MemIntr->getMemOperand()); 18504 18505 SDValue Compressed = 18506 getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), 18507 Mask, DAG.getUNDEF(VT), Subtarget, DAG); 18508 return DAG.getStore(Chain, dl, Compressed, Addr, 18509 MemIntr->getMemOperand()); 18510 } 18511 case TRUNCATE_TO_MEM_VI8: 18512 case TRUNCATE_TO_MEM_VI16: 18513 case TRUNCATE_TO_MEM_VI32: { 18514 SDValue Mask = Op.getOperand(4); 18515 SDValue DataToTruncate = Op.getOperand(3); 18516 SDValue Addr = Op.getOperand(2); 18517 SDValue Chain = Op.getOperand(0); 18518 18519 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); 18520 assert(MemIntr && "Expected MemIntrinsicSDNode!"); 18521 18522 EVT VT = MemIntr->getMemoryVT(); 18523 18524 if (isAllOnesConstant(Mask)) // return just a truncate store 18525 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT, 18526 MemIntr->getMemOperand()); 18527 18528 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 18529 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); 18530 18531 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT, 18532 MemIntr->getMemOperand(), true); 18533 } 18534 case EXPAND_FROM_MEM: { 18535 SDValue Mask = Op.getOperand(4); 18536 SDValue PassThru = Op.getOperand(3); 18537 SDValue Addr = Op.getOperand(2); 18538 SDValue Chain = Op.getOperand(0); 18539 MVT VT = Op.getSimpleValueType(); 18540 18541 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); 18542 assert(MemIntr && "Expected MemIntrinsicSDNode!"); 18543 18544 SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, 18545 MemIntr->getMemOperand()); 18546 18547 if (isAllOnesConstant(Mask)) // return just a load 18548 return DataToExpand; 18549 18550 SDValue Results[] = { 18551 getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), 18552 Mask, PassThru, Subtarget, DAG), Chain}; 18553 return DAG.getMergeValues(Results, dl); 18554 } 18555 } 18556 } 18557 18558 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 18559 SelectionDAG &DAG) const { 18560 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 18561 MFI->setReturnAddressIsTaken(true); 18562 18563 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 18564 return SDValue(); 18565 18566 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 18567 SDLoc dl(Op); 18568 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 18569 18570 if (Depth > 0) { 18571 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 18572 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 18573 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); 18574 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 18575 DAG.getNode(ISD::ADD, dl, PtrVT, 18576 FrameAddr, Offset), 18577 MachinePointerInfo(), false, false, false, 0); 18578 } 18579 18580 // Just load the return address. 18581 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 18582 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 18583 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 18584 } 18585 18586 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 18587 MachineFunction &MF = DAG.getMachineFunction(); 18588 MachineFrameInfo *MFI = MF.getFrameInfo(); 18589 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 18590 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 18591 EVT VT = Op.getValueType(); 18592 18593 MFI->setFrameAddressIsTaken(true); 18594 18595 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { 18596 // Depth > 0 makes no sense on targets which use Windows unwind codes. It 18597 // is not possible to crawl up the stack without looking at the unwind codes 18598 // simultaneously. 18599 int FrameAddrIndex = FuncInfo->getFAIndex(); 18600 if (!FrameAddrIndex) { 18601 // Set up a frame object for the return address. 18602 unsigned SlotSize = RegInfo->getSlotSize(); 18603 FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( 18604 SlotSize, /*Offset=*/0, /*IsImmutable=*/false); 18605 FuncInfo->setFAIndex(FrameAddrIndex); 18606 } 18607 return DAG.getFrameIndex(FrameAddrIndex, VT); 18608 } 18609 18610 unsigned FrameReg = 18611 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); 18612 SDLoc dl(Op); // FIXME probably not meaningful 18613 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 18614 assert(((FrameReg == X86::RBP && VT == MVT::i64) || 18615 (FrameReg == X86::EBP && VT == MVT::i32)) && 18616 "Invalid Frame Register!"); 18617 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 18618 while (Depth--) 18619 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 18620 MachinePointerInfo(), 18621 false, false, false, 0); 18622 return FrameAddr; 18623 } 18624 18625 // FIXME? Maybe this could be a TableGen attribute on some registers and 18626 // this table could be generated automatically from RegInfo. 18627 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, 18628 SelectionDAG &DAG) const { 18629 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); 18630 const MachineFunction &MF = DAG.getMachineFunction(); 18631 18632 unsigned Reg = StringSwitch<unsigned>(RegName) 18633 .Case("esp", X86::ESP) 18634 .Case("rsp", X86::RSP) 18635 .Case("ebp", X86::EBP) 18636 .Case("rbp", X86::RBP) 18637 .Default(0); 18638 18639 if (Reg == X86::EBP || Reg == X86::RBP) { 18640 if (!TFI.hasFP(MF)) 18641 report_fatal_error("register " + StringRef(RegName) + 18642 " is allocatable: function has no frame pointer"); 18643 #ifndef NDEBUG 18644 else { 18645 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 18646 unsigned FrameReg = 18647 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); 18648 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && 18649 "Invalid Frame Register!"); 18650 } 18651 #endif 18652 } 18653 18654 if (Reg) 18655 return Reg; 18656 18657 report_fatal_error("Invalid register name global variable"); 18658 } 18659 18660 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 18661 SelectionDAG &DAG) const { 18662 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 18663 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); 18664 } 18665 18666 unsigned X86TargetLowering::getExceptionPointerRegister( 18667 const Constant *PersonalityFn) const { 18668 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) 18669 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; 18670 18671 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX; 18672 } 18673 18674 unsigned X86TargetLowering::getExceptionSelectorRegister( 18675 const Constant *PersonalityFn) const { 18676 // Funclet personalities don't use selectors (the runtime does the selection). 18677 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); 18678 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; 18679 } 18680 18681 bool X86TargetLowering::needsFixedCatchObjects() const { 18682 return Subtarget.isTargetWin64(); 18683 } 18684 18685 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 18686 SDValue Chain = Op.getOperand(0); 18687 SDValue Offset = Op.getOperand(1); 18688 SDValue Handler = Op.getOperand(2); 18689 SDLoc dl (Op); 18690 18691 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 18692 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 18693 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 18694 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || 18695 (FrameReg == X86::EBP && PtrVT == MVT::i32)) && 18696 "Invalid Frame Register!"); 18697 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); 18698 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; 18699 18700 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, 18701 DAG.getIntPtrConstant(RegInfo->getSlotSize(), 18702 dl)); 18703 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); 18704 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 18705 false, false, 0); 18706 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 18707 18708 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, 18709 DAG.getRegister(StoreAddrReg, PtrVT)); 18710 } 18711 18712 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 18713 SelectionDAG &DAG) const { 18714 SDLoc DL(Op); 18715 // If the subtarget is not 64bit, we may need the global base reg 18716 // after isel expand pseudo, i.e., after CGBR pass ran. 18717 // Therefore, ask for the GlobalBaseReg now, so that the pass 18718 // inserts the code for us in case we need it. 18719 // Otherwise, we will end up in a situation where we will 18720 // reference a virtual register that is not defined! 18721 if (!Subtarget.is64Bit()) { 18722 const X86InstrInfo *TII = Subtarget.getInstrInfo(); 18723 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction()); 18724 } 18725 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, 18726 DAG.getVTList(MVT::i32, MVT::Other), 18727 Op.getOperand(0), Op.getOperand(1)); 18728 } 18729 18730 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 18731 SelectionDAG &DAG) const { 18732 SDLoc DL(Op); 18733 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 18734 Op.getOperand(0), Op.getOperand(1)); 18735 } 18736 18737 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 18738 SelectionDAG &DAG) const { 18739 SDLoc DL(Op); 18740 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other, 18741 Op.getOperand(0)); 18742 } 18743 18744 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 18745 return Op.getOperand(0); 18746 } 18747 18748 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 18749 SelectionDAG &DAG) const { 18750 SDValue Root = Op.getOperand(0); 18751 SDValue Trmp = Op.getOperand(1); // trampoline 18752 SDValue FPtr = Op.getOperand(2); // nested function 18753 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 18754 SDLoc dl (Op); 18755 18756 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 18757 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 18758 18759 if (Subtarget.is64Bit()) { 18760 SDValue OutChains[6]; 18761 18762 // Large code-model. 18763 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 18764 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 18765 18766 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 18767 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 18768 18769 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 18770 18771 // Load the pointer to the nested function into R11. 18772 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 18773 SDValue Addr = Trmp; 18774 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), 18775 Addr, MachinePointerInfo(TrmpAddr), 18776 false, false, 0); 18777 18778 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 18779 DAG.getConstant(2, dl, MVT::i64)); 18780 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 18781 MachinePointerInfo(TrmpAddr, 2), 18782 false, false, 2); 18783 18784 // Load the 'nest' parameter value into R10. 18785 // R10 is specified in X86CallingConv.td 18786 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 18787 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 18788 DAG.getConstant(10, dl, MVT::i64)); 18789 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), 18790 Addr, MachinePointerInfo(TrmpAddr, 10), 18791 false, false, 0); 18792 18793 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 18794 DAG.getConstant(12, dl, MVT::i64)); 18795 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 18796 MachinePointerInfo(TrmpAddr, 12), 18797 false, false, 2); 18798 18799 // Jump to the nested function. 18800 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 18801 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 18802 DAG.getConstant(20, dl, MVT::i64)); 18803 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), 18804 Addr, MachinePointerInfo(TrmpAddr, 20), 18805 false, false, 0); 18806 18807 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 18808 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 18809 DAG.getConstant(22, dl, MVT::i64)); 18810 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), 18811 Addr, MachinePointerInfo(TrmpAddr, 22), 18812 false, false, 0); 18813 18814 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); 18815 } else { 18816 const Function *Func = 18817 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 18818 CallingConv::ID CC = Func->getCallingConv(); 18819 unsigned NestReg; 18820 18821 switch (CC) { 18822 default: 18823 llvm_unreachable("Unsupported calling convention"); 18824 case CallingConv::C: 18825 case CallingConv::X86_StdCall: { 18826 // Pass 'nest' parameter in ECX. 18827 // Must be kept in sync with X86CallingConv.td 18828 NestReg = X86::ECX; 18829 18830 // Check that ECX wasn't needed by an 'inreg' parameter. 18831 FunctionType *FTy = Func->getFunctionType(); 18832 const AttributeSet &Attrs = Func->getAttributes(); 18833 18834 if (!Attrs.isEmpty() && !Func->isVarArg()) { 18835 unsigned InRegCount = 0; 18836 unsigned Idx = 1; 18837 18838 for (FunctionType::param_iterator I = FTy->param_begin(), 18839 E = FTy->param_end(); I != E; ++I, ++Idx) 18840 if (Attrs.hasAttribute(Idx, Attribute::InReg)) { 18841 auto &DL = DAG.getDataLayout(); 18842 // FIXME: should only count parameters that are lowered to integers. 18843 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; 18844 } 18845 18846 if (InRegCount > 2) { 18847 report_fatal_error("Nest register in use - reduce number of inreg" 18848 " parameters!"); 18849 } 18850 } 18851 break; 18852 } 18853 case CallingConv::X86_FastCall: 18854 case CallingConv::X86_ThisCall: 18855 case CallingConv::Fast: 18856 // Pass 'nest' parameter in EAX. 18857 // Must be kept in sync with X86CallingConv.td 18858 NestReg = X86::EAX; 18859 break; 18860 } 18861 18862 SDValue OutChains[4]; 18863 SDValue Addr, Disp; 18864 18865 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 18866 DAG.getConstant(10, dl, MVT::i32)); 18867 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 18868 18869 // This is storing the opcode for MOV32ri. 18870 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 18871 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 18872 OutChains[0] = DAG.getStore(Root, dl, 18873 DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8), 18874 Trmp, MachinePointerInfo(TrmpAddr), 18875 false, false, 0); 18876 18877 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 18878 DAG.getConstant(1, dl, MVT::i32)); 18879 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 18880 MachinePointerInfo(TrmpAddr, 1), 18881 false, false, 1); 18882 18883 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 18884 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 18885 DAG.getConstant(5, dl, MVT::i32)); 18886 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), 18887 Addr, MachinePointerInfo(TrmpAddr, 5), 18888 false, false, 1); 18889 18890 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 18891 DAG.getConstant(6, dl, MVT::i32)); 18892 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 18893 MachinePointerInfo(TrmpAddr, 6), 18894 false, false, 1); 18895 18896 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); 18897 } 18898 } 18899 18900 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 18901 SelectionDAG &DAG) const { 18902 /* 18903 The rounding mode is in bits 11:10 of FPSR, and has the following 18904 settings: 18905 00 Round to nearest 18906 01 Round to -inf 18907 10 Round to +inf 18908 11 Round to 0 18909 18910 FLT_ROUNDS, on the other hand, expects the following: 18911 -1 Undefined 18912 0 Round to 0 18913 1 Round to nearest 18914 2 Round to +inf 18915 3 Round to -inf 18916 18917 To perform the conversion, we do: 18918 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 18919 */ 18920 18921 MachineFunction &MF = DAG.getMachineFunction(); 18922 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); 18923 unsigned StackAlignment = TFI.getStackAlignment(); 18924 MVT VT = Op.getSimpleValueType(); 18925 SDLoc DL(Op); 18926 18927 // Save FP Control Word to stack slot 18928 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 18929 SDValue StackSlot = 18930 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); 18931 18932 MachineMemOperand *MMO = 18933 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), 18934 MachineMemOperand::MOStore, 2, 2); 18935 18936 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 18937 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 18938 DAG.getVTList(MVT::Other), 18939 Ops, MVT::i16, MMO); 18940 18941 // Load FP Control Word from stack slot 18942 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 18943 MachinePointerInfo(), false, false, false, 0); 18944 18945 // Transform as necessary 18946 SDValue CWD1 = 18947 DAG.getNode(ISD::SRL, DL, MVT::i16, 18948 DAG.getNode(ISD::AND, DL, MVT::i16, 18949 CWD, DAG.getConstant(0x800, DL, MVT::i16)), 18950 DAG.getConstant(11, DL, MVT::i8)); 18951 SDValue CWD2 = 18952 DAG.getNode(ISD::SRL, DL, MVT::i16, 18953 DAG.getNode(ISD::AND, DL, MVT::i16, 18954 CWD, DAG.getConstant(0x400, DL, MVT::i16)), 18955 DAG.getConstant(9, DL, MVT::i8)); 18956 18957 SDValue RetVal = 18958 DAG.getNode(ISD::AND, DL, MVT::i16, 18959 DAG.getNode(ISD::ADD, DL, MVT::i16, 18960 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 18961 DAG.getConstant(1, DL, MVT::i16)), 18962 DAG.getConstant(3, DL, MVT::i16)); 18963 18964 return DAG.getNode((VT.getSizeInBits() < 16 ? 18965 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 18966 } 18967 18968 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. 18969 // 18970 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended 18971 // to 512-bit vector. 18972 // 2. i8/i16 vector implemented using dword LZCNT vector instruction 18973 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, 18974 // split the vector, perform operation on it's Lo a Hi part and 18975 // concatenate the results. 18976 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { 18977 assert(Op.getOpcode() == ISD::CTLZ); 18978 SDLoc dl(Op); 18979 MVT VT = Op.getSimpleValueType(); 18980 MVT EltVT = VT.getVectorElementType(); 18981 unsigned NumElems = VT.getVectorNumElements(); 18982 18983 if (EltVT == MVT::i64 || EltVT == MVT::i32) { 18984 // Extend to 512 bit vector. 18985 assert((VT.is256BitVector() || VT.is128BitVector()) && 18986 "Unsupported value type for operation"); 18987 18988 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); 18989 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, 18990 DAG.getUNDEF(NewVT), 18991 Op.getOperand(0), 18992 DAG.getIntPtrConstant(0, dl)); 18993 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); 18994 18995 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, 18996 DAG.getIntPtrConstant(0, dl)); 18997 } 18998 18999 assert((EltVT == MVT::i8 || EltVT == MVT::i16) && 19000 "Unsupported element type"); 19001 19002 if (16 < NumElems) { 19003 // Split vector, it's Lo and Hi parts will be handled in next iteration. 19004 SDValue Lo, Hi; 19005 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); 19006 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); 19007 19008 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo); 19009 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi); 19010 19011 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); 19012 } 19013 19014 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); 19015 19016 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && 19017 "Unsupported value type for operation"); 19018 19019 // Use native supported vector instruction vplzcntd. 19020 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); 19021 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); 19022 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); 19023 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); 19024 19025 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); 19026 } 19027 19028 // Lower CTLZ using a PSHUFB lookup table implementation. 19029 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, 19030 const X86Subtarget &Subtarget, 19031 SelectionDAG &DAG) { 19032 MVT VT = Op.getSimpleValueType(); 19033 int NumElts = VT.getVectorNumElements(); 19034 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8); 19035 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes); 19036 19037 // Per-nibble leading zero PSHUFB lookup table. 19038 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2, 19039 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1, 19040 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0, 19041 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0}; 19042 19043 SmallVector<SDValue, 64> LUTVec; 19044 for (int i = 0; i < NumBytes; ++i) 19045 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); 19046 SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec); 19047 19048 // Begin by bitcasting the input to byte vector, then split those bytes 19049 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them. 19050 // If the hi input nibble is zero then we add both results together, otherwise 19051 // we just take the hi result (by masking the lo result to zero before the 19052 // add). 19053 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); 19054 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL); 19055 19056 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT); 19057 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); 19058 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask); 19059 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); 19060 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); 19061 19062 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo); 19063 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi); 19064 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ); 19065 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi); 19066 19067 // Merge result back from vXi8 back to VT, working on the lo/hi halves 19068 // of the current vector width in the same way we did for the nibbles. 19069 // If the upper half of the input element is zero then add the halves' 19070 // leading zero counts together, otherwise just use the upper half's. 19071 // Double the width of the result until we are at target width. 19072 while (CurrVT != VT) { 19073 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits(); 19074 int CurrNumElts = CurrVT.getVectorNumElements(); 19075 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2); 19076 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2); 19077 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT); 19078 19079 // Check if the upper half of the input element is zero. 19080 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), 19081 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); 19082 HiZ = DAG.getBitcast(NextVT, HiZ); 19083 19084 // Move the upper/lower halves to the lower bits as we'll be extending to 19085 // NextVT. Mask the lower result to zero if HiZ is true and add the results 19086 // together. 19087 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res); 19088 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift); 19089 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift); 19090 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1); 19091 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1); 19092 CurrVT = NextVT; 19093 } 19094 19095 return Res; 19096 } 19097 19098 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, 19099 const X86Subtarget &Subtarget, 19100 SelectionDAG &DAG) { 19101 MVT VT = Op.getSimpleValueType(); 19102 SDValue Op0 = Op.getOperand(0); 19103 19104 if (Subtarget.hasAVX512()) 19105 return LowerVectorCTLZ_AVX512(Op, DAG); 19106 19107 // Decompose 256-bit ops into smaller 128-bit ops. 19108 if (VT.is256BitVector() && !Subtarget.hasInt256()) { 19109 unsigned NumElems = VT.getVectorNumElements(); 19110 19111 // Extract each 128-bit vector, perform ctlz and concat the result. 19112 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); 19113 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); 19114 19115 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, 19116 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS), 19117 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS)); 19118 } 19119 19120 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); 19121 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); 19122 } 19123 19124 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, 19125 SelectionDAG &DAG) { 19126 MVT VT = Op.getSimpleValueType(); 19127 MVT OpVT = VT; 19128 unsigned NumBits = VT.getSizeInBits(); 19129 SDLoc dl(Op); 19130 unsigned Opc = Op.getOpcode(); 19131 19132 if (VT.isVector()) 19133 return LowerVectorCTLZ(Op, dl, Subtarget, DAG); 19134 19135 Op = Op.getOperand(0); 19136 if (VT == MVT::i8) { 19137 // Zero extend to i32 since there is not an i8 bsr. 19138 OpVT = MVT::i32; 19139 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 19140 } 19141 19142 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 19143 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 19144 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 19145 19146 if (Opc == ISD::CTLZ) { 19147 // If src is zero (i.e. bsr sets ZF), returns NumBits. 19148 SDValue Ops[] = { 19149 Op, 19150 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), 19151 DAG.getConstant(X86::COND_E, dl, MVT::i8), 19152 Op.getValue(1) 19153 }; 19154 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); 19155 } 19156 19157 // Finally xor with NumBits-1. 19158 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, 19159 DAG.getConstant(NumBits - 1, dl, OpVT)); 19160 19161 if (VT == MVT::i8) 19162 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 19163 return Op; 19164 } 19165 19166 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 19167 MVT VT = Op.getSimpleValueType(); 19168 unsigned NumBits = VT.getScalarSizeInBits(); 19169 SDLoc dl(Op); 19170 19171 if (VT.isVector()) { 19172 SDValue N0 = Op.getOperand(0); 19173 SDValue Zero = DAG.getConstant(0, dl, VT); 19174 19175 // lsb(x) = (x & -x) 19176 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, 19177 DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); 19178 19179 // cttz_undef(x) = (width - 1) - ctlz(lsb) 19180 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { 19181 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); 19182 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, 19183 DAG.getNode(ISD::CTLZ, dl, VT, LSB)); 19184 } 19185 19186 // cttz(x) = ctpop(lsb - 1) 19187 SDValue One = DAG.getConstant(1, dl, VT); 19188 return DAG.getNode(ISD::CTPOP, dl, VT, 19189 DAG.getNode(ISD::SUB, dl, VT, LSB, One)); 19190 } 19191 19192 assert(Op.getOpcode() == ISD::CTTZ && 19193 "Only scalar CTTZ requires custom lowering"); 19194 19195 // Issue a bsf (scan bits forward) which also sets EFLAGS. 19196 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 19197 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); 19198 19199 // If src is zero (i.e. bsf sets ZF), returns NumBits. 19200 SDValue Ops[] = { 19201 Op, 19202 DAG.getConstant(NumBits, dl, VT), 19203 DAG.getConstant(X86::COND_E, dl, MVT::i8), 19204 Op.getValue(1) 19205 }; 19206 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); 19207 } 19208 19209 /// Break a 256-bit integer operation into two new 128-bit ones and then 19210 /// concatenate the result back. 19211 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 19212 MVT VT = Op.getSimpleValueType(); 19213 19214 assert(VT.is256BitVector() && VT.isInteger() && 19215 "Unsupported value type for operation"); 19216 19217 unsigned NumElems = VT.getVectorNumElements(); 19218 SDLoc dl(Op); 19219 19220 // Extract the LHS vectors 19221 SDValue LHS = Op.getOperand(0); 19222 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); 19223 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); 19224 19225 // Extract the RHS vectors 19226 SDValue RHS = Op.getOperand(1); 19227 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); 19228 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); 19229 19230 MVT EltVT = VT.getVectorElementType(); 19231 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 19232 19233 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 19234 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 19235 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 19236 } 19237 19238 /// Break a 512-bit integer operation into two new 256-bit ones and then 19239 /// concatenate the result back. 19240 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) { 19241 MVT VT = Op.getSimpleValueType(); 19242 19243 assert(VT.is512BitVector() && VT.isInteger() && 19244 "Unsupported value type for operation"); 19245 19246 unsigned NumElems = VT.getVectorNumElements(); 19247 SDLoc dl(Op); 19248 19249 // Extract the LHS vectors 19250 SDValue LHS = Op.getOperand(0); 19251 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl); 19252 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl); 19253 19254 // Extract the RHS vectors 19255 SDValue RHS = Op.getOperand(1); 19256 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl); 19257 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl); 19258 19259 MVT EltVT = VT.getVectorElementType(); 19260 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 19261 19262 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 19263 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 19264 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 19265 } 19266 19267 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 19268 if (Op.getValueType() == MVT::i1) 19269 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), 19270 Op.getOperand(0), Op.getOperand(1)); 19271 assert(Op.getSimpleValueType().is256BitVector() && 19272 Op.getSimpleValueType().isInteger() && 19273 "Only handle AVX 256-bit vector integer operation"); 19274 return Lower256IntArith(Op, DAG); 19275 } 19276 19277 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 19278 if (Op.getValueType() == MVT::i1) 19279 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), 19280 Op.getOperand(0), Op.getOperand(1)); 19281 assert(Op.getSimpleValueType().is256BitVector() && 19282 Op.getSimpleValueType().isInteger() && 19283 "Only handle AVX 256-bit vector integer operation"); 19284 return Lower256IntArith(Op, DAG); 19285 } 19286 19287 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { 19288 assert(Op.getSimpleValueType().is256BitVector() && 19289 Op.getSimpleValueType().isInteger() && 19290 "Only handle AVX 256-bit vector integer operation"); 19291 return Lower256IntArith(Op, DAG); 19292 } 19293 19294 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, 19295 SelectionDAG &DAG) { 19296 SDLoc dl(Op); 19297 MVT VT = Op.getSimpleValueType(); 19298 19299 if (VT == MVT::i1) 19300 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); 19301 19302 // Decompose 256-bit ops into smaller 128-bit ops. 19303 if (VT.is256BitVector() && !Subtarget.hasInt256()) 19304 return Lower256IntArith(Op, DAG); 19305 19306 SDValue A = Op.getOperand(0); 19307 SDValue B = Op.getOperand(1); 19308 19309 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16 19310 // vector pairs, multiply and truncate. 19311 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { 19312 if (Subtarget.hasInt256()) { 19313 // For 512-bit vectors, split into 256-bit vectors to allow the 19314 // sign-extension to occur. 19315 if (VT == MVT::v64i8) 19316 return Lower512IntArith(Op, DAG); 19317 19318 // For 256-bit vectors, split into 128-bit vectors to allow the 19319 // sign-extension to occur. We don't need this on AVX512BW as we can 19320 // safely sign-extend to v32i16. 19321 if (VT == MVT::v32i8 && !Subtarget.hasBWI()) 19322 return Lower256IntArith(Op, DAG); 19323 19324 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); 19325 return DAG.getNode( 19326 ISD::TRUNCATE, dl, VT, 19327 DAG.getNode(ISD::MUL, dl, ExVT, 19328 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A), 19329 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B))); 19330 } 19331 19332 assert(VT == MVT::v16i8 && 19333 "Pre-AVX2 support only supports v16i8 multiplication"); 19334 MVT ExVT = MVT::v8i16; 19335 19336 // Extract the lo parts and sign extend to i16 19337 SDValue ALo, BLo; 19338 if (Subtarget.hasSSE41()) { 19339 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); 19340 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); 19341 } else { 19342 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, 19343 -1, 4, -1, 5, -1, 6, -1, 7}; 19344 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 19345 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 19346 ALo = DAG.getBitcast(ExVT, ALo); 19347 BLo = DAG.getBitcast(ExVT, BLo); 19348 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); 19349 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); 19350 } 19351 19352 // Extract the hi parts and sign extend to i16 19353 SDValue AHi, BHi; 19354 if (Subtarget.hasSSE41()) { 19355 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, 19356 -1, -1, -1, -1, -1, -1, -1, -1}; 19357 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 19358 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 19359 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); 19360 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); 19361 } else { 19362 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, 19363 -1, 12, -1, 13, -1, 14, -1, 15}; 19364 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 19365 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 19366 AHi = DAG.getBitcast(ExVT, AHi); 19367 BHi = DAG.getBitcast(ExVT, BHi); 19368 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); 19369 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); 19370 } 19371 19372 // Multiply, mask the lower 8bits of the lo/hi results and pack 19373 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); 19374 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); 19375 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); 19376 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT)); 19377 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); 19378 } 19379 19380 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. 19381 if (VT == MVT::v4i32) { 19382 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() && 19383 "Should not custom lower when pmuldq is available!"); 19384 19385 // Extract the odd parts. 19386 static const int UnpackMask[] = { 1, -1, 3, -1 }; 19387 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); 19388 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); 19389 19390 // Multiply the even parts. 19391 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); 19392 // Now multiply odd parts. 19393 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); 19394 19395 Evens = DAG.getBitcast(VT, Evens); 19396 Odds = DAG.getBitcast(VT, Odds); 19397 19398 // Merge the two vectors back together with a shuffle. This expands into 2 19399 // shuffles. 19400 static const int ShufMask[] = { 0, 4, 2, 6 }; 19401 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); 19402 } 19403 19404 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && 19405 "Only know how to lower V2I64/V4I64/V8I64 multiply"); 19406 19407 // Ahi = psrlqi(a, 32); 19408 // Bhi = psrlqi(b, 32); 19409 // 19410 // AloBlo = pmuludq(a, b); 19411 // AloBhi = pmuludq(a, Bhi); 19412 // AhiBlo = pmuludq(Ahi, b); 19413 19414 // AloBhi = psllqi(AloBhi, 32); 19415 // AhiBlo = psllqi(AhiBlo, 32); 19416 // return AloBlo + AloBhi + AhiBlo; 19417 19418 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); 19419 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); 19420 19421 SDValue AhiBlo = Ahi; 19422 SDValue AloBhi = Bhi; 19423 // Bit cast to 32-bit vectors for MULUDQ 19424 MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : 19425 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; 19426 A = DAG.getBitcast(MulVT, A); 19427 B = DAG.getBitcast(MulVT, B); 19428 Ahi = DAG.getBitcast(MulVT, Ahi); 19429 Bhi = DAG.getBitcast(MulVT, Bhi); 19430 19431 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 19432 // After shifting right const values the result may be all-zero. 19433 if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) { 19434 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 19435 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); 19436 } 19437 if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) { 19438 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 19439 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); 19440 } 19441 19442 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 19443 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 19444 } 19445 19446 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, 19447 SelectionDAG &DAG) { 19448 SDLoc dl(Op); 19449 MVT VT = Op.getSimpleValueType(); 19450 19451 // Decompose 256-bit ops into smaller 128-bit ops. 19452 if (VT.is256BitVector() && !Subtarget.hasInt256()) 19453 return Lower256IntArith(Op, DAG); 19454 19455 // Only i8 vectors should need custom lowering after this. 19456 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) && 19457 "Unsupported vector type"); 19458 19459 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply, 19460 // logical shift down the upper half and pack back to i8. 19461 SDValue A = Op.getOperand(0); 19462 SDValue B = Op.getOperand(1); 19463 19464 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack 19465 // and then ashr/lshr the upper bits down to the lower bits before multiply. 19466 unsigned Opcode = Op.getOpcode(); 19467 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA); 19468 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT); 19469 19470 // AVX2 implementations - extend xmm subvectors to ymm. 19471 if (Subtarget.hasInt256()) { 19472 SDValue Lo = DAG.getIntPtrConstant(0, dl); 19473 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl); 19474 19475 if (VT == MVT::v32i8) { 19476 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo); 19477 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo); 19478 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi); 19479 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi); 19480 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo); 19481 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo); 19482 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi); 19483 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi); 19484 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16, 19485 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo), 19486 DAG.getConstant(8, dl, MVT::v16i16)); 19487 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16, 19488 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi), 19489 DAG.getConstant(8, dl, MVT::v16i16)); 19490 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before 19491 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane. 19492 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7, 19493 16, 17, 18, 19, 20, 21, 22, 23}; 19494 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15, 19495 24, 25, 26, 27, 28, 29, 30, 31}; 19496 return DAG.getNode(X86ISD::PACKUS, dl, VT, 19497 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask), 19498 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); 19499 } 19500 19501 SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A); 19502 SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B); 19503 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); 19504 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, 19505 DAG.getConstant(8, dl, MVT::v16i16)); 19506 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo); 19507 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi); 19508 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); 19509 } 19510 19511 assert(VT == MVT::v16i8 && 19512 "Pre-AVX2 support only supports v16i8 multiplication"); 19513 MVT ExVT = MVT::v8i16; 19514 19515 // Extract the lo parts and zero/sign extend to i16. 19516 SDValue ALo, BLo; 19517 if (Subtarget.hasSSE41()) { 19518 ALo = DAG.getNode(ExSSE41, dl, ExVT, A); 19519 BLo = DAG.getNode(ExSSE41, dl, ExVT, B); 19520 } else { 19521 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, 19522 -1, 4, -1, 5, -1, 6, -1, 7}; 19523 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 19524 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 19525 ALo = DAG.getBitcast(ExVT, ALo); 19526 BLo = DAG.getBitcast(ExVT, BLo); 19527 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); 19528 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); 19529 } 19530 19531 // Extract the hi parts and zero/sign extend to i16. 19532 SDValue AHi, BHi; 19533 if (Subtarget.hasSSE41()) { 19534 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, 19535 -1, -1, -1, -1, -1, -1, -1, -1}; 19536 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 19537 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 19538 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi); 19539 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi); 19540 } else { 19541 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, 19542 -1, 12, -1, 13, -1, 14, -1, 15}; 19543 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); 19544 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); 19545 AHi = DAG.getBitcast(ExVT, AHi); 19546 BHi = DAG.getBitcast(ExVT, BHi); 19547 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); 19548 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); 19549 } 19550 19551 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and 19552 // pack back to v16i8. 19553 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); 19554 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); 19555 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT)); 19556 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT)); 19557 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); 19558 } 19559 19560 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { 19561 assert(Subtarget.isTargetWin64() && "Unexpected target"); 19562 EVT VT = Op.getValueType(); 19563 assert(VT.isInteger() && VT.getSizeInBits() == 128 && 19564 "Unexpected return type for lowering"); 19565 19566 RTLIB::Libcall LC; 19567 bool isSigned; 19568 switch (Op->getOpcode()) { 19569 default: llvm_unreachable("Unexpected request for libcall!"); 19570 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; 19571 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; 19572 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; 19573 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; 19574 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; 19575 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; 19576 } 19577 19578 SDLoc dl(Op); 19579 SDValue InChain = DAG.getEntryNode(); 19580 19581 TargetLowering::ArgListTy Args; 19582 TargetLowering::ArgListEntry Entry; 19583 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { 19584 EVT ArgVT = Op->getOperand(i).getValueType(); 19585 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && 19586 "Unexpected argument type for lowering"); 19587 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); 19588 Entry.Node = StackPtr; 19589 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), 19590 false, false, 16); 19591 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 19592 Entry.Ty = PointerType::get(ArgTy,0); 19593 Entry.isSExt = false; 19594 Entry.isZExt = false; 19595 Args.push_back(Entry); 19596 } 19597 19598 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 19599 getPointerTy(DAG.getDataLayout())); 19600 19601 TargetLowering::CallLoweringInfo CLI(DAG); 19602 CLI.setDebugLoc(dl).setChain(InChain) 19603 .setCallee(getLibcallCallingConv(LC), 19604 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), 19605 Callee, std::move(Args)) 19606 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 19607 19608 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 19609 return DAG.getBitcast(VT, CallInfo.first); 19610 } 19611 19612 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, 19613 SelectionDAG &DAG) { 19614 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); 19615 MVT VT = Op0.getSimpleValueType(); 19616 SDLoc dl(Op); 19617 19618 // Decompose 256-bit ops into smaller 128-bit ops. 19619 if (VT.is256BitVector() && !Subtarget.hasInt256()) { 19620 unsigned Opcode = Op.getOpcode(); 19621 unsigned NumElems = VT.getVectorNumElements(); 19622 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2); 19623 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl); 19624 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl); 19625 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl); 19626 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl); 19627 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1); 19628 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1); 19629 SDValue Ops[] = { 19630 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)), 19631 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1)) 19632 }; 19633 return DAG.getMergeValues(Ops, dl); 19634 } 19635 19636 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || 19637 (VT == MVT::v8i32 && Subtarget.hasInt256())); 19638 19639 // PMULxD operations multiply each even value (starting at 0) of LHS with 19640 // the related value of RHS and produce a widen result. 19641 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> 19642 // => <2 x i64> <ae|cg> 19643 // 19644 // In other word, to have all the results, we need to perform two PMULxD: 19645 // 1. one with the even values. 19646 // 2. one with the odd values. 19647 // To achieve #2, with need to place the odd values at an even position. 19648 // 19649 // Place the odd value at an even position (basically, shift all values 1 19650 // step to the left): 19651 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; 19652 // <a|b|c|d> => <b|undef|d|undef> 19653 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, 19654 makeArrayRef(&Mask[0], VT.getVectorNumElements())); 19655 // <e|f|g|h> => <f|undef|h|undef> 19656 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, 19657 makeArrayRef(&Mask[0], VT.getVectorNumElements())); 19658 19659 // Emit two multiplies, one for the lower 2 ints and one for the higher 2 19660 // ints. 19661 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; 19662 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; 19663 unsigned Opcode = 19664 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; 19665 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> 19666 // => <2 x i64> <ae|cg> 19667 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); 19668 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef> 19669 // => <2 x i64> <bf|dh> 19670 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); 19671 19672 // Shuffle it back into the right order. 19673 SDValue Highs, Lows; 19674 if (VT == MVT::v8i32) { 19675 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; 19676 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); 19677 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; 19678 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); 19679 } else { 19680 const int HighMask[] = {1, 5, 3, 7}; 19681 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); 19682 const int LowMask[] = {0, 4, 2, 6}; 19683 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); 19684 } 19685 19686 // If we have a signed multiply but no PMULDQ fix up the high parts of a 19687 // unsigned multiply. 19688 if (IsSigned && !Subtarget.hasSSE41()) { 19689 SDValue ShAmt = DAG.getConstant( 19690 31, dl, 19691 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); 19692 SDValue T1 = DAG.getNode(ISD::AND, dl, VT, 19693 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); 19694 SDValue T2 = DAG.getNode(ISD::AND, dl, VT, 19695 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); 19696 19697 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); 19698 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); 19699 } 19700 19701 // The first result of MUL_LOHI is actually the low value, followed by the 19702 // high value. 19703 SDValue Ops[] = {Lows, Highs}; 19704 return DAG.getMergeValues(Ops, dl); 19705 } 19706 19707 // Return true if the required (according to Opcode) shift-imm form is natively 19708 // supported by the Subtarget 19709 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, 19710 unsigned Opcode) { 19711 if (VT.getScalarSizeInBits() < 16) 19712 return false; 19713 19714 if (VT.is512BitVector() && 19715 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) 19716 return true; 19717 19718 bool LShift = VT.is128BitVector() || 19719 (VT.is256BitVector() && Subtarget.hasInt256()); 19720 19721 bool AShift = LShift && (Subtarget.hasVLX() || 19722 (VT != MVT::v2i64 && VT != MVT::v4i64)); 19723 return (Opcode == ISD::SRA) ? AShift : LShift; 19724 } 19725 19726 // The shift amount is a variable, but it is the same for all vector lanes. 19727 // These instructions are defined together with shift-immediate. 19728 static 19729 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget, 19730 unsigned Opcode) { 19731 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); 19732 } 19733 19734 // Return true if the required (according to Opcode) variable-shift form is 19735 // natively supported by the Subtarget 19736 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, 19737 unsigned Opcode) { 19738 19739 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16) 19740 return false; 19741 19742 // vXi16 supported only on AVX-512, BWI 19743 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) 19744 return false; 19745 19746 if (VT.is512BitVector() || Subtarget.hasVLX()) 19747 return true; 19748 19749 bool LShift = VT.is128BitVector() || VT.is256BitVector(); 19750 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; 19751 return (Opcode == ISD::SRA) ? AShift : LShift; 19752 } 19753 19754 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, 19755 const X86Subtarget &Subtarget) { 19756 MVT VT = Op.getSimpleValueType(); 19757 SDLoc dl(Op); 19758 SDValue R = Op.getOperand(0); 19759 SDValue Amt = Op.getOperand(1); 19760 19761 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : 19762 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; 19763 19764 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { 19765 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); 19766 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); 19767 SDValue Ex = DAG.getBitcast(ExVT, R); 19768 19769 if (ShiftAmt >= 32) { 19770 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. 19771 SDValue Upper = 19772 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); 19773 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 19774 ShiftAmt - 32, DAG); 19775 if (VT == MVT::v2i64) 19776 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); 19777 if (VT == MVT::v4i64) 19778 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, 19779 {9, 1, 11, 3, 13, 5, 15, 7}); 19780 } else { 19781 // SRA upper i32, SHL whole i64 and select lower i32. 19782 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 19783 ShiftAmt, DAG); 19784 SDValue Lower = 19785 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); 19786 Lower = DAG.getBitcast(ExVT, Lower); 19787 if (VT == MVT::v2i64) 19788 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); 19789 if (VT == MVT::v4i64) 19790 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, 19791 {8, 1, 10, 3, 12, 5, 14, 7}); 19792 } 19793 return DAG.getBitcast(VT, Ex); 19794 }; 19795 19796 // Optimize shl/srl/sra with constant shift amount. 19797 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { 19798 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { 19799 uint64_t ShiftAmt = ShiftConst->getZExtValue(); 19800 19801 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) 19802 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); 19803 19804 // i64 SRA needs to be performed as partial shifts. 19805 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) && 19806 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP()) 19807 return ArithmeticShiftRight64(ShiftAmt); 19808 19809 if (VT == MVT::v16i8 || 19810 (Subtarget.hasInt256() && VT == MVT::v32i8) || 19811 VT == MVT::v64i8) { 19812 unsigned NumElts = VT.getVectorNumElements(); 19813 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); 19814 19815 // Simple i8 add case 19816 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) 19817 return DAG.getNode(ISD::ADD, dl, VT, R, R); 19818 19819 // ashr(R, 7) === cmp_slt(R, 0) 19820 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { 19821 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 19822 if (VT.is512BitVector()) { 19823 assert(VT == MVT::v64i8 && "Unexpected element type!"); 19824 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R); 19825 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP); 19826 } 19827 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 19828 } 19829 19830 // XOP can shift v16i8 directly instead of as shift v8i16 + mask. 19831 if (VT == MVT::v16i8 && Subtarget.hasXOP()) 19832 return SDValue(); 19833 19834 if (Op.getOpcode() == ISD::SHL) { 19835 // Make a large shift. 19836 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, 19837 R, ShiftAmt, DAG); 19838 SHL = DAG.getBitcast(VT, SHL); 19839 // Zero out the rightmost bits. 19840 return DAG.getNode(ISD::AND, dl, VT, SHL, 19841 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); 19842 } 19843 if (Op.getOpcode() == ISD::SRL) { 19844 // Make a large shift. 19845 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, 19846 R, ShiftAmt, DAG); 19847 SRL = DAG.getBitcast(VT, SRL); 19848 // Zero out the leftmost bits. 19849 return DAG.getNode(ISD::AND, dl, VT, SRL, 19850 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); 19851 } 19852 if (Op.getOpcode() == ISD::SRA) { 19853 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) 19854 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 19855 19856 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); 19857 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 19858 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 19859 return Res; 19860 } 19861 llvm_unreachable("Unknown shift opcode."); 19862 } 19863 } 19864 } 19865 19866 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 19867 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() && 19868 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) { 19869 19870 // Peek through any splat that was introduced for i64 shift vectorization. 19871 int SplatIndex = -1; 19872 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) 19873 if (SVN->isSplat()) { 19874 SplatIndex = SVN->getSplatIndex(); 19875 Amt = Amt.getOperand(0); 19876 assert(SplatIndex < (int)VT.getVectorNumElements() && 19877 "Splat shuffle referencing second operand"); 19878 } 19879 19880 if (Amt.getOpcode() != ISD::BITCAST || 19881 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) 19882 return SDValue(); 19883 19884 Amt = Amt.getOperand(0); 19885 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / 19886 VT.getVectorNumElements(); 19887 unsigned RatioInLog2 = Log2_32_Ceil(Ratio); 19888 uint64_t ShiftAmt = 0; 19889 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); 19890 for (unsigned i = 0; i != Ratio; ++i) { 19891 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp)); 19892 if (!C) 19893 return SDValue(); 19894 // 6 == Log2(64) 19895 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); 19896 } 19897 19898 // Check remaining shift amounts (if not a splat). 19899 if (SplatIndex < 0) { 19900 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 19901 uint64_t ShAmt = 0; 19902 for (unsigned j = 0; j != Ratio; ++j) { 19903 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); 19904 if (!C) 19905 return SDValue(); 19906 // 6 == Log2(64) 19907 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); 19908 } 19909 if (ShAmt != ShiftAmt) 19910 return SDValue(); 19911 } 19912 } 19913 19914 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) 19915 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); 19916 19917 if (Op.getOpcode() == ISD::SRA) 19918 return ArithmeticShiftRight64(ShiftAmt); 19919 } 19920 19921 return SDValue(); 19922 } 19923 19924 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, 19925 const X86Subtarget &Subtarget) { 19926 MVT VT = Op.getSimpleValueType(); 19927 SDLoc dl(Op); 19928 SDValue R = Op.getOperand(0); 19929 SDValue Amt = Op.getOperand(1); 19930 19931 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : 19932 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; 19933 19934 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL : 19935 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA; 19936 19937 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { 19938 SDValue BaseShAmt; 19939 MVT EltVT = VT.getVectorElementType(); 19940 19941 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { 19942 // Check if this build_vector node is doing a splat. 19943 // If so, then set BaseShAmt equal to the splat value. 19944 BaseShAmt = BV->getSplatValue(); 19945 if (BaseShAmt && BaseShAmt.isUndef()) 19946 BaseShAmt = SDValue(); 19947 } else { 19948 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) 19949 Amt = Amt.getOperand(0); 19950 19951 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt); 19952 if (SVN && SVN->isSplat()) { 19953 unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); 19954 SDValue InVec = Amt.getOperand(0); 19955 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 19956 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && 19957 "Unexpected shuffle index found!"); 19958 BaseShAmt = InVec.getOperand(SplatIdx); 19959 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 19960 if (ConstantSDNode *C = 19961 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 19962 if (C->getZExtValue() == SplatIdx) 19963 BaseShAmt = InVec.getOperand(1); 19964 } 19965 } 19966 19967 if (!BaseShAmt) 19968 // Avoid introducing an extract element from a shuffle. 19969 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, 19970 DAG.getIntPtrConstant(SplatIdx, dl)); 19971 } 19972 } 19973 19974 if (BaseShAmt.getNode()) { 19975 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); 19976 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) 19977 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); 19978 else if (EltVT.bitsLT(MVT::i32)) 19979 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); 19980 19981 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG); 19982 } 19983 } 19984 19985 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 19986 if (!Subtarget.is64Bit() && VT == MVT::v2i64 && 19987 Amt.getOpcode() == ISD::BITCAST && 19988 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 19989 Amt = Amt.getOperand(0); 19990 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / 19991 VT.getVectorNumElements(); 19992 std::vector<SDValue> Vals(Ratio); 19993 for (unsigned i = 0; i != Ratio; ++i) 19994 Vals[i] = Amt.getOperand(i); 19995 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 19996 for (unsigned j = 0; j != Ratio; ++j) 19997 if (Vals[j] != Amt.getOperand(i + j)) 19998 return SDValue(); 19999 } 20000 20001 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) 20002 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); 20003 } 20004 return SDValue(); 20005 } 20006 20007 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, 20008 SelectionDAG &DAG) { 20009 MVT VT = Op.getSimpleValueType(); 20010 SDLoc dl(Op); 20011 SDValue R = Op.getOperand(0); 20012 SDValue Amt = Op.getOperand(1); 20013 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); 20014 20015 assert(VT.isVector() && "Custom lowering only for vector shifts!"); 20016 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"); 20017 20018 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) 20019 return V; 20020 20021 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) 20022 return V; 20023 20024 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) 20025 return Op; 20026 20027 // XOP has 128-bit variable logical/arithmetic shifts. 20028 // +ve/-ve Amt = shift left/right. 20029 if (Subtarget.hasXOP() && 20030 (VT == MVT::v2i64 || VT == MVT::v4i32 || 20031 VT == MVT::v8i16 || VT == MVT::v16i8)) { 20032 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { 20033 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); 20034 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); 20035 } 20036 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) 20037 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); 20038 if (Op.getOpcode() == ISD::SRA) 20039 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); 20040 } 20041 20042 // 2i64 vector logical shifts can efficiently avoid scalarization - do the 20043 // shifts per-lane and then shuffle the partial results back together. 20044 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { 20045 // Splat the shift amounts so the scalar shifts above will catch it. 20046 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); 20047 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); 20048 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); 20049 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); 20050 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); 20051 } 20052 20053 // i64 vector arithmetic shift can be emulated with the transform: 20054 // M = lshr(SIGN_BIT, Amt) 20055 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) 20056 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) && 20057 Op.getOpcode() == ISD::SRA) { 20058 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); 20059 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); 20060 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 20061 R = DAG.getNode(ISD::XOR, dl, VT, R, M); 20062 R = DAG.getNode(ISD::SUB, dl, VT, R, M); 20063 return R; 20064 } 20065 20066 // If possible, lower this packed shift into a vector multiply instead of 20067 // expanding it into a sequence of scalar shifts. 20068 // Do this only if the vector shift count is a constant build_vector. 20069 if (ConstantAmt && Op.getOpcode() == ISD::SHL && 20070 (VT == MVT::v8i16 || VT == MVT::v4i32 || 20071 (Subtarget.hasInt256() && VT == MVT::v16i16))) { 20072 SmallVector<SDValue, 8> Elts; 20073 MVT SVT = VT.getVectorElementType(); 20074 unsigned SVTBits = SVT.getSizeInBits(); 20075 APInt One(SVTBits, 1); 20076 unsigned NumElems = VT.getVectorNumElements(); 20077 20078 for (unsigned i=0; i !=NumElems; ++i) { 20079 SDValue Op = Amt->getOperand(i); 20080 if (Op->isUndef()) { 20081 Elts.push_back(Op); 20082 continue; 20083 } 20084 20085 ConstantSDNode *ND = cast<ConstantSDNode>(Op); 20086 APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); 20087 uint64_t ShAmt = C.getZExtValue(); 20088 if (ShAmt >= SVTBits) { 20089 Elts.push_back(DAG.getUNDEF(SVT)); 20090 continue; 20091 } 20092 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); 20093 } 20094 SDValue BV = DAG.getBuildVector(VT, dl, Elts); 20095 return DAG.getNode(ISD::MUL, dl, VT, R, BV); 20096 } 20097 20098 // Lower SHL with variable shift amount. 20099 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 20100 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); 20101 20102 Op = DAG.getNode(ISD::ADD, dl, VT, Op, 20103 DAG.getConstant(0x3f800000U, dl, VT)); 20104 Op = DAG.getBitcast(MVT::v4f32, Op); 20105 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 20106 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 20107 } 20108 20109 // If possible, lower this shift as a sequence of two shifts by 20110 // constant plus a MOVSS/MOVSD instead of scalarizing it. 20111 // Example: 20112 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) 20113 // 20114 // Could be rewritten as: 20115 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) 20116 // 20117 // The advantage is that the two shifts from the example would be 20118 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing 20119 // the vector shift into four scalar shifts plus four pairs of vector 20120 // insert/extract. 20121 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) { 20122 unsigned TargetOpcode = X86ISD::MOVSS; 20123 bool CanBeSimplified; 20124 // The splat value for the first packed shift (the 'X' from the example). 20125 SDValue Amt1 = Amt->getOperand(0); 20126 // The splat value for the second packed shift (the 'Y' from the example). 20127 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2); 20128 20129 // See if it is possible to replace this node with a sequence of 20130 // two shifts followed by a MOVSS/MOVSD 20131 if (VT == MVT::v4i32) { 20132 // Check if it is legal to use a MOVSS. 20133 CanBeSimplified = Amt2 == Amt->getOperand(2) && 20134 Amt2 == Amt->getOperand(3); 20135 if (!CanBeSimplified) { 20136 // Otherwise, check if we can still simplify this node using a MOVSD. 20137 CanBeSimplified = Amt1 == Amt->getOperand(1) && 20138 Amt->getOperand(2) == Amt->getOperand(3); 20139 TargetOpcode = X86ISD::MOVSD; 20140 Amt2 = Amt->getOperand(2); 20141 } 20142 } else { 20143 // Do similar checks for the case where the machine value type 20144 // is MVT::v8i16. 20145 CanBeSimplified = Amt1 == Amt->getOperand(1); 20146 for (unsigned i=3; i != 8 && CanBeSimplified; ++i) 20147 CanBeSimplified = Amt2 == Amt->getOperand(i); 20148 20149 if (!CanBeSimplified) { 20150 TargetOpcode = X86ISD::MOVSD; 20151 CanBeSimplified = true; 20152 Amt2 = Amt->getOperand(4); 20153 for (unsigned i=0; i != 4 && CanBeSimplified; ++i) 20154 CanBeSimplified = Amt1 == Amt->getOperand(i); 20155 for (unsigned j=4; j != 8 && CanBeSimplified; ++j) 20156 CanBeSimplified = Amt2 == Amt->getOperand(j); 20157 } 20158 } 20159 20160 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && 20161 isa<ConstantSDNode>(Amt2)) { 20162 // Replace this node with two shifts followed by a MOVSS/MOVSD. 20163 MVT CastVT = MVT::v4i32; 20164 SDValue Splat1 = 20165 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); 20166 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); 20167 SDValue Splat2 = 20168 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT); 20169 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); 20170 if (TargetOpcode == X86ISD::MOVSD) 20171 CastVT = MVT::v2i64; 20172 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1); 20173 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2); 20174 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, 20175 BitCast1, DAG); 20176 return DAG.getBitcast(VT, Result); 20177 } 20178 } 20179 20180 // v4i32 Non Uniform Shifts. 20181 // If the shift amount is constant we can shift each lane using the SSE2 20182 // immediate shifts, else we need to zero-extend each lane to the lower i64 20183 // and shift using the SSE2 variable shifts. 20184 // The separate results can then be blended together. 20185 if (VT == MVT::v4i32) { 20186 unsigned Opc = Op.getOpcode(); 20187 SDValue Amt0, Amt1, Amt2, Amt3; 20188 if (ConstantAmt) { 20189 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); 20190 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); 20191 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); 20192 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); 20193 } else { 20194 // ISD::SHL is handled above but we include it here for completeness. 20195 switch (Opc) { 20196 default: 20197 llvm_unreachable("Unknown target vector shift node"); 20198 case ISD::SHL: 20199 Opc = X86ISD::VSHL; 20200 break; 20201 case ISD::SRL: 20202 Opc = X86ISD::VSRL; 20203 break; 20204 case ISD::SRA: 20205 Opc = X86ISD::VSRA; 20206 break; 20207 } 20208 // The SSE2 shifts use the lower i64 as the same shift amount for 20209 // all lanes and the upper i64 is ignored. These shuffle masks 20210 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets. 20211 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); 20212 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); 20213 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); 20214 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); 20215 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); 20216 } 20217 20218 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); 20219 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); 20220 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2); 20221 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3); 20222 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); 20223 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); 20224 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); 20225 } 20226 20227 if (VT == MVT::v16i8 || 20228 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) { 20229 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); 20230 unsigned ShiftOpcode = Op->getOpcode(); 20231 20232 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { 20233 // On SSE41 targets we make use of the fact that VSELECT lowers 20234 // to PBLENDVB which selects bytes based just on the sign bit. 20235 if (Subtarget.hasSSE41()) { 20236 V0 = DAG.getBitcast(VT, V0); 20237 V1 = DAG.getBitcast(VT, V1); 20238 Sel = DAG.getBitcast(VT, Sel); 20239 return DAG.getBitcast(SelVT, 20240 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); 20241 } 20242 // On pre-SSE41 targets we test for the sign bit by comparing to 20243 // zero - a negative value will set all bits of the lanes to true 20244 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. 20245 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); 20246 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); 20247 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); 20248 }; 20249 20250 // Turn 'a' into a mask suitable for VSELECT: a = a << 5; 20251 // We can safely do this using i16 shifts as we're only interested in 20252 // the 3 lower bits of each byte. 20253 Amt = DAG.getBitcast(ExtVT, Amt); 20254 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT)); 20255 Amt = DAG.getBitcast(VT, Amt); 20256 20257 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) { 20258 // r = VSELECT(r, shift(r, 4), a); 20259 SDValue M = 20260 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); 20261 R = SignBitSelect(VT, Amt, M, R); 20262 20263 // a += a 20264 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 20265 20266 // r = VSELECT(r, shift(r, 2), a); 20267 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); 20268 R = SignBitSelect(VT, Amt, M, R); 20269 20270 // a += a 20271 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 20272 20273 // return VSELECT(r, shift(r, 1), a); 20274 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); 20275 R = SignBitSelect(VT, Amt, M, R); 20276 return R; 20277 } 20278 20279 if (Op->getOpcode() == ISD::SRA) { 20280 // For SRA we need to unpack each byte to the higher byte of a i16 vector 20281 // so we can correctly sign extend. We don't care what happens to the 20282 // lower byte. 20283 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt); 20284 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt); 20285 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R); 20286 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R); 20287 ALo = DAG.getBitcast(ExtVT, ALo); 20288 AHi = DAG.getBitcast(ExtVT, AHi); 20289 RLo = DAG.getBitcast(ExtVT, RLo); 20290 RHi = DAG.getBitcast(ExtVT, RHi); 20291 20292 // r = VSELECT(r, shift(r, 4), a); 20293 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, 20294 DAG.getConstant(4, dl, ExtVT)); 20295 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, 20296 DAG.getConstant(4, dl, ExtVT)); 20297 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); 20298 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); 20299 20300 // a += a 20301 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); 20302 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); 20303 20304 // r = VSELECT(r, shift(r, 2), a); 20305 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, 20306 DAG.getConstant(2, dl, ExtVT)); 20307 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, 20308 DAG.getConstant(2, dl, ExtVT)); 20309 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); 20310 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); 20311 20312 // a += a 20313 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); 20314 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); 20315 20316 // r = VSELECT(r, shift(r, 1), a); 20317 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, 20318 DAG.getConstant(1, dl, ExtVT)); 20319 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, 20320 DAG.getConstant(1, dl, ExtVT)); 20321 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); 20322 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); 20323 20324 // Logical shift the result back to the lower byte, leaving a zero upper 20325 // byte 20326 // meaning that we can safely pack with PACKUSWB. 20327 RLo = 20328 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT)); 20329 RHi = 20330 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT)); 20331 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); 20332 } 20333 } 20334 20335 // It's worth extending once and using the v8i32 shifts for 16-bit types, but 20336 // the extra overheads to get from v16i8 to v8i32 make the existing SSE 20337 // solution better. 20338 if (Subtarget.hasInt256() && VT == MVT::v8i16) { 20339 MVT ExtVT = MVT::v8i32; 20340 unsigned ExtOpc = 20341 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 20342 R = DAG.getNode(ExtOpc, dl, ExtVT, R); 20343 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); 20344 return DAG.getNode(ISD::TRUNCATE, dl, VT, 20345 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); 20346 } 20347 20348 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { 20349 MVT ExtVT = MVT::v8i32; 20350 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); 20351 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); 20352 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); 20353 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R); 20354 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R); 20355 ALo = DAG.getBitcast(ExtVT, ALo); 20356 AHi = DAG.getBitcast(ExtVT, AHi); 20357 RLo = DAG.getBitcast(ExtVT, RLo); 20358 RHi = DAG.getBitcast(ExtVT, RHi); 20359 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo); 20360 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi); 20361 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT)); 20362 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT)); 20363 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); 20364 } 20365 20366 if (VT == MVT::v8i16) { 20367 unsigned ShiftOpcode = Op->getOpcode(); 20368 20369 // If we have a constant shift amount, the non-SSE41 path is best as 20370 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW. 20371 bool UseSSE41 = Subtarget.hasSSE41() && 20372 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); 20373 20374 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { 20375 // On SSE41 targets we make use of the fact that VSELECT lowers 20376 // to PBLENDVB which selects bytes based just on the sign bit. 20377 if (UseSSE41) { 20378 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); 20379 V0 = DAG.getBitcast(ExtVT, V0); 20380 V1 = DAG.getBitcast(ExtVT, V1); 20381 Sel = DAG.getBitcast(ExtVT, Sel); 20382 return DAG.getBitcast( 20383 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); 20384 } 20385 // On pre-SSE41 targets we splat the sign bit - a negative value will 20386 // set all bits of the lanes to true and VSELECT uses that in 20387 // its OR(AND(V0,C),AND(V1,~C)) lowering. 20388 SDValue C = 20389 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); 20390 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); 20391 }; 20392 20393 // Turn 'a' into a mask suitable for VSELECT: a = a << 12; 20394 if (UseSSE41) { 20395 // On SSE41 targets we need to replicate the shift mask in both 20396 // bytes for PBLENDVB. 20397 Amt = DAG.getNode( 20398 ISD::OR, dl, VT, 20399 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)), 20400 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT))); 20401 } else { 20402 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)); 20403 } 20404 20405 // r = VSELECT(r, shift(r, 8), a); 20406 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT)); 20407 R = SignBitSelect(Amt, M, R); 20408 20409 // a += a 20410 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 20411 20412 // r = VSELECT(r, shift(r, 4), a); 20413 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); 20414 R = SignBitSelect(Amt, M, R); 20415 20416 // a += a 20417 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 20418 20419 // r = VSELECT(r, shift(r, 2), a); 20420 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); 20421 R = SignBitSelect(Amt, M, R); 20422 20423 // a += a 20424 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); 20425 20426 // return VSELECT(r, shift(r, 1), a); 20427 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); 20428 R = SignBitSelect(Amt, M, R); 20429 return R; 20430 } 20431 20432 // Decompose 256-bit shifts into smaller 128-bit shifts. 20433 if (VT.is256BitVector()) 20434 return Lower256IntArith(Op, DAG); 20435 20436 return SDValue(); 20437 } 20438 20439 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, 20440 SelectionDAG &DAG) { 20441 MVT VT = Op.getSimpleValueType(); 20442 SDLoc DL(Op); 20443 SDValue R = Op.getOperand(0); 20444 SDValue Amt = Op.getOperand(1); 20445 20446 assert(VT.isVector() && "Custom lowering only for vector rotates!"); 20447 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!"); 20448 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); 20449 20450 // XOP has 128-bit vector variable + immediate rotates. 20451 // +ve/-ve Amt = rotate left/right. 20452 20453 // Split 256-bit integers. 20454 if (VT.is256BitVector()) 20455 return Lower256IntArith(Op, DAG); 20456 20457 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); 20458 20459 // Attempt to rotate by immediate. 20460 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { 20461 if (auto *RotateConst = BVAmt->getConstantSplatNode()) { 20462 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); 20463 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); 20464 return DAG.getNode(X86ISD::VPROTI, DL, VT, R, 20465 DAG.getConstant(RotateAmt, DL, MVT::i8)); 20466 } 20467 } 20468 20469 // Use general rotate by variable (per-element). 20470 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); 20471 } 20472 20473 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 20474 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 20475 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 20476 // looks for this combo and may remove the "setcc" instruction if the "setcc" 20477 // has only one use. 20478 SDNode *N = Op.getNode(); 20479 SDValue LHS = N->getOperand(0); 20480 SDValue RHS = N->getOperand(1); 20481 unsigned BaseOp = 0; 20482 unsigned Cond = 0; 20483 SDLoc DL(Op); 20484 switch (Op.getOpcode()) { 20485 default: llvm_unreachable("Unknown ovf instruction!"); 20486 case ISD::SADDO: 20487 // A subtract of one will be selected as a INC. Note that INC doesn't 20488 // set CF, so we can't do this for UADDO. 20489 if (isOneConstant(RHS)) { 20490 BaseOp = X86ISD::INC; 20491 Cond = X86::COND_O; 20492 break; 20493 } 20494 BaseOp = X86ISD::ADD; 20495 Cond = X86::COND_O; 20496 break; 20497 case ISD::UADDO: 20498 BaseOp = X86ISD::ADD; 20499 Cond = X86::COND_B; 20500 break; 20501 case ISD::SSUBO: 20502 // A subtract of one will be selected as a DEC. Note that DEC doesn't 20503 // set CF, so we can't do this for USUBO. 20504 if (isOneConstant(RHS)) { 20505 BaseOp = X86ISD::DEC; 20506 Cond = X86::COND_O; 20507 break; 20508 } 20509 BaseOp = X86ISD::SUB; 20510 Cond = X86::COND_O; 20511 break; 20512 case ISD::USUBO: 20513 BaseOp = X86ISD::SUB; 20514 Cond = X86::COND_B; 20515 break; 20516 case ISD::SMULO: 20517 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; 20518 Cond = X86::COND_O; 20519 break; 20520 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 20521 if (N->getValueType(0) == MVT::i8) { 20522 BaseOp = X86ISD::UMUL8; 20523 Cond = X86::COND_O; 20524 break; 20525 } 20526 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 20527 MVT::i32); 20528 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 20529 20530 SDValue SetCC = 20531 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 20532 DAG.getConstant(X86::COND_O, DL, MVT::i32), 20533 SDValue(Sum.getNode(), 2)); 20534 20535 if (N->getValueType(1) == MVT::i1) { 20536 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, 20537 DAG.getValueType(MVT::i1)); 20538 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); 20539 } 20540 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 20541 } 20542 } 20543 20544 // Also sets EFLAGS. 20545 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 20546 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 20547 20548 SDValue SetCC = 20549 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 20550 DAG.getConstant(Cond, DL, MVT::i32), 20551 SDValue(Sum.getNode(), 1)); 20552 20553 if (N->getValueType(1) == MVT::i1) { 20554 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, 20555 DAG.getValueType(MVT::i1)); 20556 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); 20557 } 20558 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 20559 } 20560 20561 /// Returns true if the operand type is exactly twice the native width, and 20562 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. 20563 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations 20564 /// (otherwise we leave them alone to become __sync_fetch_and_... calls). 20565 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { 20566 unsigned OpWidth = MemType->getPrimitiveSizeInBits(); 20567 20568 if (OpWidth == 64) 20569 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b 20570 else if (OpWidth == 128) 20571 return Subtarget.hasCmpxchg16b(); 20572 else 20573 return false; 20574 } 20575 20576 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 20577 return needsCmpXchgNb(SI->getValueOperand()->getType()); 20578 } 20579 20580 // Note: this turns large loads into lock cmpxchg8b/16b. 20581 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. 20582 TargetLowering::AtomicExpansionKind 20583 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 20584 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); 20585 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg 20586 : AtomicExpansionKind::None; 20587 } 20588 20589 TargetLowering::AtomicExpansionKind 20590 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 20591 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; 20592 Type *MemType = AI->getType(); 20593 20594 // If the operand is too big, we must see if cmpxchg8/16b is available 20595 // and default to library calls otherwise. 20596 if (MemType->getPrimitiveSizeInBits() > NativeWidth) { 20597 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg 20598 : AtomicExpansionKind::None; 20599 } 20600 20601 AtomicRMWInst::BinOp Op = AI->getOperation(); 20602 switch (Op) { 20603 default: 20604 llvm_unreachable("Unknown atomic operation"); 20605 case AtomicRMWInst::Xchg: 20606 case AtomicRMWInst::Add: 20607 case AtomicRMWInst::Sub: 20608 // It's better to use xadd, xsub or xchg for these in all cases. 20609 return AtomicExpansionKind::None; 20610 case AtomicRMWInst::Or: 20611 case AtomicRMWInst::And: 20612 case AtomicRMWInst::Xor: 20613 // If the atomicrmw's result isn't actually used, we can just add a "lock" 20614 // prefix to a normal instruction for these operations. 20615 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg 20616 : AtomicExpansionKind::None; 20617 case AtomicRMWInst::Nand: 20618 case AtomicRMWInst::Max: 20619 case AtomicRMWInst::Min: 20620 case AtomicRMWInst::UMax: 20621 case AtomicRMWInst::UMin: 20622 // These always require a non-trivial set of data operations on x86. We must 20623 // use a cmpxchg loop. 20624 return AtomicExpansionKind::CmpXChg; 20625 } 20626 } 20627 20628 LoadInst * 20629 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { 20630 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; 20631 Type *MemType = AI->getType(); 20632 // Accesses larger than the native width are turned into cmpxchg/libcalls, so 20633 // there is no benefit in turning such RMWs into loads, and it is actually 20634 // harmful as it introduces a mfence. 20635 if (MemType->getPrimitiveSizeInBits() > NativeWidth) 20636 return nullptr; 20637 20638 auto Builder = IRBuilder<>(AI); 20639 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 20640 auto SynchScope = AI->getSynchScope(); 20641 // We must restrict the ordering to avoid generating loads with Release or 20642 // ReleaseAcquire orderings. 20643 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); 20644 auto Ptr = AI->getPointerOperand(); 20645 20646 // Before the load we need a fence. Here is an example lifted from 20647 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence 20648 // is required: 20649 // Thread 0: 20650 // x.store(1, relaxed); 20651 // r1 = y.fetch_add(0, release); 20652 // Thread 1: 20653 // y.fetch_add(42, acquire); 20654 // r2 = x.load(relaxed); 20655 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is 20656 // lowered to just a load without a fence. A mfence flushes the store buffer, 20657 // making the optimization clearly correct. 20658 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear 20659 // otherwise, we might be able to be more aggressive on relaxed idempotent 20660 // rmw. In practice, they do not look useful, so we don't try to be 20661 // especially clever. 20662 if (SynchScope == SingleThread) 20663 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at 20664 // the IR level, so we must wrap it in an intrinsic. 20665 return nullptr; 20666 20667 if (!Subtarget.hasMFence()) 20668 // FIXME: it might make sense to use a locked operation here but on a 20669 // different cache-line to prevent cache-line bouncing. In practice it 20670 // is probably a small win, and x86 processors without mfence are rare 20671 // enough that we do not bother. 20672 return nullptr; 20673 20674 Function *MFence = 20675 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); 20676 Builder.CreateCall(MFence, {}); 20677 20678 // Finally we can emit the atomic load. 20679 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, 20680 AI->getType()->getPrimitiveSizeInBits()); 20681 Loaded->setAtomic(Order, SynchScope); 20682 AI->replaceAllUsesWith(Loaded); 20683 AI->eraseFromParent(); 20684 return Loaded; 20685 } 20686 20687 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, 20688 SelectionDAG &DAG) { 20689 SDLoc dl(Op); 20690 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 20691 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 20692 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 20693 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 20694 20695 // The only fence that needs an instruction is a sequentially-consistent 20696 // cross-thread fence. 20697 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && 20698 FenceScope == CrossThread) { 20699 if (Subtarget.hasMFence()) 20700 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 20701 20702 SDValue Chain = Op.getOperand(0); 20703 SDValue Zero = DAG.getConstant(0, dl, MVT::i32); 20704 SDValue Ops[] = { 20705 DAG.getRegister(X86::ESP, MVT::i32), // Base 20706 DAG.getTargetConstant(1, dl, MVT::i8), // Scale 20707 DAG.getRegister(0, MVT::i32), // Index 20708 DAG.getTargetConstant(0, dl, MVT::i32), // Disp 20709 DAG.getRegister(0, MVT::i32), // Segment. 20710 Zero, 20711 Chain 20712 }; 20713 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); 20714 return SDValue(Res, 0); 20715 } 20716 20717 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 20718 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 20719 } 20720 20721 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, 20722 SelectionDAG &DAG) { 20723 MVT T = Op.getSimpleValueType(); 20724 SDLoc DL(Op); 20725 unsigned Reg = 0; 20726 unsigned size = 0; 20727 switch(T.SimpleTy) { 20728 default: llvm_unreachable("Invalid value type!"); 20729 case MVT::i8: Reg = X86::AL; size = 1; break; 20730 case MVT::i16: Reg = X86::AX; size = 2; break; 20731 case MVT::i32: Reg = X86::EAX; size = 4; break; 20732 case MVT::i64: 20733 assert(Subtarget.is64Bit() && "Node not type legal!"); 20734 Reg = X86::RAX; size = 8; 20735 break; 20736 } 20737 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 20738 Op.getOperand(2), SDValue()); 20739 SDValue Ops[] = { cpIn.getValue(0), 20740 Op.getOperand(1), 20741 Op.getOperand(3), 20742 DAG.getTargetConstant(size, DL, MVT::i8), 20743 cpIn.getValue(1) }; 20744 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 20745 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 20746 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 20747 Ops, T, MMO); 20748 20749 SDValue cpOut = 20750 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 20751 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, 20752 MVT::i32, cpOut.getValue(2)); 20753 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), 20754 DAG.getConstant(X86::COND_E, DL, MVT::i8), 20755 EFLAGS); 20756 20757 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); 20758 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); 20759 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); 20760 return SDValue(); 20761 } 20762 20763 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, 20764 SelectionDAG &DAG) { 20765 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 20766 MVT DstVT = Op.getSimpleValueType(); 20767 20768 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || 20769 SrcVT == MVT::i64) { 20770 assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); 20771 if (DstVT != MVT::f64) 20772 // This conversion needs to be expanded. 20773 return SDValue(); 20774 20775 SDValue Op0 = Op->getOperand(0); 20776 SmallVector<SDValue, 16> Elts; 20777 SDLoc dl(Op); 20778 unsigned NumElts; 20779 MVT SVT; 20780 if (SrcVT.isVector()) { 20781 NumElts = SrcVT.getVectorNumElements(); 20782 SVT = SrcVT.getVectorElementType(); 20783 20784 // Widen the vector in input in the case of MVT::v2i32. 20785 // Example: from MVT::v2i32 to MVT::v4i32. 20786 for (unsigned i = 0, e = NumElts; i != e; ++i) 20787 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, 20788 DAG.getIntPtrConstant(i, dl))); 20789 } else { 20790 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && 20791 "Unexpected source type in LowerBITCAST"); 20792 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, 20793 DAG.getIntPtrConstant(0, dl))); 20794 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, 20795 DAG.getIntPtrConstant(1, dl))); 20796 NumElts = 2; 20797 SVT = MVT::i32; 20798 } 20799 // Explicitly mark the extra elements as Undef. 20800 Elts.append(NumElts, DAG.getUNDEF(SVT)); 20801 20802 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); 20803 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts); 20804 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV); 20805 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, 20806 DAG.getIntPtrConstant(0, dl)); 20807 } 20808 20809 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() && 20810 Subtarget.hasMMX() && "Unexpected custom BITCAST"); 20811 assert((DstVT == MVT::i64 || 20812 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 20813 "Unexpected custom BITCAST"); 20814 // i64 <=> MMX conversions are Legal. 20815 if (SrcVT==MVT::i64 && DstVT.isVector()) 20816 return Op; 20817 if (DstVT==MVT::i64 && SrcVT.isVector()) 20818 return Op; 20819 // MMX <=> MMX conversions are Legal. 20820 if (SrcVT.isVector() && DstVT.isVector()) 20821 return Op; 20822 // All other conversions need to be expanded. 20823 return SDValue(); 20824 } 20825 20826 /// Compute the horizontal sum of bytes in V for the elements of VT. 20827 /// 20828 /// Requires V to be a byte vector and VT to be an integer vector type with 20829 /// wider elements than V's type. The width of the elements of VT determines 20830 /// how many bytes of V are summed horizontally to produce each element of the 20831 /// result. 20832 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, 20833 const X86Subtarget &Subtarget, 20834 SelectionDAG &DAG) { 20835 SDLoc DL(V); 20836 MVT ByteVecVT = V.getSimpleValueType(); 20837 MVT EltVT = VT.getVectorElementType(); 20838 assert(ByteVecVT.getVectorElementType() == MVT::i8 && 20839 "Expected value to have byte element type."); 20840 assert(EltVT != MVT::i8 && 20841 "Horizontal byte sum only makes sense for wider elements!"); 20842 unsigned VecSize = VT.getSizeInBits(); 20843 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"); 20844 20845 // PSADBW instruction horizontally add all bytes and leave the result in i64 20846 // chunks, thus directly computes the pop count for v2i64 and v4i64. 20847 if (EltVT == MVT::i64) { 20848 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); 20849 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); 20850 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); 20851 return DAG.getBitcast(VT, V); 20852 } 20853 20854 if (EltVT == MVT::i32) { 20855 // We unpack the low half and high half into i32s interleaved with zeros so 20856 // that we can use PSADBW to horizontally sum them. The most useful part of 20857 // this is that it lines up the results of two PSADBW instructions to be 20858 // two v2i64 vectors which concatenated are the 4 population counts. We can 20859 // then use PACKUSWB to shrink and concatenate them into a v4i32 again. 20860 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); 20861 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros); 20862 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros); 20863 20864 // Do the horizontal sums into two v2i64s. 20865 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); 20866 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); 20867 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, 20868 DAG.getBitcast(ByteVecVT, Low), Zeros); 20869 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, 20870 DAG.getBitcast(ByteVecVT, High), Zeros); 20871 20872 // Merge them together. 20873 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); 20874 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, 20875 DAG.getBitcast(ShortVecVT, Low), 20876 DAG.getBitcast(ShortVecVT, High)); 20877 20878 return DAG.getBitcast(VT, V); 20879 } 20880 20881 // The only element type left is i16. 20882 assert(EltVT == MVT::i16 && "Unknown how to handle type"); 20883 20884 // To obtain pop count for each i16 element starting from the pop count for 20885 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s 20886 // right by 8. It is important to shift as i16s as i8 vector shift isn't 20887 // directly supported. 20888 SDValue ShifterV = DAG.getConstant(8, DL, VT); 20889 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV); 20890 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), 20891 DAG.getBitcast(ByteVecVT, V)); 20892 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV); 20893 } 20894 20895 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, 20896 const X86Subtarget &Subtarget, 20897 SelectionDAG &DAG) { 20898 MVT VT = Op.getSimpleValueType(); 20899 MVT EltVT = VT.getVectorElementType(); 20900 unsigned VecSize = VT.getSizeInBits(); 20901 20902 // Implement a lookup table in register by using an algorithm based on: 20903 // http://wm.ite.pl/articles/sse-popcount.html 20904 // 20905 // The general idea is that every lower byte nibble in the input vector is an 20906 // index into a in-register pre-computed pop count table. We then split up the 20907 // input vector in two new ones: (1) a vector with only the shifted-right 20908 // higher nibbles for each byte and (2) a vector with the lower nibbles (and 20909 // masked out higher ones) for each byte. PSHUB is used separately with both 20910 // to index the in-register table. Next, both are added and the result is a 20911 // i8 vector where each element contains the pop count for input byte. 20912 // 20913 // To obtain the pop count for elements != i8, we follow up with the same 20914 // approach and use additional tricks as described below. 20915 // 20916 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 20917 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 20918 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 20919 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; 20920 20921 int NumByteElts = VecSize / 8; 20922 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); 20923 SDValue In = DAG.getBitcast(ByteVecVT, Op); 20924 SmallVector<SDValue, 64> LUTVec; 20925 for (int i = 0; i < NumByteElts; ++i) 20926 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); 20927 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec); 20928 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT); 20929 20930 // High nibbles 20931 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT); 20932 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); 20933 20934 // Low nibbles 20935 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); 20936 20937 // The input vector is used as the shuffle mask that index elements into the 20938 // LUT. After counting low and high nibbles, add the vector to obtain the 20939 // final pop count per i8 element. 20940 SDValue HighPopCnt = 20941 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); 20942 SDValue LowPopCnt = 20943 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); 20944 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); 20945 20946 if (EltVT == MVT::i8) 20947 return PopCnt; 20948 20949 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); 20950 } 20951 20952 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, 20953 const X86Subtarget &Subtarget, 20954 SelectionDAG &DAG) { 20955 MVT VT = Op.getSimpleValueType(); 20956 assert(VT.is128BitVector() && 20957 "Only 128-bit vector bitmath lowering supported."); 20958 20959 int VecSize = VT.getSizeInBits(); 20960 MVT EltVT = VT.getVectorElementType(); 20961 int Len = EltVT.getSizeInBits(); 20962 20963 // This is the vectorized version of the "best" algorithm from 20964 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 20965 // with a minor tweak to use a series of adds + shifts instead of vector 20966 // multiplications. Implemented for all integer vector types. We only use 20967 // this when we don't have SSSE3 which allows a LUT-based lowering that is 20968 // much faster, even faster than using native popcnt instructions. 20969 20970 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { 20971 MVT VT = V.getSimpleValueType(); 20972 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT); 20973 return DAG.getNode(OpCode, DL, VT, V, ShifterV); 20974 }; 20975 auto GetMask = [&](SDValue V, APInt Mask) { 20976 MVT VT = V.getSimpleValueType(); 20977 SDValue MaskV = DAG.getConstant(Mask, DL, VT); 20978 return DAG.getNode(ISD::AND, DL, VT, V, MaskV); 20979 }; 20980 20981 // We don't want to incur the implicit masks required to SRL vNi8 vectors on 20982 // x86, so set the SRL type to have elements at least i16 wide. This is 20983 // correct because all of our SRLs are followed immediately by a mask anyways 20984 // that handles any bits that sneak into the high bits of the byte elements. 20985 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16); 20986 20987 SDValue V = Op; 20988 20989 // v = v - ((v >> 1) & 0x55555555...) 20990 SDValue Srl = 20991 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); 20992 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55))); 20993 V = DAG.getNode(ISD::SUB, DL, VT, V, And); 20994 20995 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) 20996 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33))); 20997 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); 20998 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33))); 20999 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); 21000 21001 // v = (v + (v >> 4)) & 0x0F0F0F0F... 21002 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); 21003 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); 21004 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F))); 21005 21006 // At this point, V contains the byte-wise population count, and we are 21007 // merely doing a horizontal sum if necessary to get the wider element 21008 // counts. 21009 if (EltVT == MVT::i8) 21010 return V; 21011 21012 return LowerHorizontalByteSum( 21013 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget, 21014 DAG); 21015 } 21016 21017 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, 21018 SelectionDAG &DAG) { 21019 MVT VT = Op.getSimpleValueType(); 21020 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && 21021 "Unknown CTPOP type to handle"); 21022 SDLoc DL(Op.getNode()); 21023 SDValue Op0 = Op.getOperand(0); 21024 21025 if (!Subtarget.hasSSSE3()) { 21026 // We can't use the fast LUT approach, so fall back on vectorized bitmath. 21027 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); 21028 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); 21029 } 21030 21031 if (VT.is256BitVector() && !Subtarget.hasInt256()) { 21032 unsigned NumElems = VT.getVectorNumElements(); 21033 21034 // Extract each 128-bit vector, compute pop count and concat the result. 21035 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); 21036 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); 21037 21038 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, 21039 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), 21040 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); 21041 } 21042 21043 if (VT.is512BitVector() && !Subtarget.hasBWI()) { 21044 unsigned NumElems = VT.getVectorNumElements(); 21045 21046 // Extract each 256-bit vector, compute pop count and concat the result. 21047 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL); 21048 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL); 21049 21050 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, 21051 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), 21052 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); 21053 } 21054 21055 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); 21056 } 21057 21058 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget, 21059 SelectionDAG &DAG) { 21060 assert(Op.getSimpleValueType().isVector() && 21061 "We only do custom lowering for vector population count."); 21062 return LowerVectorCTPOP(Op, Subtarget, DAG); 21063 } 21064 21065 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { 21066 MVT VT = Op.getSimpleValueType(); 21067 SDValue In = Op.getOperand(0); 21068 SDLoc DL(Op); 21069 21070 // For scalars, its still beneficial to transfer to/from the SIMD unit to 21071 // perform the BITREVERSE. 21072 if (!VT.isVector()) { 21073 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits()); 21074 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In); 21075 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res); 21076 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res, 21077 DAG.getIntPtrConstant(0, DL)); 21078 } 21079 21080 MVT SVT = VT.getVectorElementType(); 21081 int NumElts = VT.getVectorNumElements(); 21082 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; 21083 21084 // Decompose 256-bit ops into smaller 128-bit ops. 21085 if (VT.is256BitVector()) { 21086 SDValue Lo = extract128BitVector(In, 0, DAG, DL); 21087 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); 21088 21089 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2); 21090 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, 21091 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo), 21092 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi)); 21093 } 21094 21095 assert(VT.is128BitVector() && 21096 "Only 128-bit vector bitreverse lowering supported."); 21097 21098 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we 21099 // perform the BSWAP in the shuffle. 21100 // Its best to shuffle using the second operand as this will implicitly allow 21101 // memory folding for multiple vectors. 21102 SmallVector<SDValue, 16> MaskElts; 21103 for (int i = 0; i != NumElts; ++i) { 21104 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) { 21105 int SourceByte = 16 + (i * ScalarSizeInBytes) + j; 21106 int PermuteByte = SourceByte | (2 << 5); 21107 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8)); 21108 } 21109 } 21110 21111 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts); 21112 SDValue Res = DAG.getBitcast(MVT::v16i8, In); 21113 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8), 21114 Res, Mask); 21115 return DAG.getBitcast(VT, Res); 21116 } 21117 21118 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, 21119 SelectionDAG &DAG) { 21120 if (Subtarget.hasXOP()) 21121 return LowerBITREVERSE_XOP(Op, DAG); 21122 21123 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); 21124 21125 MVT VT = Op.getSimpleValueType(); 21126 SDValue In = Op.getOperand(0); 21127 SDLoc DL(Op); 21128 21129 unsigned NumElts = VT.getVectorNumElements(); 21130 assert(VT.getScalarType() == MVT::i8 && 21131 "Only byte vector BITREVERSE supported"); 21132 21133 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. 21134 if (VT.is256BitVector() && !Subtarget.hasInt256()) { 21135 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2); 21136 SDValue Lo = extract128BitVector(In, 0, DAG, DL); 21137 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); 21138 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo); 21139 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi); 21140 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); 21141 } 21142 21143 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into 21144 // two nibbles and a PSHUFB lookup to find the bitreverse of each 21145 // 0-15 value (moved to the other nibble). 21146 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT); 21147 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask); 21148 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT)); 21149 21150 const int LoLUT[16] = { 21151 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0, 21152 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0, 21153 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0, 21154 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0}; 21155 const int HiLUT[16] = { 21156 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C, 21157 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E, 21158 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D, 21159 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F}; 21160 21161 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts; 21162 for (unsigned i = 0; i < NumElts; ++i) { 21163 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8)); 21164 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8)); 21165 } 21166 21167 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts); 21168 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts); 21169 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo); 21170 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi); 21171 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); 21172 } 21173 21174 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) { 21175 unsigned NewOpc = 0; 21176 switch (N->getOpcode()) { 21177 case ISD::ATOMIC_LOAD_ADD: 21178 NewOpc = X86ISD::LADD; 21179 break; 21180 case ISD::ATOMIC_LOAD_SUB: 21181 NewOpc = X86ISD::LSUB; 21182 break; 21183 case ISD::ATOMIC_LOAD_OR: 21184 NewOpc = X86ISD::LOR; 21185 break; 21186 case ISD::ATOMIC_LOAD_XOR: 21187 NewOpc = X86ISD::LXOR; 21188 break; 21189 case ISD::ATOMIC_LOAD_AND: 21190 NewOpc = X86ISD::LAND; 21191 break; 21192 default: 21193 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode"); 21194 } 21195 21196 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand(); 21197 return DAG.getMemIntrinsicNode( 21198 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), 21199 {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, 21200 /*MemVT=*/N->getSimpleValueType(0), MMO); 21201 } 21202 21203 /// Lower atomic_load_ops into LOCK-prefixed operations. 21204 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, 21205 const X86Subtarget &Subtarget) { 21206 SDValue Chain = N->getOperand(0); 21207 SDValue LHS = N->getOperand(1); 21208 SDValue RHS = N->getOperand(2); 21209 unsigned Opc = N->getOpcode(); 21210 MVT VT = N->getSimpleValueType(0); 21211 SDLoc DL(N); 21212 21213 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op 21214 // can only be lowered when the result is unused. They should have already 21215 // been transformed into a cmpxchg loop in AtomicExpand. 21216 if (N->hasAnyUseOfValue(0)) { 21217 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to 21218 // select LXADD if LOCK_SUB can't be selected. 21219 if (Opc == ISD::ATOMIC_LOAD_SUB) { 21220 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode()); 21221 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); 21222 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, 21223 RHS, AN->getMemOperand(), AN->getOrdering(), 21224 AN->getSynchScope()); 21225 } 21226 assert(Opc == ISD::ATOMIC_LOAD_ADD && 21227 "Used AtomicRMW ops other than Add should have been expanded!"); 21228 return N; 21229 } 21230 21231 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG); 21232 // RAUW the chain, but don't worry about the result, as it's unused. 21233 assert(!N->hasAnyUseOfValue(0)); 21234 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); 21235 return SDValue(); 21236 } 21237 21238 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 21239 SDNode *Node = Op.getNode(); 21240 SDLoc dl(Node); 21241 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 21242 21243 // Convert seq_cst store -> xchg 21244 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 21245 // FIXME: On 32-bit, store -> fist or movq would be more efficient 21246 // (The only way to get a 16-byte store is cmpxchg16b) 21247 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 21248 if (cast<AtomicSDNode>(Node)->getOrdering() == 21249 AtomicOrdering::SequentiallyConsistent || 21250 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 21251 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 21252 cast<AtomicSDNode>(Node)->getMemoryVT(), 21253 Node->getOperand(0), 21254 Node->getOperand(1), Node->getOperand(2), 21255 cast<AtomicSDNode>(Node)->getMemOperand(), 21256 cast<AtomicSDNode>(Node)->getOrdering(), 21257 cast<AtomicSDNode>(Node)->getSynchScope()); 21258 return Swap.getValue(1); 21259 } 21260 // Other atomic stores have a simple pattern. 21261 return Op; 21262 } 21263 21264 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 21265 MVT VT = Op.getNode()->getSimpleValueType(0); 21266 21267 // Let legalize expand this if it isn't a legal type yet. 21268 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 21269 return SDValue(); 21270 21271 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 21272 21273 unsigned Opc; 21274 bool ExtraOp = false; 21275 switch (Op.getOpcode()) { 21276 default: llvm_unreachable("Invalid code"); 21277 case ISD::ADDC: Opc = X86ISD::ADD; break; 21278 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 21279 case ISD::SUBC: Opc = X86ISD::SUB; break; 21280 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 21281 } 21282 21283 if (!ExtraOp) 21284 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 21285 Op.getOperand(1)); 21286 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 21287 Op.getOperand(1), Op.getOperand(2)); 21288 } 21289 21290 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, 21291 SelectionDAG &DAG) { 21292 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); 21293 21294 // For MacOSX, we want to call an alternative entry point: __sincos_stret, 21295 // which returns the values as { float, float } (in XMM0) or 21296 // { double, double } (which is returned in XMM0, XMM1). 21297 SDLoc dl(Op); 21298 SDValue Arg = Op.getOperand(0); 21299 EVT ArgVT = Arg.getValueType(); 21300 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 21301 21302 TargetLowering::ArgListTy Args; 21303 TargetLowering::ArgListEntry Entry; 21304 21305 Entry.Node = Arg; 21306 Entry.Ty = ArgTy; 21307 Entry.isSExt = false; 21308 Entry.isZExt = false; 21309 Args.push_back(Entry); 21310 21311 bool isF64 = ArgVT == MVT::f64; 21312 // Only optimize x86_64 for now. i386 is a bit messy. For f32, 21313 // the small struct {f32, f32} is returned in (eax, edx). For f64, 21314 // the results are returned via SRet in memory. 21315 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; 21316 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 21317 SDValue Callee = 21318 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); 21319 21320 Type *RetTy = isF64 21321 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) 21322 : (Type*)VectorType::get(ArgTy, 4); 21323 21324 TargetLowering::CallLoweringInfo CLI(DAG); 21325 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 21326 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args)); 21327 21328 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); 21329 21330 if (isF64) 21331 // Returned in xmm0 and xmm1. 21332 return CallResult.first; 21333 21334 // Returned in bits 0:31 and 32:64 xmm0. 21335 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 21336 CallResult.first, DAG.getIntPtrConstant(0, dl)); 21337 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 21338 CallResult.first, DAG.getIntPtrConstant(1, dl)); 21339 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 21340 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); 21341 } 21342 21343 /// Widen a vector input to a vector of NVT. The 21344 /// input vector must have the same element type as NVT. 21345 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, 21346 bool FillWithZeroes = false) { 21347 // Check if InOp already has the right width. 21348 MVT InVT = InOp.getSimpleValueType(); 21349 if (InVT == NVT) 21350 return InOp; 21351 21352 if (InOp.isUndef()) 21353 return DAG.getUNDEF(NVT); 21354 21355 assert(InVT.getVectorElementType() == NVT.getVectorElementType() && 21356 "input and widen element type must match"); 21357 21358 unsigned InNumElts = InVT.getVectorNumElements(); 21359 unsigned WidenNumElts = NVT.getVectorNumElements(); 21360 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && 21361 "Unexpected request for vector widening"); 21362 21363 EVT EltVT = NVT.getVectorElementType(); 21364 21365 SDLoc dl(InOp); 21366 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && 21367 InOp.getNumOperands() == 2) { 21368 SDValue N1 = InOp.getOperand(1); 21369 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || 21370 N1.isUndef()) { 21371 InOp = InOp.getOperand(0); 21372 InVT = InOp.getSimpleValueType(); 21373 InNumElts = InVT.getVectorNumElements(); 21374 } 21375 } 21376 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || 21377 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { 21378 SmallVector<SDValue, 16> Ops; 21379 for (unsigned i = 0; i < InNumElts; ++i) 21380 Ops.push_back(InOp.getOperand(i)); 21381 21382 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : 21383 DAG.getUNDEF(EltVT); 21384 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) 21385 Ops.push_back(FillVal); 21386 return DAG.getBuildVector(NVT, dl, Ops); 21387 } 21388 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : 21389 DAG.getUNDEF(NVT); 21390 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, 21391 InOp, DAG.getIntPtrConstant(0, dl)); 21392 } 21393 21394 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, 21395 SelectionDAG &DAG) { 21396 assert(Subtarget.hasAVX512() && 21397 "MGATHER/MSCATTER are supported on AVX-512 arch only"); 21398 21399 // X86 scatter kills mask register, so its type should be added to 21400 // the list of return values. 21401 // If the "scatter" has 2 return values, it is already handled. 21402 if (Op.getNode()->getNumValues() == 2) 21403 return Op; 21404 21405 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); 21406 SDValue Src = N->getValue(); 21407 MVT VT = Src.getSimpleValueType(); 21408 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); 21409 SDLoc dl(Op); 21410 21411 SDValue NewScatter; 21412 SDValue Index = N->getIndex(); 21413 SDValue Mask = N->getMask(); 21414 SDValue Chain = N->getChain(); 21415 SDValue BasePtr = N->getBasePtr(); 21416 MVT MemVT = N->getMemoryVT().getSimpleVT(); 21417 MVT IndexVT = Index.getSimpleValueType(); 21418 MVT MaskVT = Mask.getSimpleValueType(); 21419 21420 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { 21421 // The v2i32 value was promoted to v2i64. 21422 // Now we "redo" the type legalizer's work and widen the original 21423 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 21424 // with a shuffle. 21425 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && 21426 "Unexpected memory type"); 21427 int ShuffleMask[] = {0, 2, -1, -1}; 21428 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), 21429 DAG.getUNDEF(MVT::v4i32), ShuffleMask); 21430 // Now we have 4 elements instead of 2. 21431 // Expand the index. 21432 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); 21433 Index = ExtendToType(Index, NewIndexVT, DAG); 21434 21435 // Expand the mask with zeroes 21436 // Mask may be <2 x i64> or <2 x i1> at this moment 21437 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && 21438 "Unexpected mask type"); 21439 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); 21440 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); 21441 VT = MVT::v4i32; 21442 } 21443 21444 unsigned NumElts = VT.getVectorNumElements(); 21445 if (!Subtarget.hasVLX() && !VT.is512BitVector() && 21446 !Index.getSimpleValueType().is512BitVector()) { 21447 // AVX512F supports only 512-bit vectors. Or data or index should 21448 // be 512 bit wide. If now the both index and data are 256-bit, but 21449 // the vector contains 8 elements, we just sign-extend the index 21450 if (IndexVT == MVT::v8i32) 21451 // Just extend index 21452 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); 21453 else { 21454 // The minimal number of elts in scatter is 8 21455 NumElts = 8; 21456 // Index 21457 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); 21458 // Use original index here, do not modify the index twice 21459 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); 21460 if (IndexVT.getScalarType() == MVT::i32) 21461 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); 21462 21463 // Mask 21464 // At this point we have promoted mask operand 21465 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); 21466 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); 21467 // Use the original mask here, do not modify the mask twice 21468 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); 21469 21470 // The value that should be stored 21471 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); 21472 Src = ExtendToType(Src, NewVT, DAG); 21473 } 21474 } 21475 // If the mask is "wide" at this point - truncate it to i1 vector 21476 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); 21477 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); 21478 21479 // The mask is killed by scatter, add it to the values 21480 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); 21481 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; 21482 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, 21483 N->getMemOperand()); 21484 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); 21485 return SDValue(NewScatter.getNode(), 1); 21486 } 21487 21488 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, 21489 SelectionDAG &DAG) { 21490 21491 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 21492 MVT VT = Op.getSimpleValueType(); 21493 MVT ScalarVT = VT.getScalarType(); 21494 SDValue Mask = N->getMask(); 21495 SDLoc dl(Op); 21496 21497 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && 21498 "Cannot lower masked load op."); 21499 21500 assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) || 21501 (Subtarget.hasBWI() && 21502 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && 21503 "Unsupported masked load op."); 21504 21505 // This operation is legal for targets with VLX, but without 21506 // VLX the vector should be widened to 512 bit 21507 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); 21508 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); 21509 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); 21510 SDValue Src0 = N->getSrc0(); 21511 Src0 = ExtendToType(Src0, WideDataVT, DAG); 21512 Mask = ExtendToType(Mask, WideMaskVT, DAG, true); 21513 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), 21514 N->getBasePtr(), Mask, Src0, 21515 N->getMemoryVT(), N->getMemOperand(), 21516 N->getExtensionType()); 21517 21518 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, 21519 NewLoad.getValue(0), 21520 DAG.getIntPtrConstant(0, dl)); 21521 SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; 21522 return DAG.getMergeValues(RetOps, dl); 21523 } 21524 21525 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, 21526 SelectionDAG &DAG) { 21527 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode()); 21528 SDValue DataToStore = N->getValue(); 21529 MVT VT = DataToStore.getSimpleValueType(); 21530 MVT ScalarVT = VT.getScalarType(); 21531 SDValue Mask = N->getMask(); 21532 SDLoc dl(Op); 21533 21534 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && 21535 "Cannot lower masked store op."); 21536 21537 assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) || 21538 (Subtarget.hasBWI() && 21539 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && 21540 "Unsupported masked store op."); 21541 21542 // This operation is legal for targets with VLX, but without 21543 // VLX the vector should be widened to 512 bit 21544 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); 21545 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); 21546 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); 21547 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); 21548 Mask = ExtendToType(Mask, WideMaskVT, DAG, true); 21549 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), 21550 Mask, N->getMemoryVT(), N->getMemOperand(), 21551 N->isTruncatingStore()); 21552 } 21553 21554 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, 21555 SelectionDAG &DAG) { 21556 assert(Subtarget.hasAVX512() && 21557 "MGATHER/MSCATTER are supported on AVX-512 arch only"); 21558 21559 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); 21560 SDLoc dl(Op); 21561 MVT VT = Op.getSimpleValueType(); 21562 SDValue Index = N->getIndex(); 21563 SDValue Mask = N->getMask(); 21564 SDValue Src0 = N->getValue(); 21565 MVT IndexVT = Index.getSimpleValueType(); 21566 MVT MaskVT = Mask.getSimpleValueType(); 21567 21568 unsigned NumElts = VT.getVectorNumElements(); 21569 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); 21570 21571 if (!Subtarget.hasVLX() && !VT.is512BitVector() && 21572 !Index.getSimpleValueType().is512BitVector()) { 21573 // AVX512F supports only 512-bit vectors. Or data or index should 21574 // be 512 bit wide. If now the both index and data are 256-bit, but 21575 // the vector contains 8 elements, we just sign-extend the index 21576 if (NumElts == 8) { 21577 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); 21578 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), 21579 N->getOperand(3), Index }; 21580 DAG.UpdateNodeOperands(N, Ops); 21581 return Op; 21582 } 21583 21584 // Minimal number of elements in Gather 21585 NumElts = 8; 21586 // Index 21587 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); 21588 Index = ExtendToType(Index, NewIndexVT, DAG); 21589 if (IndexVT.getScalarType() == MVT::i32) 21590 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); 21591 21592 // Mask 21593 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); 21594 // At this point we have promoted mask operand 21595 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); 21596 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); 21597 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); 21598 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); 21599 21600 // The pass-thru value 21601 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); 21602 Src0 = ExtendToType(Src0, NewVT, DAG); 21603 21604 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; 21605 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), 21606 N->getMemoryVT(), dl, Ops, 21607 N->getMemOperand()); 21608 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, 21609 NewGather.getValue(0), 21610 DAG.getIntPtrConstant(0, dl)); 21611 SDValue RetOps[] = {Exract, NewGather.getValue(1)}; 21612 return DAG.getMergeValues(RetOps, dl); 21613 } 21614 return Op; 21615 } 21616 21617 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, 21618 SelectionDAG &DAG) const { 21619 // TODO: Eventually, the lowering of these nodes should be informed by or 21620 // deferred to the GC strategy for the function in which they appear. For 21621 // now, however, they must be lowered to something. Since they are logically 21622 // no-ops in the case of a null GC strategy (or a GC strategy which does not 21623 // require special handling for these nodes), lower them as literal NOOPs for 21624 // the time being. 21625 SmallVector<SDValue, 2> Ops; 21626 21627 Ops.push_back(Op.getOperand(0)); 21628 if (Op->getGluedNode()) 21629 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); 21630 21631 SDLoc OpDL(Op); 21632 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 21633 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); 21634 21635 return NOOP; 21636 } 21637 21638 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, 21639 SelectionDAG &DAG) const { 21640 // TODO: Eventually, the lowering of these nodes should be informed by or 21641 // deferred to the GC strategy for the function in which they appear. For 21642 // now, however, they must be lowered to something. Since they are logically 21643 // no-ops in the case of a null GC strategy (or a GC strategy which does not 21644 // require special handling for these nodes), lower them as literal NOOPs for 21645 // the time being. 21646 SmallVector<SDValue, 2> Ops; 21647 21648 Ops.push_back(Op.getOperand(0)); 21649 if (Op->getGluedNode()) 21650 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); 21651 21652 SDLoc OpDL(Op); 21653 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 21654 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); 21655 21656 return NOOP; 21657 } 21658 21659 /// Provide custom lowering hooks for some operations. 21660 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 21661 switch (Op.getOpcode()) { 21662 default: llvm_unreachable("Should not custom lower this!"); 21663 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 21664 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 21665 return LowerCMP_SWAP(Op, Subtarget, DAG); 21666 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); 21667 case ISD::ATOMIC_LOAD_ADD: 21668 case ISD::ATOMIC_LOAD_SUB: 21669 case ISD::ATOMIC_LOAD_OR: 21670 case ISD::ATOMIC_LOAD_XOR: 21671 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); 21672 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); 21673 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); 21674 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 21675 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); 21676 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); 21677 case ISD::VSELECT: return LowerVSELECT(Op, DAG); 21678 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 21679 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 21680 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 21681 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 21682 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 21683 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 21684 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 21685 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 21686 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 21687 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 21688 case ISD::SHL_PARTS: 21689 case ISD::SRA_PARTS: 21690 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 21691 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 21692 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 21693 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 21694 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); 21695 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); 21696 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); 21697 case ISD::SIGN_EXTEND_VECTOR_INREG: 21698 return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); 21699 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 21700 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 21701 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 21702 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); 21703 case ISD::FABS: 21704 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); 21705 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 21706 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 21707 case ISD::SETCC: return LowerSETCC(Op, DAG); 21708 case ISD::SETCCE: return LowerSETCCE(Op, DAG); 21709 case ISD::SELECT: return LowerSELECT(Op, DAG); 21710 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 21711 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 21712 case ISD::VASTART: return LowerVASTART(Op, DAG); 21713 case ISD::VAARG: return LowerVAARG(Op, DAG); 21714 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 21715 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); 21716 case ISD::INTRINSIC_VOID: 21717 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); 21718 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 21719 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 21720 case ISD::FRAME_TO_ARGS_OFFSET: 21721 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 21722 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 21723 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 21724 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 21725 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 21726 case ISD::EH_SJLJ_SETUP_DISPATCH: 21727 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 21728 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 21729 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 21730 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 21731 case ISD::CTLZ: 21732 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); 21733 case ISD::CTTZ: 21734 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); 21735 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 21736 case ISD::MULHS: 21737 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); 21738 case ISD::UMUL_LOHI: 21739 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); 21740 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); 21741 case ISD::SRA: 21742 case ISD::SRL: 21743 case ISD::SHL: return LowerShift(Op, Subtarget, DAG); 21744 case ISD::SADDO: 21745 case ISD::UADDO: 21746 case ISD::SSUBO: 21747 case ISD::USUBO: 21748 case ISD::SMULO: 21749 case ISD::UMULO: return LowerXALUO(Op, DAG); 21750 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 21751 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); 21752 case ISD::ADDC: 21753 case ISD::ADDE: 21754 case ISD::SUBC: 21755 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 21756 case ISD::ADD: return LowerADD(Op, DAG); 21757 case ISD::SUB: return LowerSUB(Op, DAG); 21758 case ISD::SMAX: 21759 case ISD::SMIN: 21760 case ISD::UMAX: 21761 case ISD::UMIN: return LowerMINMAX(Op, DAG); 21762 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); 21763 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); 21764 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); 21765 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); 21766 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); 21767 case ISD::GC_TRANSITION_START: 21768 return LowerGC_TRANSITION_START(Op, DAG); 21769 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); 21770 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG); 21771 } 21772 } 21773 21774 /// Places new result values for the node in Results (their number 21775 /// and types must exactly match those of the original return values of 21776 /// the node), or leaves Results empty, which indicates that the node is not 21777 /// to be custom lowered after all. 21778 void X86TargetLowering::LowerOperationWrapper(SDNode *N, 21779 SmallVectorImpl<SDValue> &Results, 21780 SelectionDAG &DAG) const { 21781 SDValue Res = LowerOperation(SDValue(N, 0), DAG); 21782 21783 if (!Res.getNode()) 21784 return; 21785 21786 assert((N->getNumValues() <= Res->getNumValues()) && 21787 "Lowering returned the wrong number of results!"); 21788 21789 // Places new result values base on N result number. 21790 // In some cases (LowerSINT_TO_FP for example) Res has more result values 21791 // than original node, chain should be dropped(last value). 21792 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) 21793 Results.push_back(Res.getValue(I)); 21794 } 21795 21796 /// Replace a node with an illegal result type with a new node built out of 21797 /// custom code. 21798 void X86TargetLowering::ReplaceNodeResults(SDNode *N, 21799 SmallVectorImpl<SDValue>&Results, 21800 SelectionDAG &DAG) const { 21801 SDLoc dl(N); 21802 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 21803 switch (N->getOpcode()) { 21804 default: 21805 llvm_unreachable("Do not know how to custom type legalize this operation!"); 21806 case X86ISD::AVG: { 21807 // Legalize types for X86ISD::AVG by expanding vectors. 21808 assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); 21809 21810 auto InVT = N->getValueType(0); 21811 auto InVTSize = InVT.getSizeInBits(); 21812 const unsigned RegSize = 21813 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; 21814 assert((!Subtarget.hasAVX512() || RegSize < 512) && 21815 "512-bit vector requires AVX512"); 21816 assert((!Subtarget.hasAVX2() || RegSize < 256) && 21817 "256-bit vector requires AVX2"); 21818 21819 auto ElemVT = InVT.getVectorElementType(); 21820 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 21821 RegSize / ElemVT.getSizeInBits()); 21822 assert(RegSize % InVT.getSizeInBits() == 0); 21823 unsigned NumConcat = RegSize / InVT.getSizeInBits(); 21824 21825 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT)); 21826 Ops[0] = N->getOperand(0); 21827 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); 21828 Ops[0] = N->getOperand(1); 21829 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); 21830 21831 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); 21832 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, 21833 DAG.getIntPtrConstant(0, dl))); 21834 return; 21835 } 21836 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. 21837 case X86ISD::FMINC: 21838 case X86ISD::FMIN: 21839 case X86ISD::FMAXC: 21840 case X86ISD::FMAX: { 21841 EVT VT = N->getValueType(0); 21842 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); 21843 SDValue UNDEF = DAG.getUNDEF(VT); 21844 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, 21845 N->getOperand(0), UNDEF); 21846 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, 21847 N->getOperand(1), UNDEF); 21848 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); 21849 return; 21850 } 21851 case ISD::SIGN_EXTEND_INREG: 21852 case ISD::ADDC: 21853 case ISD::ADDE: 21854 case ISD::SUBC: 21855 case ISD::SUBE: 21856 // We don't want to expand or promote these. 21857 return; 21858 case ISD::SDIV: 21859 case ISD::UDIV: 21860 case ISD::SREM: 21861 case ISD::UREM: 21862 case ISD::SDIVREM: 21863 case ISD::UDIVREM: { 21864 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); 21865 Results.push_back(V); 21866 return; 21867 } 21868 case ISD::FP_TO_SINT: 21869 case ISD::FP_TO_UINT: { 21870 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 21871 21872 std::pair<SDValue,SDValue> Vals = 21873 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 21874 SDValue FIST = Vals.first, StackSlot = Vals.second; 21875 if (FIST.getNode()) { 21876 EVT VT = N->getValueType(0); 21877 // Return a load from the stack slot. 21878 if (StackSlot.getNode()) 21879 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 21880 MachinePointerInfo(), 21881 false, false, false, 0)); 21882 else 21883 Results.push_back(FIST); 21884 } 21885 return; 21886 } 21887 case ISD::UINT_TO_FP: { 21888 assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); 21889 if (N->getOperand(0).getValueType() != MVT::v2i32 || 21890 N->getValueType(0) != MVT::v2f32) 21891 return; 21892 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, 21893 N->getOperand(0)); 21894 SDValue VBias = 21895 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64); 21896 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, 21897 DAG.getBitcast(MVT::v2i64, VBias)); 21898 Or = DAG.getBitcast(MVT::v2f64, Or); 21899 // TODO: Are there any fast-math-flags to propagate here? 21900 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); 21901 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); 21902 return; 21903 } 21904 case ISD::FP_ROUND: { 21905 if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) 21906 return; 21907 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 21908 Results.push_back(V); 21909 return; 21910 } 21911 case ISD::FP_EXTEND: { 21912 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. 21913 // No other ValueType for FP_EXTEND should reach this point. 21914 assert(N->getValueType(0) == MVT::v2f32 && 21915 "Do not know how to legalize this Node"); 21916 return; 21917 } 21918 case ISD::INTRINSIC_W_CHAIN: { 21919 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 21920 switch (IntNo) { 21921 default : llvm_unreachable("Do not know how to custom type " 21922 "legalize this intrinsic operation!"); 21923 case Intrinsic::x86_rdtsc: 21924 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, 21925 Results); 21926 case Intrinsic::x86_rdtscp: 21927 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, 21928 Results); 21929 case Intrinsic::x86_rdpmc: 21930 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); 21931 } 21932 } 21933 case ISD::INTRINSIC_WO_CHAIN: { 21934 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) 21935 Results.push_back(V); 21936 return; 21937 } 21938 case ISD::READCYCLECOUNTER: { 21939 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, 21940 Results); 21941 } 21942 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { 21943 EVT T = N->getValueType(0); 21944 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 21945 bool Regs64bit = T == MVT::i128; 21946 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 21947 SDValue cpInL, cpInH; 21948 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 21949 DAG.getConstant(0, dl, HalfT)); 21950 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 21951 DAG.getConstant(1, dl, HalfT)); 21952 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 21953 Regs64bit ? X86::RAX : X86::EAX, 21954 cpInL, SDValue()); 21955 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 21956 Regs64bit ? X86::RDX : X86::EDX, 21957 cpInH, cpInL.getValue(1)); 21958 SDValue swapInL, swapInH; 21959 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 21960 DAG.getConstant(0, dl, HalfT)); 21961 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 21962 DAG.getConstant(1, dl, HalfT)); 21963 swapInH = 21964 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, 21965 swapInH, cpInH.getValue(1)); 21966 // If the current function needs the base pointer, RBX, 21967 // we shouldn't use cmpxchg directly. 21968 // Indeed the lowering of that instruction will clobber 21969 // that register and since RBX will be a reserved register 21970 // the register allocator will not make sure its value will 21971 // be properly saved and restored around this live-range. 21972 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 21973 SDValue Result; 21974 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 21975 unsigned BasePtr = TRI->getBaseRegister(); 21976 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 21977 if (TRI->hasBasePointer(DAG.getMachineFunction()) && 21978 (BasePtr == X86::RBX || BasePtr == X86::EBX)) { 21979 // ISel prefers the LCMPXCHG64 variant. 21980 // If that assert breaks, that means it is not the case anymore, 21981 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX, 21982 // not just EBX. This is a matter of accepting i64 input for that 21983 // pseudo, and restoring into the register of the right wide 21984 // in expand pseudo. Everything else should just work. 21985 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && 21986 "Saving only half of the RBX"); 21987 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG 21988 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG; 21989 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl, 21990 Regs64bit ? X86::RBX : X86::EBX, 21991 HalfT, swapInH.getValue(1)); 21992 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL, 21993 RBXSave, 21994 /*Glue*/ RBXSave.getValue(2)}; 21995 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); 21996 } else { 21997 unsigned Opcode = 21998 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; 21999 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, 22000 Regs64bit ? X86::RBX : X86::EBX, swapInL, 22001 swapInH.getValue(1)); 22002 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), 22003 swapInL.getValue(1)}; 22004 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); 22005 } 22006 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 22007 Regs64bit ? X86::RAX : X86::EAX, 22008 HalfT, Result.getValue(1)); 22009 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 22010 Regs64bit ? X86::RDX : X86::EDX, 22011 HalfT, cpOutL.getValue(2)); 22012 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 22013 22014 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, 22015 MVT::i32, cpOutH.getValue(2)); 22016 SDValue Success = 22017 DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 22018 DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS); 22019 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); 22020 22021 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); 22022 Results.push_back(Success); 22023 Results.push_back(EFLAGS.getValue(1)); 22024 return; 22025 } 22026 case ISD::ATOMIC_SWAP: 22027 case ISD::ATOMIC_LOAD_ADD: 22028 case ISD::ATOMIC_LOAD_SUB: 22029 case ISD::ATOMIC_LOAD_AND: 22030 case ISD::ATOMIC_LOAD_OR: 22031 case ISD::ATOMIC_LOAD_XOR: 22032 case ISD::ATOMIC_LOAD_NAND: 22033 case ISD::ATOMIC_LOAD_MIN: 22034 case ISD::ATOMIC_LOAD_MAX: 22035 case ISD::ATOMIC_LOAD_UMIN: 22036 case ISD::ATOMIC_LOAD_UMAX: 22037 case ISD::ATOMIC_LOAD: { 22038 // Delegate to generic TypeLegalization. Situations we can really handle 22039 // should have already been dealt with by AtomicExpandPass.cpp. 22040 break; 22041 } 22042 case ISD::BITCAST: { 22043 assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); 22044 EVT DstVT = N->getValueType(0); 22045 EVT SrcVT = N->getOperand(0)->getValueType(0); 22046 22047 if (SrcVT != MVT::f64 || 22048 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) 22049 return; 22050 22051 unsigned NumElts = DstVT.getVectorNumElements(); 22052 EVT SVT = DstVT.getVectorElementType(); 22053 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); 22054 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 22055 MVT::v2f64, N->getOperand(0)); 22056 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); 22057 22058 if (ExperimentalVectorWideningLegalization) { 22059 // If we are legalizing vectors by widening, we already have the desired 22060 // legal vector type, just return it. 22061 Results.push_back(ToVecInt); 22062 return; 22063 } 22064 22065 SmallVector<SDValue, 8> Elts; 22066 for (unsigned i = 0, e = NumElts; i != e; ++i) 22067 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, 22068 ToVecInt, DAG.getIntPtrConstant(i, dl))); 22069 22070 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts)); 22071 } 22072 } 22073 } 22074 22075 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 22076 switch ((X86ISD::NodeType)Opcode) { 22077 case X86ISD::FIRST_NUMBER: break; 22078 case X86ISD::BSF: return "X86ISD::BSF"; 22079 case X86ISD::BSR: return "X86ISD::BSR"; 22080 case X86ISD::SHLD: return "X86ISD::SHLD"; 22081 case X86ISD::SHRD: return "X86ISD::SHRD"; 22082 case X86ISD::FAND: return "X86ISD::FAND"; 22083 case X86ISD::FANDN: return "X86ISD::FANDN"; 22084 case X86ISD::FOR: return "X86ISD::FOR"; 22085 case X86ISD::FXOR: return "X86ISD::FXOR"; 22086 case X86ISD::FILD: return "X86ISD::FILD"; 22087 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 22088 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 22089 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 22090 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 22091 case X86ISD::FLD: return "X86ISD::FLD"; 22092 case X86ISD::FST: return "X86ISD::FST"; 22093 case X86ISD::CALL: return "X86ISD::CALL"; 22094 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 22095 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; 22096 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; 22097 case X86ISD::BT: return "X86ISD::BT"; 22098 case X86ISD::CMP: return "X86ISD::CMP"; 22099 case X86ISD::COMI: return "X86ISD::COMI"; 22100 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 22101 case X86ISD::CMPM: return "X86ISD::CMPM"; 22102 case X86ISD::CMPMU: return "X86ISD::CMPMU"; 22103 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; 22104 case X86ISD::SETCC: return "X86ISD::SETCC"; 22105 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 22106 case X86ISD::FSETCC: return "X86ISD::FSETCC"; 22107 case X86ISD::CMOV: return "X86ISD::CMOV"; 22108 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 22109 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 22110 case X86ISD::IRET: return "X86ISD::IRET"; 22111 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 22112 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 22113 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 22114 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 22115 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 22116 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; 22117 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; 22118 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; 22119 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 22120 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 22121 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 22122 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 22123 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 22124 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 22125 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 22126 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 22127 case X86ISD::BLENDI: return "X86ISD::BLENDI"; 22128 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; 22129 case X86ISD::ADDUS: return "X86ISD::ADDUS"; 22130 case X86ISD::SUBUS: return "X86ISD::SUBUS"; 22131 case X86ISD::HADD: return "X86ISD::HADD"; 22132 case X86ISD::HSUB: return "X86ISD::HSUB"; 22133 case X86ISD::FHADD: return "X86ISD::FHADD"; 22134 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 22135 case X86ISD::ABS: return "X86ISD::ABS"; 22136 case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; 22137 case X86ISD::FMAX: return "X86ISD::FMAX"; 22138 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; 22139 case X86ISD::FMIN: return "X86ISD::FMIN"; 22140 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; 22141 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 22142 case X86ISD::FMINC: return "X86ISD::FMINC"; 22143 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 22144 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; 22145 case X86ISD::FRCP: return "X86ISD::FRCP"; 22146 case X86ISD::FRCPS: return "X86ISD::FRCPS"; 22147 case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; 22148 case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; 22149 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 22150 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 22151 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 22152 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; 22153 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; 22154 case X86ISD::EH_SJLJ_SETUP_DISPATCH: 22155 return "X86ISD::EH_SJLJ_SETUP_DISPATCH"; 22156 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 22157 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 22158 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 22159 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 22160 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 22161 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 22162 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; 22163 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG: 22164 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG"; 22165 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG: 22166 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG"; 22167 case X86ISD::LADD: return "X86ISD::LADD"; 22168 case X86ISD::LSUB: return "X86ISD::LSUB"; 22169 case X86ISD::LOR: return "X86ISD::LOR"; 22170 case X86ISD::LXOR: return "X86ISD::LXOR"; 22171 case X86ISD::LAND: return "X86ISD::LAND"; 22172 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 22173 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 22174 case X86ISD::VZEXT: return "X86ISD::VZEXT"; 22175 case X86ISD::VSEXT: return "X86ISD::VSEXT"; 22176 case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; 22177 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; 22178 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; 22179 case X86ISD::VINSERT: return "X86ISD::VINSERT"; 22180 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 22181 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 22182 case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; 22183 case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; 22184 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; 22185 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 22186 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 22187 case X86ISD::VSHL: return "X86ISD::VSHL"; 22188 case X86ISD::VSRL: return "X86ISD::VSRL"; 22189 case X86ISD::VSRA: return "X86ISD::VSRA"; 22190 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 22191 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 22192 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 22193 case X86ISD::VSRAV: return "X86ISD::VSRAV"; 22194 case X86ISD::VROTLI: return "X86ISD::VROTLI"; 22195 case X86ISD::VROTRI: return "X86ISD::VROTRI"; 22196 case X86ISD::VPPERM: return "X86ISD::VPPERM"; 22197 case X86ISD::CMPP: return "X86ISD::CMPP"; 22198 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 22199 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 22200 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; 22201 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; 22202 case X86ISD::ADD: return "X86ISD::ADD"; 22203 case X86ISD::SUB: return "X86ISD::SUB"; 22204 case X86ISD::ADC: return "X86ISD::ADC"; 22205 case X86ISD::SBB: return "X86ISD::SBB"; 22206 case X86ISD::SMUL: return "X86ISD::SMUL"; 22207 case X86ISD::UMUL: return "X86ISD::UMUL"; 22208 case X86ISD::SMUL8: return "X86ISD::SMUL8"; 22209 case X86ISD::UMUL8: return "X86ISD::UMUL8"; 22210 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; 22211 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; 22212 case X86ISD::INC: return "X86ISD::INC"; 22213 case X86ISD::DEC: return "X86ISD::DEC"; 22214 case X86ISD::OR: return "X86ISD::OR"; 22215 case X86ISD::XOR: return "X86ISD::XOR"; 22216 case X86ISD::AND: return "X86ISD::AND"; 22217 case X86ISD::BEXTR: return "X86ISD::BEXTR"; 22218 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 22219 case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; 22220 case X86ISD::PTEST: return "X86ISD::PTEST"; 22221 case X86ISD::TESTP: return "X86ISD::TESTP"; 22222 case X86ISD::TESTM: return "X86ISD::TESTM"; 22223 case X86ISD::TESTNM: return "X86ISD::TESTNM"; 22224 case X86ISD::KORTEST: return "X86ISD::KORTEST"; 22225 case X86ISD::KTEST: return "X86ISD::KTEST"; 22226 case X86ISD::PACKSS: return "X86ISD::PACKSS"; 22227 case X86ISD::PACKUS: return "X86ISD::PACKUS"; 22228 case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; 22229 case X86ISD::VALIGN: return "X86ISD::VALIGN"; 22230 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 22231 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 22232 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 22233 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 22234 case X86ISD::SHUF128: return "X86ISD::SHUF128"; 22235 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 22236 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 22237 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 22238 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 22239 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 22240 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 22241 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 22242 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 22243 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 22244 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 22245 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 22246 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 22247 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 22248 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; 22249 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; 22250 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; 22251 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; 22252 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; 22253 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 22254 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 22255 case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; 22256 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; 22257 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 22258 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; 22259 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; 22260 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; 22261 case X86ISD::VRANGE: return "X86ISD::VRANGE"; 22262 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 22263 case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; 22264 case X86ISD::PSADBW: return "X86ISD::PSADBW"; 22265 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; 22266 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 22267 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 22268 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 22269 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 22270 case X86ISD::MFENCE: return "X86ISD::MFENCE"; 22271 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 22272 case X86ISD::SAHF: return "X86ISD::SAHF"; 22273 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 22274 case X86ISD::RDSEED: return "X86ISD::RDSEED"; 22275 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; 22276 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; 22277 case X86ISD::VPROT: return "X86ISD::VPROT"; 22278 case X86ISD::VPROTI: return "X86ISD::VPROTI"; 22279 case X86ISD::VPSHA: return "X86ISD::VPSHA"; 22280 case X86ISD::VPSHL: return "X86ISD::VPSHL"; 22281 case X86ISD::VPCOM: return "X86ISD::VPCOM"; 22282 case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; 22283 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; 22284 case X86ISD::FMADD: return "X86ISD::FMADD"; 22285 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 22286 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 22287 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 22288 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 22289 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 22290 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; 22291 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND"; 22292 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND"; 22293 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; 22294 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; 22295 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; 22296 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; 22297 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; 22298 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; 22299 case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; 22300 case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; 22301 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; 22302 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; 22303 case X86ISD::XTEST: return "X86ISD::XTEST"; 22304 case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; 22305 case X86ISD::EXPAND: return "X86ISD::EXPAND"; 22306 case X86ISD::SELECT: return "X86ISD::SELECT"; 22307 case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; 22308 case X86ISD::RCP28: return "X86ISD::RCP28"; 22309 case X86ISD::EXP2: return "X86ISD::EXP2"; 22310 case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; 22311 case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; 22312 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; 22313 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; 22314 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; 22315 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; 22316 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; 22317 case X86ISD::SCALEF: return "X86ISD::SCALEF"; 22318 case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; 22319 case X86ISD::ADDS: return "X86ISD::ADDS"; 22320 case X86ISD::SUBS: return "X86ISD::SUBS"; 22321 case X86ISD::AVG: return "X86ISD::AVG"; 22322 case X86ISD::MULHRS: return "X86ISD::MULHRS"; 22323 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; 22324 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; 22325 case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; 22326 case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; 22327 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; 22328 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; 22329 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; 22330 case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND"; 22331 case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND"; 22332 } 22333 return nullptr; 22334 } 22335 22336 /// Return true if the addressing mode represented by AM is legal for this 22337 /// target, for a load/store of the specified type. 22338 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, 22339 const AddrMode &AM, Type *Ty, 22340 unsigned AS) const { 22341 // X86 supports extremely general addressing modes. 22342 CodeModel::Model M = getTargetMachine().getCodeModel(); 22343 22344 // X86 allows a sign-extended 32-bit immediate field as a displacement. 22345 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) 22346 return false; 22347 22348 if (AM.BaseGV) { 22349 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV); 22350 22351 // If a reference to this global requires an extra load, we can't fold it. 22352 if (isGlobalStubReference(GVFlags)) 22353 return false; 22354 22355 // If BaseGV requires a register for the PIC base, we cannot also have a 22356 // BaseReg specified. 22357 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 22358 return false; 22359 22360 // If lower 4G is not available, then we must use rip-relative addressing. 22361 if ((M != CodeModel::Small || isPositionIndependent()) && 22362 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 22363 return false; 22364 } 22365 22366 switch (AM.Scale) { 22367 case 0: 22368 case 1: 22369 case 2: 22370 case 4: 22371 case 8: 22372 // These scales always work. 22373 break; 22374 case 3: 22375 case 5: 22376 case 9: 22377 // These scales are formed with basereg+scalereg. Only accept if there is 22378 // no basereg yet. 22379 if (AM.HasBaseReg) 22380 return false; 22381 break; 22382 default: // Other stuff never works. 22383 return false; 22384 } 22385 22386 return true; 22387 } 22388 22389 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { 22390 unsigned Bits = Ty->getScalarSizeInBits(); 22391 22392 // 8-bit shifts are always expensive, but versions with a scalar amount aren't 22393 // particularly cheaper than those without. 22394 if (Bits == 8) 22395 return false; 22396 22397 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make 22398 // variable shifts just as cheap as scalar ones. 22399 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64)) 22400 return false; 22401 22402 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a 22403 // fully general vector. 22404 return true; 22405 } 22406 22407 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 22408 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 22409 return false; 22410 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 22411 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 22412 return NumBits1 > NumBits2; 22413 } 22414 22415 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 22416 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 22417 return false; 22418 22419 if (!isTypeLegal(EVT::getEVT(Ty1))) 22420 return false; 22421 22422 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 22423 22424 // Assuming the caller doesn't have a zeroext or signext return parameter, 22425 // truncation all the way down to i1 is valid. 22426 return true; 22427 } 22428 22429 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 22430 return isInt<32>(Imm); 22431 } 22432 22433 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 22434 // Can also use sub to handle negated immediates. 22435 return isInt<32>(Imm); 22436 } 22437 22438 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 22439 if (!VT1.isInteger() || !VT2.isInteger()) 22440 return false; 22441 unsigned NumBits1 = VT1.getSizeInBits(); 22442 unsigned NumBits2 = VT2.getSizeInBits(); 22443 return NumBits1 > NumBits2; 22444 } 22445 22446 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 22447 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 22448 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit(); 22449 } 22450 22451 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 22452 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 22453 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit(); 22454 } 22455 22456 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 22457 EVT VT1 = Val.getValueType(); 22458 if (isZExtFree(VT1, VT2)) 22459 return true; 22460 22461 if (Val.getOpcode() != ISD::LOAD) 22462 return false; 22463 22464 if (!VT1.isSimple() || !VT1.isInteger() || 22465 !VT2.isSimple() || !VT2.isInteger()) 22466 return false; 22467 22468 switch (VT1.getSimpleVT().SimpleTy) { 22469 default: break; 22470 case MVT::i8: 22471 case MVT::i16: 22472 case MVT::i32: 22473 // X86 has 8, 16, and 32-bit zero-extending loads. 22474 return true; 22475 } 22476 22477 return false; 22478 } 22479 22480 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } 22481 22482 bool 22483 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 22484 if (!Subtarget.hasAnyFMA()) 22485 return false; 22486 22487 VT = VT.getScalarType(); 22488 22489 if (!VT.isSimple()) 22490 return false; 22491 22492 switch (VT.getSimpleVT().SimpleTy) { 22493 case MVT::f32: 22494 case MVT::f64: 22495 return true; 22496 default: 22497 break; 22498 } 22499 22500 return false; 22501 } 22502 22503 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 22504 // i16 instructions are longer (0x66 prefix) and potentially slower. 22505 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 22506 } 22507 22508 /// Targets can use this to indicate that they only support *some* 22509 /// VECTOR_SHUFFLE operations, those with specific masks. 22510 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 22511 /// are assumed to be legal. 22512 bool 22513 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 22514 EVT VT) const { 22515 if (!VT.isSimple()) 22516 return false; 22517 22518 // Not for i1 vectors 22519 if (VT.getSimpleVT().getScalarType() == MVT::i1) 22520 return false; 22521 22522 // Very little shuffling can be done for 64-bit vectors right now. 22523 if (VT.getSimpleVT().getSizeInBits() == 64) 22524 return false; 22525 22526 // We only care that the types being shuffled are legal. The lowering can 22527 // handle any possible shuffle mask that results. 22528 return isTypeLegal(VT.getSimpleVT()); 22529 } 22530 22531 bool 22532 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 22533 EVT VT) const { 22534 // Just delegate to the generic legality, clear masks aren't special. 22535 return isShuffleMaskLegal(Mask, VT); 22536 } 22537 22538 //===----------------------------------------------------------------------===// 22539 // X86 Scheduler Hooks 22540 //===----------------------------------------------------------------------===// 22541 22542 /// Utility function to emit xbegin specifying the start of an RTM region. 22543 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, 22544 const TargetInstrInfo *TII) { 22545 DebugLoc DL = MI.getDebugLoc(); 22546 22547 const BasicBlock *BB = MBB->getBasicBlock(); 22548 MachineFunction::iterator I = ++MBB->getIterator(); 22549 22550 // For the v = xbegin(), we generate 22551 // 22552 // thisMBB: 22553 // xbegin sinkMBB 22554 // 22555 // mainMBB: 22556 // eax = -1 22557 // 22558 // sinkMBB: 22559 // v = eax 22560 22561 MachineBasicBlock *thisMBB = MBB; 22562 MachineFunction *MF = MBB->getParent(); 22563 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 22564 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 22565 MF->insert(I, mainMBB); 22566 MF->insert(I, sinkMBB); 22567 22568 // Transfer the remainder of BB and its successor edges to sinkMBB. 22569 sinkMBB->splice(sinkMBB->begin(), MBB, 22570 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 22571 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 22572 22573 // thisMBB: 22574 // xbegin sinkMBB 22575 // # fallthrough to mainMBB 22576 // # abortion to sinkMBB 22577 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); 22578 thisMBB->addSuccessor(mainMBB); 22579 thisMBB->addSuccessor(sinkMBB); 22580 22581 // mainMBB: 22582 // EAX = -1 22583 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); 22584 mainMBB->addSuccessor(sinkMBB); 22585 22586 // sinkMBB: 22587 // EAX is live into the sinkMBB 22588 sinkMBB->addLiveIn(X86::EAX); 22589 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), 22590 MI.getOperand(0).getReg()) 22591 .addReg(X86::EAX); 22592 22593 MI.eraseFromParent(); 22594 return sinkMBB; 22595 } 22596 22597 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 22598 // or XMM0_V32I8 in AVX all of this code can be replaced with that 22599 // in the .td file. 22600 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB, 22601 const TargetInstrInfo *TII) { 22602 unsigned Opc; 22603 switch (MI.getOpcode()) { 22604 default: llvm_unreachable("illegal opcode!"); 22605 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; 22606 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; 22607 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; 22608 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; 22609 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; 22610 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; 22611 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; 22612 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; 22613 } 22614 22615 DebugLoc dl = MI.getDebugLoc(); 22616 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 22617 22618 unsigned NumArgs = MI.getNumOperands(); 22619 for (unsigned i = 1; i < NumArgs; ++i) { 22620 MachineOperand &Op = MI.getOperand(i); 22621 if (!(Op.isReg() && Op.isImplicit())) 22622 MIB.addOperand(Op); 22623 } 22624 if (MI.hasOneMemOperand()) 22625 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 22626 22627 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) 22628 .addReg(X86::XMM0); 22629 22630 MI.eraseFromParent(); 22631 return BB; 22632 } 22633 22634 // FIXME: Custom handling because TableGen doesn't support multiple implicit 22635 // defs in an instruction pattern 22636 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB, 22637 const TargetInstrInfo *TII) { 22638 unsigned Opc; 22639 switch (MI.getOpcode()) { 22640 default: llvm_unreachable("illegal opcode!"); 22641 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; 22642 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; 22643 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; 22644 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; 22645 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; 22646 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; 22647 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; 22648 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; 22649 } 22650 22651 DebugLoc dl = MI.getDebugLoc(); 22652 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 22653 22654 unsigned NumArgs = MI.getNumOperands(); // remove the results 22655 for (unsigned i = 1; i < NumArgs; ++i) { 22656 MachineOperand &Op = MI.getOperand(i); 22657 if (!(Op.isReg() && Op.isImplicit())) 22658 MIB.addOperand(Op); 22659 } 22660 if (MI.hasOneMemOperand()) 22661 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 22662 22663 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) 22664 .addReg(X86::ECX); 22665 22666 MI.eraseFromParent(); 22667 return BB; 22668 } 22669 22670 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB, 22671 const X86Subtarget &Subtarget) { 22672 DebugLoc dl = MI.getDebugLoc(); 22673 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 22674 22675 // insert input VAL into EAX 22676 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 22677 .addReg(MI.getOperand(0).getReg()); 22678 // insert zero to ECX 22679 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); 22680 22681 // insert zero to EDX 22682 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX); 22683 22684 // insert WRPKRU instruction 22685 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); 22686 22687 MI.eraseFromParent(); // The pseudo is gone now. 22688 return BB; 22689 } 22690 22691 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB, 22692 const X86Subtarget &Subtarget) { 22693 DebugLoc dl = MI.getDebugLoc(); 22694 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 22695 22696 // insert zero to ECX 22697 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); 22698 22699 // insert RDPKRU instruction 22700 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); 22701 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) 22702 .addReg(X86::EAX); 22703 22704 MI.eraseFromParent(); // The pseudo is gone now. 22705 return BB; 22706 } 22707 22708 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, 22709 const X86Subtarget &Subtarget, 22710 unsigned Opc) { 22711 DebugLoc dl = MI.getDebugLoc(); 22712 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 22713 // Address into RAX/EAX, other two args into ECX, EDX. 22714 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; 22715 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; 22716 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 22717 for (int i = 0; i < X86::AddrNumOperands; ++i) 22718 MIB.addOperand(MI.getOperand(i)); 22719 22720 unsigned ValOps = X86::AddrNumOperands; 22721 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 22722 .addReg(MI.getOperand(ValOps).getReg()); 22723 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 22724 .addReg(MI.getOperand(ValOps + 1).getReg()); 22725 22726 // The instruction doesn't actually take any operands though. 22727 BuildMI(*BB, MI, dl, TII->get(Opc)); 22728 22729 MI.eraseFromParent(); // The pseudo is gone now. 22730 return BB; 22731 } 22732 22733 MachineBasicBlock * 22734 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, 22735 MachineBasicBlock *MBB) const { 22736 // Emit va_arg instruction on X86-64. 22737 22738 // Operands to this pseudo-instruction: 22739 // 0 ) Output : destination address (reg) 22740 // 1-5) Input : va_list address (addr, i64mem) 22741 // 6 ) ArgSize : Size (in bytes) of vararg type 22742 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 22743 // 8 ) Align : Alignment of type 22744 // 9 ) EFLAGS (implicit-def) 22745 22746 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 22747 static_assert(X86::AddrNumOperands == 5, 22748 "VAARG_64 assumes 5 address operands"); 22749 22750 unsigned DestReg = MI.getOperand(0).getReg(); 22751 MachineOperand &Base = MI.getOperand(1); 22752 MachineOperand &Scale = MI.getOperand(2); 22753 MachineOperand &Index = MI.getOperand(3); 22754 MachineOperand &Disp = MI.getOperand(4); 22755 MachineOperand &Segment = MI.getOperand(5); 22756 unsigned ArgSize = MI.getOperand(6).getImm(); 22757 unsigned ArgMode = MI.getOperand(7).getImm(); 22758 unsigned Align = MI.getOperand(8).getImm(); 22759 22760 // Memory Reference 22761 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 22762 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); 22763 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); 22764 22765 // Machine Information 22766 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 22767 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 22768 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 22769 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 22770 DebugLoc DL = MI.getDebugLoc(); 22771 22772 // struct va_list { 22773 // i32 gp_offset 22774 // i32 fp_offset 22775 // i64 overflow_area (address) 22776 // i64 reg_save_area (address) 22777 // } 22778 // sizeof(va_list) = 24 22779 // alignment(va_list) = 8 22780 22781 unsigned TotalNumIntRegs = 6; 22782 unsigned TotalNumXMMRegs = 8; 22783 bool UseGPOffset = (ArgMode == 1); 22784 bool UseFPOffset = (ArgMode == 2); 22785 unsigned MaxOffset = TotalNumIntRegs * 8 + 22786 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 22787 22788 /* Align ArgSize to a multiple of 8 */ 22789 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 22790 bool NeedsAlign = (Align > 8); 22791 22792 MachineBasicBlock *thisMBB = MBB; 22793 MachineBasicBlock *overflowMBB; 22794 MachineBasicBlock *offsetMBB; 22795 MachineBasicBlock *endMBB; 22796 22797 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 22798 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 22799 unsigned OffsetReg = 0; 22800 22801 if (!UseGPOffset && !UseFPOffset) { 22802 // If we only pull from the overflow region, we don't create a branch. 22803 // We don't need to alter control flow. 22804 OffsetDestReg = 0; // unused 22805 OverflowDestReg = DestReg; 22806 22807 offsetMBB = nullptr; 22808 overflowMBB = thisMBB; 22809 endMBB = thisMBB; 22810 } else { 22811 // First emit code to check if gp_offset (or fp_offset) is below the bound. 22812 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 22813 // If not, pull from overflow_area. (branch to overflowMBB) 22814 // 22815 // thisMBB 22816 // | . 22817 // | . 22818 // offsetMBB overflowMBB 22819 // | . 22820 // | . 22821 // endMBB 22822 22823 // Registers for the PHI in endMBB 22824 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 22825 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 22826 22827 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 22828 MachineFunction *MF = MBB->getParent(); 22829 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 22830 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 22831 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 22832 22833 MachineFunction::iterator MBBIter = ++MBB->getIterator(); 22834 22835 // Insert the new basic blocks 22836 MF->insert(MBBIter, offsetMBB); 22837 MF->insert(MBBIter, overflowMBB); 22838 MF->insert(MBBIter, endMBB); 22839 22840 // Transfer the remainder of MBB and its successor edges to endMBB. 22841 endMBB->splice(endMBB->begin(), thisMBB, 22842 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); 22843 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 22844 22845 // Make offsetMBB and overflowMBB successors of thisMBB 22846 thisMBB->addSuccessor(offsetMBB); 22847 thisMBB->addSuccessor(overflowMBB); 22848 22849 // endMBB is a successor of both offsetMBB and overflowMBB 22850 offsetMBB->addSuccessor(endMBB); 22851 overflowMBB->addSuccessor(endMBB); 22852 22853 // Load the offset value into a register 22854 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 22855 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 22856 .addOperand(Base) 22857 .addOperand(Scale) 22858 .addOperand(Index) 22859 .addDisp(Disp, UseFPOffset ? 4 : 0) 22860 .addOperand(Segment) 22861 .setMemRefs(MMOBegin, MMOEnd); 22862 22863 // Check if there is enough room left to pull this argument. 22864 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 22865 .addReg(OffsetReg) 22866 .addImm(MaxOffset + 8 - ArgSizeA8); 22867 22868 // Branch to "overflowMBB" if offset >= max 22869 // Fall through to "offsetMBB" otherwise 22870 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 22871 .addMBB(overflowMBB); 22872 } 22873 22874 // In offsetMBB, emit code to use the reg_save_area. 22875 if (offsetMBB) { 22876 assert(OffsetReg != 0); 22877 22878 // Read the reg_save_area address. 22879 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 22880 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 22881 .addOperand(Base) 22882 .addOperand(Scale) 22883 .addOperand(Index) 22884 .addDisp(Disp, 16) 22885 .addOperand(Segment) 22886 .setMemRefs(MMOBegin, MMOEnd); 22887 22888 // Zero-extend the offset 22889 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 22890 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 22891 .addImm(0) 22892 .addReg(OffsetReg) 22893 .addImm(X86::sub_32bit); 22894 22895 // Add the offset to the reg_save_area to get the final address. 22896 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 22897 .addReg(OffsetReg64) 22898 .addReg(RegSaveReg); 22899 22900 // Compute the offset for the next argument 22901 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 22902 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 22903 .addReg(OffsetReg) 22904 .addImm(UseFPOffset ? 16 : 8); 22905 22906 // Store it back into the va_list. 22907 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 22908 .addOperand(Base) 22909 .addOperand(Scale) 22910 .addOperand(Index) 22911 .addDisp(Disp, UseFPOffset ? 4 : 0) 22912 .addOperand(Segment) 22913 .addReg(NextOffsetReg) 22914 .setMemRefs(MMOBegin, MMOEnd); 22915 22916 // Jump to endMBB 22917 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) 22918 .addMBB(endMBB); 22919 } 22920 22921 // 22922 // Emit code to use overflow area 22923 // 22924 22925 // Load the overflow_area address into a register. 22926 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 22927 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 22928 .addOperand(Base) 22929 .addOperand(Scale) 22930 .addOperand(Index) 22931 .addDisp(Disp, 8) 22932 .addOperand(Segment) 22933 .setMemRefs(MMOBegin, MMOEnd); 22934 22935 // If we need to align it, do so. Otherwise, just copy the address 22936 // to OverflowDestReg. 22937 if (NeedsAlign) { 22938 // Align the overflow address 22939 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2"); 22940 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 22941 22942 // aligned_addr = (addr + (align-1)) & ~(align-1) 22943 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 22944 .addReg(OverflowAddrReg) 22945 .addImm(Align-1); 22946 22947 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 22948 .addReg(TmpReg) 22949 .addImm(~(uint64_t)(Align-1)); 22950 } else { 22951 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 22952 .addReg(OverflowAddrReg); 22953 } 22954 22955 // Compute the next overflow address after this argument. 22956 // (the overflow address should be kept 8-byte aligned) 22957 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 22958 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 22959 .addReg(OverflowDestReg) 22960 .addImm(ArgSizeA8); 22961 22962 // Store the new overflow address. 22963 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 22964 .addOperand(Base) 22965 .addOperand(Scale) 22966 .addOperand(Index) 22967 .addDisp(Disp, 8) 22968 .addOperand(Segment) 22969 .addReg(NextAddrReg) 22970 .setMemRefs(MMOBegin, MMOEnd); 22971 22972 // If we branched, emit the PHI to the front of endMBB. 22973 if (offsetMBB) { 22974 BuildMI(*endMBB, endMBB->begin(), DL, 22975 TII->get(X86::PHI), DestReg) 22976 .addReg(OffsetDestReg).addMBB(offsetMBB) 22977 .addReg(OverflowDestReg).addMBB(overflowMBB); 22978 } 22979 22980 // Erase the pseudo instruction 22981 MI.eraseFromParent(); 22982 22983 return endMBB; 22984 } 22985 22986 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 22987 MachineInstr &MI, MachineBasicBlock *MBB) const { 22988 // Emit code to save XMM registers to the stack. The ABI says that the 22989 // number of registers to save is given in %al, so it's theoretically 22990 // possible to do an indirect jump trick to avoid saving all of them, 22991 // however this code takes a simpler approach and just executes all 22992 // of the stores if %al is non-zero. It's less code, and it's probably 22993 // easier on the hardware branch predictor, and stores aren't all that 22994 // expensive anyway. 22995 22996 // Create the new basic blocks. One block contains all the XMM stores, 22997 // and one block is the final destination regardless of whether any 22998 // stores were performed. 22999 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 23000 MachineFunction *F = MBB->getParent(); 23001 MachineFunction::iterator MBBIter = ++MBB->getIterator(); 23002 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 23003 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 23004 F->insert(MBBIter, XMMSaveMBB); 23005 F->insert(MBBIter, EndMBB); 23006 23007 // Transfer the remainder of MBB and its successor edges to EndMBB. 23008 EndMBB->splice(EndMBB->begin(), MBB, 23009 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 23010 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 23011 23012 // The original block will now fall through to the XMM save block. 23013 MBB->addSuccessor(XMMSaveMBB); 23014 // The XMMSaveMBB will fall through to the end block. 23015 XMMSaveMBB->addSuccessor(EndMBB); 23016 23017 // Now add the instructions. 23018 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 23019 DebugLoc DL = MI.getDebugLoc(); 23020 23021 unsigned CountReg = MI.getOperand(0).getReg(); 23022 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); 23023 int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); 23024 23025 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) { 23026 // If %al is 0, branch around the XMM save block. 23027 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 23028 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); 23029 MBB->addSuccessor(EndMBB); 23030 } 23031 23032 // Make sure the last operand is EFLAGS, which gets clobbered by the branch 23033 // that was just emitted, but clearly shouldn't be "saved". 23034 assert((MI.getNumOperands() <= 3 || 23035 !MI.getOperand(MI.getNumOperands() - 1).isReg() || 23036 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && 23037 "Expected last argument to be EFLAGS"); 23038 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; 23039 // In the XMM save block, save all the XMM argument registers. 23040 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) { 23041 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 23042 MachineMemOperand *MMO = F->getMachineMemOperand( 23043 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), 23044 MachineMemOperand::MOStore, 23045 /*Size=*/16, /*Align=*/16); 23046 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 23047 .addFrameIndex(RegSaveFrameIndex) 23048 .addImm(/*Scale=*/1) 23049 .addReg(/*IndexReg=*/0) 23050 .addImm(/*Disp=*/Offset) 23051 .addReg(/*Segment=*/0) 23052 .addReg(MI.getOperand(i).getReg()) 23053 .addMemOperand(MMO); 23054 } 23055 23056 MI.eraseFromParent(); // The pseudo instruction is gone now. 23057 23058 return EndMBB; 23059 } 23060 23061 // The EFLAGS operand of SelectItr might be missing a kill marker 23062 // because there were multiple uses of EFLAGS, and ISel didn't know 23063 // which to mark. Figure out whether SelectItr should have had a 23064 // kill marker, and set it if it should. Returns the correct kill 23065 // marker value. 23066 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 23067 MachineBasicBlock* BB, 23068 const TargetRegisterInfo* TRI) { 23069 // Scan forward through BB for a use/def of EFLAGS. 23070 MachineBasicBlock::iterator miI(std::next(SelectItr)); 23071 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 23072 const MachineInstr& mi = *miI; 23073 if (mi.readsRegister(X86::EFLAGS)) 23074 return false; 23075 if (mi.definesRegister(X86::EFLAGS)) 23076 break; // Should have kill-flag - update below. 23077 } 23078 23079 // If we hit the end of the block, check whether EFLAGS is live into a 23080 // successor. 23081 if (miI == BB->end()) { 23082 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 23083 sEnd = BB->succ_end(); 23084 sItr != sEnd; ++sItr) { 23085 MachineBasicBlock* succ = *sItr; 23086 if (succ->isLiveIn(X86::EFLAGS)) 23087 return false; 23088 } 23089 } 23090 23091 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 23092 // out. SelectMI should have a kill flag on EFLAGS. 23093 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 23094 return true; 23095 } 23096 23097 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded 23098 // together with other CMOV pseudo-opcodes into a single basic-block with 23099 // conditional jump around it. 23100 static bool isCMOVPseudo(MachineInstr &MI) { 23101 switch (MI.getOpcode()) { 23102 case X86::CMOV_FR32: 23103 case X86::CMOV_FR64: 23104 case X86::CMOV_GR8: 23105 case X86::CMOV_GR16: 23106 case X86::CMOV_GR32: 23107 case X86::CMOV_RFP32: 23108 case X86::CMOV_RFP64: 23109 case X86::CMOV_RFP80: 23110 case X86::CMOV_V2F64: 23111 case X86::CMOV_V2I64: 23112 case X86::CMOV_V4F32: 23113 case X86::CMOV_V4F64: 23114 case X86::CMOV_V4I64: 23115 case X86::CMOV_V16F32: 23116 case X86::CMOV_V8F32: 23117 case X86::CMOV_V8F64: 23118 case X86::CMOV_V8I64: 23119 case X86::CMOV_V8I1: 23120 case X86::CMOV_V16I1: 23121 case X86::CMOV_V32I1: 23122 case X86::CMOV_V64I1: 23123 return true; 23124 23125 default: 23126 return false; 23127 } 23128 } 23129 23130 MachineBasicBlock * 23131 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, 23132 MachineBasicBlock *BB) const { 23133 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 23134 DebugLoc DL = MI.getDebugLoc(); 23135 23136 // To "insert" a SELECT_CC instruction, we actually have to insert the 23137 // diamond control-flow pattern. The incoming instruction knows the 23138 // destination vreg to set, the condition code register to branch on, the 23139 // true/false values to select between, and a branch opcode to use. 23140 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 23141 MachineFunction::iterator It = ++BB->getIterator(); 23142 23143 // thisMBB: 23144 // ... 23145 // TrueVal = ... 23146 // cmpTY ccX, r1, r2 23147 // bCC copy1MBB 23148 // fallthrough --> copy0MBB 23149 MachineBasicBlock *thisMBB = BB; 23150 MachineFunction *F = BB->getParent(); 23151 23152 // This code lowers all pseudo-CMOV instructions. Generally it lowers these 23153 // as described above, by inserting a BB, and then making a PHI at the join 23154 // point to select the true and false operands of the CMOV in the PHI. 23155 // 23156 // The code also handles two different cases of multiple CMOV opcodes 23157 // in a row. 23158 // 23159 // Case 1: 23160 // In this case, there are multiple CMOVs in a row, all which are based on 23161 // the same condition setting (or the exact opposite condition setting). 23162 // In this case we can lower all the CMOVs using a single inserted BB, and 23163 // then make a number of PHIs at the join point to model the CMOVs. The only 23164 // trickiness here, is that in a case like: 23165 // 23166 // t2 = CMOV cond1 t1, f1 23167 // t3 = CMOV cond1 t2, f2 23168 // 23169 // when rewriting this into PHIs, we have to perform some renaming on the 23170 // temps since you cannot have a PHI operand refer to a PHI result earlier 23171 // in the same block. The "simple" but wrong lowering would be: 23172 // 23173 // t2 = PHI t1(BB1), f1(BB2) 23174 // t3 = PHI t2(BB1), f2(BB2) 23175 // 23176 // but clearly t2 is not defined in BB1, so that is incorrect. The proper 23177 // renaming is to note that on the path through BB1, t2 is really just a 23178 // copy of t1, and do that renaming, properly generating: 23179 // 23180 // t2 = PHI t1(BB1), f1(BB2) 23181 // t3 = PHI t1(BB1), f2(BB2) 23182 // 23183 // Case 2, we lower cascaded CMOVs such as 23184 // 23185 // (CMOV (CMOV F, T, cc1), T, cc2) 23186 // 23187 // to two successives branches. For that, we look for another CMOV as the 23188 // following instruction. 23189 // 23190 // Without this, we would add a PHI between the two jumps, which ends up 23191 // creating a few copies all around. For instance, for 23192 // 23193 // (sitofp (zext (fcmp une))) 23194 // 23195 // we would generate: 23196 // 23197 // ucomiss %xmm1, %xmm0 23198 // movss <1.0f>, %xmm0 23199 // movaps %xmm0, %xmm1 23200 // jne .LBB5_2 23201 // xorps %xmm1, %xmm1 23202 // .LBB5_2: 23203 // jp .LBB5_4 23204 // movaps %xmm1, %xmm0 23205 // .LBB5_4: 23206 // retq 23207 // 23208 // because this custom-inserter would have generated: 23209 // 23210 // A 23211 // | \ 23212 // | B 23213 // | / 23214 // C 23215 // | \ 23216 // | D 23217 // | / 23218 // E 23219 // 23220 // A: X = ...; Y = ... 23221 // B: empty 23222 // C: Z = PHI [X, A], [Y, B] 23223 // D: empty 23224 // E: PHI [X, C], [Z, D] 23225 // 23226 // If we lower both CMOVs in a single step, we can instead generate: 23227 // 23228 // A 23229 // | \ 23230 // | C 23231 // | /| 23232 // |/ | 23233 // | | 23234 // | D 23235 // | / 23236 // E 23237 // 23238 // A: X = ...; Y = ... 23239 // D: empty 23240 // E: PHI [X, A], [X, C], [Y, D] 23241 // 23242 // Which, in our sitofp/fcmp example, gives us something like: 23243 // 23244 // ucomiss %xmm1, %xmm0 23245 // movss <1.0f>, %xmm0 23246 // jne .LBB5_4 23247 // jp .LBB5_4 23248 // xorps %xmm0, %xmm0 23249 // .LBB5_4: 23250 // retq 23251 // 23252 MachineInstr *CascadedCMOV = nullptr; 23253 MachineInstr *LastCMOV = &MI; 23254 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); 23255 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); 23256 MachineBasicBlock::iterator NextMIIt = 23257 std::next(MachineBasicBlock::iterator(MI)); 23258 23259 // Check for case 1, where there are multiple CMOVs with the same condition 23260 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the 23261 // number of jumps the most. 23262 23263 if (isCMOVPseudo(MI)) { 23264 // See if we have a string of CMOVS with the same condition. 23265 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) && 23266 (NextMIIt->getOperand(3).getImm() == CC || 23267 NextMIIt->getOperand(3).getImm() == OppCC)) { 23268 LastCMOV = &*NextMIIt; 23269 ++NextMIIt; 23270 } 23271 } 23272 23273 // This checks for case 2, but only do this if we didn't already find 23274 // case 1, as indicated by LastCMOV == MI. 23275 if (LastCMOV == &MI && NextMIIt != BB->end() && 23276 NextMIIt->getOpcode() == MI.getOpcode() && 23277 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() && 23278 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() && 23279 NextMIIt->getOperand(1).isKill()) { 23280 CascadedCMOV = &*NextMIIt; 23281 } 23282 23283 MachineBasicBlock *jcc1MBB = nullptr; 23284 23285 // If we have a cascaded CMOV, we lower it to two successive branches to 23286 // the same block. EFLAGS is used by both, so mark it as live in the second. 23287 if (CascadedCMOV) { 23288 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); 23289 F->insert(It, jcc1MBB); 23290 jcc1MBB->addLiveIn(X86::EFLAGS); 23291 } 23292 23293 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 23294 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 23295 F->insert(It, copy0MBB); 23296 F->insert(It, sinkMBB); 23297 23298 // If the EFLAGS register isn't dead in the terminator, then claim that it's 23299 // live into the sink and copy blocks. 23300 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 23301 23302 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; 23303 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && 23304 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { 23305 copy0MBB->addLiveIn(X86::EFLAGS); 23306 sinkMBB->addLiveIn(X86::EFLAGS); 23307 } 23308 23309 // Transfer the remainder of BB and its successor edges to sinkMBB. 23310 sinkMBB->splice(sinkMBB->begin(), BB, 23311 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); 23312 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 23313 23314 // Add the true and fallthrough blocks as its successors. 23315 if (CascadedCMOV) { 23316 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. 23317 BB->addSuccessor(jcc1MBB); 23318 23319 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and 23320 // jump to the sinkMBB. 23321 jcc1MBB->addSuccessor(copy0MBB); 23322 jcc1MBB->addSuccessor(sinkMBB); 23323 } else { 23324 BB->addSuccessor(copy0MBB); 23325 } 23326 23327 // The true block target of the first (or only) branch is always sinkMBB. 23328 BB->addSuccessor(sinkMBB); 23329 23330 // Create the conditional branch instruction. 23331 unsigned Opc = X86::GetCondBranchFromCond(CC); 23332 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 23333 23334 if (CascadedCMOV) { 23335 unsigned Opc2 = X86::GetCondBranchFromCond( 23336 (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); 23337 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); 23338 } 23339 23340 // copy0MBB: 23341 // %FalseValue = ... 23342 // # fallthrough to sinkMBB 23343 copy0MBB->addSuccessor(sinkMBB); 23344 23345 // sinkMBB: 23346 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 23347 // ... 23348 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); 23349 MachineBasicBlock::iterator MIItEnd = 23350 std::next(MachineBasicBlock::iterator(LastCMOV)); 23351 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); 23352 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; 23353 MachineInstrBuilder MIB; 23354 23355 // As we are creating the PHIs, we have to be careful if there is more than 23356 // one. Later CMOVs may reference the results of earlier CMOVs, but later 23357 // PHIs have to reference the individual true/false inputs from earlier PHIs. 23358 // That also means that PHI construction must work forward from earlier to 23359 // later, and that the code must maintain a mapping from earlier PHI's 23360 // destination registers, and the registers that went into the PHI. 23361 23362 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { 23363 unsigned DestReg = MIIt->getOperand(0).getReg(); 23364 unsigned Op1Reg = MIIt->getOperand(1).getReg(); 23365 unsigned Op2Reg = MIIt->getOperand(2).getReg(); 23366 23367 // If this CMOV we are generating is the opposite condition from 23368 // the jump we generated, then we have to swap the operands for the 23369 // PHI that is going to be generated. 23370 if (MIIt->getOperand(3).getImm() == OppCC) 23371 std::swap(Op1Reg, Op2Reg); 23372 23373 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) 23374 Op1Reg = RegRewriteTable[Op1Reg].first; 23375 23376 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) 23377 Op2Reg = RegRewriteTable[Op2Reg].second; 23378 23379 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, 23380 TII->get(X86::PHI), DestReg) 23381 .addReg(Op1Reg).addMBB(copy0MBB) 23382 .addReg(Op2Reg).addMBB(thisMBB); 23383 23384 // Add this PHI to the rewrite table. 23385 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); 23386 } 23387 23388 // If we have a cascaded CMOV, the second Jcc provides the same incoming 23389 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). 23390 if (CascadedCMOV) { 23391 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB); 23392 // Copy the PHI result to the register defined by the second CMOV. 23393 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), 23394 DL, TII->get(TargetOpcode::COPY), 23395 CascadedCMOV->getOperand(0).getReg()) 23396 .addReg(MI.getOperand(0).getReg()); 23397 CascadedCMOV->eraseFromParent(); 23398 } 23399 23400 // Now remove the CMOV(s). 23401 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) 23402 (MIIt++)->eraseFromParent(); 23403 23404 return sinkMBB; 23405 } 23406 23407 MachineBasicBlock * 23408 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, 23409 MachineBasicBlock *BB) const { 23410 // Combine the following atomic floating-point modification pattern: 23411 // a.store(reg OP a.load(acquire), release) 23412 // Transform them into: 23413 // OPss (%gpr), %xmm 23414 // movss %xmm, (%gpr) 23415 // Or sd equivalent for 64-bit operations. 23416 unsigned MOp, FOp; 23417 switch (MI.getOpcode()) { 23418 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); 23419 case X86::RELEASE_FADD32mr: 23420 FOp = X86::ADDSSrm; 23421 MOp = X86::MOVSSmr; 23422 break; 23423 case X86::RELEASE_FADD64mr: 23424 FOp = X86::ADDSDrm; 23425 MOp = X86::MOVSDmr; 23426 break; 23427 } 23428 const X86InstrInfo *TII = Subtarget.getInstrInfo(); 23429 DebugLoc DL = MI.getDebugLoc(); 23430 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 23431 unsigned ValOpIdx = X86::AddrNumOperands; 23432 unsigned VSrc = MI.getOperand(ValOpIdx).getReg(); 23433 MachineInstrBuilder MIB = 23434 BuildMI(*BB, MI, DL, TII->get(FOp), 23435 MRI.createVirtualRegister(MRI.getRegClass(VSrc))) 23436 .addReg(VSrc); 23437 for (int i = 0; i < X86::AddrNumOperands; ++i) { 23438 MachineOperand &Operand = MI.getOperand(i); 23439 // Clear any kill flags on register operands as we'll create a second 23440 // instruction using the same address operands. 23441 if (Operand.isReg()) 23442 Operand.setIsKill(false); 23443 MIB.addOperand(Operand); 23444 } 23445 MachineInstr *FOpMI = MIB; 23446 MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); 23447 for (int i = 0; i < X86::AddrNumOperands; ++i) 23448 MIB.addOperand(MI.getOperand(i)); 23449 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); 23450 MI.eraseFromParent(); // The pseudo instruction is gone now. 23451 return BB; 23452 } 23453 23454 MachineBasicBlock * 23455 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, 23456 MachineBasicBlock *BB) const { 23457 MachineFunction *MF = BB->getParent(); 23458 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 23459 DebugLoc DL = MI.getDebugLoc(); 23460 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 23461 23462 assert(MF->shouldSplitStack()); 23463 23464 const bool Is64Bit = Subtarget.is64Bit(); 23465 const bool IsLP64 = Subtarget.isTarget64BitLP64(); 23466 23467 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 23468 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; 23469 23470 // BB: 23471 // ... [Till the alloca] 23472 // If stacklet is not large enough, jump to mallocMBB 23473 // 23474 // bumpMBB: 23475 // Allocate by subtracting from RSP 23476 // Jump to continueMBB 23477 // 23478 // mallocMBB: 23479 // Allocate by call to runtime 23480 // 23481 // continueMBB: 23482 // ... 23483 // [rest of original BB] 23484 // 23485 23486 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 23487 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 23488 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 23489 23490 MachineRegisterInfo &MRI = MF->getRegInfo(); 23491 const TargetRegisterClass *AddrRegClass = 23492 getRegClassFor(getPointerTy(MF->getDataLayout())); 23493 23494 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 23495 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 23496 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 23497 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 23498 sizeVReg = MI.getOperand(1).getReg(), 23499 physSPReg = 23500 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP; 23501 23502 MachineFunction::iterator MBBIter = ++BB->getIterator(); 23503 23504 MF->insert(MBBIter, bumpMBB); 23505 MF->insert(MBBIter, mallocMBB); 23506 MF->insert(MBBIter, continueMBB); 23507 23508 continueMBB->splice(continueMBB->begin(), BB, 23509 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 23510 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 23511 23512 // Add code to the main basic block to check if the stack limit has been hit, 23513 // and if so, jump to mallocMBB otherwise to bumpMBB. 23514 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 23515 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 23516 .addReg(tmpSPVReg).addReg(sizeVReg); 23517 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) 23518 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 23519 .addReg(SPLimitVReg); 23520 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); 23521 23522 // bumpMBB simply decreases the stack pointer, since we know the current 23523 // stacklet has enough space. 23524 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 23525 .addReg(SPLimitVReg); 23526 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 23527 .addReg(SPLimitVReg); 23528 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); 23529 23530 // Calls into a routine in libgcc to allocate more space from the heap. 23531 const uint32_t *RegMask = 23532 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); 23533 if (IsLP64) { 23534 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 23535 .addReg(sizeVReg); 23536 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 23537 .addExternalSymbol("__morestack_allocate_stack_space") 23538 .addRegMask(RegMask) 23539 .addReg(X86::RDI, RegState::Implicit) 23540 .addReg(X86::RAX, RegState::ImplicitDefine); 23541 } else if (Is64Bit) { 23542 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) 23543 .addReg(sizeVReg); 23544 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 23545 .addExternalSymbol("__morestack_allocate_stack_space") 23546 .addRegMask(RegMask) 23547 .addReg(X86::EDI, RegState::Implicit) 23548 .addReg(X86::EAX, RegState::ImplicitDefine); 23549 } else { 23550 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 23551 .addImm(12); 23552 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 23553 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 23554 .addExternalSymbol("__morestack_allocate_stack_space") 23555 .addRegMask(RegMask) 23556 .addReg(X86::EAX, RegState::ImplicitDefine); 23557 } 23558 23559 if (!Is64Bit) 23560 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 23561 .addImm(16); 23562 23563 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 23564 .addReg(IsLP64 ? X86::RAX : X86::EAX); 23565 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); 23566 23567 // Set up the CFG correctly. 23568 BB->addSuccessor(bumpMBB); 23569 BB->addSuccessor(mallocMBB); 23570 mallocMBB->addSuccessor(continueMBB); 23571 bumpMBB->addSuccessor(continueMBB); 23572 23573 // Take care of the PHI nodes. 23574 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 23575 MI.getOperand(0).getReg()) 23576 .addReg(mallocPtrVReg) 23577 .addMBB(mallocMBB) 23578 .addReg(bumpSPPtrVReg) 23579 .addMBB(bumpMBB); 23580 23581 // Delete the original pseudo instruction. 23582 MI.eraseFromParent(); 23583 23584 // And we're done. 23585 return continueMBB; 23586 } 23587 23588 MachineBasicBlock * 23589 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, 23590 MachineBasicBlock *BB) const { 23591 MachineFunction *MF = BB->getParent(); 23592 const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); 23593 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 23594 DebugLoc DL = MI.getDebugLoc(); 23595 23596 assert(!isAsynchronousEHPersonality( 23597 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && 23598 "SEH does not use catchret!"); 23599 23600 // Only 32-bit EH needs to worry about manually restoring stack pointers. 23601 if (!Subtarget.is32Bit()) 23602 return BB; 23603 23604 // C++ EH creates a new target block to hold the restore code, and wires up 23605 // the new block to the return destination with a normal JMP_4. 23606 MachineBasicBlock *RestoreMBB = 23607 MF->CreateMachineBasicBlock(BB->getBasicBlock()); 23608 assert(BB->succ_size() == 1); 23609 MF->insert(std::next(BB->getIterator()), RestoreMBB); 23610 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); 23611 BB->addSuccessor(RestoreMBB); 23612 MI.getOperand(0).setMBB(RestoreMBB); 23613 23614 auto RestoreMBBI = RestoreMBB->begin(); 23615 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); 23616 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); 23617 return BB; 23618 } 23619 23620 MachineBasicBlock * 23621 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI, 23622 MachineBasicBlock *BB) const { 23623 MachineFunction *MF = BB->getParent(); 23624 const Constant *PerFn = MF->getFunction()->getPersonalityFn(); 23625 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); 23626 // Only 32-bit SEH requires special handling for catchpad. 23627 if (IsSEH && Subtarget.is32Bit()) { 23628 const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); 23629 DebugLoc DL = MI.getDebugLoc(); 23630 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); 23631 } 23632 MI.eraseFromParent(); 23633 return BB; 23634 } 23635 23636 MachineBasicBlock * 23637 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, 23638 MachineBasicBlock *BB) const { 23639 // So, here we replace TLSADDR with the sequence: 23640 // adjust_stackdown -> TLSADDR -> adjust_stackup. 23641 // We need this because TLSADDR is lowered into calls 23642 // inside MC, therefore without the two markers shrink-wrapping 23643 // may push the prologue/epilogue pass them. 23644 const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); 23645 DebugLoc DL = MI.getDebugLoc(); 23646 MachineFunction &MF = *BB->getParent(); 23647 23648 // Emit CALLSEQ_START right before the instruction. 23649 unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); 23650 MachineInstrBuilder CallseqStart = 23651 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0); 23652 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); 23653 23654 // Emit CALLSEQ_END right after the instruction. 23655 // We don't call erase from parent because we want to keep the 23656 // original instruction around. 23657 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); 23658 MachineInstrBuilder CallseqEnd = 23659 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0); 23660 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd); 23661 23662 return BB; 23663 } 23664 23665 MachineBasicBlock * 23666 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, 23667 MachineBasicBlock *BB) const { 23668 // This is pretty easy. We're taking the value that we received from 23669 // our load from the relocation, sticking it in either RDI (x86-64) 23670 // or EAX and doing an indirect call. The return value will then 23671 // be in the normal return register. 23672 MachineFunction *F = BB->getParent(); 23673 const X86InstrInfo *TII = Subtarget.getInstrInfo(); 23674 DebugLoc DL = MI.getDebugLoc(); 23675 23676 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?"); 23677 assert(MI.getOperand(3).isGlobal() && "This should be a global"); 23678 23679 // Get a register mask for the lowered call. 23680 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 23681 // proper register mask. 23682 const uint32_t *RegMask = 23683 Subtarget.is64Bit() ? 23684 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() : 23685 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); 23686 if (Subtarget.is64Bit()) { 23687 MachineInstrBuilder MIB = 23688 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) 23689 .addReg(X86::RIP) 23690 .addImm(0) 23691 .addReg(0) 23692 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, 23693 MI.getOperand(3).getTargetFlags()) 23694 .addReg(0); 23695 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 23696 addDirectMem(MIB, X86::RDI); 23697 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 23698 } else if (!isPositionIndependent()) { 23699 MachineInstrBuilder MIB = 23700 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) 23701 .addReg(0) 23702 .addImm(0) 23703 .addReg(0) 23704 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, 23705 MI.getOperand(3).getTargetFlags()) 23706 .addReg(0); 23707 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 23708 addDirectMem(MIB, X86::EAX); 23709 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 23710 } else { 23711 MachineInstrBuilder MIB = 23712 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) 23713 .addReg(TII->getGlobalBaseReg(F)) 23714 .addImm(0) 23715 .addReg(0) 23716 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, 23717 MI.getOperand(3).getTargetFlags()) 23718 .addReg(0); 23719 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 23720 addDirectMem(MIB, X86::EAX); 23721 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 23722 } 23723 23724 MI.eraseFromParent(); // The pseudo instruction is gone now. 23725 return BB; 23726 } 23727 23728 MachineBasicBlock * 23729 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, 23730 MachineBasicBlock *MBB) const { 23731 DebugLoc DL = MI.getDebugLoc(); 23732 MachineFunction *MF = MBB->getParent(); 23733 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 23734 MachineRegisterInfo &MRI = MF->getRegInfo(); 23735 23736 const BasicBlock *BB = MBB->getBasicBlock(); 23737 MachineFunction::iterator I = ++MBB->getIterator(); 23738 23739 // Memory Reference 23740 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); 23741 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); 23742 23743 unsigned DstReg; 23744 unsigned MemOpndSlot = 0; 23745 23746 unsigned CurOp = 0; 23747 23748 DstReg = MI.getOperand(CurOp++).getReg(); 23749 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 23750 assert(RC->hasType(MVT::i32) && "Invalid destination!"); 23751 unsigned mainDstReg = MRI.createVirtualRegister(RC); 23752 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 23753 23754 MemOpndSlot = CurOp; 23755 23756 MVT PVT = getPointerTy(MF->getDataLayout()); 23757 assert((PVT == MVT::i64 || PVT == MVT::i32) && 23758 "Invalid Pointer Size!"); 23759 23760 // For v = setjmp(buf), we generate 23761 // 23762 // thisMBB: 23763 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB 23764 // SjLjSetup restoreMBB 23765 // 23766 // mainMBB: 23767 // v_main = 0 23768 // 23769 // sinkMBB: 23770 // v = phi(main, restore) 23771 // 23772 // restoreMBB: 23773 // if base pointer being used, load it from frame 23774 // v_restore = 1 23775 23776 MachineBasicBlock *thisMBB = MBB; 23777 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 23778 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 23779 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); 23780 MF->insert(I, mainMBB); 23781 MF->insert(I, sinkMBB); 23782 MF->push_back(restoreMBB); 23783 restoreMBB->setHasAddressTaken(); 23784 23785 MachineInstrBuilder MIB; 23786 23787 // Transfer the remainder of BB and its successor edges to sinkMBB. 23788 sinkMBB->splice(sinkMBB->begin(), MBB, 23789 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 23790 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 23791 23792 // thisMBB: 23793 unsigned PtrStoreOpc = 0; 23794 unsigned LabelReg = 0; 23795 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 23796 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && 23797 !isPositionIndependent(); 23798 23799 // Prepare IP either in reg or imm. 23800 if (!UseImmLabel) { 23801 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 23802 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 23803 LabelReg = MRI.createVirtualRegister(PtrRC); 23804 if (Subtarget.is64Bit()) { 23805 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) 23806 .addReg(X86::RIP) 23807 .addImm(0) 23808 .addReg(0) 23809 .addMBB(restoreMBB) 23810 .addReg(0); 23811 } else { 23812 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); 23813 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) 23814 .addReg(XII->getGlobalBaseReg(MF)) 23815 .addImm(0) 23816 .addReg(0) 23817 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference()) 23818 .addReg(0); 23819 } 23820 } else 23821 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 23822 // Store IP 23823 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); 23824 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 23825 if (i == X86::AddrDisp) 23826 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); 23827 else 23828 MIB.addOperand(MI.getOperand(MemOpndSlot + i)); 23829 } 23830 if (!UseImmLabel) 23831 MIB.addReg(LabelReg); 23832 else 23833 MIB.addMBB(restoreMBB); 23834 MIB.setMemRefs(MMOBegin, MMOEnd); 23835 // Setup 23836 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) 23837 .addMBB(restoreMBB); 23838 23839 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 23840 MIB.addRegMask(RegInfo->getNoPreservedMask()); 23841 thisMBB->addSuccessor(mainMBB); 23842 thisMBB->addSuccessor(restoreMBB); 23843 23844 // mainMBB: 23845 // EAX = 0 23846 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); 23847 mainMBB->addSuccessor(sinkMBB); 23848 23849 // sinkMBB: 23850 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 23851 TII->get(X86::PHI), DstReg) 23852 .addReg(mainDstReg).addMBB(mainMBB) 23853 .addReg(restoreDstReg).addMBB(restoreMBB); 23854 23855 // restoreMBB: 23856 if (RegInfo->hasBasePointer(*MF)) { 23857 const bool Uses64BitFramePtr = 23858 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); 23859 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); 23860 X86FI->setRestoreBasePointer(MF); 23861 unsigned FramePtr = RegInfo->getFrameRegister(*MF); 23862 unsigned BasePtr = RegInfo->getBaseRegister(); 23863 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; 23864 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), 23865 FramePtr, true, X86FI->getRestoreBasePointerOffset()) 23866 .setMIFlag(MachineInstr::FrameSetup); 23867 } 23868 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); 23869 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); 23870 restoreMBB->addSuccessor(sinkMBB); 23871 23872 MI.eraseFromParent(); 23873 return sinkMBB; 23874 } 23875 23876 MachineBasicBlock * 23877 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, 23878 MachineBasicBlock *MBB) const { 23879 DebugLoc DL = MI.getDebugLoc(); 23880 MachineFunction *MF = MBB->getParent(); 23881 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 23882 MachineRegisterInfo &MRI = MF->getRegInfo(); 23883 23884 // Memory Reference 23885 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); 23886 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); 23887 23888 MVT PVT = getPointerTy(MF->getDataLayout()); 23889 assert((PVT == MVT::i64 || PVT == MVT::i32) && 23890 "Invalid Pointer Size!"); 23891 23892 const TargetRegisterClass *RC = 23893 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 23894 unsigned Tmp = MRI.createVirtualRegister(RC); 23895 // Since FP is only updated here but NOT referenced, it's treated as GPR. 23896 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 23897 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; 23898 unsigned SP = RegInfo->getStackRegister(); 23899 23900 MachineInstrBuilder MIB; 23901 23902 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 23903 const int64_t SPOffset = 2 * PVT.getStoreSize(); 23904 23905 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; 23906 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; 23907 23908 // Reload FP 23909 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); 23910 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 23911 MIB.addOperand(MI.getOperand(i)); 23912 MIB.setMemRefs(MMOBegin, MMOEnd); 23913 // Reload IP 23914 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); 23915 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 23916 if (i == X86::AddrDisp) 23917 MIB.addDisp(MI.getOperand(i), LabelOffset); 23918 else 23919 MIB.addOperand(MI.getOperand(i)); 23920 } 23921 MIB.setMemRefs(MMOBegin, MMOEnd); 23922 // Reload SP 23923 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); 23924 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 23925 if (i == X86::AddrDisp) 23926 MIB.addDisp(MI.getOperand(i), SPOffset); 23927 else 23928 MIB.addOperand(MI.getOperand(i)); 23929 } 23930 MIB.setMemRefs(MMOBegin, MMOEnd); 23931 // Jump 23932 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); 23933 23934 MI.eraseFromParent(); 23935 return MBB; 23936 } 23937 23938 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 23939 MachineBasicBlock *MBB, 23940 MachineBasicBlock *DispatchBB, 23941 int FI) const { 23942 DebugLoc DL = MI.getDebugLoc(); 23943 MachineFunction *MF = MBB->getParent(); 23944 MachineRegisterInfo *MRI = &MF->getRegInfo(); 23945 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 23946 23947 MVT PVT = getPointerTy(MF->getDataLayout()); 23948 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); 23949 23950 unsigned Op = 0; 23951 unsigned VR = 0; 23952 23953 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && 23954 !isPositionIndependent(); 23955 23956 if (UseImmLabel) { 23957 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 23958 } else { 23959 const TargetRegisterClass *TRC = 23960 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 23961 VR = MRI->createVirtualRegister(TRC); 23962 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 23963 23964 /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */ 23965 23966 if (Subtarget.is64Bit()) 23967 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR) 23968 .addReg(X86::RIP) 23969 .addImm(1) 23970 .addReg(0) 23971 .addMBB(DispatchBB) 23972 .addReg(0); 23973 else 23974 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR) 23975 .addReg(0) /* XII->getGlobalBaseReg(MF) */ 23976 .addImm(1) 23977 .addReg(0) 23978 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) 23979 .addReg(0); 23980 } 23981 23982 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op)); 23983 addFrameReference(MIB, FI, 36); 23984 if (UseImmLabel) 23985 MIB.addMBB(DispatchBB); 23986 else 23987 MIB.addReg(VR); 23988 } 23989 23990 MachineBasicBlock * 23991 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 23992 MachineBasicBlock *BB) const { 23993 DebugLoc DL = MI.getDebugLoc(); 23994 MachineFunction *MF = BB->getParent(); 23995 MachineModuleInfo *MMI = &MF->getMMI(); 23996 MachineFrameInfo *MFI = MF->getFrameInfo(); 23997 MachineRegisterInfo *MRI = &MF->getRegInfo(); 23998 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 23999 int FI = MFI->getFunctionContextIndex(); 24000 24001 // Get a mapping of the call site numbers to all of the landing pads they're 24002 // associated with. 24003 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad; 24004 unsigned MaxCSNum = 0; 24005 for (auto &MBB : *MF) { 24006 if (!MBB.isEHPad()) 24007 continue; 24008 24009 MCSymbol *Sym = nullptr; 24010 for (const auto &MI : MBB) { 24011 if (MI.isDebugValue()) 24012 continue; 24013 24014 assert(MI.isEHLabel() && "expected EH_LABEL"); 24015 Sym = MI.getOperand(0).getMCSymbol(); 24016 break; 24017 } 24018 24019 if (!MMI->hasCallSiteLandingPad(Sym)) 24020 continue; 24021 24022 for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) { 24023 CallSiteNumToLPad[CSI].push_back(&MBB); 24024 MaxCSNum = std::max(MaxCSNum, CSI); 24025 } 24026 } 24027 24028 // Get an ordered list of the machine basic blocks for the jump table. 24029 std::vector<MachineBasicBlock *> LPadList; 24030 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs; 24031 LPadList.reserve(CallSiteNumToLPad.size()); 24032 24033 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) { 24034 for (auto &LP : CallSiteNumToLPad[CSI]) { 24035 LPadList.push_back(LP); 24036 InvokeBBs.insert(LP->pred_begin(), LP->pred_end()); 24037 } 24038 } 24039 24040 assert(!LPadList.empty() && 24041 "No landing pad destinations for the dispatch jump table!"); 24042 24043 // Create the MBBs for the dispatch code. 24044 24045 // Shove the dispatch's address into the return slot in the function context. 24046 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 24047 DispatchBB->setIsEHPad(true); 24048 24049 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 24050 BuildMI(TrapBB, DL, TII->get(X86::TRAP)); 24051 DispatchBB->addSuccessor(TrapBB); 24052 24053 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 24054 DispatchBB->addSuccessor(DispContBB); 24055 24056 // Insert MBBs. 24057 MF->push_back(DispatchBB); 24058 MF->push_back(DispContBB); 24059 MF->push_back(TrapBB); 24060 24061 // Insert code into the entry block that creates and registers the function 24062 // context. 24063 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI); 24064 24065 // Create the jump table and associated information 24066 MachineJumpTableInfo *JTI = 24067 MF->getOrCreateJumpTableInfo(getJumpTableEncoding()); 24068 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 24069 24070 const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); 24071 const X86RegisterInfo &RI = XII->getRegisterInfo(); 24072 24073 // Add a register mask with no preserved registers. This results in all 24074 // registers being marked as clobbered. 24075 if (RI.hasBasePointer(*MF)) { 24076 const bool FPIs64Bit = 24077 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); 24078 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>(); 24079 MFI->setRestoreBasePointer(MF); 24080 24081 unsigned FP = RI.getFrameRegister(*MF); 24082 unsigned BP = RI.getBaseRegister(); 24083 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm; 24084 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true, 24085 MFI->getRestoreBasePointerOffset()) 24086 .addRegMask(RI.getNoPreservedMask()); 24087 } else { 24088 BuildMI(DispatchBB, DL, TII->get(X86::NOOP)) 24089 .addRegMask(RI.getNoPreservedMask()); 24090 } 24091 24092 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass); 24093 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI, 24094 4); 24095 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) 24096 .addReg(IReg) 24097 .addImm(LPadList.size()); 24098 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB); 24099 24100 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass); 24101 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg) 24102 .addReg(IReg) 24103 .addImm(1); 24104 BuildMI(DispContBB, DL, 24105 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m)) 24106 .addReg(0) 24107 .addImm(Subtarget.is64Bit() ? 8 : 4) 24108 .addReg(JReg) 24109 .addJumpTableIndex(MJTI) 24110 .addReg(0); 24111 24112 // Add the jump table entries as successors to the MBB. 24113 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs; 24114 for (auto &LP : LPadList) 24115 if (SeenMBBs.insert(LP).second) 24116 DispContBB->addSuccessor(LP); 24117 24118 // N.B. the order the invoke BBs are processed in doesn't matter here. 24119 SmallVector<MachineBasicBlock *, 64> MBBLPads; 24120 const MCPhysReg *SavedRegs = 24121 Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF); 24122 for (MachineBasicBlock *MBB : InvokeBBs) { 24123 // Remove the landing pad successor from the invoke block and replace it 24124 // with the new dispatch block. 24125 // Keep a copy of Successors since it's modified inside the loop. 24126 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(), 24127 MBB->succ_rend()); 24128 // FIXME: Avoid quadratic complexity. 24129 for (auto MBBS : Successors) { 24130 if (MBBS->isEHPad()) { 24131 MBB->removeSuccessor(MBBS); 24132 MBBLPads.push_back(MBBS); 24133 } 24134 } 24135 24136 MBB->addSuccessor(DispatchBB); 24137 24138 // Find the invoke call and mark all of the callee-saved registers as 24139 // 'implicit defined' so that they're spilled. This prevents code from 24140 // moving instructions to before the EH block, where they will never be 24141 // executed. 24142 for (auto &II : reverse(*MBB)) { 24143 if (!II.isCall()) 24144 continue; 24145 24146 DenseMap<unsigned, bool> DefRegs; 24147 for (auto &MOp : II.operands()) 24148 if (MOp.isReg()) 24149 DefRegs[MOp.getReg()] = true; 24150 24151 MachineInstrBuilder MIB(*MF, &II); 24152 for (unsigned RI = 0; SavedRegs[RI]; ++RI) { 24153 unsigned Reg = SavedRegs[RI]; 24154 if (!DefRegs[Reg]) 24155 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 24156 } 24157 24158 break; 24159 } 24160 } 24161 24162 // Mark all former landing pads as non-landing pads. The dispatch is the only 24163 // landing pad now. 24164 for (auto &LP : MBBLPads) 24165 LP->setIsEHPad(false); 24166 24167 // The instruction is gone now. 24168 MI.eraseFromParent(); 24169 return BB; 24170 } 24171 24172 // Replace 213-type (isel default) FMA3 instructions with 231-type for 24173 // accumulator loops. Writing back to the accumulator allows the coalescer 24174 // to remove extra copies in the loop. 24175 // FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937). 24176 MachineBasicBlock * 24177 X86TargetLowering::emitFMA3Instr(MachineInstr &MI, 24178 MachineBasicBlock *MBB) const { 24179 MachineOperand &AddendOp = MI.getOperand(3); 24180 24181 // Bail out early if the addend isn't a register - we can't switch these. 24182 if (!AddendOp.isReg()) 24183 return MBB; 24184 24185 MachineFunction &MF = *MBB->getParent(); 24186 MachineRegisterInfo &MRI = MF.getRegInfo(); 24187 24188 // Check whether the addend is defined by a PHI: 24189 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); 24190 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); 24191 if (!AddendDef.isPHI()) 24192 return MBB; 24193 24194 // Look for the following pattern: 24195 // loop: 24196 // %addend = phi [%entry, 0], [%loop, %result] 24197 // ... 24198 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend 24199 24200 // Replace with: 24201 // loop: 24202 // %addend = phi [%entry, 0], [%loop, %result] 24203 // ... 24204 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2 24205 24206 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { 24207 assert(AddendDef.getOperand(i).isReg()); 24208 MachineOperand PHISrcOp = AddendDef.getOperand(i); 24209 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); 24210 if (&PHISrcInst == &MI) { 24211 // Found a matching instruction. 24212 unsigned NewFMAOpc = 0; 24213 switch (MI.getOpcode()) { 24214 case X86::VFMADDPDr213r: 24215 NewFMAOpc = X86::VFMADDPDr231r; 24216 break; 24217 case X86::VFMADDPSr213r: 24218 NewFMAOpc = X86::VFMADDPSr231r; 24219 break; 24220 case X86::VFMADDSDr213r: 24221 NewFMAOpc = X86::VFMADDSDr231r; 24222 break; 24223 case X86::VFMADDSSr213r: 24224 NewFMAOpc = X86::VFMADDSSr231r; 24225 break; 24226 case X86::VFMSUBPDr213r: 24227 NewFMAOpc = X86::VFMSUBPDr231r; 24228 break; 24229 case X86::VFMSUBPSr213r: 24230 NewFMAOpc = X86::VFMSUBPSr231r; 24231 break; 24232 case X86::VFMSUBSDr213r: 24233 NewFMAOpc = X86::VFMSUBSDr231r; 24234 break; 24235 case X86::VFMSUBSSr213r: 24236 NewFMAOpc = X86::VFMSUBSSr231r; 24237 break; 24238 case X86::VFNMADDPDr213r: 24239 NewFMAOpc = X86::VFNMADDPDr231r; 24240 break; 24241 case X86::VFNMADDPSr213r: 24242 NewFMAOpc = X86::VFNMADDPSr231r; 24243 break; 24244 case X86::VFNMADDSDr213r: 24245 NewFMAOpc = X86::VFNMADDSDr231r; 24246 break; 24247 case X86::VFNMADDSSr213r: 24248 NewFMAOpc = X86::VFNMADDSSr231r; 24249 break; 24250 case X86::VFNMSUBPDr213r: 24251 NewFMAOpc = X86::VFNMSUBPDr231r; 24252 break; 24253 case X86::VFNMSUBPSr213r: 24254 NewFMAOpc = X86::VFNMSUBPSr231r; 24255 break; 24256 case X86::VFNMSUBSDr213r: 24257 NewFMAOpc = X86::VFNMSUBSDr231r; 24258 break; 24259 case X86::VFNMSUBSSr213r: 24260 NewFMAOpc = X86::VFNMSUBSSr231r; 24261 break; 24262 case X86::VFMADDSUBPDr213r: 24263 NewFMAOpc = X86::VFMADDSUBPDr231r; 24264 break; 24265 case X86::VFMADDSUBPSr213r: 24266 NewFMAOpc = X86::VFMADDSUBPSr231r; 24267 break; 24268 case X86::VFMSUBADDPDr213r: 24269 NewFMAOpc = X86::VFMSUBADDPDr231r; 24270 break; 24271 case X86::VFMSUBADDPSr213r: 24272 NewFMAOpc = X86::VFMSUBADDPSr231r; 24273 break; 24274 24275 case X86::VFMADDPDr213rY: 24276 NewFMAOpc = X86::VFMADDPDr231rY; 24277 break; 24278 case X86::VFMADDPSr213rY: 24279 NewFMAOpc = X86::VFMADDPSr231rY; 24280 break; 24281 case X86::VFMSUBPDr213rY: 24282 NewFMAOpc = X86::VFMSUBPDr231rY; 24283 break; 24284 case X86::VFMSUBPSr213rY: 24285 NewFMAOpc = X86::VFMSUBPSr231rY; 24286 break; 24287 case X86::VFNMADDPDr213rY: 24288 NewFMAOpc = X86::VFNMADDPDr231rY; 24289 break; 24290 case X86::VFNMADDPSr213rY: 24291 NewFMAOpc = X86::VFNMADDPSr231rY; 24292 break; 24293 case X86::VFNMSUBPDr213rY: 24294 NewFMAOpc = X86::VFNMSUBPDr231rY; 24295 break; 24296 case X86::VFNMSUBPSr213rY: 24297 NewFMAOpc = X86::VFNMSUBPSr231rY; 24298 break; 24299 case X86::VFMADDSUBPDr213rY: 24300 NewFMAOpc = X86::VFMADDSUBPDr231rY; 24301 break; 24302 case X86::VFMADDSUBPSr213rY: 24303 NewFMAOpc = X86::VFMADDSUBPSr231rY; 24304 break; 24305 case X86::VFMSUBADDPDr213rY: 24306 NewFMAOpc = X86::VFMSUBADDPDr231rY; 24307 break; 24308 case X86::VFMSUBADDPSr213rY: 24309 NewFMAOpc = X86::VFMSUBADDPSr231rY; 24310 break; 24311 default: 24312 llvm_unreachable("Unrecognized FMA variant."); 24313 } 24314 24315 const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); 24316 MachineInstrBuilder MIB = 24317 BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc)) 24318 .addOperand(MI.getOperand(0)) 24319 .addOperand(MI.getOperand(3)) 24320 .addOperand(MI.getOperand(2)) 24321 .addOperand(MI.getOperand(1)); 24322 MBB->insert(MachineBasicBlock::iterator(MI), MIB); 24323 MI.eraseFromParent(); 24324 } 24325 } 24326 24327 return MBB; 24328 } 24329 24330 MachineBasicBlock * 24331 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 24332 MachineBasicBlock *BB) const { 24333 switch (MI.getOpcode()) { 24334 default: llvm_unreachable("Unexpected instr type to insert"); 24335 case X86::TAILJMPd64: 24336 case X86::TAILJMPr64: 24337 case X86::TAILJMPm64: 24338 case X86::TAILJMPd64_REX: 24339 case X86::TAILJMPr64_REX: 24340 case X86::TAILJMPm64_REX: 24341 llvm_unreachable("TAILJMP64 would not be touched here."); 24342 case X86::TCRETURNdi64: 24343 case X86::TCRETURNri64: 24344 case X86::TCRETURNmi64: 24345 return BB; 24346 case X86::TLS_addr32: 24347 case X86::TLS_addr64: 24348 case X86::TLS_base_addr32: 24349 case X86::TLS_base_addr64: 24350 return EmitLoweredTLSAddr(MI, BB); 24351 case X86::CATCHRET: 24352 return EmitLoweredCatchRet(MI, BB); 24353 case X86::CATCHPAD: 24354 return EmitLoweredCatchPad(MI, BB); 24355 case X86::SEG_ALLOCA_32: 24356 case X86::SEG_ALLOCA_64: 24357 return EmitLoweredSegAlloca(MI, BB); 24358 case X86::TLSCall_32: 24359 case X86::TLSCall_64: 24360 return EmitLoweredTLSCall(MI, BB); 24361 case X86::CMOV_FR32: 24362 case X86::CMOV_FR64: 24363 case X86::CMOV_FR128: 24364 case X86::CMOV_GR8: 24365 case X86::CMOV_GR16: 24366 case X86::CMOV_GR32: 24367 case X86::CMOV_RFP32: 24368 case X86::CMOV_RFP64: 24369 case X86::CMOV_RFP80: 24370 case X86::CMOV_V2F64: 24371 case X86::CMOV_V2I64: 24372 case X86::CMOV_V4F32: 24373 case X86::CMOV_V4F64: 24374 case X86::CMOV_V4I64: 24375 case X86::CMOV_V16F32: 24376 case X86::CMOV_V8F32: 24377 case X86::CMOV_V8F64: 24378 case X86::CMOV_V8I64: 24379 case X86::CMOV_V8I1: 24380 case X86::CMOV_V16I1: 24381 case X86::CMOV_V32I1: 24382 case X86::CMOV_V64I1: 24383 return EmitLoweredSelect(MI, BB); 24384 24385 case X86::RDFLAGS32: 24386 case X86::RDFLAGS64: { 24387 DebugLoc DL = MI.getDebugLoc(); 24388 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 24389 unsigned PushF = 24390 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; 24391 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; 24392 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF)); 24393 // Permit reads of the FLAGS register without it being defined. 24394 // This intrinsic exists to read external processor state in flags, such as 24395 // the trap flag, interrupt flag, and direction flag, none of which are 24396 // modeled by the backend. 24397 Push->getOperand(2).setIsUndef(); 24398 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg()); 24399 24400 MI.eraseFromParent(); // The pseudo is gone now. 24401 return BB; 24402 } 24403 24404 case X86::WRFLAGS32: 24405 case X86::WRFLAGS64: { 24406 DebugLoc DL = MI.getDebugLoc(); 24407 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 24408 unsigned Push = 24409 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; 24410 unsigned PopF = 24411 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; 24412 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg()); 24413 BuildMI(*BB, MI, DL, TII->get(PopF)); 24414 24415 MI.eraseFromParent(); // The pseudo is gone now. 24416 return BB; 24417 } 24418 24419 case X86::RELEASE_FADD32mr: 24420 case X86::RELEASE_FADD64mr: 24421 return EmitLoweredAtomicFP(MI, BB); 24422 24423 case X86::FP32_TO_INT16_IN_MEM: 24424 case X86::FP32_TO_INT32_IN_MEM: 24425 case X86::FP32_TO_INT64_IN_MEM: 24426 case X86::FP64_TO_INT16_IN_MEM: 24427 case X86::FP64_TO_INT32_IN_MEM: 24428 case X86::FP64_TO_INT64_IN_MEM: 24429 case X86::FP80_TO_INT16_IN_MEM: 24430 case X86::FP80_TO_INT32_IN_MEM: 24431 case X86::FP80_TO_INT64_IN_MEM: { 24432 MachineFunction *F = BB->getParent(); 24433 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 24434 DebugLoc DL = MI.getDebugLoc(); 24435 24436 // Change the floating point control register to use "round towards zero" 24437 // mode when truncating to an integer value. 24438 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 24439 addFrameReference(BuildMI(*BB, MI, DL, 24440 TII->get(X86::FNSTCW16m)), CWFrameIdx); 24441 24442 // Load the old value of the high byte of the control word... 24443 unsigned OldCW = 24444 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 24445 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 24446 CWFrameIdx); 24447 24448 // Set the high part to be round to zero... 24449 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 24450 .addImm(0xC7F); 24451 24452 // Reload the modified control word now... 24453 addFrameReference(BuildMI(*BB, MI, DL, 24454 TII->get(X86::FLDCW16m)), CWFrameIdx); 24455 24456 // Restore the memory image of control word to original value 24457 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 24458 .addReg(OldCW); 24459 24460 // Get the X86 opcode to use. 24461 unsigned Opc; 24462 switch (MI.getOpcode()) { 24463 default: llvm_unreachable("illegal opcode!"); 24464 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 24465 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 24466 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 24467 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 24468 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 24469 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 24470 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 24471 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 24472 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 24473 } 24474 24475 X86AddressMode AM = getAddressFromInstr(&MI, 0); 24476 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 24477 .addReg(MI.getOperand(X86::AddrNumOperands).getReg()); 24478 24479 // Reload the original control word now. 24480 addFrameReference(BuildMI(*BB, MI, DL, 24481 TII->get(X86::FLDCW16m)), CWFrameIdx); 24482 24483 MI.eraseFromParent(); // The pseudo instruction is gone now. 24484 return BB; 24485 } 24486 // String/text processing lowering. 24487 case X86::PCMPISTRM128REG: 24488 case X86::VPCMPISTRM128REG: 24489 case X86::PCMPISTRM128MEM: 24490 case X86::VPCMPISTRM128MEM: 24491 case X86::PCMPESTRM128REG: 24492 case X86::VPCMPESTRM128REG: 24493 case X86::PCMPESTRM128MEM: 24494 case X86::VPCMPESTRM128MEM: 24495 assert(Subtarget.hasSSE42() && 24496 "Target must have SSE4.2 or AVX features enabled"); 24497 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo()); 24498 24499 // String/text processing lowering. 24500 case X86::PCMPISTRIREG: 24501 case X86::VPCMPISTRIREG: 24502 case X86::PCMPISTRIMEM: 24503 case X86::VPCMPISTRIMEM: 24504 case X86::PCMPESTRIREG: 24505 case X86::VPCMPESTRIREG: 24506 case X86::PCMPESTRIMEM: 24507 case X86::VPCMPESTRIMEM: 24508 assert(Subtarget.hasSSE42() && 24509 "Target must have SSE4.2 or AVX features enabled"); 24510 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo()); 24511 24512 // Thread synchronization. 24513 case X86::MONITOR: 24514 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); 24515 case X86::MONITORX: 24516 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); 24517 // PKU feature 24518 case X86::WRPKRU: 24519 return emitWRPKRU(MI, BB, Subtarget); 24520 case X86::RDPKRU: 24521 return emitRDPKRU(MI, BB, Subtarget); 24522 // xbegin 24523 case X86::XBEGIN: 24524 return emitXBegin(MI, BB, Subtarget.getInstrInfo()); 24525 24526 case X86::VASTART_SAVE_XMM_REGS: 24527 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 24528 24529 case X86::VAARG_64: 24530 return EmitVAARG64WithCustomInserter(MI, BB); 24531 24532 case X86::EH_SjLj_SetJmp32: 24533 case X86::EH_SjLj_SetJmp64: 24534 return emitEHSjLjSetJmp(MI, BB); 24535 24536 case X86::EH_SjLj_LongJmp32: 24537 case X86::EH_SjLj_LongJmp64: 24538 return emitEHSjLjLongJmp(MI, BB); 24539 24540 case X86::Int_eh_sjlj_setup_dispatch: 24541 return EmitSjLjDispatchBlock(MI, BB); 24542 24543 case TargetOpcode::STATEPOINT: 24544 // As an implementation detail, STATEPOINT shares the STACKMAP format at 24545 // this point in the process. We diverge later. 24546 return emitPatchPoint(MI, BB); 24547 24548 case TargetOpcode::STACKMAP: 24549 case TargetOpcode::PATCHPOINT: 24550 return emitPatchPoint(MI, BB); 24551 24552 case X86::VFMADDPDr213r: 24553 case X86::VFMADDPSr213r: 24554 case X86::VFMADDSDr213r: 24555 case X86::VFMADDSSr213r: 24556 case X86::VFMSUBPDr213r: 24557 case X86::VFMSUBPSr213r: 24558 case X86::VFMSUBSDr213r: 24559 case X86::VFMSUBSSr213r: 24560 case X86::VFNMADDPDr213r: 24561 case X86::VFNMADDPSr213r: 24562 case X86::VFNMADDSDr213r: 24563 case X86::VFNMADDSSr213r: 24564 case X86::VFNMSUBPDr213r: 24565 case X86::VFNMSUBPSr213r: 24566 case X86::VFNMSUBSDr213r: 24567 case X86::VFNMSUBSSr213r: 24568 case X86::VFMADDSUBPDr213r: 24569 case X86::VFMADDSUBPSr213r: 24570 case X86::VFMSUBADDPDr213r: 24571 case X86::VFMSUBADDPSr213r: 24572 case X86::VFMADDPDr213rY: 24573 case X86::VFMADDPSr213rY: 24574 case X86::VFMSUBPDr213rY: 24575 case X86::VFMSUBPSr213rY: 24576 case X86::VFNMADDPDr213rY: 24577 case X86::VFNMADDPSr213rY: 24578 case X86::VFNMSUBPDr213rY: 24579 case X86::VFNMSUBPSr213rY: 24580 case X86::VFMADDSUBPDr213rY: 24581 case X86::VFMADDSUBPSr213rY: 24582 case X86::VFMSUBADDPDr213rY: 24583 case X86::VFMSUBADDPSr213rY: 24584 return emitFMA3Instr(MI, BB); 24585 case X86::LCMPXCHG8B_SAVE_EBX: 24586 case X86::LCMPXCHG16B_SAVE_RBX: { 24587 unsigned BasePtr = 24588 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX; 24589 if (!BB->isLiveIn(BasePtr)) 24590 BB->addLiveIn(BasePtr); 24591 return BB; 24592 } 24593 } 24594 } 24595 24596 //===----------------------------------------------------------------------===// 24597 // X86 Optimization Hooks 24598 //===----------------------------------------------------------------------===// 24599 24600 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 24601 APInt &KnownZero, 24602 APInt &KnownOne, 24603 const SelectionDAG &DAG, 24604 unsigned Depth) const { 24605 unsigned BitWidth = KnownZero.getBitWidth(); 24606 unsigned Opc = Op.getOpcode(); 24607 assert((Opc >= ISD::BUILTIN_OP_END || 24608 Opc == ISD::INTRINSIC_WO_CHAIN || 24609 Opc == ISD::INTRINSIC_W_CHAIN || 24610 Opc == ISD::INTRINSIC_VOID) && 24611 "Should use MaskedValueIsZero if you don't know whether Op" 24612 " is a target node!"); 24613 24614 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 24615 switch (Opc) { 24616 default: break; 24617 case X86ISD::ADD: 24618 case X86ISD::SUB: 24619 case X86ISD::ADC: 24620 case X86ISD::SBB: 24621 case X86ISD::SMUL: 24622 case X86ISD::UMUL: 24623 case X86ISD::INC: 24624 case X86ISD::DEC: 24625 case X86ISD::OR: 24626 case X86ISD::XOR: 24627 case X86ISD::AND: 24628 // These nodes' second result is a boolean. 24629 if (Op.getResNo() == 0) 24630 break; 24631 // Fallthrough 24632 case X86ISD::SETCC: 24633 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 24634 break; 24635 case X86ISD::MOVMSK: { 24636 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); 24637 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 24638 break; 24639 } 24640 } 24641 } 24642 24643 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( 24644 SDValue Op, 24645 const SelectionDAG &, 24646 unsigned Depth) const { 24647 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 24648 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 24649 return Op.getValueType().getScalarSizeInBits(); 24650 24651 // Fallback case. 24652 return 1; 24653 } 24654 24655 /// Returns true (and the GlobalValue and the offset) if the node is a 24656 /// GlobalAddress + offset. 24657 bool X86TargetLowering::isGAPlusOffset(SDNode *N, 24658 const GlobalValue* &GA, 24659 int64_t &Offset) const { 24660 if (N->getOpcode() == X86ISD::Wrapper) { 24661 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 24662 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 24663 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 24664 return true; 24665 } 24666 } 24667 return TargetLowering::isGAPlusOffset(N, GA, Offset); 24668 } 24669 24670 /// Performs shuffle combines for 256-bit vectors. 24671 /// FIXME: This could be expanded to support 512 bit vectors as well. 24672 static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG, 24673 TargetLowering::DAGCombinerInfo &DCI, 24674 const X86Subtarget &Subtarget) { 24675 SDLoc dl(N); 24676 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 24677 SDValue V1 = SVOp->getOperand(0); 24678 SDValue V2 = SVOp->getOperand(1); 24679 MVT VT = SVOp->getSimpleValueType(0); 24680 unsigned NumElems = VT.getVectorNumElements(); 24681 24682 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 24683 V2.getOpcode() == ISD::CONCAT_VECTORS) { 24684 // 24685 // 0,0,0,... 24686 // | 24687 // V UNDEF BUILD_VECTOR UNDEF 24688 // \ / \ / 24689 // CONCAT_VECTOR CONCAT_VECTOR 24690 // \ / 24691 // \ / 24692 // RESULT: V + zero extended 24693 // 24694 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 24695 !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef()) 24696 return SDValue(); 24697 24698 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 24699 return SDValue(); 24700 24701 // To match the shuffle mask, the first half of the mask should 24702 // be exactly the first vector, and all the rest a splat with the 24703 // first element of the second one. 24704 for (unsigned i = 0; i != NumElems/2; ++i) 24705 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 24706 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 24707 return SDValue(); 24708 24709 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 24710 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 24711 if (Ld->hasNUsesOfValue(1, 0)) { 24712 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 24713 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 24714 SDValue ResNode = 24715 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 24716 Ld->getMemoryVT(), 24717 Ld->getPointerInfo(), 24718 Ld->getAlignment(), 24719 false/*isVolatile*/, true/*ReadMem*/, 24720 false/*WriteMem*/); 24721 24722 // Make sure the newly-created LOAD is in the same position as Ld in 24723 // terms of dependency. We create a TokenFactor for Ld and ResNode, 24724 // and update uses of Ld's output chain to use the TokenFactor. 24725 if (Ld->hasAnyUseOfValue(1)) { 24726 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 24727 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); 24728 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 24729 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), 24730 SDValue(ResNode.getNode(), 1)); 24731 } 24732 24733 return DAG.getBitcast(VT, ResNode); 24734 } 24735 } 24736 24737 // Emit a zeroed vector and insert the desired subvector on its 24738 // first half. 24739 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 24740 SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 24741 return DCI.CombineTo(N, InsV); 24742 } 24743 24744 return SDValue(); 24745 } 24746 24747 // Attempt to match a combined shuffle mask against supported unary shuffle 24748 // instructions. 24749 // TODO: Investigate sharing more of this with shuffle lowering. 24750 static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, 24751 const X86Subtarget &Subtarget, 24752 unsigned &Shuffle, MVT &ShuffleVT) { 24753 bool FloatDomain = SrcVT.isFloatingPoint() || 24754 (!Subtarget.hasAVX2() && SrcVT.is256BitVector()); 24755 24756 // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction. 24757 if (!FloatDomain && SrcVT.is128BitVector() && 24758 isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) { 24759 Shuffle = X86ISD::VZEXT_MOVL; 24760 ShuffleVT = MVT::v2i64; 24761 return true; 24762 } 24763 24764 // Check if we have SSE3 which will let us use MOVDDUP etc. The 24765 // instructions are no slower than UNPCKLPD but has the option to 24766 // fold the input operand into even an unaligned memory load. 24767 if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { 24768 if (isTargetShuffleEquivalent(Mask, {0, 0})) { 24769 Shuffle = X86ISD::MOVDDUP; 24770 ShuffleVT = MVT::v2f64; 24771 return true; 24772 } 24773 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { 24774 Shuffle = X86ISD::MOVSLDUP; 24775 ShuffleVT = MVT::v4f32; 24776 return true; 24777 } 24778 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) { 24779 Shuffle = X86ISD::MOVSHDUP; 24780 ShuffleVT = MVT::v4f32; 24781 return true; 24782 } 24783 } 24784 24785 if (SrcVT.is256BitVector() && FloatDomain) { 24786 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); 24787 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { 24788 Shuffle = X86ISD::MOVDDUP; 24789 ShuffleVT = MVT::v4f64; 24790 return true; 24791 } 24792 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { 24793 Shuffle = X86ISD::MOVSLDUP; 24794 ShuffleVT = MVT::v8f32; 24795 return true; 24796 } 24797 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) { 24798 Shuffle = X86ISD::MOVSHDUP; 24799 ShuffleVT = MVT::v8f32; 24800 return true; 24801 } 24802 } 24803 24804 if (SrcVT.is512BitVector() && FloatDomain) { 24805 assert(Subtarget.hasAVX512() && 24806 "AVX512 required for 512-bit vector shuffles"); 24807 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { 24808 Shuffle = X86ISD::MOVDDUP; 24809 ShuffleVT = MVT::v8f64; 24810 return true; 24811 } 24812 if (isTargetShuffleEquivalent( 24813 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { 24814 Shuffle = X86ISD::MOVSLDUP; 24815 ShuffleVT = MVT::v16f32; 24816 return true; 24817 } 24818 if (isTargetShuffleEquivalent( 24819 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { 24820 Shuffle = X86ISD::MOVSHDUP; 24821 ShuffleVT = MVT::v16f32; 24822 return true; 24823 } 24824 } 24825 24826 // Attempt to match against broadcast-from-vector. 24827 if (Subtarget.hasAVX2()) { 24828 unsigned NumElts = Mask.size(); 24829 SmallVector<int, 64> BroadcastMask(NumElts, 0); 24830 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { 24831 unsigned EltSize = SrcVT.getSizeInBits() / NumElts; 24832 ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize) 24833 : MVT::getIntegerVT(EltSize); 24834 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts); 24835 Shuffle = X86ISD::VBROADCAST; 24836 return true; 24837 } 24838 } 24839 24840 return false; 24841 } 24842 24843 // Attempt to match a combined shuffle mask against supported unary immediate 24844 // permute instructions. 24845 // TODO: Investigate sharing more of this with shuffle lowering. 24846 static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, 24847 const X86Subtarget &Subtarget, 24848 unsigned &Shuffle, MVT &ShuffleVT, 24849 unsigned &PermuteImm) { 24850 // Ensure we don't contain any zero elements. 24851 for (int M : Mask) { 24852 if (M == SM_SentinelZero) 24853 return false; 24854 assert(SM_SentinelUndef <= M && M < (int)Mask.size() && 24855 "Expected unary shuffle"); 24856 } 24857 24858 unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size(); 24859 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); 24860 24861 // Handle PSHUFLW/PSHUFHW repeated patterns. 24862 if (MaskScalarSizeInBits == 16) { 24863 SmallVector<int, 4> RepeatedMask; 24864 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { 24865 ArrayRef<int> LoMask(Mask.data() + 0, 4); 24866 ArrayRef<int> HiMask(Mask.data() + 4, 4); 24867 24868 // PSHUFLW: permute lower 4 elements only. 24869 if (isUndefOrInRange(LoMask, 0, 4) && 24870 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { 24871 Shuffle = X86ISD::PSHUFLW; 24872 ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16); 24873 PermuteImm = getV4X86ShuffleImm(LoMask); 24874 return true; 24875 } 24876 24877 // PSHUFHW: permute upper 4 elements only. 24878 if (isUndefOrInRange(HiMask, 4, 8) && 24879 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { 24880 // Offset the HiMask so that we can create the shuffle immediate. 24881 int OffsetHiMask[4]; 24882 for (int i = 0; i != 4; ++i) 24883 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); 24884 24885 Shuffle = X86ISD::PSHUFHW; 24886 ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16); 24887 PermuteImm = getV4X86ShuffleImm(OffsetHiMask); 24888 return true; 24889 } 24890 24891 return false; 24892 } 24893 return false; 24894 } 24895 24896 // We only support permutation of 32/64 bit elements after this. 24897 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64) 24898 return false; 24899 24900 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we 24901 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). 24902 bool FloatDomain = SrcVT.isFloatingPoint(); 24903 if (FloatDomain && !Subtarget.hasAVX()) 24904 return false; 24905 24906 // Pre-AVX2 we must use float shuffles on 256-bit vectors. 24907 if (SrcVT.is256BitVector() && !Subtarget.hasAVX2()) 24908 FloatDomain = true; 24909 24910 // Check for lane crossing permutes. 24911 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { 24912 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). 24913 if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) { 24914 Shuffle = X86ISD::VPERMI; 24915 ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); 24916 PermuteImm = getV4X86ShuffleImm(Mask); 24917 return true; 24918 } 24919 if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) { 24920 SmallVector<int, 4> RepeatedMask; 24921 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { 24922 Shuffle = X86ISD::VPERMI; 24923 ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); 24924 PermuteImm = getV4X86ShuffleImm(RepeatedMask); 24925 return true; 24926 } 24927 } 24928 return false; 24929 } 24930 24931 // VPERMILPD can permute with a non-repeating shuffle. 24932 if (FloatDomain && MaskScalarSizeInBits == 64) { 24933 Shuffle = X86ISD::VPERMILPI; 24934 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); 24935 PermuteImm = 0; 24936 for (int i = 0, e = Mask.size(); i != e; ++i) { 24937 int M = Mask[i]; 24938 if (M == SM_SentinelUndef) 24939 continue; 24940 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); 24941 PermuteImm |= (M & 1) << i; 24942 } 24943 return true; 24944 } 24945 24946 // We need a repeating shuffle mask for VPERMILPS/PSHUFD. 24947 SmallVector<int, 4> RepeatedMask; 24948 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) 24949 return false; 24950 24951 // Narrow the repeated mask for 32-bit element permutes. 24952 SmallVector<int, 4> WordMask = RepeatedMask; 24953 if (MaskScalarSizeInBits == 64) 24954 scaleShuffleMask(2, RepeatedMask, WordMask); 24955 24956 Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); 24957 ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); 24958 ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32); 24959 PermuteImm = getV4X86ShuffleImm(WordMask); 24960 return true; 24961 } 24962 24963 // Attempt to match a combined unary shuffle mask against supported binary 24964 // shuffle instructions. 24965 // TODO: Investigate sharing more of this with shuffle lowering. 24966 static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, 24967 unsigned &Shuffle, MVT &ShuffleVT) { 24968 bool FloatDomain = SrcVT.isFloatingPoint(); 24969 24970 if (SrcVT.is128BitVector()) { 24971 if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { 24972 Shuffle = X86ISD::MOVLHPS; 24973 ShuffleVT = MVT::v4f32; 24974 return true; 24975 } 24976 if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) { 24977 Shuffle = X86ISD::MOVHLPS; 24978 ShuffleVT = MVT::v4f32; 24979 return true; 24980 } 24981 if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) { 24982 Shuffle = X86ISD::UNPCKL; 24983 ShuffleVT = MVT::v4f32; 24984 return true; 24985 } 24986 if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) { 24987 Shuffle = X86ISD::UNPCKH; 24988 ShuffleVT = MVT::v4f32; 24989 return true; 24990 } 24991 if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) || 24992 isTargetShuffleEquivalent( 24993 Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) { 24994 Shuffle = X86ISD::UNPCKL; 24995 ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; 24996 return true; 24997 } 24998 if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) || 24999 isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 25000 13, 14, 14, 15, 15})) { 25001 Shuffle = X86ISD::UNPCKH; 25002 ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; 25003 return true; 25004 } 25005 } 25006 25007 return false; 25008 } 25009 25010 /// \brief Combine an arbitrary chain of shuffles into a single instruction if 25011 /// possible. 25012 /// 25013 /// This is the leaf of the recursive combine below. When we have found some 25014 /// chain of single-use x86 shuffle instructions and accumulated the combined 25015 /// shuffle mask represented by them, this will try to pattern match that mask 25016 /// into either a single instruction if there is a special purpose instruction 25017 /// for this operation, or into a PSHUFB instruction which is a fully general 25018 /// instruction but should only be used to replace chains over a certain depth. 25019 static bool combineX86ShuffleChain(SDValue Input, SDValue Root, 25020 ArrayRef<int> BaseMask, int Depth, 25021 bool HasVariableMask, SelectionDAG &DAG, 25022 TargetLowering::DAGCombinerInfo &DCI, 25023 const X86Subtarget &Subtarget) { 25024 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); 25025 25026 // Find the operand that enters the chain. Note that multiple uses are OK 25027 // here, we're not going to remove the operand we find. 25028 Input = peekThroughBitcasts(Input); 25029 25030 MVT VT = Input.getSimpleValueType(); 25031 MVT RootVT = Root.getSimpleValueType(); 25032 SDLoc DL(Root); 25033 25034 SDValue Res; 25035 25036 unsigned NumBaseMaskElts = BaseMask.size(); 25037 if (NumBaseMaskElts == 1) { 25038 assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); 25039 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), 25040 /*AddTo*/ true); 25041 return true; 25042 } 25043 25044 unsigned RootSizeInBits = RootVT.getSizeInBits(); 25045 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; 25046 25047 // Don't combine if we are a AVX512/EVEX target and the mask element size 25048 // is different from the root element size - this would prevent writemasks 25049 // from being reused. 25050 // TODO - this currently prevents all lane shuffles from occurring. 25051 // TODO - check for writemasks usage instead of always preventing combining. 25052 // TODO - attempt to narrow Mask back to writemask size. 25053 if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits && 25054 (RootSizeInBits == 512 || 25055 (Subtarget.hasVLX() && RootSizeInBits >= 128))) { 25056 return false; 25057 } 25058 25059 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. 25060 25061 // Handle 128-bit lane shuffles of 256-bit vectors. 25062 if (VT.is256BitVector() && NumBaseMaskElts == 2 && 25063 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { 25064 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) 25065 return false; // Nothing to do! 25066 MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64 25067 : MVT::v4i64); 25068 unsigned PermMask = 0; 25069 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); 25070 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); 25071 25072 Res = DAG.getBitcast(ShuffleVT, Input); 25073 DCI.AddToWorklist(Res.getNode()); 25074 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, 25075 DAG.getUNDEF(ShuffleVT), 25076 DAG.getConstant(PermMask, DL, MVT::i8)); 25077 DCI.AddToWorklist(Res.getNode()); 25078 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), 25079 /*AddTo*/ true); 25080 return true; 25081 } 25082 25083 // For masks that have been widened to 128-bit elements or more, 25084 // narrow back down to 64-bit elements. 25085 SmallVector<int, 64> Mask; 25086 if (BaseMaskEltSizeInBits > 64) { 25087 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"); 25088 int MaskScale = BaseMaskEltSizeInBits / 64; 25089 scaleShuffleMask(MaskScale, BaseMask, Mask); 25090 } else { 25091 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end()); 25092 } 25093 25094 unsigned NumMaskElts = Mask.size(); 25095 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts; 25096 25097 // Determine the effective mask value type. 25098 bool FloatDomain = 25099 (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) && 25100 (32 <= MaskEltSizeInBits); 25101 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits) 25102 : MVT::getIntegerVT(MaskEltSizeInBits); 25103 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts); 25104 25105 // Attempt to match the mask against known shuffle patterns. 25106 MVT ShuffleVT; 25107 unsigned Shuffle, PermuteImm; 25108 25109 if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) { 25110 if (Depth == 1 && Root.getOpcode() == Shuffle) 25111 return false; // Nothing to do! 25112 Res = DAG.getBitcast(ShuffleVT, Input); 25113 DCI.AddToWorklist(Res.getNode()); 25114 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); 25115 DCI.AddToWorklist(Res.getNode()); 25116 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), 25117 /*AddTo*/ true); 25118 return true; 25119 } 25120 25121 if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT, 25122 PermuteImm)) { 25123 if (Depth == 1 && Root.getOpcode() == Shuffle) 25124 return false; // Nothing to do! 25125 Res = DAG.getBitcast(ShuffleVT, Input); 25126 DCI.AddToWorklist(Res.getNode()); 25127 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, 25128 DAG.getConstant(PermuteImm, DL, MVT::i8)); 25129 DCI.AddToWorklist(Res.getNode()); 25130 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), 25131 /*AddTo*/ true); 25132 return true; 25133 } 25134 25135 if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) { 25136 if (Depth == 1 && Root.getOpcode() == Shuffle) 25137 return false; // Nothing to do! 25138 Res = DAG.getBitcast(ShuffleVT, Input); 25139 DCI.AddToWorklist(Res.getNode()); 25140 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res); 25141 DCI.AddToWorklist(Res.getNode()); 25142 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), 25143 /*AddTo*/ true); 25144 return true; 25145 } 25146 25147 // Attempt to blend with zero. 25148 if (NumMaskElts <= 8 && 25149 ((Subtarget.hasSSE41() && VT.is128BitVector()) || 25150 (Subtarget.hasAVX() && VT.is256BitVector()))) { 25151 // Convert VT to a type compatible with X86ISD::BLENDI. 25152 // TODO - add 16i16 support (requires lane duplication). 25153 MVT ShuffleVT = MaskVT; 25154 if (Subtarget.hasAVX2()) { 25155 if (ShuffleVT == MVT::v4i64) 25156 ShuffleVT = MVT::v8i32; 25157 else if (ShuffleVT == MVT::v2i64) 25158 ShuffleVT = MVT::v4i32; 25159 } else { 25160 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) 25161 ShuffleVT = MVT::v8i16; 25162 else if (ShuffleVT == MVT::v4i64) 25163 ShuffleVT = MVT::v4f64; 25164 else if (ShuffleVT == MVT::v8i32) 25165 ShuffleVT = MVT::v8f32; 25166 } 25167 25168 if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, 25169 /*Low*/ 0) && 25170 NumMaskElts <= ShuffleVT.getVectorNumElements()) { 25171 unsigned BlendMask = 0; 25172 unsigned ShuffleSize = ShuffleVT.getVectorNumElements(); 25173 unsigned MaskRatio = ShuffleSize / NumMaskElts; 25174 25175 if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI) 25176 return false; 25177 25178 for (unsigned i = 0; i != ShuffleSize; ++i) 25179 if (Mask[i / MaskRatio] < 0) 25180 BlendMask |= 1u << i; 25181 25182 SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL); 25183 Res = DAG.getBitcast(ShuffleVT, Input); 25184 DCI.AddToWorklist(Res.getNode()); 25185 Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero, 25186 DAG.getConstant(BlendMask, DL, MVT::i8)); 25187 DCI.AddToWorklist(Res.getNode()); 25188 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), 25189 /*AddTo*/ true); 25190 return true; 25191 } 25192 } 25193 25194 // Attempt to combine to INSERTPS. 25195 if (Subtarget.hasSSE41() && NumMaskElts == 4 && 25196 (VT == MVT::v2f64 || VT == MVT::v4f32)) { 25197 SmallBitVector Zeroable(4, false); 25198 for (unsigned i = 0; i != NumMaskElts; ++i) 25199 if (Mask[i] < 0) 25200 Zeroable[i] = true; 25201 25202 unsigned InsertPSMask; 25203 SDValue V1 = Input, V2 = Input; 25204 if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, 25205 Zeroable, Mask, DAG)) { 25206 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS) 25207 return false; // Nothing to do! 25208 V1 = DAG.getBitcast(MVT::v4f32, V1); 25209 DCI.AddToWorklist(V1.getNode()); 25210 V2 = DAG.getBitcast(MVT::v4f32, V2); 25211 DCI.AddToWorklist(V2.getNode()); 25212 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, 25213 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 25214 DCI.AddToWorklist(Res.getNode()); 25215 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), 25216 /*AddTo*/ true); 25217 return true; 25218 } 25219 } 25220 25221 // Don't try to re-form single instruction chains under any circumstances now 25222 // that we've done encoding canonicalization for them. 25223 if (Depth < 2) 25224 return false; 25225 25226 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) 25227 return false; 25228 25229 bool MaskContainsZeros = 25230 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); 25231 25232 // If we have a single input shuffle with different shuffle patterns in the 25233 // the 128-bit lanes use the variable mask to VPERMILPS. 25234 // TODO Combine other mask types at higher depths. 25235 if (HasVariableMask && !MaskContainsZeros && 25236 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || 25237 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { 25238 SmallVector<SDValue, 16> VPermIdx; 25239 for (int M : Mask) { 25240 SDValue Idx = 25241 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); 25242 VPermIdx.push_back(Idx); 25243 } 25244 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts); 25245 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx); 25246 DCI.AddToWorklist(VPermMask.getNode()); 25247 Res = DAG.getBitcast(MaskVT, Input); 25248 DCI.AddToWorklist(Res.getNode()); 25249 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); 25250 DCI.AddToWorklist(Res.getNode()); 25251 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), 25252 /*AddTo*/ true); 25253 return true; 25254 } 25255 25256 // If we have 3 or more shuffle instructions or a chain involving a variable 25257 // mask, we can replace them with a single PSHUFB instruction profitably. 25258 // Intel's manuals suggest only using PSHUFB if doing so replacing 5 25259 // instructions, but in practice PSHUFB tends to be *very* fast so we're 25260 // more aggressive. 25261 if ((Depth >= 3 || HasVariableMask) && 25262 ((VT.is128BitVector() && Subtarget.hasSSSE3()) || 25263 (VT.is256BitVector() && Subtarget.hasAVX2()) || 25264 (VT.is512BitVector() && Subtarget.hasBWI()))) { 25265 SmallVector<SDValue, 16> PSHUFBMask; 25266 int NumBytes = VT.getSizeInBits() / 8; 25267 int Ratio = NumBytes / NumMaskElts; 25268 for (int i = 0; i < NumBytes; ++i) { 25269 int M = Mask[i / Ratio]; 25270 if (M == SM_SentinelUndef) { 25271 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); 25272 continue; 25273 } 25274 if (M == SM_SentinelZero) { 25275 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8)); 25276 continue; 25277 } 25278 M = Ratio * M + i % Ratio; 25279 assert ((M / 16) == (i / 16) && "Lane crossing detected"); 25280 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); 25281 } 25282 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); 25283 Res = DAG.getBitcast(ByteVT, Input); 25284 DCI.AddToWorklist(Res.getNode()); 25285 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask); 25286 DCI.AddToWorklist(PSHUFBMaskOp.getNode()); 25287 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp); 25288 DCI.AddToWorklist(Res.getNode()); 25289 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), 25290 /*AddTo*/ true); 25291 return true; 25292 } 25293 25294 // Failed to find any combines. 25295 return false; 25296 } 25297 25298 /// \brief Fully generic combining of x86 shuffle instructions. 25299 /// 25300 /// This should be the last combine run over the x86 shuffle instructions. Once 25301 /// they have been fully optimized, this will recursively consider all chains 25302 /// of single-use shuffle instructions, build a generic model of the cumulative 25303 /// shuffle operation, and check for simpler instructions which implement this 25304 /// operation. We use this primarily for two purposes: 25305 /// 25306 /// 1) Collapse generic shuffles to specialized single instructions when 25307 /// equivalent. In most cases, this is just an encoding size win, but 25308 /// sometimes we will collapse multiple generic shuffles into a single 25309 /// special-purpose shuffle. 25310 /// 2) Look for sequences of shuffle instructions with 3 or more total 25311 /// instructions, and replace them with the slightly more expensive SSSE3 25312 /// PSHUFB instruction if available. We do this as the last combining step 25313 /// to ensure we avoid using PSHUFB if we can implement the shuffle with 25314 /// a suitable short sequence of other instructions. The PHUFB will either 25315 /// use a register or have to read from memory and so is slightly (but only 25316 /// slightly) more expensive than the other shuffle instructions. 25317 /// 25318 /// Because this is inherently a quadratic operation (for each shuffle in 25319 /// a chain, we recurse up the chain), the depth is limited to 8 instructions. 25320 /// This should never be an issue in practice as the shuffle lowering doesn't 25321 /// produce sequences of more than 8 instructions. 25322 /// 25323 /// FIXME: We will currently miss some cases where the redundant shuffling 25324 /// would simplify under the threshold for PSHUFB formation because of 25325 /// combine-ordering. To fix this, we should do the redundant instruction 25326 /// combining in this recursive walk. 25327 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, 25328 ArrayRef<int> RootMask, 25329 int Depth, bool HasVariableMask, 25330 SelectionDAG &DAG, 25331 TargetLowering::DAGCombinerInfo &DCI, 25332 const X86Subtarget &Subtarget) { 25333 // Bound the depth of our recursive combine because this is ultimately 25334 // quadratic in nature. 25335 if (Depth > 8) 25336 return false; 25337 25338 // Directly rip through bitcasts to find the underlying operand. 25339 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) 25340 Op = Op.getOperand(0); 25341 25342 MVT VT = Op.getSimpleValueType(); 25343 if (!VT.isVector()) 25344 return false; // Bail if we hit a non-vector. 25345 25346 assert(Root.getSimpleValueType().isVector() && 25347 "Shuffles operate on vector types!"); 25348 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && 25349 "Can only combine shuffles of the same vector register size."); 25350 25351 // Extract target shuffle mask and resolve sentinels and inputs. 25352 SDValue Input0, Input1; 25353 SmallVector<int, 16> OpMask; 25354 if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) 25355 return false; 25356 25357 assert(VT.getVectorNumElements() == OpMask.size() && 25358 "Different mask size from vector size!"); 25359 assert(((RootMask.size() > OpMask.size() && 25360 RootMask.size() % OpMask.size() == 0) || 25361 (OpMask.size() > RootMask.size() && 25362 OpMask.size() % RootMask.size() == 0) || 25363 OpMask.size() == RootMask.size()) && 25364 "The smaller number of elements must divide the larger."); 25365 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size()); 25366 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); 25367 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); 25368 assert(((RootRatio == 1 && OpRatio == 1) || 25369 (RootRatio == 1) != (OpRatio == 1)) && 25370 "Must not have a ratio for both incoming and op masks!"); 25371 25372 SmallVector<int, 16> Mask; 25373 Mask.reserve(MaskWidth); 25374 25375 // Merge this shuffle operation's mask into our accumulated mask. Note that 25376 // this shuffle's mask will be the first applied to the input, followed by the 25377 // root mask to get us all the way to the root value arrangement. The reason 25378 // for this order is that we are recursing up the operation chain. 25379 for (int i = 0; i < MaskWidth; ++i) { 25380 int RootIdx = i / RootRatio; 25381 if (RootMask[RootIdx] < 0) { 25382 // This is a zero or undef lane, we're done. 25383 Mask.push_back(RootMask[RootIdx]); 25384 continue; 25385 } 25386 25387 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; 25388 int OpIdx = RootMaskedIdx / OpRatio; 25389 if (OpMask[OpIdx] < 0) { 25390 // The incoming lanes are zero or undef, it doesn't matter which ones we 25391 // are using. 25392 Mask.push_back(OpMask[OpIdx]); 25393 continue; 25394 } 25395 25396 // Ok, we have non-zero lanes, map them through. 25397 Mask.push_back(OpMask[OpIdx] * OpRatio + 25398 RootMaskedIdx % OpRatio); 25399 } 25400 25401 // Handle the all undef/zero cases early. 25402 if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) { 25403 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType())); 25404 return true; 25405 } 25406 if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) { 25407 // TODO - should we handle the mixed zero/undef case as well? Just returning 25408 // a zero mask will lose information on undef elements possibly reducing 25409 // future combine possibilities. 25410 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(), 25411 Subtarget, DAG, SDLoc(Root))); 25412 return true; 25413 } 25414 25415 int MaskSize = Mask.size(); 25416 bool UseInput0 = std::any_of(Mask.begin(), Mask.end(), 25417 [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; }); 25418 bool UseInput1 = std::any_of(Mask.begin(), Mask.end(), 25419 [MaskSize](int Idx) { return MaskSize <= Idx; }); 25420 25421 // At the moment we can only combine unary shuffle mask cases. 25422 if (UseInput0 && UseInput1) 25423 return false; 25424 else if (UseInput1) { 25425 std::swap(Input0, Input1); 25426 ShuffleVectorSDNode::commuteMask(Mask); 25427 } 25428 25429 assert(Input0 && "Shuffle with no inputs detected"); 25430 25431 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); 25432 25433 // See if we can recurse into Input0 (if it's a target shuffle). 25434 if (Op->isOnlyUserOf(Input0.getNode()) && 25435 combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, 25436 HasVariableMask, DAG, DCI, Subtarget)) 25437 return true; 25438 25439 // Minor canonicalization of the accumulated shuffle mask to make it easier 25440 // to match below. All this does is detect masks with sequential pairs of 25441 // elements, and shrink them to the half-width mask. It does this in a loop 25442 // so it will reduce the size of the mask to the minimal width mask which 25443 // performs an equivalent shuffle. 25444 SmallVector<int, 16> WidenedMask; 25445 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { 25446 Mask = std::move(WidenedMask); 25447 } 25448 25449 return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG, 25450 DCI, Subtarget); 25451 } 25452 25453 /// \brief Get the PSHUF-style mask from PSHUF node. 25454 /// 25455 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 25456 /// PSHUF-style masks that can be reused with such instructions. 25457 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { 25458 MVT VT = N.getSimpleValueType(); 25459 SmallVector<int, 4> Mask; 25460 SmallVector<SDValue, 2> Ops; 25461 bool IsUnary; 25462 bool HaveMask = 25463 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary); 25464 (void)HaveMask; 25465 assert(HaveMask); 25466 25467 // If we have more than 128-bits, only the low 128-bits of shuffle mask 25468 // matter. Check that the upper masks are repeats and remove them. 25469 if (VT.getSizeInBits() > 128) { 25470 int LaneElts = 128 / VT.getScalarSizeInBits(); 25471 #ifndef NDEBUG 25472 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) 25473 for (int j = 0; j < LaneElts; ++j) 25474 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && 25475 "Mask doesn't repeat in high 128-bit lanes!"); 25476 #endif 25477 Mask.resize(LaneElts); 25478 } 25479 25480 switch (N.getOpcode()) { 25481 case X86ISD::PSHUFD: 25482 return Mask; 25483 case X86ISD::PSHUFLW: 25484 Mask.resize(4); 25485 return Mask; 25486 case X86ISD::PSHUFHW: 25487 Mask.erase(Mask.begin(), Mask.begin() + 4); 25488 for (int &M : Mask) 25489 M -= 4; 25490 return Mask; 25491 default: 25492 llvm_unreachable("No valid shuffle instruction found!"); 25493 } 25494 } 25495 25496 /// \brief Search for a combinable shuffle across a chain ending in pshufd. 25497 /// 25498 /// We walk up the chain and look for a combinable shuffle, skipping over 25499 /// shuffles that we could hoist this shuffle's transformation past without 25500 /// altering anything. 25501 static SDValue 25502 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, 25503 SelectionDAG &DAG, 25504 TargetLowering::DAGCombinerInfo &DCI) { 25505 assert(N.getOpcode() == X86ISD::PSHUFD && 25506 "Called with something other than an x86 128-bit half shuffle!"); 25507 SDLoc DL(N); 25508 25509 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack 25510 // of the shuffles in the chain so that we can form a fresh chain to replace 25511 // this one. 25512 SmallVector<SDValue, 8> Chain; 25513 SDValue V = N.getOperand(0); 25514 for (; V.hasOneUse(); V = V.getOperand(0)) { 25515 switch (V.getOpcode()) { 25516 default: 25517 return SDValue(); // Nothing combined! 25518 25519 case ISD::BITCAST: 25520 // Skip bitcasts as we always know the type for the target specific 25521 // instructions. 25522 continue; 25523 25524 case X86ISD::PSHUFD: 25525 // Found another dword shuffle. 25526 break; 25527 25528 case X86ISD::PSHUFLW: 25529 // Check that the low words (being shuffled) are the identity in the 25530 // dword shuffle, and the high words are self-contained. 25531 if (Mask[0] != 0 || Mask[1] != 1 || 25532 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) 25533 return SDValue(); 25534 25535 Chain.push_back(V); 25536 continue; 25537 25538 case X86ISD::PSHUFHW: 25539 // Check that the high words (being shuffled) are the identity in the 25540 // dword shuffle, and the low words are self-contained. 25541 if (Mask[2] != 2 || Mask[3] != 3 || 25542 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) 25543 return SDValue(); 25544 25545 Chain.push_back(V); 25546 continue; 25547 25548 case X86ISD::UNPCKL: 25549 case X86ISD::UNPCKH: 25550 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword 25551 // shuffle into a preceding word shuffle. 25552 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && 25553 V.getSimpleValueType().getVectorElementType() != MVT::i16) 25554 return SDValue(); 25555 25556 // Search for a half-shuffle which we can combine with. 25557 unsigned CombineOp = 25558 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; 25559 if (V.getOperand(0) != V.getOperand(1) || 25560 !V->isOnlyUserOf(V.getOperand(0).getNode())) 25561 return SDValue(); 25562 Chain.push_back(V); 25563 V = V.getOperand(0); 25564 do { 25565 switch (V.getOpcode()) { 25566 default: 25567 return SDValue(); // Nothing to combine. 25568 25569 case X86ISD::PSHUFLW: 25570 case X86ISD::PSHUFHW: 25571 if (V.getOpcode() == CombineOp) 25572 break; 25573 25574 Chain.push_back(V); 25575 25576 // Fallthrough! 25577 case ISD::BITCAST: 25578 V = V.getOperand(0); 25579 continue; 25580 } 25581 break; 25582 } while (V.hasOneUse()); 25583 break; 25584 } 25585 // Break out of the loop if we break out of the switch. 25586 break; 25587 } 25588 25589 if (!V.hasOneUse()) 25590 // We fell out of the loop without finding a viable combining instruction. 25591 return SDValue(); 25592 25593 // Merge this node's mask and our incoming mask. 25594 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 25595 for (int &M : Mask) 25596 M = VMask[M]; 25597 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), 25598 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 25599 25600 // Rebuild the chain around this new shuffle. 25601 while (!Chain.empty()) { 25602 SDValue W = Chain.pop_back_val(); 25603 25604 if (V.getValueType() != W.getOperand(0).getValueType()) 25605 V = DAG.getBitcast(W.getOperand(0).getValueType(), V); 25606 25607 switch (W.getOpcode()) { 25608 default: 25609 llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); 25610 25611 case X86ISD::UNPCKL: 25612 case X86ISD::UNPCKH: 25613 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); 25614 break; 25615 25616 case X86ISD::PSHUFD: 25617 case X86ISD::PSHUFLW: 25618 case X86ISD::PSHUFHW: 25619 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); 25620 break; 25621 } 25622 } 25623 if (V.getValueType() != N.getValueType()) 25624 V = DAG.getBitcast(N.getValueType(), V); 25625 25626 // Return the new chain to replace N. 25627 return V; 25628 } 25629 25630 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or 25631 /// pshufhw. 25632 /// 25633 /// We walk up the chain, skipping shuffles of the other half and looking 25634 /// through shuffles which switch halves trying to find a shuffle of the same 25635 /// pair of dwords. 25636 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, 25637 SelectionDAG &DAG, 25638 TargetLowering::DAGCombinerInfo &DCI) { 25639 assert( 25640 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && 25641 "Called with something other than an x86 128-bit half shuffle!"); 25642 SDLoc DL(N); 25643 unsigned CombineOpcode = N.getOpcode(); 25644 25645 // Walk up a single-use chain looking for a combinable shuffle. 25646 SDValue V = N.getOperand(0); 25647 for (; V.hasOneUse(); V = V.getOperand(0)) { 25648 switch (V.getOpcode()) { 25649 default: 25650 return false; // Nothing combined! 25651 25652 case ISD::BITCAST: 25653 // Skip bitcasts as we always know the type for the target specific 25654 // instructions. 25655 continue; 25656 25657 case X86ISD::PSHUFLW: 25658 case X86ISD::PSHUFHW: 25659 if (V.getOpcode() == CombineOpcode) 25660 break; 25661 25662 // Other-half shuffles are no-ops. 25663 continue; 25664 } 25665 // Break out of the loop if we break out of the switch. 25666 break; 25667 } 25668 25669 if (!V.hasOneUse()) 25670 // We fell out of the loop without finding a viable combining instruction. 25671 return false; 25672 25673 // Combine away the bottom node as its shuffle will be accumulated into 25674 // a preceding shuffle. 25675 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); 25676 25677 // Record the old value. 25678 SDValue Old = V; 25679 25680 // Merge this node's mask and our incoming mask (adjusted to account for all 25681 // the pshufd instructions encountered). 25682 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 25683 for (int &M : Mask) 25684 M = VMask[M]; 25685 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), 25686 getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); 25687 25688 // Check that the shuffles didn't cancel each other out. If not, we need to 25689 // combine to the new one. 25690 if (Old != V) 25691 // Replace the combinable shuffle with the combined one, updating all users 25692 // so that we re-evaluate the chain here. 25693 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); 25694 25695 return true; 25696 } 25697 25698 /// \brief Try to combine x86 target specific shuffles. 25699 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, 25700 TargetLowering::DAGCombinerInfo &DCI, 25701 const X86Subtarget &Subtarget) { 25702 SDLoc DL(N); 25703 MVT VT = N.getSimpleValueType(); 25704 SmallVector<int, 4> Mask; 25705 25706 switch (N.getOpcode()) { 25707 case X86ISD::PSHUFD: 25708 case X86ISD::PSHUFLW: 25709 case X86ISD::PSHUFHW: 25710 Mask = getPSHUFShuffleMask(N); 25711 assert(Mask.size() == 4); 25712 break; 25713 case X86ISD::UNPCKL: { 25714 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in 25715 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE 25716 // moves upper half elements into the lower half part. For example: 25717 // 25718 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, 25719 // undef:v16i8 25720 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 25721 // 25722 // will be combined to: 25723 // 25724 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 25725 25726 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not 25727 // happen due to advanced instructions. 25728 if (!VT.is128BitVector()) 25729 return SDValue(); 25730 25731 auto Op0 = N.getOperand(0); 25732 auto Op1 = N.getOperand(1); 25733 if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { 25734 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); 25735 25736 unsigned NumElts = VT.getVectorNumElements(); 25737 SmallVector<int, 8> ExpectedMask(NumElts, -1); 25738 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, 25739 NumElts / 2); 25740 25741 auto ShufOp = Op1.getOperand(0); 25742 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) 25743 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); 25744 } 25745 return SDValue(); 25746 } 25747 case X86ISD::BLENDI: { 25748 SDValue V0 = N->getOperand(0); 25749 SDValue V1 = N->getOperand(1); 25750 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && 25751 "Unexpected input vector types"); 25752 25753 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector 25754 // operands and changing the mask to 1. This saves us a bunch of 25755 // pattern-matching possibilities related to scalar math ops in SSE/AVX. 25756 // x86InstrInfo knows how to commute this back after instruction selection 25757 // if it would help register allocation. 25758 25759 // TODO: If optimizing for size or a processor that doesn't suffer from 25760 // partial register update stalls, this should be transformed into a MOVSD 25761 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. 25762 25763 if (VT == MVT::v2f64) 25764 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) 25765 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { 25766 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); 25767 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); 25768 } 25769 25770 // Attempt to merge blend(insertps(x,y),zero). 25771 if (V0.getOpcode() == X86ISD::INSERTPS || 25772 V1.getOpcode() == X86ISD::INSERTPS) { 25773 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); 25774 25775 // Determine which elements are known to be zero. 25776 SmallVector<int, 8> TargetMask; 25777 SmallVector<SDValue, 2> BlendOps; 25778 if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps)) 25779 return SDValue(); 25780 25781 // Helper function to take inner insertps node and attempt to 25782 // merge the blend with zero into its zero mask. 25783 auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) { 25784 if (V.getOpcode() != X86ISD::INSERTPS) 25785 return SDValue(); 25786 SDValue Op0 = V.getOperand(0); 25787 SDValue Op1 = V.getOperand(1); 25788 SDValue Op2 = V.getOperand(2); 25789 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue(); 25790 25791 // Check each element of the blend node's target mask - must either 25792 // be zeroable (and update the zero mask) or selects the element from 25793 // the inner insertps node. 25794 for (int i = 0; i != 4; ++i) 25795 if (TargetMask[i] < 0) 25796 InsertPSMask |= (1u << i); 25797 else if (TargetMask[i] != (i + Offset)) 25798 return SDValue(); 25799 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1, 25800 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 25801 }; 25802 25803 if (SDValue V = MergeInsertPSAndBlend(V0, 0)) 25804 return V; 25805 if (SDValue V = MergeInsertPSAndBlend(V1, 4)) 25806 return V; 25807 } 25808 return SDValue(); 25809 } 25810 case X86ISD::INSERTPS: { 25811 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); 25812 SDValue Op0 = N.getOperand(0); 25813 SDValue Op1 = N.getOperand(1); 25814 SDValue Op2 = N.getOperand(2); 25815 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue(); 25816 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; 25817 unsigned DstIdx = (InsertPSMask >> 4) & 0x3; 25818 unsigned ZeroMask = InsertPSMask & 0xF; 25819 25820 // If we zero out all elements from Op0 then we don't need to reference it. 25821 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef()) 25822 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1, 25823 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 25824 25825 // If we zero out the element from Op1 then we don't need to reference it. 25826 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef()) 25827 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), 25828 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 25829 25830 // Attempt to merge insertps Op1 with an inner target shuffle node. 25831 SmallVector<int, 8> TargetMask1; 25832 SmallVector<SDValue, 2> Ops1; 25833 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { 25834 int M = TargetMask1[SrcIdx]; 25835 if (isUndefOrZero(M)) { 25836 // Zero/UNDEF insertion - zero out element and remove dependency. 25837 InsertPSMask |= (1u << DstIdx); 25838 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), 25839 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 25840 } 25841 // Update insertps mask srcidx and reference the source input directly. 25842 assert(0 <= M && M < 8 && "Shuffle index out of range"); 25843 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); 25844 Op1 = Ops1[M < 4 ? 0 : 1]; 25845 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, 25846 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 25847 } 25848 25849 // Attempt to merge insertps Op0 with an inner target shuffle node. 25850 SmallVector<int, 8> TargetMask0; 25851 SmallVector<SDValue, 2> Ops0; 25852 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) 25853 return SDValue(); 25854 25855 bool Updated = false; 25856 bool UseInput00 = false; 25857 bool UseInput01 = false; 25858 for (int i = 0; i != 4; ++i) { 25859 int M = TargetMask0[i]; 25860 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { 25861 // No change if element is already zero or the inserted element. 25862 continue; 25863 } else if (isUndefOrZero(M)) { 25864 // If the target mask is undef/zero then we must zero the element. 25865 InsertPSMask |= (1u << i); 25866 Updated = true; 25867 continue; 25868 } 25869 25870 // The input vector element must be inline. 25871 if (M != i && M != (i + 4)) 25872 return SDValue(); 25873 25874 // Determine which inputs of the target shuffle we're using. 25875 UseInput00 |= (0 <= M && M < 4); 25876 UseInput01 |= (4 <= M); 25877 } 25878 25879 // If we're not using both inputs of the target shuffle then use the 25880 // referenced input directly. 25881 if (UseInput00 && !UseInput01) { 25882 Updated = true; 25883 Op0 = Ops0[0]; 25884 } else if (!UseInput00 && UseInput01) { 25885 Updated = true; 25886 Op0 = Ops0[1]; 25887 } 25888 25889 if (Updated) 25890 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, 25891 DAG.getConstant(InsertPSMask, DL, MVT::i8)); 25892 25893 return SDValue(); 25894 } 25895 default: 25896 return SDValue(); 25897 } 25898 25899 // Nuke no-op shuffles that show up after combining. 25900 if (isNoopShuffleMask(Mask)) 25901 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); 25902 25903 // Look for simplifications involving one or two shuffle instructions. 25904 SDValue V = N.getOperand(0); 25905 switch (N.getOpcode()) { 25906 default: 25907 break; 25908 case X86ISD::PSHUFLW: 25909 case X86ISD::PSHUFHW: 25910 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); 25911 25912 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) 25913 return SDValue(); // We combined away this shuffle, so we're done. 25914 25915 // See if this reduces to a PSHUFD which is no more expensive and can 25916 // combine with more operations. Note that it has to at least flip the 25917 // dwords as otherwise it would have been removed as a no-op. 25918 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { 25919 int DMask[] = {0, 1, 2, 3}; 25920 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; 25921 DMask[DOffset + 0] = DOffset + 1; 25922 DMask[DOffset + 1] = DOffset + 0; 25923 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); 25924 V = DAG.getBitcast(DVT, V); 25925 DCI.AddToWorklist(V.getNode()); 25926 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, 25927 getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); 25928 DCI.AddToWorklist(V.getNode()); 25929 return DAG.getBitcast(VT, V); 25930 } 25931 25932 // Look for shuffle patterns which can be implemented as a single unpack. 25933 // FIXME: This doesn't handle the location of the PSHUFD generically, and 25934 // only works when we have a PSHUFD followed by two half-shuffles. 25935 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && 25936 (V.getOpcode() == X86ISD::PSHUFLW || 25937 V.getOpcode() == X86ISD::PSHUFHW) && 25938 V.getOpcode() != N.getOpcode() && 25939 V.hasOneUse()) { 25940 SDValue D = V.getOperand(0); 25941 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) 25942 D = D.getOperand(0); 25943 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { 25944 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 25945 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); 25946 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; 25947 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; 25948 int WordMask[8]; 25949 for (int i = 0; i < 4; ++i) { 25950 WordMask[i + NOffset] = Mask[i] + NOffset; 25951 WordMask[i + VOffset] = VMask[i] + VOffset; 25952 } 25953 // Map the word mask through the DWord mask. 25954 int MappedMask[8]; 25955 for (int i = 0; i < 8; ++i) 25956 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; 25957 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || 25958 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { 25959 // We can replace all three shuffles with an unpack. 25960 V = DAG.getBitcast(VT, D.getOperand(0)); 25961 DCI.AddToWorklist(V.getNode()); 25962 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL 25963 : X86ISD::UNPCKH, 25964 DL, VT, V, V); 25965 } 25966 } 25967 } 25968 25969 break; 25970 25971 case X86ISD::PSHUFD: 25972 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) 25973 return NewN; 25974 25975 break; 25976 } 25977 25978 return SDValue(); 25979 } 25980 25981 /// \brief Try to combine a shuffle into a target-specific add-sub node. 25982 /// 25983 /// We combine this directly on the abstract vector shuffle nodes so it is 25984 /// easier to generically match. We also insert dummy vector shuffle nodes for 25985 /// the operands which explicitly discard the lanes which are unused by this 25986 /// operation to try to flow through the rest of the combiner the fact that 25987 /// they're unused. 25988 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget, 25989 SelectionDAG &DAG) { 25990 SDLoc DL(N); 25991 EVT VT = N->getValueType(0); 25992 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && 25993 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) 25994 return SDValue(); 25995 25996 // We only handle target-independent shuffles. 25997 // FIXME: It would be easy and harmless to use the target shuffle mask 25998 // extraction tool to support more. 25999 if (N->getOpcode() != ISD::VECTOR_SHUFFLE) 26000 return SDValue(); 26001 26002 auto *SVN = cast<ShuffleVectorSDNode>(N); 26003 SmallVector<int, 8> Mask; 26004 for (int M : SVN->getMask()) 26005 Mask.push_back(M); 26006 26007 SDValue V1 = N->getOperand(0); 26008 SDValue V2 = N->getOperand(1); 26009 26010 // We require the first shuffle operand to be the FSUB node, and the second to 26011 // be the FADD node. 26012 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { 26013 ShuffleVectorSDNode::commuteMask(Mask); 26014 std::swap(V1, V2); 26015 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) 26016 return SDValue(); 26017 26018 // If there are other uses of these operations we can't fold them. 26019 if (!V1->hasOneUse() || !V2->hasOneUse()) 26020 return SDValue(); 26021 26022 // Ensure that both operations have the same operands. Note that we can 26023 // commute the FADD operands. 26024 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); 26025 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && 26026 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) 26027 return SDValue(); 26028 26029 // We're looking for blends between FADD and FSUB nodes. We insist on these 26030 // nodes being lined up in a specific expected pattern. 26031 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || 26032 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || 26033 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) 26034 return SDValue(); 26035 26036 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); 26037 } 26038 26039 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, 26040 TargetLowering::DAGCombinerInfo &DCI, 26041 const X86Subtarget &Subtarget) { 26042 SDLoc dl(N); 26043 EVT VT = N->getValueType(0); 26044 26045 // Don't create instructions with illegal types after legalize types has run. 26046 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 26047 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 26048 return SDValue(); 26049 26050 // If we have legalized the vector types, look for blends of FADD and FSUB 26051 // nodes that we can fuse into an ADDSUB node. 26052 if (TLI.isTypeLegal(VT)) 26053 if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) 26054 return AddSub; 26055 26056 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 26057 if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() && 26058 N->getOpcode() == ISD::VECTOR_SHUFFLE) 26059 return combineShuffle256(N, DAG, DCI, Subtarget); 26060 26061 // During Type Legalization, when promoting illegal vector types, 26062 // the backend might introduce new shuffle dag nodes and bitcasts. 26063 // 26064 // This code performs the following transformation: 26065 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> 26066 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) 26067 // 26068 // We do this only if both the bitcast and the BINOP dag nodes have 26069 // one use. Also, perform this transformation only if the new binary 26070 // operation is legal. This is to avoid introducing dag nodes that 26071 // potentially need to be further expanded (or custom lowered) into a 26072 // less optimal sequence of dag nodes. 26073 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && 26074 N->getOpcode() == ISD::VECTOR_SHUFFLE && 26075 N->getOperand(0).getOpcode() == ISD::BITCAST && 26076 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) { 26077 SDValue N0 = N->getOperand(0); 26078 SDValue N1 = N->getOperand(1); 26079 26080 SDValue BC0 = N0.getOperand(0); 26081 EVT SVT = BC0.getValueType(); 26082 unsigned Opcode = BC0.getOpcode(); 26083 unsigned NumElts = VT.getVectorNumElements(); 26084 26085 if (BC0.hasOneUse() && SVT.isVector() && 26086 SVT.getVectorNumElements() * 2 == NumElts && 26087 TLI.isOperationLegal(Opcode, VT)) { 26088 bool CanFold = false; 26089 switch (Opcode) { 26090 default : break; 26091 case ISD::ADD : 26092 case ISD::FADD : 26093 case ISD::SUB : 26094 case ISD::FSUB : 26095 case ISD::MUL : 26096 case ISD::FMUL : 26097 CanFold = true; 26098 } 26099 26100 unsigned SVTNumElts = SVT.getVectorNumElements(); 26101 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 26102 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) 26103 CanFold = SVOp->getMaskElt(i) == (int)(i * 2); 26104 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) 26105 CanFold = SVOp->getMaskElt(i) < 0; 26106 26107 if (CanFold) { 26108 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); 26109 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); 26110 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); 26111 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask()); 26112 } 26113 } 26114 } 26115 26116 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 26117 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 26118 // consecutive, non-overlapping, and in the right order. 26119 SmallVector<SDValue, 16> Elts; 26120 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 26121 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 26122 26123 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) 26124 return LD; 26125 26126 if (isTargetShuffle(N->getOpcode())) { 26127 if (SDValue Shuffle = 26128 combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget)) 26129 return Shuffle; 26130 26131 // Try recursively combining arbitrary sequences of x86 shuffle 26132 // instructions into higher-order shuffles. We do this after combining 26133 // specific PSHUF instruction sequences into their minimal form so that we 26134 // can evaluate how many specialized shuffle instructions are involved in 26135 // a particular chain. 26136 SmallVector<int, 1> NonceMask; // Just a placeholder. 26137 NonceMask.push_back(0); 26138 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, 26139 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, 26140 DCI, Subtarget)) 26141 return SDValue(); // This routine will use CombineTo to replace N. 26142 } 26143 26144 return SDValue(); 26145 } 26146 26147 /// Check if a vector extract from a target-specific shuffle of a load can be 26148 /// folded into a single element load. 26149 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 26150 /// shuffles have been custom lowered so we need to handle those here. 26151 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 26152 TargetLowering::DAGCombinerInfo &DCI) { 26153 if (DCI.isBeforeLegalizeOps()) 26154 return SDValue(); 26155 26156 SDValue InVec = N->getOperand(0); 26157 SDValue EltNo = N->getOperand(1); 26158 EVT EltVT = N->getValueType(0); 26159 26160 if (!isa<ConstantSDNode>(EltNo)) 26161 return SDValue(); 26162 26163 EVT OriginalVT = InVec.getValueType(); 26164 26165 if (InVec.getOpcode() == ISD::BITCAST) { 26166 // Don't duplicate a load with other uses. 26167 if (!InVec.hasOneUse()) 26168 return SDValue(); 26169 EVT BCVT = InVec.getOperand(0).getValueType(); 26170 if (!BCVT.isVector() || 26171 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) 26172 return SDValue(); 26173 InVec = InVec.getOperand(0); 26174 } 26175 26176 EVT CurrentVT = InVec.getValueType(); 26177 26178 if (!isTargetShuffle(InVec.getOpcode())) 26179 return SDValue(); 26180 26181 // Don't duplicate a load with other uses. 26182 if (!InVec.hasOneUse()) 26183 return SDValue(); 26184 26185 SmallVector<int, 16> ShuffleMask; 26186 SmallVector<SDValue, 2> ShuffleOps; 26187 bool UnaryShuffle; 26188 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, 26189 ShuffleOps, ShuffleMask, UnaryShuffle)) 26190 return SDValue(); 26191 26192 // Select the input vector, guarding against out of range extract vector. 26193 unsigned NumElems = CurrentVT.getVectorNumElements(); 26194 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 26195 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; 26196 26197 if (Idx == SM_SentinelZero) 26198 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) 26199 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); 26200 if (Idx == SM_SentinelUndef) 26201 return DAG.getUNDEF(EltVT); 26202 26203 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); 26204 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] 26205 : ShuffleOps[1]; 26206 26207 // If inputs to shuffle are the same for both ops, then allow 2 uses 26208 unsigned AllowedUses = 26209 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; 26210 26211 if (LdNode.getOpcode() == ISD::BITCAST) { 26212 // Don't duplicate a load with other uses. 26213 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 26214 return SDValue(); 26215 26216 AllowedUses = 1; // only allow 1 load use if we have a bitcast 26217 LdNode = LdNode.getOperand(0); 26218 } 26219 26220 if (!ISD::isNormalLoad(LdNode.getNode())) 26221 return SDValue(); 26222 26223 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 26224 26225 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 26226 return SDValue(); 26227 26228 // If there's a bitcast before the shuffle, check if the load type and 26229 // alignment is valid. 26230 unsigned Align = LN0->getAlignment(); 26231 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 26232 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( 26233 EltVT.getTypeForEVT(*DAG.getContext())); 26234 26235 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) 26236 return SDValue(); 26237 26238 // All checks match so transform back to vector_shuffle so that DAG combiner 26239 // can finish the job 26240 SDLoc dl(N); 26241 26242 // Create shuffle node taking into account the case that its a unary shuffle 26243 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1]; 26244 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle, 26245 ShuffleMask); 26246 Shuffle = DAG.getBitcast(OriginalVT, Shuffle); 26247 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 26248 EltNo); 26249 } 26250 26251 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, 26252 const X86Subtarget &Subtarget) { 26253 SDValue N0 = N->getOperand(0); 26254 EVT VT = N->getValueType(0); 26255 26256 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are 26257 // special and don't usually play with other vector types, it's better to 26258 // handle them early to be sure we emit efficient code by avoiding 26259 // store-load conversions. 26260 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && 26261 N0.getValueType() == MVT::v2i32 && 26262 isNullConstant(N0.getOperand(1))) { 26263 SDValue N00 = N0->getOperand(0); 26264 if (N00.getValueType() == MVT::i32) 26265 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); 26266 } 26267 26268 // Convert a bitcasted integer logic operation that has one bitcasted 26269 // floating-point operand and one constant operand into a floating-point 26270 // logic operation. This may create a load of the constant, but that is 26271 // cheaper than materializing the constant in an integer register and 26272 // transferring it to an SSE register or transferring the SSE operand to 26273 // integer register and back. 26274 unsigned FPOpcode; 26275 switch (N0.getOpcode()) { 26276 case ISD::AND: FPOpcode = X86ISD::FAND; break; 26277 case ISD::OR: FPOpcode = X86ISD::FOR; break; 26278 case ISD::XOR: FPOpcode = X86ISD::FXOR; break; 26279 default: return SDValue(); 26280 } 26281 if (((Subtarget.hasSSE1() && VT == MVT::f32) || 26282 (Subtarget.hasSSE2() && VT == MVT::f64)) && 26283 isa<ConstantSDNode>(N0.getOperand(1)) && 26284 N0.getOperand(0).getOpcode() == ISD::BITCAST && 26285 N0.getOperand(0).getOperand(0).getValueType() == VT) { 26286 SDValue N000 = N0.getOperand(0).getOperand(0); 26287 SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); 26288 return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); 26289 } 26290 26291 return SDValue(); 26292 } 26293 26294 /// Detect vector gather/scatter index generation and convert it from being a 26295 /// bunch of shuffles and extracts into a somewhat faster sequence. 26296 /// For i686, the best sequence is apparently storing the value and loading 26297 /// scalars back, while for x64 we should use 64-bit extracts and shifts. 26298 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, 26299 TargetLowering::DAGCombinerInfo &DCI) { 26300 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) 26301 return NewOp; 26302 26303 SDValue InputVector = N->getOperand(0); 26304 SDLoc dl(InputVector); 26305 // Detect mmx to i32 conversion through a v2i32 elt extract. 26306 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && 26307 N->getValueType(0) == MVT::i32 && 26308 InputVector.getValueType() == MVT::v2i32 && 26309 isa<ConstantSDNode>(N->getOperand(1)) && 26310 N->getConstantOperandVal(1) == 0) { 26311 SDValue MMXSrc = InputVector.getNode()->getOperand(0); 26312 26313 // The bitcast source is a direct mmx result. 26314 if (MMXSrc.getValueType() == MVT::x86mmx) 26315 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); 26316 } 26317 26318 EVT VT = N->getValueType(0); 26319 26320 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && 26321 InputVector.getOpcode() == ISD::BITCAST && 26322 isa<ConstantSDNode>(InputVector.getOperand(0))) { 26323 uint64_t ExtractedElt = 26324 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 26325 uint64_t InputValue = 26326 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); 26327 uint64_t Res = (InputValue >> ExtractedElt) & 1; 26328 return DAG.getConstant(Res, dl, MVT::i1); 26329 } 26330 // Only operate on vectors of 4 elements, where the alternative shuffling 26331 // gets to be more expensive. 26332 if (InputVector.getValueType() != MVT::v4i32) 26333 return SDValue(); 26334 26335 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 26336 // single use which is a sign-extend or zero-extend, and all elements are 26337 // used. 26338 SmallVector<SDNode *, 4> Uses; 26339 unsigned ExtractedElements = 0; 26340 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 26341 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 26342 if (UI.getUse().getResNo() != InputVector.getResNo()) 26343 return SDValue(); 26344 26345 SDNode *Extract = *UI; 26346 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 26347 return SDValue(); 26348 26349 if (Extract->getValueType(0) != MVT::i32) 26350 return SDValue(); 26351 if (!Extract->hasOneUse()) 26352 return SDValue(); 26353 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 26354 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 26355 return SDValue(); 26356 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 26357 return SDValue(); 26358 26359 // Record which element was extracted. 26360 ExtractedElements |= 26361 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 26362 26363 Uses.push_back(Extract); 26364 } 26365 26366 // If not all the elements were used, this may not be worthwhile. 26367 if (ExtractedElements != 15) 26368 return SDValue(); 26369 26370 // Ok, we've now decided to do the transformation. 26371 // If 64-bit shifts are legal, use the extract-shift sequence, 26372 // otherwise bounce the vector off the cache. 26373 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 26374 SDValue Vals[4]; 26375 26376 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { 26377 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector); 26378 auto &DL = DAG.getDataLayout(); 26379 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL); 26380 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, 26381 DAG.getConstant(0, dl, VecIdxTy)); 26382 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, 26383 DAG.getConstant(1, dl, VecIdxTy)); 26384 26385 SDValue ShAmt = DAG.getConstant( 26386 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL)); 26387 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); 26388 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, 26389 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); 26390 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); 26391 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, 26392 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); 26393 } else { 26394 // Store the value to a temporary stack slot. 26395 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 26396 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 26397 MachinePointerInfo(), false, false, 0); 26398 26399 EVT ElementType = InputVector.getValueType().getVectorElementType(); 26400 unsigned EltSize = ElementType.getSizeInBits() / 8; 26401 26402 // Replace each use (extract) with a load of the appropriate element. 26403 for (unsigned i = 0; i < 4; ++i) { 26404 uint64_t Offset = EltSize * i; 26405 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); 26406 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT); 26407 26408 SDValue ScalarAddr = 26409 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); 26410 26411 // Load the scalar. 26412 Vals[i] = DAG.getLoad(ElementType, dl, Ch, 26413 ScalarAddr, MachinePointerInfo(), 26414 false, false, false, 0); 26415 26416 } 26417 } 26418 26419 // Replace the extracts 26420 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 26421 UE = Uses.end(); UI != UE; ++UI) { 26422 SDNode *Extract = *UI; 26423 26424 SDValue Idx = Extract->getOperand(1); 26425 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 26426 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); 26427 } 26428 26429 // The replacement was made in place; don't return anything. 26430 return SDValue(); 26431 } 26432 26433 /// Do target-specific dag combines on SELECT and VSELECT nodes. 26434 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, 26435 TargetLowering::DAGCombinerInfo &DCI, 26436 const X86Subtarget &Subtarget) { 26437 SDLoc DL(N); 26438 SDValue Cond = N->getOperand(0); 26439 // Get the LHS/RHS of the select. 26440 SDValue LHS = N->getOperand(1); 26441 SDValue RHS = N->getOperand(2); 26442 EVT VT = LHS.getValueType(); 26443 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 26444 26445 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 26446 // instructions match the semantics of the common C idiom x<y?x:y but not 26447 // x<=y?x:y, because of how they handle negative zero (which can be 26448 // ignored in unsafe-math mode). 26449 // We also try to create v2f32 min/max nodes, which we later widen to v4f32. 26450 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 26451 VT != MVT::f80 && VT != MVT::f128 && 26452 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && 26453 (Subtarget.hasSSE2() || 26454 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) { 26455 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 26456 26457 unsigned Opcode = 0; 26458 // Check for x CC y ? x : y. 26459 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 26460 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 26461 switch (CC) { 26462 default: break; 26463 case ISD::SETULT: 26464 // Converting this to a min would handle NaNs incorrectly, and swapping 26465 // the operands would cause it to handle comparisons between positive 26466 // and negative zero incorrectly. 26467 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 26468 if (!DAG.getTarget().Options.UnsafeFPMath && 26469 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 26470 break; 26471 std::swap(LHS, RHS); 26472 } 26473 Opcode = X86ISD::FMIN; 26474 break; 26475 case ISD::SETOLE: 26476 // Converting this to a min would handle comparisons between positive 26477 // and negative zero incorrectly. 26478 if (!DAG.getTarget().Options.UnsafeFPMath && 26479 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 26480 break; 26481 Opcode = X86ISD::FMIN; 26482 break; 26483 case ISD::SETULE: 26484 // Converting this to a min would handle both negative zeros and NaNs 26485 // incorrectly, but we can swap the operands to fix both. 26486 std::swap(LHS, RHS); 26487 case ISD::SETOLT: 26488 case ISD::SETLT: 26489 case ISD::SETLE: 26490 Opcode = X86ISD::FMIN; 26491 break; 26492 26493 case ISD::SETOGE: 26494 // Converting this to a max would handle comparisons between positive 26495 // and negative zero incorrectly. 26496 if (!DAG.getTarget().Options.UnsafeFPMath && 26497 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 26498 break; 26499 Opcode = X86ISD::FMAX; 26500 break; 26501 case ISD::SETUGT: 26502 // Converting this to a max would handle NaNs incorrectly, and swapping 26503 // the operands would cause it to handle comparisons between positive 26504 // and negative zero incorrectly. 26505 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 26506 if (!DAG.getTarget().Options.UnsafeFPMath && 26507 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 26508 break; 26509 std::swap(LHS, RHS); 26510 } 26511 Opcode = X86ISD::FMAX; 26512 break; 26513 case ISD::SETUGE: 26514 // Converting this to a max would handle both negative zeros and NaNs 26515 // incorrectly, but we can swap the operands to fix both. 26516 std::swap(LHS, RHS); 26517 case ISD::SETOGT: 26518 case ISD::SETGT: 26519 case ISD::SETGE: 26520 Opcode = X86ISD::FMAX; 26521 break; 26522 } 26523 // Check for x CC y ? y : x -- a min/max with reversed arms. 26524 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 26525 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 26526 switch (CC) { 26527 default: break; 26528 case ISD::SETOGE: 26529 // Converting this to a min would handle comparisons between positive 26530 // and negative zero incorrectly, and swapping the operands would 26531 // cause it to handle NaNs incorrectly. 26532 if (!DAG.getTarget().Options.UnsafeFPMath && 26533 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 26534 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 26535 break; 26536 std::swap(LHS, RHS); 26537 } 26538 Opcode = X86ISD::FMIN; 26539 break; 26540 case ISD::SETUGT: 26541 // Converting this to a min would handle NaNs incorrectly. 26542 if (!DAG.getTarget().Options.UnsafeFPMath && 26543 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 26544 break; 26545 Opcode = X86ISD::FMIN; 26546 break; 26547 case ISD::SETUGE: 26548 // Converting this to a min would handle both negative zeros and NaNs 26549 // incorrectly, but we can swap the operands to fix both. 26550 std::swap(LHS, RHS); 26551 case ISD::SETOGT: 26552 case ISD::SETGT: 26553 case ISD::SETGE: 26554 Opcode = X86ISD::FMIN; 26555 break; 26556 26557 case ISD::SETULT: 26558 // Converting this to a max would handle NaNs incorrectly. 26559 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 26560 break; 26561 Opcode = X86ISD::FMAX; 26562 break; 26563 case ISD::SETOLE: 26564 // Converting this to a max would handle comparisons between positive 26565 // and negative zero incorrectly, and swapping the operands would 26566 // cause it to handle NaNs incorrectly. 26567 if (!DAG.getTarget().Options.UnsafeFPMath && 26568 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 26569 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 26570 break; 26571 std::swap(LHS, RHS); 26572 } 26573 Opcode = X86ISD::FMAX; 26574 break; 26575 case ISD::SETULE: 26576 // Converting this to a max would handle both negative zeros and NaNs 26577 // incorrectly, but we can swap the operands to fix both. 26578 std::swap(LHS, RHS); 26579 case ISD::SETOLT: 26580 case ISD::SETLT: 26581 case ISD::SETLE: 26582 Opcode = X86ISD::FMAX; 26583 break; 26584 } 26585 } 26586 26587 if (Opcode) 26588 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 26589 } 26590 26591 EVT CondVT = Cond.getValueType(); 26592 if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() && 26593 CondVT.getVectorElementType() == MVT::i1) { 26594 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper 26595 // lowering on KNL. In this case we convert it to 26596 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. 26597 // The same situation for all 128 and 256-bit vectors of i8 and i16. 26598 // Since SKX these selects have a proper lowering. 26599 EVT OpVT = LHS.getValueType(); 26600 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && 26601 (OpVT.getVectorElementType() == MVT::i8 || 26602 OpVT.getVectorElementType() == MVT::i16) && 26603 !(Subtarget.hasBWI() && Subtarget.hasVLX())) { 26604 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); 26605 DCI.AddToWorklist(Cond.getNode()); 26606 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); 26607 } 26608 } 26609 // If this is a select between two integer constants, try to do some 26610 // optimizations. 26611 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 26612 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 26613 // Don't do this for crazy integer types. 26614 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 26615 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 26616 // so that TrueC (the true value) is larger than FalseC. 26617 bool NeedsCondInvert = false; 26618 26619 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 26620 // Efficiently invertible. 26621 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 26622 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 26623 isa<ConstantSDNode>(Cond.getOperand(1))))) { 26624 NeedsCondInvert = true; 26625 std::swap(TrueC, FalseC); 26626 } 26627 26628 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 26629 if (FalseC->getAPIntValue() == 0 && 26630 TrueC->getAPIntValue().isPowerOf2()) { 26631 if (NeedsCondInvert) // Invert the condition if needed. 26632 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 26633 DAG.getConstant(1, DL, Cond.getValueType())); 26634 26635 // Zero extend the condition if needed. 26636 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 26637 26638 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 26639 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 26640 DAG.getConstant(ShAmt, DL, MVT::i8)); 26641 } 26642 26643 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 26644 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 26645 if (NeedsCondInvert) // Invert the condition if needed. 26646 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 26647 DAG.getConstant(1, DL, Cond.getValueType())); 26648 26649 // Zero extend the condition if needed. 26650 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 26651 FalseC->getValueType(0), Cond); 26652 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 26653 SDValue(FalseC, 0)); 26654 } 26655 26656 // Optimize cases that will turn into an LEA instruction. This requires 26657 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 26658 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 26659 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 26660 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 26661 26662 bool isFastMultiplier = false; 26663 if (Diff < 10) { 26664 switch ((unsigned char)Diff) { 26665 default: break; 26666 case 1: // result = add base, cond 26667 case 2: // result = lea base( , cond*2) 26668 case 3: // result = lea base(cond, cond*2) 26669 case 4: // result = lea base( , cond*4) 26670 case 5: // result = lea base(cond, cond*4) 26671 case 8: // result = lea base( , cond*8) 26672 case 9: // result = lea base(cond, cond*8) 26673 isFastMultiplier = true; 26674 break; 26675 } 26676 } 26677 26678 if (isFastMultiplier) { 26679 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 26680 if (NeedsCondInvert) // Invert the condition if needed. 26681 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 26682 DAG.getConstant(1, DL, Cond.getValueType())); 26683 26684 // Zero extend the condition if needed. 26685 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 26686 Cond); 26687 // Scale the condition by the difference. 26688 if (Diff != 1) 26689 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 26690 DAG.getConstant(Diff, DL, 26691 Cond.getValueType())); 26692 26693 // Add the base if non-zero. 26694 if (FalseC->getAPIntValue() != 0) 26695 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 26696 SDValue(FalseC, 0)); 26697 return Cond; 26698 } 26699 } 26700 } 26701 } 26702 26703 // Canonicalize max and min: 26704 // (x > y) ? x : y -> (x >= y) ? x : y 26705 // (x < y) ? x : y -> (x <= y) ? x : y 26706 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 26707 // the need for an extra compare 26708 // against zero. e.g. 26709 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 26710 // subl %esi, %edi 26711 // testl %edi, %edi 26712 // movl $0, %eax 26713 // cmovgl %edi, %eax 26714 // => 26715 // xorl %eax, %eax 26716 // subl %esi, $edi 26717 // cmovsl %eax, %edi 26718 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 26719 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 26720 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 26721 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 26722 switch (CC) { 26723 default: break; 26724 case ISD::SETLT: 26725 case ISD::SETGT: { 26726 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 26727 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), 26728 Cond.getOperand(0), Cond.getOperand(1), NewCC); 26729 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 26730 } 26731 } 26732 } 26733 26734 // Early exit check 26735 if (!TLI.isTypeLegal(VT)) 26736 return SDValue(); 26737 26738 // Match VSELECTs into subs with unsigned saturation. 26739 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 26740 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. 26741 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || 26742 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { 26743 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 26744 26745 // Check if one of the arms of the VSELECT is a zero vector. If it's on the 26746 // left side invert the predicate to simplify logic below. 26747 SDValue Other; 26748 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 26749 Other = RHS; 26750 CC = ISD::getSetCCInverse(CC, true); 26751 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { 26752 Other = LHS; 26753 } 26754 26755 if (Other.getNode() && Other->getNumOperands() == 2 && 26756 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { 26757 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); 26758 SDValue CondRHS = Cond->getOperand(1); 26759 26760 // Look for a general sub with unsigned saturation first. 26761 // x >= y ? x-y : 0 --> subus x, y 26762 // x > y ? x-y : 0 --> subus x, y 26763 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && 26764 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) 26765 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 26766 26767 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) 26768 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { 26769 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS)) 26770 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) 26771 // If the RHS is a constant we have to reverse the const 26772 // canonicalization. 26773 // x > C-1 ? x+-C : 0 --> subus x, C 26774 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && 26775 CondRHSConst->getAPIntValue() == 26776 (-OpRHSConst->getAPIntValue() - 1)) 26777 return DAG.getNode( 26778 X86ISD::SUBUS, DL, VT, OpLHS, 26779 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT)); 26780 26781 // Another special case: If C was a sign bit, the sub has been 26782 // canonicalized into a xor. 26783 // FIXME: Would it be better to use computeKnownBits to determine 26784 // whether it's safe to decanonicalize the xor? 26785 // x s< 0 ? x^C : 0 --> subus x, C 26786 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && 26787 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 26788 OpRHSConst->getAPIntValue().isSignBit()) 26789 // Note that we have to rebuild the RHS constant here to ensure we 26790 // don't rely on particular values of undef lanes. 26791 return DAG.getNode( 26792 X86ISD::SUBUS, DL, VT, OpLHS, 26793 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT)); 26794 } 26795 } 26796 } 26797 26798 // Simplify vector selection if condition value type matches vselect 26799 // operand type 26800 if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { 26801 assert(Cond.getValueType().isVector() && 26802 "vector select expects a vector selector!"); 26803 26804 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); 26805 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 26806 26807 // Try invert the condition if true value is not all 1s and false value 26808 // is not all 0s. 26809 if (!TValIsAllOnes && !FValIsAllZeros && 26810 // Check if the selector will be produced by CMPP*/PCMP* 26811 Cond.getOpcode() == ISD::SETCC && 26812 // Check if SETCC has already been promoted 26813 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == 26814 CondVT) { 26815 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); 26816 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); 26817 26818 if (TValIsAllZeros || FValIsAllOnes) { 26819 SDValue CC = Cond.getOperand(2); 26820 ISD::CondCode NewCC = 26821 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 26822 Cond.getOperand(0).getValueType().isInteger()); 26823 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); 26824 std::swap(LHS, RHS); 26825 TValIsAllOnes = FValIsAllOnes; 26826 FValIsAllZeros = TValIsAllZeros; 26827 } 26828 } 26829 26830 if (TValIsAllOnes || FValIsAllZeros) { 26831 SDValue Ret; 26832 26833 if (TValIsAllOnes && FValIsAllZeros) 26834 Ret = Cond; 26835 else if (TValIsAllOnes) 26836 Ret = 26837 DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS)); 26838 else if (FValIsAllZeros) 26839 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, 26840 DAG.getBitcast(CondVT, LHS)); 26841 26842 return DAG.getBitcast(VT, Ret); 26843 } 26844 } 26845 26846 // If this is a *dynamic* select (non-constant condition) and we can match 26847 // this node with one of the variable blend instructions, restructure the 26848 // condition so that the blends can use the high bit of each element and use 26849 // SimplifyDemandedBits to simplify the condition operand. 26850 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 26851 !DCI.isBeforeLegalize() && 26852 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { 26853 unsigned BitWidth = Cond.getValueType().getScalarSizeInBits(); 26854 26855 // Don't optimize vector selects that map to mask-registers. 26856 if (BitWidth == 1) 26857 return SDValue(); 26858 26859 // We can only handle the cases where VSELECT is directly legal on the 26860 // subtarget. We custom lower VSELECT nodes with constant conditions and 26861 // this makes it hard to see whether a dynamic VSELECT will correctly 26862 // lower, so we both check the operation's status and explicitly handle the 26863 // cases where a *dynamic* blend will fail even though a constant-condition 26864 // blend could be custom lowered. 26865 // FIXME: We should find a better way to handle this class of problems. 26866 // Potentially, we should combine constant-condition vselect nodes 26867 // pre-legalization into shuffles and not mark as many types as custom 26868 // lowered. 26869 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) 26870 return SDValue(); 26871 // FIXME: We don't support i16-element blends currently. We could and 26872 // should support them by making *all* the bits in the condition be set 26873 // rather than just the high bit and using an i8-element blend. 26874 if (VT.getVectorElementType() == MVT::i16) 26875 return SDValue(); 26876 // Dynamic blending was only available from SSE4.1 onward. 26877 if (VT.is128BitVector() && !Subtarget.hasSSE41()) 26878 return SDValue(); 26879 // Byte blends are only available in AVX2 26880 if (VT == MVT::v32i8 && !Subtarget.hasAVX2()) 26881 return SDValue(); 26882 26883 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 26884 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 26885 26886 APInt KnownZero, KnownOne; 26887 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 26888 DCI.isBeforeLegalizeOps()); 26889 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 26890 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, 26891 TLO)) { 26892 // If we changed the computation somewhere in the DAG, this change 26893 // will affect all users of Cond. 26894 // Make sure it is fine and update all the nodes so that we do not 26895 // use the generic VSELECT anymore. Otherwise, we may perform 26896 // wrong optimizations as we messed up with the actual expectation 26897 // for the vector boolean values. 26898 if (Cond != TLO.Old) { 26899 // Check all uses of that condition operand to check whether it will be 26900 // consumed by non-BLEND instructions, which may depend on all bits are 26901 // set properly. 26902 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); 26903 I != E; ++I) 26904 if (I->getOpcode() != ISD::VSELECT) 26905 // TODO: Add other opcodes eventually lowered into BLEND. 26906 return SDValue(); 26907 26908 // Update all the users of the condition, before committing the change, 26909 // so that the VSELECT optimizations that expect the correct vector 26910 // boolean value will not be triggered. 26911 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); 26912 I != E; ++I) 26913 DAG.ReplaceAllUsesOfValueWith( 26914 SDValue(*I, 0), 26915 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), 26916 Cond, I->getOperand(1), I->getOperand(2))); 26917 DCI.CommitTargetLoweringOpt(TLO); 26918 return SDValue(); 26919 } 26920 // At this point, only Cond is changed. Change the condition 26921 // just for N to keep the opportunity to optimize all other 26922 // users their own way. 26923 DAG.ReplaceAllUsesOfValueWith( 26924 SDValue(N, 0), 26925 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), 26926 TLO.New, N->getOperand(1), N->getOperand(2))); 26927 return SDValue(); 26928 } 26929 } 26930 26931 return SDValue(); 26932 } 26933 26934 /// Combine: 26935 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) 26936 /// to: 26937 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE) 26938 /// i.e., reusing the EFLAGS produced by the LOCKed instruction. 26939 /// Note that this is only legal for some op/cc combinations. 26940 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, 26941 SelectionDAG &DAG) { 26942 // This combine only operates on CMP-like nodes. 26943 if (!(Cmp.getOpcode() == X86ISD::CMP || 26944 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) 26945 return SDValue(); 26946 26947 // This only applies to variations of the common case: 26948 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) 26949 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) 26950 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0) 26951 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0) 26952 // Using the proper condcodes (see below), overflow is checked for. 26953 26954 // FIXME: We can generalize both constraints: 26955 // - XOR/OR/AND (if they were made to survive AtomicExpand) 26956 // - LHS != 1 26957 // if the result is compared. 26958 26959 SDValue CmpLHS = Cmp.getOperand(0); 26960 SDValue CmpRHS = Cmp.getOperand(1); 26961 26962 if (!CmpLHS.hasOneUse()) 26963 return SDValue(); 26964 26965 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS); 26966 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0) 26967 return SDValue(); 26968 26969 const unsigned Opc = CmpLHS.getOpcode(); 26970 26971 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB) 26972 return SDValue(); 26973 26974 SDValue OpRHS = CmpLHS.getOperand(2); 26975 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS); 26976 if (!OpRHSC) 26977 return SDValue(); 26978 26979 APInt Addend = OpRHSC->getAPIntValue(); 26980 if (Opc == ISD::ATOMIC_LOAD_SUB) 26981 Addend = -Addend; 26982 26983 if (CC == X86::COND_S && Addend == 1) 26984 CC = X86::COND_LE; 26985 else if (CC == X86::COND_NS && Addend == 1) 26986 CC = X86::COND_G; 26987 else if (CC == X86::COND_G && Addend == -1) 26988 CC = X86::COND_GE; 26989 else if (CC == X86::COND_LE && Addend == -1) 26990 CC = X86::COND_L; 26991 else 26992 return SDValue(); 26993 26994 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG); 26995 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), 26996 DAG.getUNDEF(CmpLHS.getValueType())); 26997 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); 26998 return LockOp; 26999 } 27000 27001 // Check whether a boolean test is testing a boolean value generated by 27002 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 27003 // code. 27004 // 27005 // Simplify the following patterns: 27006 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 27007 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 27008 // to (Op EFLAGS Cond) 27009 // 27010 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 27011 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 27012 // to (Op EFLAGS !Cond) 27013 // 27014 // where Op could be BRCOND or CMOV. 27015 // 27016 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 27017 // This combine only operates on CMP-like nodes. 27018 if (!(Cmp.getOpcode() == X86ISD::CMP || 27019 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) 27020 return SDValue(); 27021 27022 // Quit if not used as a boolean value. 27023 if (CC != X86::COND_E && CC != X86::COND_NE) 27024 return SDValue(); 27025 27026 // Check CMP operands. One of them should be 0 or 1 and the other should be 27027 // an SetCC or extended from it. 27028 SDValue Op1 = Cmp.getOperand(0); 27029 SDValue Op2 = Cmp.getOperand(1); 27030 27031 SDValue SetCC; 27032 const ConstantSDNode* C = nullptr; 27033 bool needOppositeCond = (CC == X86::COND_E); 27034 bool checkAgainstTrue = false; // Is it a comparison against 1? 27035 27036 if ((C = dyn_cast<ConstantSDNode>(Op1))) 27037 SetCC = Op2; 27038 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 27039 SetCC = Op1; 27040 else // Quit if all operands are not constants. 27041 return SDValue(); 27042 27043 if (C->getZExtValue() == 1) { 27044 needOppositeCond = !needOppositeCond; 27045 checkAgainstTrue = true; 27046 } else if (C->getZExtValue() != 0) 27047 // Quit if the constant is neither 0 or 1. 27048 return SDValue(); 27049 27050 bool truncatedToBoolWithAnd = false; 27051 // Skip (zext $x), (trunc $x), or (and $x, 1) node. 27052 while (SetCC.getOpcode() == ISD::ZERO_EXTEND || 27053 SetCC.getOpcode() == ISD::TRUNCATE || 27054 SetCC.getOpcode() == ISD::AssertZext || 27055 SetCC.getOpcode() == ISD::AND) { 27056 if (SetCC.getOpcode() == ISD::AND) { 27057 int OpIdx = -1; 27058 if (isOneConstant(SetCC.getOperand(0))) 27059 OpIdx = 1; 27060 if (isOneConstant(SetCC.getOperand(1))) 27061 OpIdx = 0; 27062 if (OpIdx < 0) 27063 break; 27064 SetCC = SetCC.getOperand(OpIdx); 27065 truncatedToBoolWithAnd = true; 27066 } else 27067 SetCC = SetCC.getOperand(0); 27068 } 27069 27070 switch (SetCC.getOpcode()) { 27071 case X86ISD::SETCC_CARRY: 27072 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to 27073 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, 27074 // i.e. it's a comparison against true but the result of SETCC_CARRY is not 27075 // truncated to i1 using 'and'. 27076 if (checkAgainstTrue && !truncatedToBoolWithAnd) 27077 break; 27078 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && 27079 "Invalid use of SETCC_CARRY!"); 27080 // FALL THROUGH 27081 case X86ISD::SETCC: 27082 // Set the condition code or opposite one if necessary. 27083 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 27084 if (needOppositeCond) 27085 CC = X86::GetOppositeBranchCondition(CC); 27086 return SetCC.getOperand(1); 27087 case X86ISD::CMOV: { 27088 // Check whether false/true value has canonical one, i.e. 0 or 1. 27089 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 27090 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 27091 // Quit if true value is not a constant. 27092 if (!TVal) 27093 return SDValue(); 27094 // Quit if false value is not a constant. 27095 if (!FVal) { 27096 SDValue Op = SetCC.getOperand(0); 27097 // Skip 'zext' or 'trunc' node. 27098 if (Op.getOpcode() == ISD::ZERO_EXTEND || 27099 Op.getOpcode() == ISD::TRUNCATE) 27100 Op = Op.getOperand(0); 27101 // A special case for rdrand/rdseed, where 0 is set if false cond is 27102 // found. 27103 if ((Op.getOpcode() != X86ISD::RDRAND && 27104 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) 27105 return SDValue(); 27106 } 27107 // Quit if false value is not the constant 0 or 1. 27108 bool FValIsFalse = true; 27109 if (FVal && FVal->getZExtValue() != 0) { 27110 if (FVal->getZExtValue() != 1) 27111 return SDValue(); 27112 // If FVal is 1, opposite cond is needed. 27113 needOppositeCond = !needOppositeCond; 27114 FValIsFalse = false; 27115 } 27116 // Quit if TVal is not the constant opposite of FVal. 27117 if (FValIsFalse && TVal->getZExtValue() != 1) 27118 return SDValue(); 27119 if (!FValIsFalse && TVal->getZExtValue() != 0) 27120 return SDValue(); 27121 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 27122 if (needOppositeCond) 27123 CC = X86::GetOppositeBranchCondition(CC); 27124 return SetCC.getOperand(3); 27125 } 27126 } 27127 27128 return SDValue(); 27129 } 27130 27131 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. 27132 /// Match: 27133 /// (X86or (X86setcc) (X86setcc)) 27134 /// (X86cmp (and (X86setcc) (X86setcc)), 0) 27135 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, 27136 X86::CondCode &CC1, SDValue &Flags, 27137 bool &isAnd) { 27138 if (Cond->getOpcode() == X86ISD::CMP) { 27139 if (!isNullConstant(Cond->getOperand(1))) 27140 return false; 27141 27142 Cond = Cond->getOperand(0); 27143 } 27144 27145 isAnd = false; 27146 27147 SDValue SetCC0, SetCC1; 27148 switch (Cond->getOpcode()) { 27149 default: return false; 27150 case ISD::AND: 27151 case X86ISD::AND: 27152 isAnd = true; 27153 // fallthru 27154 case ISD::OR: 27155 case X86ISD::OR: 27156 SetCC0 = Cond->getOperand(0); 27157 SetCC1 = Cond->getOperand(1); 27158 break; 27159 }; 27160 27161 // Make sure we have SETCC nodes, using the same flags value. 27162 if (SetCC0.getOpcode() != X86ISD::SETCC || 27163 SetCC1.getOpcode() != X86ISD::SETCC || 27164 SetCC0->getOperand(1) != SetCC1->getOperand(1)) 27165 return false; 27166 27167 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); 27168 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); 27169 Flags = SetCC0->getOperand(1); 27170 return true; 27171 } 27172 27173 /// Optimize an EFLAGS definition used according to the condition code \p CC 27174 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing 27175 /// uses of chain values. 27176 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, 27177 SelectionDAG &DAG) { 27178 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) 27179 return R; 27180 return combineSetCCAtomicArith(EFLAGS, CC, DAG); 27181 } 27182 27183 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 27184 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, 27185 TargetLowering::DAGCombinerInfo &DCI, 27186 const X86Subtarget &Subtarget) { 27187 SDLoc DL(N); 27188 27189 // If the flag operand isn't dead, don't touch this CMOV. 27190 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 27191 return SDValue(); 27192 27193 SDValue FalseOp = N->getOperand(0); 27194 SDValue TrueOp = N->getOperand(1); 27195 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 27196 SDValue Cond = N->getOperand(3); 27197 27198 if (CC == X86::COND_E || CC == X86::COND_NE) { 27199 switch (Cond.getOpcode()) { 27200 default: break; 27201 case X86ISD::BSR: 27202 case X86ISD::BSF: 27203 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 27204 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 27205 return (CC == X86::COND_E) ? FalseOp : TrueOp; 27206 } 27207 } 27208 27209 // Try to simplify the EFLAGS and condition code operands. 27210 // We can't always do this as FCMOV only supports a subset of X86 cond. 27211 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) { 27212 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { 27213 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), 27214 Flags}; 27215 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); 27216 } 27217 } 27218 27219 // If this is a select between two integer constants, try to do some 27220 // optimizations. Note that the operands are ordered the opposite of SELECT 27221 // operands. 27222 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 27223 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 27224 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 27225 // larger than FalseC (the false value). 27226 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 27227 CC = X86::GetOppositeBranchCondition(CC); 27228 std::swap(TrueC, FalseC); 27229 std::swap(TrueOp, FalseOp); 27230 } 27231 27232 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 27233 // This is efficient for any integer data type (including i8/i16) and 27234 // shift amount. 27235 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 27236 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 27237 DAG.getConstant(CC, DL, MVT::i8), Cond); 27238 27239 // Zero extend the condition if needed. 27240 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 27241 27242 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 27243 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 27244 DAG.getConstant(ShAmt, DL, MVT::i8)); 27245 if (N->getNumValues() == 2) // Dead flag value? 27246 return DCI.CombineTo(N, Cond, SDValue()); 27247 return Cond; 27248 } 27249 27250 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 27251 // for any integer data type, including i8/i16. 27252 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 27253 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 27254 DAG.getConstant(CC, DL, MVT::i8), Cond); 27255 27256 // Zero extend the condition if needed. 27257 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 27258 FalseC->getValueType(0), Cond); 27259 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 27260 SDValue(FalseC, 0)); 27261 27262 if (N->getNumValues() == 2) // Dead flag value? 27263 return DCI.CombineTo(N, Cond, SDValue()); 27264 return Cond; 27265 } 27266 27267 // Optimize cases that will turn into an LEA instruction. This requires 27268 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 27269 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 27270 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 27271 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 27272 27273 bool isFastMultiplier = false; 27274 if (Diff < 10) { 27275 switch ((unsigned char)Diff) { 27276 default: break; 27277 case 1: // result = add base, cond 27278 case 2: // result = lea base( , cond*2) 27279 case 3: // result = lea base(cond, cond*2) 27280 case 4: // result = lea base( , cond*4) 27281 case 5: // result = lea base(cond, cond*4) 27282 case 8: // result = lea base( , cond*8) 27283 case 9: // result = lea base(cond, cond*8) 27284 isFastMultiplier = true; 27285 break; 27286 } 27287 } 27288 27289 if (isFastMultiplier) { 27290 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 27291 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 27292 DAG.getConstant(CC, DL, MVT::i8), Cond); 27293 // Zero extend the condition if needed. 27294 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 27295 Cond); 27296 // Scale the condition by the difference. 27297 if (Diff != 1) 27298 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 27299 DAG.getConstant(Diff, DL, Cond.getValueType())); 27300 27301 // Add the base if non-zero. 27302 if (FalseC->getAPIntValue() != 0) 27303 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 27304 SDValue(FalseC, 0)); 27305 if (N->getNumValues() == 2) // Dead flag value? 27306 return DCI.CombineTo(N, Cond, SDValue()); 27307 return Cond; 27308 } 27309 } 27310 } 27311 } 27312 27313 // Handle these cases: 27314 // (select (x != c), e, c) -> select (x != c), e, x), 27315 // (select (x == c), c, e) -> select (x == c), x, e) 27316 // where the c is an integer constant, and the "select" is the combination 27317 // of CMOV and CMP. 27318 // 27319 // The rationale for this change is that the conditional-move from a constant 27320 // needs two instructions, however, conditional-move from a register needs 27321 // only one instruction. 27322 // 27323 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 27324 // some instruction-combining opportunities. This opt needs to be 27325 // postponed as late as possible. 27326 // 27327 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 27328 // the DCI.xxxx conditions are provided to postpone the optimization as 27329 // late as possible. 27330 27331 ConstantSDNode *CmpAgainst = nullptr; 27332 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 27333 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 27334 !isa<ConstantSDNode>(Cond.getOperand(0))) { 27335 27336 if (CC == X86::COND_NE && 27337 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 27338 CC = X86::GetOppositeBranchCondition(CC); 27339 std::swap(TrueOp, FalseOp); 27340 } 27341 27342 if (CC == X86::COND_E && 27343 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 27344 SDValue Ops[] = { FalseOp, Cond.getOperand(0), 27345 DAG.getConstant(CC, DL, MVT::i8), Cond }; 27346 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); 27347 } 27348 } 27349 } 27350 27351 // Fold and/or of setcc's to double CMOV: 27352 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) 27353 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) 27354 // 27355 // This combine lets us generate: 27356 // cmovcc1 (jcc1 if we don't have CMOV) 27357 // cmovcc2 (same) 27358 // instead of: 27359 // setcc1 27360 // setcc2 27361 // and/or 27362 // cmovne (jne if we don't have CMOV) 27363 // When we can't use the CMOV instruction, it might increase branch 27364 // mispredicts. 27365 // When we can use CMOV, or when there is no mispredict, this improves 27366 // throughput and reduces register pressure. 27367 // 27368 if (CC == X86::COND_NE) { 27369 SDValue Flags; 27370 X86::CondCode CC0, CC1; 27371 bool isAndSetCC; 27372 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { 27373 if (isAndSetCC) { 27374 std::swap(FalseOp, TrueOp); 27375 CC0 = X86::GetOppositeBranchCondition(CC0); 27376 CC1 = X86::GetOppositeBranchCondition(CC1); 27377 } 27378 27379 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), 27380 Flags}; 27381 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); 27382 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; 27383 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); 27384 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); 27385 return CMOV; 27386 } 27387 } 27388 27389 return SDValue(); 27390 } 27391 27392 /// Different mul shrinking modes. 27393 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; 27394 27395 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { 27396 EVT VT = N->getOperand(0).getValueType(); 27397 if (VT.getScalarSizeInBits() != 32) 27398 return false; 27399 27400 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2"); 27401 unsigned SignBits[2] = {1, 1}; 27402 bool IsPositive[2] = {false, false}; 27403 for (unsigned i = 0; i < 2; i++) { 27404 SDValue Opd = N->getOperand(i); 27405 27406 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to 27407 // compute signbits for it separately. 27408 if (Opd.getOpcode() == ISD::ANY_EXTEND) { 27409 // For anyextend, it is safe to assume an appropriate number of leading 27410 // sign/zero bits. 27411 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8) 27412 SignBits[i] = 25; 27413 else if (Opd.getOperand(0).getValueType().getVectorElementType() == 27414 MVT::i16) 27415 SignBits[i] = 17; 27416 else 27417 return false; 27418 IsPositive[i] = true; 27419 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) { 27420 // All the operands of BUILD_VECTOR need to be int constant. 27421 // Find the smallest value range which all the operands belong to. 27422 SignBits[i] = 32; 27423 IsPositive[i] = true; 27424 for (const SDValue &SubOp : Opd.getNode()->op_values()) { 27425 if (SubOp.isUndef()) 27426 continue; 27427 auto *CN = dyn_cast<ConstantSDNode>(SubOp); 27428 if (!CN) 27429 return false; 27430 APInt IntVal = CN->getAPIntValue(); 27431 if (IntVal.isNegative()) 27432 IsPositive[i] = false; 27433 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits()); 27434 } 27435 } else { 27436 SignBits[i] = DAG.ComputeNumSignBits(Opd); 27437 if (Opd.getOpcode() == ISD::ZERO_EXTEND) 27438 IsPositive[i] = true; 27439 } 27440 } 27441 27442 bool AllPositive = IsPositive[0] && IsPositive[1]; 27443 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); 27444 // When ranges are from -128 ~ 127, use MULS8 mode. 27445 if (MinSignBits >= 25) 27446 Mode = MULS8; 27447 // When ranges are from 0 ~ 255, use MULU8 mode. 27448 else if (AllPositive && MinSignBits >= 24) 27449 Mode = MULU8; 27450 // When ranges are from -32768 ~ 32767, use MULS16 mode. 27451 else if (MinSignBits >= 17) 27452 Mode = MULS16; 27453 // When ranges are from 0 ~ 65535, use MULU16 mode. 27454 else if (AllPositive && MinSignBits >= 16) 27455 Mode = MULU16; 27456 else 27457 return false; 27458 return true; 27459 } 27460 27461 /// When the operands of vector mul are extended from smaller size values, 27462 /// like i8 and i16, the type of mul may be shrinked to generate more 27463 /// efficient code. Two typical patterns are handled: 27464 /// Pattern1: 27465 /// %2 = sext/zext <N x i8> %1 to <N x i32> 27466 /// %4 = sext/zext <N x i8> %3 to <N x i32> 27467 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants) 27468 /// %5 = mul <N x i32> %2, %4 27469 /// 27470 /// Pattern2: 27471 /// %2 = zext/sext <N x i16> %1 to <N x i32> 27472 /// %4 = zext/sext <N x i16> %3 to <N x i32> 27473 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants) 27474 /// %5 = mul <N x i32> %2, %4 27475 /// 27476 /// There are four mul shrinking modes: 27477 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is 27478 /// -128 to 128, and the scalar value range of %4 is also -128 to 128, 27479 /// generate pmullw+sext32 for it (MULS8 mode). 27480 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is 27481 /// 0 to 255, and the scalar value range of %4 is also 0 to 255, 27482 /// generate pmullw+zext32 for it (MULU8 mode). 27483 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is 27484 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767, 27485 /// generate pmullw+pmulhw for it (MULS16 mode). 27486 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is 27487 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535, 27488 /// generate pmullw+pmulhuw for it (MULU16 mode). 27489 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, 27490 const X86Subtarget &Subtarget) { 27491 // pmulld is supported since SSE41. It is better to use pmulld 27492 // instead of pmullw+pmulhw. 27493 if (Subtarget.hasSSE41()) 27494 return SDValue(); 27495 27496 ShrinkMode Mode; 27497 if (!canReduceVMulWidth(N, DAG, Mode)) 27498 return SDValue(); 27499 27500 SDLoc DL(N); 27501 SDValue N0 = N->getOperand(0); 27502 SDValue N1 = N->getOperand(1); 27503 EVT VT = N->getOperand(0).getValueType(); 27504 unsigned RegSize = 128; 27505 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); 27506 EVT ReducedVT = 27507 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); 27508 // Shrink the operands of mul. 27509 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); 27510 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); 27511 27512 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) { 27513 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the 27514 // lower part is needed. 27515 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); 27516 if (Mode == MULU8 || Mode == MULS8) { 27517 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, 27518 DL, VT, MulLo); 27519 } else { 27520 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); 27521 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, 27522 // the higher part is also needed. 27523 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, 27524 ReducedVT, NewN0, NewN1); 27525 27526 // Repack the lower part and higher part result of mul into a wider 27527 // result. 27528 // Generate shuffle functioning as punpcklwd. 27529 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements()); 27530 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { 27531 ShuffleMask[2 * i] = i; 27532 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements(); 27533 } 27534 SDValue ResLo = 27535 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); 27536 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo); 27537 // Generate shuffle functioning as punpckhwd. 27538 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { 27539 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2; 27540 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2; 27541 } 27542 SDValue ResHi = 27543 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); 27544 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi); 27545 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); 27546 } 27547 } else { 27548 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want 27549 // to legalize the mul explicitly because implicit legalization for type 27550 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack 27551 // instructions which will not exist when we explicitly legalize it by 27552 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with 27553 // <4 x i16> undef). 27554 // 27555 // Legalize the operands of mul. 27556 SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(), 27557 DAG.getUNDEF(ReducedVT)); 27558 Ops[0] = NewN0; 27559 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); 27560 Ops[0] = NewN1; 27561 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); 27562 27563 if (Mode == MULU8 || Mode == MULS8) { 27564 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower 27565 // part is needed. 27566 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); 27567 27568 // convert the type of mul result to VT. 27569 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); 27570 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG 27571 : ISD::SIGN_EXTEND_VECTOR_INREG, 27572 DL, ResVT, Mul); 27573 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, 27574 DAG.getIntPtrConstant(0, DL)); 27575 } else { 27576 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For 27577 // MULU16/MULS16, both parts are needed. 27578 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); 27579 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, 27580 OpsVT, NewN0, NewN1); 27581 27582 // Repack the lower part and higher part result of mul into a wider 27583 // result. Make sure the type of mul result is VT. 27584 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); 27585 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi); 27586 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res); 27587 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, 27588 DAG.getIntPtrConstant(0, DL)); 27589 } 27590 } 27591 } 27592 27593 /// Optimize a single multiply with constant into two operations in order to 27594 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. 27595 static SDValue combineMul(SDNode *N, SelectionDAG &DAG, 27596 TargetLowering::DAGCombinerInfo &DCI, 27597 const X86Subtarget &Subtarget) { 27598 EVT VT = N->getValueType(0); 27599 if (DCI.isBeforeLegalize() && VT.isVector()) 27600 return reduceVMULWidth(N, DAG, Subtarget); 27601 27602 // An imul is usually smaller than the alternative sequence. 27603 if (DAG.getMachineFunction().getFunction()->optForMinSize()) 27604 return SDValue(); 27605 27606 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 27607 return SDValue(); 27608 27609 if (VT != MVT::i64 && VT != MVT::i32) 27610 return SDValue(); 27611 27612 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 27613 if (!C) 27614 return SDValue(); 27615 uint64_t MulAmt = C->getZExtValue(); 27616 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 27617 return SDValue(); 27618 27619 uint64_t MulAmt1 = 0; 27620 uint64_t MulAmt2 = 0; 27621 if ((MulAmt % 9) == 0) { 27622 MulAmt1 = 9; 27623 MulAmt2 = MulAmt / 9; 27624 } else if ((MulAmt % 5) == 0) { 27625 MulAmt1 = 5; 27626 MulAmt2 = MulAmt / 5; 27627 } else if ((MulAmt % 3) == 0) { 27628 MulAmt1 = 3; 27629 MulAmt2 = MulAmt / 3; 27630 } 27631 27632 SDLoc DL(N); 27633 SDValue NewMul; 27634 if (MulAmt2 && 27635 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 27636 27637 if (isPowerOf2_64(MulAmt2) && 27638 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 27639 // If second multiplifer is pow2, issue it first. We want the multiply by 27640 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 27641 // is an add. 27642 std::swap(MulAmt1, MulAmt2); 27643 27644 if (isPowerOf2_64(MulAmt1)) 27645 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 27646 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); 27647 else 27648 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 27649 DAG.getConstant(MulAmt1, DL, VT)); 27650 27651 if (isPowerOf2_64(MulAmt2)) 27652 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 27653 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); 27654 else 27655 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 27656 DAG.getConstant(MulAmt2, DL, VT)); 27657 } 27658 27659 if (!NewMul) { 27660 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) 27661 && "Both cases that could cause potential overflows should have " 27662 "already been handled."); 27663 if (isPowerOf2_64(MulAmt - 1)) 27664 // (mul x, 2^N + 1) => (add (shl x, N), x) 27665 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), 27666 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 27667 DAG.getConstant(Log2_64(MulAmt - 1), DL, 27668 MVT::i8))); 27669 27670 else if (isPowerOf2_64(MulAmt + 1)) 27671 // (mul x, 2^N - 1) => (sub (shl x, N), x) 27672 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, 27673 N->getOperand(0), 27674 DAG.getConstant(Log2_64(MulAmt + 1), 27675 DL, MVT::i8)), N->getOperand(0)); 27676 } 27677 27678 if (NewMul) 27679 // Do not add new nodes to DAG combiner worklist. 27680 DCI.CombineTo(N, NewMul, false); 27681 27682 return SDValue(); 27683 } 27684 27685 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { 27686 SDValue N0 = N->getOperand(0); 27687 SDValue N1 = N->getOperand(1); 27688 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 27689 EVT VT = N0.getValueType(); 27690 27691 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 27692 // since the result of setcc_c is all zero's or all ones. 27693 if (VT.isInteger() && !VT.isVector() && 27694 N1C && N0.getOpcode() == ISD::AND && 27695 N0.getOperand(1).getOpcode() == ISD::Constant) { 27696 SDValue N00 = N0.getOperand(0); 27697 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 27698 const APInt &ShAmt = N1C->getAPIntValue(); 27699 Mask = Mask.shl(ShAmt); 27700 bool MaskOK = false; 27701 // We can handle cases concerning bit-widening nodes containing setcc_c if 27702 // we carefully interrogate the mask to make sure we are semantics 27703 // preserving. 27704 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth 27705 // of the underlying setcc_c operation if the setcc_c was zero extended. 27706 // Consider the following example: 27707 // zext(setcc_c) -> i32 0x0000FFFF 27708 // c1 -> i32 0x0000FFFF 27709 // c2 -> i32 0x00000001 27710 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE 27711 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE 27712 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 27713 MaskOK = true; 27714 } else if (N00.getOpcode() == ISD::SIGN_EXTEND && 27715 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 27716 MaskOK = true; 27717 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || 27718 N00.getOpcode() == ISD::ANY_EXTEND) && 27719 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 27720 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); 27721 } 27722 if (MaskOK && Mask != 0) { 27723 SDLoc DL(N); 27724 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); 27725 } 27726 } 27727 27728 // Hardware support for vector shifts is sparse which makes us scalarize the 27729 // vector operations in many cases. Also, on sandybridge ADD is faster than 27730 // shl. 27731 // (shl V, 1) -> add V,V 27732 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1)) 27733 if (auto *N1SplatC = N1BV->getConstantSplatNode()) { 27734 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 27735 // We shift all of the values by one. In many cases we do not have 27736 // hardware support for this operation. This is better expressed as an ADD 27737 // of two values. 27738 if (N1SplatC->getAPIntValue() == 1) 27739 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); 27740 } 27741 27742 return SDValue(); 27743 } 27744 27745 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) { 27746 SDValue N0 = N->getOperand(0); 27747 SDValue N1 = N->getOperand(1); 27748 EVT VT = N0.getValueType(); 27749 unsigned Size = VT.getSizeInBits(); 27750 27751 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) 27752 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or 27753 // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) 27754 // depending on sign of (SarConst - [56,48,32,24,16]) 27755 27756 // sexts in X86 are MOVs. The MOVs have the same code size 27757 // as above SHIFTs (only SHIFT on 1 has lower code size). 27758 // However the MOVs have 2 advantages to a SHIFT: 27759 // 1. MOVs can write to a register that differs from source 27760 // 2. MOVs accept memory operands 27761 27762 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || 27763 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || 27764 N0.getOperand(1).getOpcode() != ISD::Constant) 27765 return SDValue(); 27766 27767 SDValue N00 = N0.getOperand(0); 27768 SDValue N01 = N0.getOperand(1); 27769 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue(); 27770 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue(); 27771 EVT CVT = N1.getValueType(); 27772 27773 if (SarConst.isNegative()) 27774 return SDValue(); 27775 27776 for (MVT SVT : MVT::integer_valuetypes()) { 27777 unsigned ShiftSize = SVT.getSizeInBits(); 27778 // skipping types without corresponding sext/zext and 27779 // ShlConst that is not one of [56,48,32,24,16] 27780 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) 27781 continue; 27782 SDLoc DL(N); 27783 SDValue NN = 27784 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); 27785 SarConst = SarConst - (Size - ShiftSize); 27786 if (SarConst == 0) 27787 return NN; 27788 else if (SarConst.isNegative()) 27789 return DAG.getNode(ISD::SHL, DL, VT, NN, 27790 DAG.getConstant(-SarConst, DL, CVT)); 27791 else 27792 return DAG.getNode(ISD::SRA, DL, VT, NN, 27793 DAG.getConstant(SarConst, DL, CVT)); 27794 } 27795 return SDValue(); 27796 } 27797 27798 /// \brief Returns a vector of 0s if the node in input is a vector logical 27799 /// shift by a constant amount which is known to be bigger than or equal 27800 /// to the vector element size in bits. 27801 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, 27802 const X86Subtarget &Subtarget) { 27803 EVT VT = N->getValueType(0); 27804 27805 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 27806 (!Subtarget.hasInt256() || 27807 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 27808 return SDValue(); 27809 27810 SDValue Amt = N->getOperand(1); 27811 SDLoc DL(N); 27812 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) 27813 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { 27814 const APInt &ShiftAmt = AmtSplat->getAPIntValue(); 27815 unsigned MaxAmount = 27816 VT.getSimpleVT().getVectorElementType().getSizeInBits(); 27817 27818 // SSE2/AVX2 logical shifts always return a vector of 0s 27819 // if the shift amount is bigger than or equal to 27820 // the element size. The constant shift amount will be 27821 // encoded as a 8-bit immediate. 27822 if (ShiftAmt.trunc(8).uge(MaxAmount)) 27823 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); 27824 } 27825 27826 return SDValue(); 27827 } 27828 27829 static SDValue combineShift(SDNode* N, SelectionDAG &DAG, 27830 TargetLowering::DAGCombinerInfo &DCI, 27831 const X86Subtarget &Subtarget) { 27832 if (N->getOpcode() == ISD::SHL) 27833 if (SDValue V = combineShiftLeft(N, DAG)) 27834 return V; 27835 27836 if (N->getOpcode() == ISD::SRA) 27837 if (SDValue V = combineShiftRightAlgebraic(N, DAG)) 27838 return V; 27839 27840 // Try to fold this logical shift into a zero vector. 27841 if (N->getOpcode() != ISD::SRA) 27842 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) 27843 return V; 27844 27845 return SDValue(); 27846 } 27847 27848 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs 27849 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for 27850 /// OR -> CMPNEQSS. 27851 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, 27852 TargetLowering::DAGCombinerInfo &DCI, 27853 const X86Subtarget &Subtarget) { 27854 unsigned opcode; 27855 27856 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 27857 // we're requiring SSE2 for both. 27858 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 27859 SDValue N0 = N->getOperand(0); 27860 SDValue N1 = N->getOperand(1); 27861 SDValue CMP0 = N0->getOperand(1); 27862 SDValue CMP1 = N1->getOperand(1); 27863 SDLoc DL(N); 27864 27865 // The SETCCs should both refer to the same CMP. 27866 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 27867 return SDValue(); 27868 27869 SDValue CMP00 = CMP0->getOperand(0); 27870 SDValue CMP01 = CMP0->getOperand(1); 27871 EVT VT = CMP00.getValueType(); 27872 27873 if (VT == MVT::f32 || VT == MVT::f64) { 27874 bool ExpectingFlags = false; 27875 // Check for any users that want flags: 27876 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 27877 !ExpectingFlags && UI != UE; ++UI) 27878 switch (UI->getOpcode()) { 27879 default: 27880 case ISD::BR_CC: 27881 case ISD::BRCOND: 27882 case ISD::SELECT: 27883 ExpectingFlags = true; 27884 break; 27885 case ISD::CopyToReg: 27886 case ISD::SIGN_EXTEND: 27887 case ISD::ZERO_EXTEND: 27888 case ISD::ANY_EXTEND: 27889 break; 27890 } 27891 27892 if (!ExpectingFlags) { 27893 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 27894 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 27895 27896 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 27897 X86::CondCode tmp = cc0; 27898 cc0 = cc1; 27899 cc1 = tmp; 27900 } 27901 27902 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 27903 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 27904 // FIXME: need symbolic constants for these magic numbers. 27905 // See X86ATTInstPrinter.cpp:printSSECC(). 27906 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 27907 if (Subtarget.hasAVX512()) { 27908 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, 27909 CMP01, 27910 DAG.getConstant(x86cc, DL, MVT::i8)); 27911 if (N->getValueType(0) != MVT::i1) 27912 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), 27913 FSetCC); 27914 return FSetCC; 27915 } 27916 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, 27917 CMP00.getValueType(), CMP00, CMP01, 27918 DAG.getConstant(x86cc, DL, 27919 MVT::i8)); 27920 27921 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 27922 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; 27923 27924 if (is64BitFP && !Subtarget.is64Bit()) { 27925 // On a 32-bit target, we cannot bitcast the 64-bit float to a 27926 // 64-bit integer, since that's not a legal type. Since 27927 // OnesOrZeroesF is all ones of all zeroes, we don't need all the 27928 // bits, but can do this little dance to extract the lowest 32 bits 27929 // and work with those going forward. 27930 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, 27931 OnesOrZeroesF); 27932 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); 27933 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, 27934 Vector32, DAG.getIntPtrConstant(0, DL)); 27935 IntVT = MVT::i32; 27936 } 27937 27938 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); 27939 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, 27940 DAG.getConstant(1, DL, IntVT)); 27941 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 27942 ANDed); 27943 return OneBitOfTruth; 27944 } 27945 } 27946 } 27947 } 27948 return SDValue(); 27949 } 27950 27951 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). 27952 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { 27953 assert(N->getOpcode() == ISD::AND); 27954 27955 EVT VT = N->getValueType(0); 27956 SDValue N0 = N->getOperand(0); 27957 SDValue N1 = N->getOperand(1); 27958 SDLoc DL(N); 27959 27960 if (VT != MVT::v2i64 && VT != MVT::v4i64 && 27961 VT != MVT::v8i64 && VT != MVT::v16i32 && 27962 VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX 27963 return SDValue(); 27964 27965 // Canonicalize XOR to the left. 27966 if (N1.getOpcode() == ISD::XOR) 27967 std::swap(N0, N1); 27968 27969 if (N0.getOpcode() != ISD::XOR) 27970 return SDValue(); 27971 27972 SDValue N00 = N0->getOperand(0); 27973 SDValue N01 = N0->getOperand(1); 27974 27975 N01 = peekThroughBitcasts(N01); 27976 27977 // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an 27978 // insert_subvector building a 256-bit AllOnes vector. 27979 if (!ISD::isBuildVectorAllOnes(N01.getNode())) { 27980 if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR) 27981 return SDValue(); 27982 27983 SDValue V1 = N01->getOperand(0); 27984 SDValue V2 = N01->getOperand(1); 27985 if (V1.getOpcode() != ISD::INSERT_SUBVECTOR || 27986 !V1.getOperand(0).isUndef() || 27987 !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) || 27988 !ISD::isBuildVectorAllOnes(V2.getNode())) 27989 return SDValue(); 27990 } 27991 return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1); 27992 } 27993 27994 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized 27995 // register. In most cases we actually compare or select YMM-sized registers 27996 // and mixing the two types creates horrible code. This method optimizes 27997 // some of the transition sequences. 27998 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, 27999 TargetLowering::DAGCombinerInfo &DCI, 28000 const X86Subtarget &Subtarget) { 28001 EVT VT = N->getValueType(0); 28002 if (!VT.is256BitVector()) 28003 return SDValue(); 28004 28005 assert((N->getOpcode() == ISD::ANY_EXTEND || 28006 N->getOpcode() == ISD::ZERO_EXTEND || 28007 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); 28008 28009 SDValue Narrow = N->getOperand(0); 28010 EVT NarrowVT = Narrow->getValueType(0); 28011 if (!NarrowVT.is128BitVector()) 28012 return SDValue(); 28013 28014 if (Narrow->getOpcode() != ISD::XOR && 28015 Narrow->getOpcode() != ISD::AND && 28016 Narrow->getOpcode() != ISD::OR) 28017 return SDValue(); 28018 28019 SDValue N0 = Narrow->getOperand(0); 28020 SDValue N1 = Narrow->getOperand(1); 28021 SDLoc DL(Narrow); 28022 28023 // The Left side has to be a trunc. 28024 if (N0.getOpcode() != ISD::TRUNCATE) 28025 return SDValue(); 28026 28027 // The type of the truncated inputs. 28028 EVT WideVT = N0->getOperand(0)->getValueType(0); 28029 if (WideVT != VT) 28030 return SDValue(); 28031 28032 // The right side has to be a 'trunc' or a constant vector. 28033 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; 28034 ConstantSDNode *RHSConstSplat = nullptr; 28035 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1)) 28036 RHSConstSplat = RHSBV->getConstantSplatNode(); 28037 if (!RHSTrunc && !RHSConstSplat) 28038 return SDValue(); 28039 28040 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 28041 28042 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) 28043 return SDValue(); 28044 28045 // Set N0 and N1 to hold the inputs to the new wide operation. 28046 N0 = N0->getOperand(0); 28047 if (RHSConstSplat) { 28048 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), 28049 SDValue(RHSConstSplat, 0)); 28050 N1 = DAG.getSplatBuildVector(WideVT, DL, N1); 28051 } else if (RHSTrunc) { 28052 N1 = N1->getOperand(0); 28053 } 28054 28055 // Generate the wide operation. 28056 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); 28057 unsigned Opcode = N->getOpcode(); 28058 switch (Opcode) { 28059 case ISD::ANY_EXTEND: 28060 return Op; 28061 case ISD::ZERO_EXTEND: { 28062 unsigned InBits = NarrowVT.getScalarSizeInBits(); 28063 APInt Mask = APInt::getAllOnesValue(InBits); 28064 Mask = Mask.zext(VT.getScalarSizeInBits()); 28065 return DAG.getNode(ISD::AND, DL, VT, 28066 Op, DAG.getConstant(Mask, DL, VT)); 28067 } 28068 case ISD::SIGN_EXTEND: 28069 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, 28070 Op, DAG.getValueType(NarrowVT)); 28071 default: 28072 llvm_unreachable("Unexpected opcode"); 28073 } 28074 } 28075 28076 static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG, 28077 TargetLowering::DAGCombinerInfo &DCI, 28078 const X86Subtarget &Subtarget) { 28079 SDValue N0 = N->getOperand(0); 28080 SDValue N1 = N->getOperand(1); 28081 SDLoc DL(N); 28082 28083 // A vector zext_in_reg may be represented as a shuffle, 28084 // feeding into a bitcast (this represents anyext) feeding into 28085 // an and with a mask. 28086 // We'd like to try to combine that into a shuffle with zero 28087 // plus a bitcast, removing the and. 28088 if (N0.getOpcode() != ISD::BITCAST || 28089 N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) 28090 return SDValue(); 28091 28092 // The other side of the AND should be a splat of 2^C, where C 28093 // is the number of bits in the source type. 28094 N1 = peekThroughBitcasts(N1); 28095 if (N1.getOpcode() != ISD::BUILD_VECTOR) 28096 return SDValue(); 28097 BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1); 28098 28099 ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0)); 28100 EVT SrcType = Shuffle->getValueType(0); 28101 28102 // We expect a single-source shuffle 28103 if (!Shuffle->getOperand(1)->isUndef()) 28104 return SDValue(); 28105 28106 unsigned SrcSize = SrcType.getScalarSizeInBits(); 28107 unsigned NumElems = SrcType.getVectorNumElements(); 28108 28109 APInt SplatValue, SplatUndef; 28110 unsigned SplatBitSize; 28111 bool HasAnyUndefs; 28112 if (!Vector->isConstantSplat(SplatValue, SplatUndef, 28113 SplatBitSize, HasAnyUndefs)) 28114 return SDValue(); 28115 28116 unsigned ResSize = N1.getValueType().getScalarSizeInBits(); 28117 // Make sure the splat matches the mask we expect 28118 if (SplatBitSize > ResSize || 28119 (SplatValue + 1).exactLogBase2() != (int)SrcSize) 28120 return SDValue(); 28121 28122 // Make sure the input and output size make sense 28123 if (SrcSize >= ResSize || ResSize % SrcSize) 28124 return SDValue(); 28125 28126 // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> 28127 // The number of u's between each two values depends on the ratio between 28128 // the source and dest type. 28129 unsigned ZextRatio = ResSize / SrcSize; 28130 bool IsZext = true; 28131 for (unsigned i = 0; i != NumElems; ++i) { 28132 if (i % ZextRatio) { 28133 if (Shuffle->getMaskElt(i) > 0) { 28134 // Expected undef 28135 IsZext = false; 28136 break; 28137 } 28138 } else { 28139 if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) { 28140 // Expected element number 28141 IsZext = false; 28142 break; 28143 } 28144 } 28145 } 28146 28147 if (!IsZext) 28148 return SDValue(); 28149 28150 // Ok, perform the transformation - replace the shuffle with 28151 // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero 28152 // (instead of undef) where the k elements come from the zero vector. 28153 SmallVector<int, 8> Mask; 28154 for (unsigned i = 0; i != NumElems; ++i) 28155 if (i % ZextRatio) 28156 Mask.push_back(NumElems); 28157 else 28158 Mask.push_back(i / ZextRatio); 28159 28160 SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, 28161 Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask); 28162 return DAG.getBitcast(N0.getValueType(), NewShuffle); 28163 } 28164 28165 /// If both input operands of a logic op are being cast from floating point 28166 /// types, try to convert this into a floating point logic node to avoid 28167 /// unnecessary moves from SSE to integer registers. 28168 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, 28169 const X86Subtarget &Subtarget) { 28170 unsigned FPOpcode = ISD::DELETED_NODE; 28171 if (N->getOpcode() == ISD::AND) 28172 FPOpcode = X86ISD::FAND; 28173 else if (N->getOpcode() == ISD::OR) 28174 FPOpcode = X86ISD::FOR; 28175 else if (N->getOpcode() == ISD::XOR) 28176 FPOpcode = X86ISD::FXOR; 28177 28178 assert(FPOpcode != ISD::DELETED_NODE && 28179 "Unexpected input node for FP logic conversion"); 28180 28181 EVT VT = N->getValueType(0); 28182 SDValue N0 = N->getOperand(0); 28183 SDValue N1 = N->getOperand(1); 28184 SDLoc DL(N); 28185 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && 28186 ((Subtarget.hasSSE1() && VT == MVT::i32) || 28187 (Subtarget.hasSSE2() && VT == MVT::i64))) { 28188 SDValue N00 = N0.getOperand(0); 28189 SDValue N10 = N1.getOperand(0); 28190 EVT N00Type = N00.getValueType(); 28191 EVT N10Type = N10.getValueType(); 28192 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { 28193 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); 28194 return DAG.getBitcast(VT, FPLogic); 28195 } 28196 } 28197 return SDValue(); 28198 } 28199 28200 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is 28201 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to 28202 /// eliminate loading the vector constant mask value. This relies on the fact 28203 /// that a PCMP always creates an all-ones or all-zeros bitmask per element. 28204 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) { 28205 SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); 28206 SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); 28207 28208 // TODO: Use AssertSext to mark any nodes that have the property of producing 28209 // all-ones or all-zeros. Then check for that node rather than particular 28210 // opcodes. 28211 if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT) 28212 return SDValue(); 28213 28214 // The existence of the PCMP node guarantees that we have the required SSE2 or 28215 // AVX2 for a shift of this vector type, but there is no vector shift by 28216 // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the 28217 // masked compare nodes, so they should not make it here. 28218 EVT VT0 = Op0.getValueType(); 28219 EVT VT1 = Op1.getValueType(); 28220 unsigned EltBitWidth = VT0.getScalarType().getSizeInBits(); 28221 if (VT0 != VT1 || EltBitWidth == 8) 28222 return SDValue(); 28223 28224 assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256); 28225 28226 APInt SplatVal; 28227 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1) 28228 return SDValue(); 28229 28230 SDLoc DL(N); 28231 SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8); 28232 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); 28233 return DAG.getBitcast(N->getValueType(0), Shift); 28234 } 28235 28236 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, 28237 TargetLowering::DAGCombinerInfo &DCI, 28238 const X86Subtarget &Subtarget) { 28239 if (DCI.isBeforeLegalizeOps()) 28240 return SDValue(); 28241 28242 if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget)) 28243 return Zext; 28244 28245 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) 28246 return R; 28247 28248 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) 28249 return FPLogic; 28250 28251 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) 28252 return R; 28253 28254 if (SDValue ShiftRight = combinePCMPAnd1(N, DAG)) 28255 return ShiftRight; 28256 28257 EVT VT = N->getValueType(0); 28258 SDValue N0 = N->getOperand(0); 28259 SDValue N1 = N->getOperand(1); 28260 SDLoc DL(N); 28261 28262 // Create BEXTR instructions 28263 // BEXTR is ((X >> imm) & (2**size-1)) 28264 if (VT != MVT::i32 && VT != MVT::i64) 28265 return SDValue(); 28266 28267 if (!Subtarget.hasBMI() && !Subtarget.hasTBM()) 28268 return SDValue(); 28269 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL) 28270 return SDValue(); 28271 28272 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); 28273 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 28274 if (MaskNode && ShiftNode) { 28275 uint64_t Mask = MaskNode->getZExtValue(); 28276 uint64_t Shift = ShiftNode->getZExtValue(); 28277 if (isMask_64(Mask)) { 28278 uint64_t MaskSize = countPopulation(Mask); 28279 if (Shift + MaskSize <= VT.getSizeInBits()) 28280 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), 28281 DAG.getConstant(Shift | (MaskSize << 8), DL, 28282 VT)); 28283 } 28284 } 28285 return SDValue(); 28286 } 28287 28288 // Try to fold: 28289 // (or (and (m, y), (pandn m, x))) 28290 // into: 28291 // (vselect m, x, y) 28292 // As a special case, try to fold: 28293 // (or (and (m, (sub 0, x)), (pandn m, x))) 28294 // into: 28295 // (sub (xor X, M), M) 28296 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, 28297 const X86Subtarget &Subtarget) { 28298 assert(N->getOpcode() == ISD::OR); 28299 28300 SDValue N0 = N->getOperand(0); 28301 SDValue N1 = N->getOperand(1); 28302 EVT VT = N->getValueType(0); 28303 28304 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256()))) 28305 return SDValue(); 28306 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!"); 28307 28308 // Canonicalize pandn to RHS 28309 if (N0.getOpcode() == X86ISD::ANDNP) 28310 std::swap(N0, N1); 28311 28312 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) 28313 return SDValue(); 28314 28315 SDValue Mask = N1.getOperand(0); 28316 SDValue X = N1.getOperand(1); 28317 SDValue Y; 28318 if (N0.getOperand(0) == Mask) 28319 Y = N0.getOperand(1); 28320 if (N0.getOperand(1) == Mask) 28321 Y = N0.getOperand(0); 28322 28323 // Check to see if the mask appeared in both the AND and ANDNP. 28324 if (!Y.getNode()) 28325 return SDValue(); 28326 28327 // Validate that X, Y, and Mask are bitcasts, and see through them. 28328 Mask = peekThroughBitcasts(Mask); 28329 X = peekThroughBitcasts(X); 28330 Y = peekThroughBitcasts(Y); 28331 28332 EVT MaskVT = Mask.getValueType(); 28333 28334 // Validate that the Mask operand is a vector sra node. 28335 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 28336 // there is no psrai.b 28337 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 28338 unsigned SraAmt = ~0; 28339 if (Mask.getOpcode() == ISD::SRA) { 28340 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) 28341 if (auto *AmtConst = AmtBV->getConstantSplatNode()) 28342 SraAmt = AmtConst->getZExtValue(); 28343 } else if (Mask.getOpcode() == X86ISD::VSRAI) { 28344 SDValue SraC = Mask.getOperand(1); 28345 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 28346 } 28347 if ((SraAmt + 1) != EltBits) 28348 return SDValue(); 28349 28350 SDLoc DL(N); 28351 28352 // Try to match: 28353 // (or (and (M, (sub 0, X)), (pandn M, X))) 28354 // which is a special case of vselect: 28355 // (vselect M, (sub 0, X), X) 28356 // Per: 28357 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate 28358 // We know that, if fNegate is 0 or 1: 28359 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) 28360 // 28361 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: 28362 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) 28363 // ( M ? -X : X) == ((X ^ M ) + (M & 1)) 28364 // This lets us transform our vselect to: 28365 // (add (xor X, M), (and M, 1)) 28366 // And further to: 28367 // (sub (xor X, M), M) 28368 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 28369 auto IsNegV = [](SDNode *N, SDValue V) { 28370 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && 28371 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); 28372 }; 28373 SDValue V; 28374 if (IsNegV(Y.getNode(), X)) 28375 V = X; 28376 else if (IsNegV(X.getNode(), Y)) 28377 V = Y; 28378 28379 if (V) { 28380 assert(EltBits == 8 || EltBits == 16 || EltBits == 32); 28381 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); 28382 SDValue SubOp2 = Mask; 28383 28384 // If the negate was on the false side of the select, then 28385 // the operands of the SUB need to be swapped. PR 27251. 28386 // This is because the pattern being matched above is 28387 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) 28388 // but if the pattern matched was 28389 // (vselect M, X, (sub (0, X))), that is really negation of the pattern 28390 // above, -(vselect M, (sub 0, X), X), and therefore the replacement 28391 // pattern also needs to be a negation of the replacement pattern above. 28392 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the 28393 // sub accomplishes the negation of the replacement pattern. 28394 if (V == Y) 28395 std::swap(SubOp1, SubOp2); 28396 28397 return DAG.getBitcast(VT, 28398 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2)); 28399 } 28400 } 28401 28402 // PBLENDVB is only available on SSE 4.1. 28403 if (!Subtarget.hasSSE41()) 28404 return SDValue(); 28405 28406 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 28407 28408 X = DAG.getBitcast(BlendVT, X); 28409 Y = DAG.getBitcast(BlendVT, Y); 28410 Mask = DAG.getBitcast(BlendVT, Mask); 28411 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 28412 return DAG.getBitcast(VT, Mask); 28413 } 28414 28415 static SDValue combineOr(SDNode *N, SelectionDAG &DAG, 28416 TargetLowering::DAGCombinerInfo &DCI, 28417 const X86Subtarget &Subtarget) { 28418 if (DCI.isBeforeLegalizeOps()) 28419 return SDValue(); 28420 28421 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) 28422 return R; 28423 28424 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) 28425 return FPLogic; 28426 28427 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) 28428 return R; 28429 28430 SDValue N0 = N->getOperand(0); 28431 SDValue N1 = N->getOperand(1); 28432 EVT VT = N->getValueType(0); 28433 28434 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 28435 return SDValue(); 28436 28437 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 28438 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); 28439 28440 // SHLD/SHRD instructions have lower register pressure, but on some 28441 // platforms they have higher latency than the equivalent 28442 // series of shifts/or that would otherwise be generated. 28443 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions 28444 // have higher latencies and we are not optimizing for size. 28445 if (!OptForSize && Subtarget.isSHLDSlow()) 28446 return SDValue(); 28447 28448 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 28449 std::swap(N0, N1); 28450 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 28451 return SDValue(); 28452 if (!N0.hasOneUse() || !N1.hasOneUse()) 28453 return SDValue(); 28454 28455 SDValue ShAmt0 = N0.getOperand(1); 28456 if (ShAmt0.getValueType() != MVT::i8) 28457 return SDValue(); 28458 SDValue ShAmt1 = N1.getOperand(1); 28459 if (ShAmt1.getValueType() != MVT::i8) 28460 return SDValue(); 28461 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 28462 ShAmt0 = ShAmt0.getOperand(0); 28463 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 28464 ShAmt1 = ShAmt1.getOperand(0); 28465 28466 SDLoc DL(N); 28467 unsigned Opc = X86ISD::SHLD; 28468 SDValue Op0 = N0.getOperand(0); 28469 SDValue Op1 = N1.getOperand(0); 28470 if (ShAmt0.getOpcode() == ISD::SUB) { 28471 Opc = X86ISD::SHRD; 28472 std::swap(Op0, Op1); 28473 std::swap(ShAmt0, ShAmt1); 28474 } 28475 28476 unsigned Bits = VT.getSizeInBits(); 28477 if (ShAmt1.getOpcode() == ISD::SUB) { 28478 SDValue Sum = ShAmt1.getOperand(0); 28479 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 28480 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 28481 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 28482 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 28483 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 28484 return DAG.getNode(Opc, DL, VT, 28485 Op0, Op1, 28486 DAG.getNode(ISD::TRUNCATE, DL, 28487 MVT::i8, ShAmt0)); 28488 } 28489 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 28490 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 28491 if (ShAmt0C && 28492 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 28493 return DAG.getNode(Opc, DL, VT, 28494 N0.getOperand(0), N1.getOperand(0), 28495 DAG.getNode(ISD::TRUNCATE, DL, 28496 MVT::i8, ShAmt0)); 28497 } 28498 28499 return SDValue(); 28500 } 28501 28502 // Generate NEG and CMOV for integer abs. 28503 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) { 28504 EVT VT = N->getValueType(0); 28505 28506 // Since X86 does not have CMOV for 8-bit integer, we don't convert 28507 // 8-bit integer abs to NEG and CMOV. 28508 if (VT.isInteger() && VT.getSizeInBits() == 8) 28509 return SDValue(); 28510 28511 SDValue N0 = N->getOperand(0); 28512 SDValue N1 = N->getOperand(1); 28513 SDLoc DL(N); 28514 28515 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 28516 // and change it to SUB and CMOV. 28517 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 28518 N0.getOpcode() == ISD::ADD && 28519 N0.getOperand(1) == N1 && 28520 N1.getOpcode() == ISD::SRA && 28521 N1.getOperand(0) == N0.getOperand(0)) 28522 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 28523 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 28524 // Generate SUB & CMOV. 28525 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 28526 DAG.getConstant(0, DL, VT), N0.getOperand(0)); 28527 28528 SDValue Ops[] = { N0.getOperand(0), Neg, 28529 DAG.getConstant(X86::COND_GE, DL, MVT::i8), 28530 SDValue(Neg.getNode(), 1) }; 28531 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); 28532 } 28533 return SDValue(); 28534 } 28535 28536 /// Try to turn tests against the signbit in the form of: 28537 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) 28538 /// into: 28539 /// SETGT(X, -1) 28540 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { 28541 // This is only worth doing if the output type is i8 or i1. 28542 EVT ResultType = N->getValueType(0); 28543 if (ResultType != MVT::i8 && ResultType != MVT::i1) 28544 return SDValue(); 28545 28546 SDValue N0 = N->getOperand(0); 28547 SDValue N1 = N->getOperand(1); 28548 28549 // We should be performing an xor against a truncated shift. 28550 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) 28551 return SDValue(); 28552 28553 // Make sure we are performing an xor against one. 28554 if (!isOneConstant(N1)) 28555 return SDValue(); 28556 28557 // SetCC on x86 zero extends so only act on this if it's a logical shift. 28558 SDValue Shift = N0.getOperand(0); 28559 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) 28560 return SDValue(); 28561 28562 // Make sure we are truncating from one of i16, i32 or i64. 28563 EVT ShiftTy = Shift.getValueType(); 28564 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) 28565 return SDValue(); 28566 28567 // Make sure the shift amount extracts the sign bit. 28568 if (!isa<ConstantSDNode>(Shift.getOperand(1)) || 28569 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) 28570 return SDValue(); 28571 28572 // Create a greater-than comparison against -1. 28573 // N.B. Using SETGE against 0 works but we want a canonical looking 28574 // comparison, using SETGT matches up with what TranslateX86CC. 28575 SDLoc DL(N); 28576 SDValue ShiftOp = Shift.getOperand(0); 28577 EVT ShiftOpTy = ShiftOp.getValueType(); 28578 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 28579 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), 28580 *DAG.getContext(), ResultType); 28581 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp, 28582 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); 28583 if (SetCCResultType != ResultType) 28584 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond); 28585 return Cond; 28586 } 28587 28588 /// Turn vector tests of the signbit in the form of: 28589 /// xor (sra X, elt_size(X)-1), -1 28590 /// into: 28591 /// pcmpgt X, -1 28592 /// 28593 /// This should be called before type legalization because the pattern may not 28594 /// persist after that. 28595 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, 28596 const X86Subtarget &Subtarget) { 28597 EVT VT = N->getValueType(0); 28598 if (!VT.isSimple()) 28599 return SDValue(); 28600 28601 switch (VT.getSimpleVT().SimpleTy) { 28602 default: return SDValue(); 28603 case MVT::v16i8: 28604 case MVT::v8i16: 28605 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break; 28606 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break; 28607 case MVT::v32i8: 28608 case MVT::v16i16: 28609 case MVT::v8i32: 28610 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break; 28611 } 28612 28613 // There must be a shift right algebraic before the xor, and the xor must be a 28614 // 'not' operation. 28615 SDValue Shift = N->getOperand(0); 28616 SDValue Ones = N->getOperand(1); 28617 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() || 28618 !ISD::isBuildVectorAllOnes(Ones.getNode())) 28619 return SDValue(); 28620 28621 // The shift should be smearing the sign bit across each vector element. 28622 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1)); 28623 if (!ShiftBV) 28624 return SDValue(); 28625 28626 EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); 28627 auto *ShiftAmt = ShiftBV->getConstantSplatNode(); 28628 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) 28629 return SDValue(); 28630 28631 // Create a greater-than comparison against -1. We don't use the more obvious 28632 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction. 28633 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); 28634 } 28635 28636 static SDValue combineXor(SDNode *N, SelectionDAG &DAG, 28637 TargetLowering::DAGCombinerInfo &DCI, 28638 const X86Subtarget &Subtarget) { 28639 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) 28640 return Cmp; 28641 28642 if (DCI.isBeforeLegalizeOps()) 28643 return SDValue(); 28644 28645 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) 28646 return RV; 28647 28648 if (Subtarget.hasCMov()) 28649 if (SDValue RV = combineIntegerAbs(N, DAG)) 28650 return RV; 28651 28652 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) 28653 return FPLogic; 28654 28655 return SDValue(); 28656 } 28657 28658 /// This function detects the AVG pattern between vectors of unsigned i8/i16, 28659 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient 28660 /// X86ISD::AVG instruction. 28661 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, 28662 const X86Subtarget &Subtarget, 28663 const SDLoc &DL) { 28664 if (!VT.isVector() || !VT.isSimple()) 28665 return SDValue(); 28666 EVT InVT = In.getValueType(); 28667 unsigned NumElems = VT.getVectorNumElements(); 28668 28669 EVT ScalarVT = VT.getVectorElementType(); 28670 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && 28671 isPowerOf2_32(NumElems))) 28672 return SDValue(); 28673 28674 // InScalarVT is the intermediate type in AVG pattern and it should be greater 28675 // than the original input type (i8/i16). 28676 EVT InScalarVT = InVT.getVectorElementType(); 28677 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) 28678 return SDValue(); 28679 28680 if (!Subtarget.hasSSE2()) 28681 return SDValue(); 28682 if (Subtarget.hasAVX512()) { 28683 if (VT.getSizeInBits() > 512) 28684 return SDValue(); 28685 } else if (Subtarget.hasAVX2()) { 28686 if (VT.getSizeInBits() > 256) 28687 return SDValue(); 28688 } else { 28689 if (VT.getSizeInBits() > 128) 28690 return SDValue(); 28691 } 28692 28693 // Detect the following pattern: 28694 // 28695 // %1 = zext <N x i8> %a to <N x i32> 28696 // %2 = zext <N x i8> %b to <N x i32> 28697 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> 28698 // %4 = add nuw nsw <N x i32> %3, %2 28699 // %5 = lshr <N x i32> %N, <i32 1 x N> 28700 // %6 = trunc <N x i32> %5 to <N x i8> 28701 // 28702 // In AVX512, the last instruction can also be a trunc store. 28703 28704 if (In.getOpcode() != ISD::SRL) 28705 return SDValue(); 28706 28707 // A lambda checking the given SDValue is a constant vector and each element 28708 // is in the range [Min, Max]. 28709 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { 28710 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); 28711 if (!BV || !BV->isConstant()) 28712 return false; 28713 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { 28714 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); 28715 if (!C) 28716 return false; 28717 uint64_t Val = C->getZExtValue(); 28718 if (Val < Min || Val > Max) 28719 return false; 28720 } 28721 return true; 28722 }; 28723 28724 // Check if each element of the vector is left-shifted by one. 28725 auto LHS = In.getOperand(0); 28726 auto RHS = In.getOperand(1); 28727 if (!IsConstVectorInRange(RHS, 1, 1)) 28728 return SDValue(); 28729 if (LHS.getOpcode() != ISD::ADD) 28730 return SDValue(); 28731 28732 // Detect a pattern of a + b + 1 where the order doesn't matter. 28733 SDValue Operands[3]; 28734 Operands[0] = LHS.getOperand(0); 28735 Operands[1] = LHS.getOperand(1); 28736 28737 // Take care of the case when one of the operands is a constant vector whose 28738 // element is in the range [1, 256]. 28739 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && 28740 Operands[0].getOpcode() == ISD::ZERO_EXTEND && 28741 Operands[0].getOperand(0).getValueType() == VT) { 28742 // The pattern is detected. Subtract one from the constant vector, then 28743 // demote it and emit X86ISD::AVG instruction. 28744 SDValue VecOnes = DAG.getConstant(1, DL, InVT); 28745 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); 28746 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); 28747 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), 28748 Operands[1]); 28749 } 28750 28751 if (Operands[0].getOpcode() == ISD::ADD) 28752 std::swap(Operands[0], Operands[1]); 28753 else if (Operands[1].getOpcode() != ISD::ADD) 28754 return SDValue(); 28755 Operands[2] = Operands[1].getOperand(0); 28756 Operands[1] = Operands[1].getOperand(1); 28757 28758 // Now we have three operands of two additions. Check that one of them is a 28759 // constant vector with ones, and the other two are promoted from i8/i16. 28760 for (int i = 0; i < 3; ++i) { 28761 if (!IsConstVectorInRange(Operands[i], 1, 1)) 28762 continue; 28763 std::swap(Operands[i], Operands[2]); 28764 28765 // Check if Operands[0] and Operands[1] are results of type promotion. 28766 for (int j = 0; j < 2; ++j) 28767 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || 28768 Operands[j].getOperand(0).getValueType() != VT) 28769 return SDValue(); 28770 28771 // The pattern is detected, emit X86ISD::AVG instruction. 28772 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), 28773 Operands[1].getOperand(0)); 28774 } 28775 28776 return SDValue(); 28777 } 28778 28779 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, 28780 TargetLowering::DAGCombinerInfo &DCI, 28781 const X86Subtarget &Subtarget) { 28782 LoadSDNode *Ld = cast<LoadSDNode>(N); 28783 EVT RegVT = Ld->getValueType(0); 28784 EVT MemVT = Ld->getMemoryVT(); 28785 SDLoc dl(Ld); 28786 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 28787 28788 // For chips with slow 32-byte unaligned loads, break the 32-byte operation 28789 // into two 16-byte operations. 28790 ISD::LoadExtType Ext = Ld->getExtensionType(); 28791 bool Fast; 28792 unsigned AddressSpace = Ld->getAddressSpace(); 28793 unsigned Alignment = Ld->getAlignment(); 28794 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && 28795 Ext == ISD::NON_EXTLOAD && 28796 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, 28797 AddressSpace, Alignment, &Fast) && !Fast) { 28798 unsigned NumElems = RegVT.getVectorNumElements(); 28799 if (NumElems < 2) 28800 return SDValue(); 28801 28802 SDValue Ptr = Ld->getBasePtr(); 28803 28804 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 28805 NumElems/2); 28806 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 28807 Ld->getPointerInfo(), Ld->isVolatile(), 28808 Ld->isNonTemporal(), Ld->isInvariant(), 28809 Alignment); 28810 28811 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl); 28812 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 28813 Ld->getPointerInfo(), Ld->isVolatile(), 28814 Ld->isNonTemporal(), Ld->isInvariant(), 28815 std::min(16U, Alignment)); 28816 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 28817 Load1.getValue(1), 28818 Load2.getValue(1)); 28819 28820 SDValue NewVec = DAG.getUNDEF(RegVT); 28821 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl); 28822 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl); 28823 return DCI.CombineTo(N, NewVec, TF, true); 28824 } 28825 28826 return SDValue(); 28827 } 28828 28829 /// If V is a build vector of boolean constants and exactly one of those 28830 /// constants is true, return the operand index of that true element. 28831 /// Otherwise, return -1. 28832 static int getOneTrueElt(SDValue V) { 28833 // This needs to be a build vector of booleans. 28834 // TODO: Checking for the i1 type matches the IR definition for the mask, 28835 // but the mask check could be loosened to i8 or other types. That might 28836 // also require checking more than 'allOnesValue'; eg, the x86 HW 28837 // instructions only require that the MSB is set for each mask element. 28838 // The ISD::MSTORE comments/definition do not specify how the mask operand 28839 // is formatted. 28840 auto *BV = dyn_cast<BuildVectorSDNode>(V); 28841 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1) 28842 return -1; 28843 28844 int TrueIndex = -1; 28845 unsigned NumElts = BV->getValueType(0).getVectorNumElements(); 28846 for (unsigned i = 0; i < NumElts; ++i) { 28847 const SDValue &Op = BV->getOperand(i); 28848 if (Op.isUndef()) 28849 continue; 28850 auto *ConstNode = dyn_cast<ConstantSDNode>(Op); 28851 if (!ConstNode) 28852 return -1; 28853 if (ConstNode->getAPIntValue().isAllOnesValue()) { 28854 // If we already found a one, this is too many. 28855 if (TrueIndex >= 0) 28856 return -1; 28857 TrueIndex = i; 28858 } 28859 } 28860 return TrueIndex; 28861 } 28862 28863 /// Given a masked memory load/store operation, return true if it has one mask 28864 /// bit set. If it has one mask bit set, then also return the memory address of 28865 /// the scalar element to load/store, the vector index to insert/extract that 28866 /// scalar element, and the alignment for the scalar memory access. 28867 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, 28868 SelectionDAG &DAG, SDValue &Addr, 28869 SDValue &Index, unsigned &Alignment) { 28870 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask()); 28871 if (TrueMaskElt < 0) 28872 return false; 28873 28874 // Get the address of the one scalar element that is specified by the mask 28875 // using the appropriate offset from the base pointer. 28876 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType(); 28877 Addr = MaskedOp->getBasePtr(); 28878 if (TrueMaskElt != 0) { 28879 unsigned Offset = TrueMaskElt * EltVT.getStoreSize(); 28880 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp)); 28881 } 28882 28883 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); 28884 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize()); 28885 return true; 28886 } 28887 28888 /// If exactly one element of the mask is set for a non-extending masked load, 28889 /// it is a scalar load and vector insert. 28890 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones 28891 /// mask have already been optimized in IR, so we don't bother with those here. 28892 static SDValue 28893 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, 28894 TargetLowering::DAGCombinerInfo &DCI) { 28895 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. 28896 // However, some target hooks may need to be added to know when the transform 28897 // is profitable. Endianness would also have to be considered. 28898 28899 SDValue Addr, VecIndex; 28900 unsigned Alignment; 28901 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment)) 28902 return SDValue(); 28903 28904 // Load the one scalar element that is specified by the mask using the 28905 // appropriate offset from the base pointer. 28906 SDLoc DL(ML); 28907 EVT VT = ML->getValueType(0); 28908 EVT EltVT = VT.getVectorElementType(); 28909 SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr, 28910 ML->getPointerInfo(), ML->isVolatile(), 28911 ML->isNonTemporal(), ML->isInvariant(), Alignment); 28912 28913 // Insert the loaded element into the appropriate place in the vector. 28914 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(), 28915 Load, VecIndex); 28916 return DCI.CombineTo(ML, Insert, Load.getValue(1), true); 28917 } 28918 28919 static SDValue 28920 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, 28921 TargetLowering::DAGCombinerInfo &DCI) { 28922 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) 28923 return SDValue(); 28924 28925 SDLoc DL(ML); 28926 EVT VT = ML->getValueType(0); 28927 28928 // If we are loading the first and last elements of a vector, it is safe and 28929 // always faster to load the whole vector. Replace the masked load with a 28930 // vector load and select. 28931 unsigned NumElts = VT.getVectorNumElements(); 28932 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask()); 28933 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0)); 28934 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1)); 28935 if (LoadFirstElt && LoadLastElt) { 28936 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(), 28937 ML->getMemOperand()); 28938 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0()); 28939 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true); 28940 } 28941 28942 // Convert a masked load with a constant mask into a masked load and a select. 28943 // This allows the select operation to use a faster kind of select instruction 28944 // (for example, vblendvps -> vblendps). 28945 28946 // Don't try this if the pass-through operand is already undefined. That would 28947 // cause an infinite loop because that's what we're about to create. 28948 if (ML->getSrc0().isUndef()) 28949 return SDValue(); 28950 28951 // The new masked load has an undef pass-through operand. The select uses the 28952 // original pass-through operand. 28953 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), 28954 ML->getMask(), DAG.getUNDEF(VT), 28955 ML->getMemoryVT(), ML->getMemOperand(), 28956 ML->getExtensionType()); 28957 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0()); 28958 28959 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true); 28960 } 28961 28962 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, 28963 TargetLowering::DAGCombinerInfo &DCI, 28964 const X86Subtarget &Subtarget) { 28965 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); 28966 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { 28967 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) 28968 return ScalarLoad; 28969 // TODO: Do some AVX512 subsets benefit from this transform? 28970 if (!Subtarget.hasAVX512()) 28971 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI)) 28972 return Blend; 28973 } 28974 28975 if (Mld->getExtensionType() != ISD::SEXTLOAD) 28976 return SDValue(); 28977 28978 // Resolve extending loads. 28979 EVT VT = Mld->getValueType(0); 28980 unsigned NumElems = VT.getVectorNumElements(); 28981 EVT LdVT = Mld->getMemoryVT(); 28982 SDLoc dl(Mld); 28983 28984 assert(LdVT != VT && "Cannot extend to the same type"); 28985 unsigned ToSz = VT.getVectorElementType().getSizeInBits(); 28986 unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); 28987 // From/To sizes and ElemCount must be pow of two. 28988 assert (isPowerOf2_32(NumElems * FromSz * ToSz) && 28989 "Unexpected size for extending masked load"); 28990 28991 unsigned SizeRatio = ToSz / FromSz; 28992 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); 28993 28994 // Create a type on which we perform the shuffle. 28995 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 28996 LdVT.getScalarType(), NumElems*SizeRatio); 28997 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 28998 28999 // Convert Src0 value. 29000 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0()); 29001 if (!Mld->getSrc0().isUndef()) { 29002 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 29003 for (unsigned i = 0; i != NumElems; ++i) 29004 ShuffleVec[i] = i * SizeRatio; 29005 29006 // Can't shuffle using an illegal type. 29007 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && 29008 "WideVecVT should be legal"); 29009 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, 29010 DAG.getUNDEF(WideVecVT), ShuffleVec); 29011 } 29012 // Prepare the new mask. 29013 SDValue NewMask; 29014 SDValue Mask = Mld->getMask(); 29015 if (Mask.getValueType() == VT) { 29016 // Mask and original value have the same type. 29017 NewMask = DAG.getBitcast(WideVecVT, Mask); 29018 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 29019 for (unsigned i = 0; i != NumElems; ++i) 29020 ShuffleVec[i] = i * SizeRatio; 29021 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) 29022 ShuffleVec[i] = NumElems * SizeRatio; 29023 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, 29024 DAG.getConstant(0, dl, WideVecVT), 29025 ShuffleVec); 29026 } else { 29027 assert(Mask.getValueType().getVectorElementType() == MVT::i1); 29028 unsigned WidenNumElts = NumElems*SizeRatio; 29029 unsigned MaskNumElts = VT.getVectorNumElements(); 29030 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 29031 WidenNumElts); 29032 29033 unsigned NumConcat = WidenNumElts / MaskNumElts; 29034 SmallVector<SDValue, 16> Ops(NumConcat); 29035 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); 29036 Ops[0] = Mask; 29037 for (unsigned i = 1; i != NumConcat; ++i) 29038 Ops[i] = ZeroVal; 29039 29040 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); 29041 } 29042 29043 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), 29044 Mld->getBasePtr(), NewMask, WideSrc0, 29045 Mld->getMemoryVT(), Mld->getMemOperand(), 29046 ISD::NON_EXTLOAD); 29047 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); 29048 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); 29049 } 29050 29051 /// If exactly one element of the mask is set for a non-truncating masked store, 29052 /// it is a vector extract and scalar store. 29053 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones 29054 /// mask have already been optimized in IR, so we don't bother with those here. 29055 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, 29056 SelectionDAG &DAG) { 29057 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. 29058 // However, some target hooks may need to be added to know when the transform 29059 // is profitable. Endianness would also have to be considered. 29060 29061 SDValue Addr, VecIndex; 29062 unsigned Alignment; 29063 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment)) 29064 return SDValue(); 29065 29066 // Extract the one scalar element that is actually being stored. 29067 SDLoc DL(MS); 29068 EVT VT = MS->getValue().getValueType(); 29069 EVT EltVT = VT.getVectorElementType(); 29070 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 29071 MS->getValue(), VecIndex); 29072 29073 // Store that element at the appropriate offset from the base pointer. 29074 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(), 29075 MS->isVolatile(), MS->isNonTemporal(), Alignment); 29076 } 29077 29078 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, 29079 const X86Subtarget &Subtarget) { 29080 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); 29081 if (!Mst->isTruncatingStore()) 29082 return reduceMaskedStoreToScalarStore(Mst, DAG); 29083 29084 // Resolve truncating stores. 29085 EVT VT = Mst->getValue().getValueType(); 29086 unsigned NumElems = VT.getVectorNumElements(); 29087 EVT StVT = Mst->getMemoryVT(); 29088 SDLoc dl(Mst); 29089 29090 assert(StVT != VT && "Cannot truncate to the same type"); 29091 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 29092 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 29093 29094 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 29095 29096 // The truncating store is legal in some cases. For example 29097 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw 29098 // are designated for truncate store. 29099 // In this case we don't need any further transformations. 29100 if (TLI.isTruncStoreLegal(VT, StVT)) 29101 return SDValue(); 29102 29103 // From/To sizes and ElemCount must be pow of two. 29104 assert (isPowerOf2_32(NumElems * FromSz * ToSz) && 29105 "Unexpected size for truncating masked store"); 29106 // We are going to use the original vector elt for storing. 29107 // Accumulated smaller vector elements must be a multiple of the store size. 29108 assert (((NumElems * FromSz) % ToSz) == 0 && 29109 "Unexpected ratio for truncating masked store"); 29110 29111 unsigned SizeRatio = FromSz / ToSz; 29112 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 29113 29114 // Create a type on which we perform the shuffle. 29115 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 29116 StVT.getScalarType(), NumElems*SizeRatio); 29117 29118 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 29119 29120 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); 29121 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 29122 for (unsigned i = 0; i != NumElems; ++i) 29123 ShuffleVec[i] = i * SizeRatio; 29124 29125 // Can't shuffle using an illegal type. 29126 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && 29127 "WideVecVT should be legal"); 29128 29129 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 29130 DAG.getUNDEF(WideVecVT), 29131 ShuffleVec); 29132 29133 SDValue NewMask; 29134 SDValue Mask = Mst->getMask(); 29135 if (Mask.getValueType() == VT) { 29136 // Mask and original value have the same type. 29137 NewMask = DAG.getBitcast(WideVecVT, Mask); 29138 for (unsigned i = 0; i != NumElems; ++i) 29139 ShuffleVec[i] = i * SizeRatio; 29140 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) 29141 ShuffleVec[i] = NumElems*SizeRatio; 29142 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, 29143 DAG.getConstant(0, dl, WideVecVT), 29144 ShuffleVec); 29145 } else { 29146 assert(Mask.getValueType().getVectorElementType() == MVT::i1); 29147 unsigned WidenNumElts = NumElems*SizeRatio; 29148 unsigned MaskNumElts = VT.getVectorNumElements(); 29149 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 29150 WidenNumElts); 29151 29152 unsigned NumConcat = WidenNumElts / MaskNumElts; 29153 SmallVector<SDValue, 16> Ops(NumConcat); 29154 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); 29155 Ops[0] = Mask; 29156 for (unsigned i = 1; i != NumConcat; ++i) 29157 Ops[i] = ZeroVal; 29158 29159 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); 29160 } 29161 29162 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, 29163 Mst->getBasePtr(), NewMask, StVT, 29164 Mst->getMemOperand(), false); 29165 } 29166 29167 static SDValue combineStore(SDNode *N, SelectionDAG &DAG, 29168 const X86Subtarget &Subtarget) { 29169 StoreSDNode *St = cast<StoreSDNode>(N); 29170 EVT VT = St->getValue().getValueType(); 29171 EVT StVT = St->getMemoryVT(); 29172 SDLoc dl(St); 29173 SDValue StoredVal = St->getOperand(1); 29174 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 29175 29176 // If we are saving a concatenation of two XMM registers and 32-byte stores 29177 // are slow, such as on Sandy Bridge, perform two 16-byte stores. 29178 bool Fast; 29179 unsigned AddressSpace = St->getAddressSpace(); 29180 unsigned Alignment = St->getAlignment(); 29181 if (VT.is256BitVector() && StVT == VT && 29182 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 29183 AddressSpace, Alignment, &Fast) && 29184 !Fast) { 29185 unsigned NumElems = VT.getVectorNumElements(); 29186 if (NumElems < 2) 29187 return SDValue(); 29188 29189 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl); 29190 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl); 29191 29192 SDValue Ptr0 = St->getBasePtr(); 29193 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl); 29194 29195 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 29196 St->getPointerInfo(), St->isVolatile(), 29197 St->isNonTemporal(), Alignment); 29198 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 29199 St->getPointerInfo(), St->isVolatile(), 29200 St->isNonTemporal(), 29201 std::min(16U, Alignment)); 29202 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 29203 } 29204 29205 // Optimize trunc store (of multiple scalars) to shuffle and store. 29206 // First, pack all of the elements in one place. Next, store to memory 29207 // in fewer chunks. 29208 if (St->isTruncatingStore() && VT.isVector()) { 29209 // Check if we can detect an AVG pattern from the truncation. If yes, 29210 // replace the trunc store by a normal store with the result of X86ISD::AVG 29211 // instruction. 29212 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, 29213 Subtarget, dl)) 29214 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), 29215 St->getPointerInfo(), St->isVolatile(), 29216 St->isNonTemporal(), St->getAlignment()); 29217 29218 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 29219 unsigned NumElems = VT.getVectorNumElements(); 29220 assert(StVT != VT && "Cannot truncate to the same type"); 29221 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 29222 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 29223 29224 // The truncating store is legal in some cases. For example 29225 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw 29226 // are designated for truncate store. 29227 // In this case we don't need any further transformations. 29228 if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) 29229 return SDValue(); 29230 29231 // From, To sizes and ElemCount must be pow of two 29232 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 29233 // We are going to use the original vector elt for storing. 29234 // Accumulated smaller vector elements must be a multiple of the store size. 29235 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 29236 29237 unsigned SizeRatio = FromSz / ToSz; 29238 29239 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 29240 29241 // Create a type on which we perform the shuffle 29242 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 29243 StVT.getScalarType(), NumElems*SizeRatio); 29244 29245 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 29246 29247 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); 29248 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 29249 for (unsigned i = 0; i != NumElems; ++i) 29250 ShuffleVec[i] = i * SizeRatio; 29251 29252 // Can't shuffle using an illegal type. 29253 if (!TLI.isTypeLegal(WideVecVT)) 29254 return SDValue(); 29255 29256 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 29257 DAG.getUNDEF(WideVecVT), 29258 ShuffleVec); 29259 // At this point all of the data is stored at the bottom of the 29260 // register. We now need to save it to mem. 29261 29262 // Find the largest store unit 29263 MVT StoreType = MVT::i8; 29264 for (MVT Tp : MVT::integer_valuetypes()) { 29265 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 29266 StoreType = Tp; 29267 } 29268 29269 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 29270 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 29271 (64 <= NumElems * ToSz)) 29272 StoreType = MVT::f64; 29273 29274 // Bitcast the original vector into a vector of store-size units 29275 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 29276 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 29277 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 29278 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); 29279 SmallVector<SDValue, 8> Chains; 29280 SDValue Ptr = St->getBasePtr(); 29281 29282 // Perform one or more big stores into memory. 29283 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 29284 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 29285 StoreType, ShuffWide, 29286 DAG.getIntPtrConstant(i, dl)); 29287 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 29288 St->getPointerInfo(), St->isVolatile(), 29289 St->isNonTemporal(), St->getAlignment()); 29290 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl); 29291 Chains.push_back(Ch); 29292 } 29293 29294 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 29295 } 29296 29297 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 29298 // the FP state in cases where an emms may be missing. 29299 // A preferable solution to the general problem is to figure out the right 29300 // places to insert EMMS. This qualifies as a quick hack. 29301 29302 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 29303 if (VT.getSizeInBits() != 64) 29304 return SDValue(); 29305 29306 const Function *F = DAG.getMachineFunction().getFunction(); 29307 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); 29308 bool F64IsLegal = 29309 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); 29310 if ((VT.isVector() || 29311 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && 29312 isa<LoadSDNode>(St->getValue()) && 29313 !cast<LoadSDNode>(St->getValue())->isVolatile() && 29314 St->getChain().hasOneUse() && !St->isVolatile()) { 29315 SDNode* LdVal = St->getValue().getNode(); 29316 LoadSDNode *Ld = nullptr; 29317 int TokenFactorIndex = -1; 29318 SmallVector<SDValue, 8> Ops; 29319 SDNode* ChainVal = St->getChain().getNode(); 29320 // Must be a store of a load. We currently handle two cases: the load 29321 // is a direct child, and it's under an intervening TokenFactor. It is 29322 // possible to dig deeper under nested TokenFactors. 29323 if (ChainVal == LdVal) 29324 Ld = cast<LoadSDNode>(St->getChain()); 29325 else if (St->getValue().hasOneUse() && 29326 ChainVal->getOpcode() == ISD::TokenFactor) { 29327 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 29328 if (ChainVal->getOperand(i).getNode() == LdVal) { 29329 TokenFactorIndex = i; 29330 Ld = cast<LoadSDNode>(St->getValue()); 29331 } else 29332 Ops.push_back(ChainVal->getOperand(i)); 29333 } 29334 } 29335 29336 if (!Ld || !ISD::isNormalLoad(Ld)) 29337 return SDValue(); 29338 29339 // If this is not the MMX case, i.e. we are just turning i64 load/store 29340 // into f64 load/store, avoid the transformation if there are multiple 29341 // uses of the loaded value. 29342 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 29343 return SDValue(); 29344 29345 SDLoc LdDL(Ld); 29346 SDLoc StDL(N); 29347 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 29348 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 29349 // pair instead. 29350 if (Subtarget.is64Bit() || F64IsLegal) { 29351 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; 29352 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 29353 Ld->getPointerInfo(), Ld->isVolatile(), 29354 Ld->isNonTemporal(), Ld->isInvariant(), 29355 Ld->getAlignment()); 29356 SDValue NewChain = NewLd.getValue(1); 29357 if (TokenFactorIndex >= 0) { 29358 Ops.push_back(NewChain); 29359 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); 29360 } 29361 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 29362 St->getPointerInfo(), 29363 St->isVolatile(), St->isNonTemporal(), 29364 St->getAlignment()); 29365 } 29366 29367 // Otherwise, lower to two pairs of 32-bit loads / stores. 29368 SDValue LoAddr = Ld->getBasePtr(); 29369 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL); 29370 29371 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 29372 Ld->getPointerInfo(), 29373 Ld->isVolatile(), Ld->isNonTemporal(), 29374 Ld->isInvariant(), Ld->getAlignment()); 29375 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 29376 Ld->getPointerInfo().getWithOffset(4), 29377 Ld->isVolatile(), Ld->isNonTemporal(), 29378 Ld->isInvariant(), 29379 MinAlign(Ld->getAlignment(), 4)); 29380 29381 SDValue NewChain = LoLd.getValue(1); 29382 if (TokenFactorIndex >= 0) { 29383 Ops.push_back(LoLd); 29384 Ops.push_back(HiLd); 29385 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); 29386 } 29387 29388 LoAddr = St->getBasePtr(); 29389 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL); 29390 29391 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 29392 St->getPointerInfo(), 29393 St->isVolatile(), St->isNonTemporal(), 29394 St->getAlignment()); 29395 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 29396 St->getPointerInfo().getWithOffset(4), 29397 St->isVolatile(), 29398 St->isNonTemporal(), 29399 MinAlign(St->getAlignment(), 4)); 29400 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 29401 } 29402 29403 // This is similar to the above case, but here we handle a scalar 64-bit 29404 // integer store that is extracted from a vector on a 32-bit target. 29405 // If we have SSE2, then we can treat it like a floating-point double 29406 // to get past legalization. The execution dependencies fixup pass will 29407 // choose the optimal machine instruction for the store if this really is 29408 // an integer or v2f32 rather than an f64. 29409 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() && 29410 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 29411 SDValue OldExtract = St->getOperand(1); 29412 SDValue ExtOp0 = OldExtract.getOperand(0); 29413 unsigned VecSize = ExtOp0.getValueSizeInBits(); 29414 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); 29415 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); 29416 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 29417 BitCast, OldExtract.getOperand(1)); 29418 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), 29419 St->getPointerInfo(), St->isVolatile(), 29420 St->isNonTemporal(), St->getAlignment()); 29421 } 29422 29423 return SDValue(); 29424 } 29425 29426 /// Return 'true' if this vector operation is "horizontal" 29427 /// and return the operands for the horizontal operation in LHS and RHS. A 29428 /// horizontal operation performs the binary operation on successive elements 29429 /// of its first operand, then on successive elements of its second operand, 29430 /// returning the resulting values in a vector. For example, if 29431 /// A = < float a0, float a1, float a2, float a3 > 29432 /// and 29433 /// B = < float b0, float b1, float b2, float b3 > 29434 /// then the result of doing a horizontal operation on A and B is 29435 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 29436 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 29437 /// A horizontal-op B, for some already available A and B, and if so then LHS is 29438 /// set to A, RHS to B, and the routine returns 'true'. 29439 /// Note that the binary operation should have the property that if one of the 29440 /// operands is UNDEF then the result is UNDEF. 29441 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 29442 // Look for the following pattern: if 29443 // A = < float a0, float a1, float a2, float a3 > 29444 // B = < float b0, float b1, float b2, float b3 > 29445 // and 29446 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 29447 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 29448 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 29449 // which is A horizontal-op B. 29450 29451 // At least one of the operands should be a vector shuffle. 29452 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 29453 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 29454 return false; 29455 29456 MVT VT = LHS.getSimpleValueType(); 29457 29458 assert((VT.is128BitVector() || VT.is256BitVector()) && 29459 "Unsupported vector type for horizontal add/sub"); 29460 29461 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 29462 // operate independently on 128-bit lanes. 29463 unsigned NumElts = VT.getVectorNumElements(); 29464 unsigned NumLanes = VT.getSizeInBits()/128; 29465 unsigned NumLaneElts = NumElts / NumLanes; 29466 assert((NumLaneElts % 2 == 0) && 29467 "Vector type should have an even number of elements in each lane"); 29468 unsigned HalfLaneElts = NumLaneElts/2; 29469 29470 // View LHS in the form 29471 // LHS = VECTOR_SHUFFLE A, B, LMask 29472 // If LHS is not a shuffle then pretend it is the shuffle 29473 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 29474 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 29475 // type VT. 29476 SDValue A, B; 29477 SmallVector<int, 16> LMask(NumElts); 29478 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 29479 if (!LHS.getOperand(0).isUndef()) 29480 A = LHS.getOperand(0); 29481 if (!LHS.getOperand(1).isUndef()) 29482 B = LHS.getOperand(1); 29483 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 29484 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 29485 } else { 29486 if (!LHS.isUndef()) 29487 A = LHS; 29488 for (unsigned i = 0; i != NumElts; ++i) 29489 LMask[i] = i; 29490 } 29491 29492 // Likewise, view RHS in the form 29493 // RHS = VECTOR_SHUFFLE C, D, RMask 29494 SDValue C, D; 29495 SmallVector<int, 16> RMask(NumElts); 29496 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 29497 if (!RHS.getOperand(0).isUndef()) 29498 C = RHS.getOperand(0); 29499 if (!RHS.getOperand(1).isUndef()) 29500 D = RHS.getOperand(1); 29501 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 29502 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 29503 } else { 29504 if (!RHS.isUndef()) 29505 C = RHS; 29506 for (unsigned i = 0; i != NumElts; ++i) 29507 RMask[i] = i; 29508 } 29509 29510 // Check that the shuffles are both shuffling the same vectors. 29511 if (!(A == C && B == D) && !(A == D && B == C)) 29512 return false; 29513 29514 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 29515 if (!A.getNode() && !B.getNode()) 29516 return false; 29517 29518 // If A and B occur in reverse order in RHS, then "swap" them (which means 29519 // rewriting the mask). 29520 if (A != C) 29521 ShuffleVectorSDNode::commuteMask(RMask); 29522 29523 // At this point LHS and RHS are equivalent to 29524 // LHS = VECTOR_SHUFFLE A, B, LMask 29525 // RHS = VECTOR_SHUFFLE A, B, RMask 29526 // Check that the masks correspond to performing a horizontal operation. 29527 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 29528 for (unsigned i = 0; i != NumLaneElts; ++i) { 29529 int LIdx = LMask[i+l], RIdx = RMask[i+l]; 29530 29531 // Ignore any UNDEF components. 29532 if (LIdx < 0 || RIdx < 0 || 29533 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 29534 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 29535 continue; 29536 29537 // Check that successive elements are being operated on. If not, this is 29538 // not a horizontal operation. 29539 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs 29540 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; 29541 if (!(LIdx == Index && RIdx == Index + 1) && 29542 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 29543 return false; 29544 } 29545 } 29546 29547 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 29548 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 29549 return true; 29550 } 29551 29552 /// Do target-specific dag combines on floating-point adds/subs. 29553 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, 29554 const X86Subtarget &Subtarget) { 29555 EVT VT = N->getValueType(0); 29556 SDValue LHS = N->getOperand(0); 29557 SDValue RHS = N->getOperand(1); 29558 bool IsFadd = N->getOpcode() == ISD::FADD; 29559 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); 29560 29561 // Try to synthesize horizontal add/sub from adds/subs of shuffles. 29562 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 29563 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 29564 isHorizontalBinOp(LHS, RHS, IsFadd)) { 29565 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; 29566 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); 29567 } 29568 return SDValue(); 29569 } 29570 29571 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. 29572 static SDValue 29573 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, 29574 SmallVector<SDValue, 8> &Regs) { 29575 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || 29576 Regs[0].getValueType() == MVT::v2i64)); 29577 EVT OutVT = N->getValueType(0); 29578 EVT OutSVT = OutVT.getVectorElementType(); 29579 EVT InVT = Regs[0].getValueType(); 29580 EVT InSVT = InVT.getVectorElementType(); 29581 SDLoc DL(N); 29582 29583 // First, use mask to unset all bits that won't appear in the result. 29584 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && 29585 "OutSVT can only be either i8 or i16."); 29586 APInt Mask = 29587 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits()); 29588 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT); 29589 for (auto &Reg : Regs) 29590 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg); 29591 29592 MVT UnpackedVT, PackedVT; 29593 if (OutSVT == MVT::i8) { 29594 UnpackedVT = MVT::v8i16; 29595 PackedVT = MVT::v16i8; 29596 } else { 29597 UnpackedVT = MVT::v4i32; 29598 PackedVT = MVT::v8i16; 29599 } 29600 29601 // In each iteration, truncate the type by a half size. 29602 auto RegNum = Regs.size(); 29603 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); 29604 j < e; j *= 2, RegNum /= 2) { 29605 for (unsigned i = 0; i < RegNum; i++) 29606 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]); 29607 for (unsigned i = 0; i < RegNum / 2; i++) 29608 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], 29609 Regs[i * 2 + 1]); 29610 } 29611 29612 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and 29613 // then extract a subvector as the result since v8i8 is not a legal type. 29614 if (OutVT == MVT::v8i8) { 29615 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); 29616 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], 29617 DAG.getIntPtrConstant(0, DL)); 29618 return Regs[0]; 29619 } else if (RegNum > 1) { 29620 Regs.resize(RegNum); 29621 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); 29622 } else 29623 return Regs[0]; 29624 } 29625 29626 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. 29627 static SDValue 29628 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, 29629 SmallVector<SDValue, 8> &Regs) { 29630 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); 29631 EVT OutVT = N->getValueType(0); 29632 SDLoc DL(N); 29633 29634 // Shift left by 16 bits, then arithmetic-shift right by 16 bits. 29635 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); 29636 for (auto &Reg : Regs) { 29637 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); 29638 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); 29639 } 29640 29641 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) 29642 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], 29643 Regs[i * 2 + 1]); 29644 29645 if (Regs.size() > 2) { 29646 Regs.resize(Regs.size() / 2); 29647 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); 29648 } else 29649 return Regs[0]; 29650 } 29651 29652 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into 29653 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type 29654 /// legalization the truncation will be translated into a BUILD_VECTOR with each 29655 /// element that is extracted from a vector and then truncated, and it is 29656 /// diffcult to do this optimization based on them. 29657 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, 29658 const X86Subtarget &Subtarget) { 29659 EVT OutVT = N->getValueType(0); 29660 if (!OutVT.isVector()) 29661 return SDValue(); 29662 29663 SDValue In = N->getOperand(0); 29664 if (!In.getValueType().isSimple()) 29665 return SDValue(); 29666 29667 EVT InVT = In.getValueType(); 29668 unsigned NumElems = OutVT.getVectorNumElements(); 29669 29670 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on 29671 // SSE2, and we need to take care of it specially. 29672 // AVX512 provides vpmovdb. 29673 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2()) 29674 return SDValue(); 29675 29676 EVT OutSVT = OutVT.getVectorElementType(); 29677 EVT InSVT = InVT.getVectorElementType(); 29678 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && 29679 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && 29680 NumElems >= 8)) 29681 return SDValue(); 29682 29683 // SSSE3's pshufb results in less instructions in the cases below. 29684 if (Subtarget.hasSSSE3() && NumElems == 8 && 29685 ((OutSVT == MVT::i8 && InSVT != MVT::i64) || 29686 (InSVT == MVT::i32 && OutSVT == MVT::i16))) 29687 return SDValue(); 29688 29689 SDLoc DL(N); 29690 29691 // Split a long vector into vectors of legal type. 29692 unsigned RegNum = InVT.getSizeInBits() / 128; 29693 SmallVector<SDValue, 8> SubVec(RegNum); 29694 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits(); 29695 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts); 29696 29697 for (unsigned i = 0; i < RegNum; i++) 29698 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In, 29699 DAG.getIntPtrConstant(i * NumSubRegElts, DL)); 29700 29701 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS 29702 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to 29703 // truncate 2 x v4i32 to v8i16. 29704 if (Subtarget.hasSSE41() || OutSVT == MVT::i8) 29705 return combineVectorTruncationWithPACKUS(N, DAG, SubVec); 29706 else if (InSVT == MVT::i32) 29707 return combineVectorTruncationWithPACKSS(N, DAG, SubVec); 29708 else 29709 return SDValue(); 29710 } 29711 29712 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, 29713 const X86Subtarget &Subtarget) { 29714 EVT VT = N->getValueType(0); 29715 SDValue Src = N->getOperand(0); 29716 SDLoc DL(N); 29717 29718 // Try to detect AVG pattern first. 29719 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) 29720 return Avg; 29721 29722 // The bitcast source is a direct mmx result. 29723 // Detect bitcasts between i32 to x86mmx 29724 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { 29725 SDValue BCSrc = Src.getOperand(0); 29726 if (BCSrc.getValueType() == MVT::x86mmx) 29727 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); 29728 } 29729 29730 return combineVectorTruncation(N, DAG, Subtarget); 29731 } 29732 29733 /// Do target-specific dag combines on floating point negations. 29734 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, 29735 const X86Subtarget &Subtarget) { 29736 EVT VT = N->getValueType(0); 29737 EVT SVT = VT.getScalarType(); 29738 SDValue Arg = N->getOperand(0); 29739 SDLoc DL(N); 29740 29741 // Let legalize expand this if it isn't a legal type yet. 29742 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 29743 return SDValue(); 29744 29745 // If we're negating a FMUL node on a target with FMA, then we can avoid the 29746 // use of a constant by performing (-0 - A*B) instead. 29747 // FIXME: Check rounding control flags as well once it becomes available. 29748 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && 29749 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) { 29750 SDValue Zero = DAG.getConstantFP(0.0, DL, VT); 29751 return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), 29752 Arg.getOperand(1), Zero); 29753 } 29754 29755 // If we're negating a FMA node, then we can adjust the 29756 // instruction to include the extra negation. 29757 if (Arg.hasOneUse()) { 29758 switch (Arg.getOpcode()) { 29759 case X86ISD::FMADD: 29760 return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), 29761 Arg.getOperand(1), Arg.getOperand(2)); 29762 case X86ISD::FMSUB: 29763 return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), 29764 Arg.getOperand(1), Arg.getOperand(2)); 29765 case X86ISD::FNMADD: 29766 return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), 29767 Arg.getOperand(1), Arg.getOperand(2)); 29768 case X86ISD::FNMSUB: 29769 return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), 29770 Arg.getOperand(1), Arg.getOperand(2)); 29771 } 29772 } 29773 return SDValue(); 29774 } 29775 29776 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, 29777 const X86Subtarget &Subtarget) { 29778 EVT VT = N->getValueType(0); 29779 if (VT.is512BitVector() && !Subtarget.hasDQI()) { 29780 // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. 29781 // These logic operations may be executed in the integer domain. 29782 SDLoc dl(N); 29783 MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); 29784 MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); 29785 29786 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); 29787 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); 29788 unsigned IntOpcode = 0; 29789 switch (N->getOpcode()) { 29790 default: llvm_unreachable("Unexpected FP logic op"); 29791 case X86ISD::FOR: IntOpcode = ISD::OR; break; 29792 case X86ISD::FXOR: IntOpcode = ISD::XOR; break; 29793 case X86ISD::FAND: IntOpcode = ISD::AND; break; 29794 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; 29795 } 29796 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); 29797 return DAG.getBitcast(VT, IntOp); 29798 } 29799 return SDValue(); 29800 } 29801 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. 29802 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, 29803 const X86Subtarget &Subtarget) { 29804 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 29805 29806 // F[X]OR(0.0, x) -> x 29807 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 29808 if (C->getValueAPF().isPosZero()) 29809 return N->getOperand(1); 29810 29811 // F[X]OR(x, 0.0) -> x 29812 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 29813 if (C->getValueAPF().isPosZero()) 29814 return N->getOperand(0); 29815 29816 return lowerX86FPLogicOp(N, DAG, Subtarget); 29817 } 29818 29819 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. 29820 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { 29821 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 29822 29823 // Only perform optimizations if UnsafeMath is used. 29824 if (!DAG.getTarget().Options.UnsafeFPMath) 29825 return SDValue(); 29826 29827 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 29828 // into FMINC and FMAXC, which are Commutative operations. 29829 unsigned NewOp = 0; 29830 switch (N->getOpcode()) { 29831 default: llvm_unreachable("unknown opcode"); 29832 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 29833 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 29834 } 29835 29836 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), 29837 N->getOperand(0), N->getOperand(1)); 29838 } 29839 29840 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, 29841 const X86Subtarget &Subtarget) { 29842 if (Subtarget.useSoftFloat()) 29843 return SDValue(); 29844 29845 // TODO: Check for global or instruction-level "nnan". In that case, we 29846 // should be able to lower to FMAX/FMIN alone. 29847 // TODO: If an operand is already known to be a NaN or not a NaN, this 29848 // should be an optional swap and FMAX/FMIN. 29849 29850 EVT VT = N->getValueType(0); 29851 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || 29852 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || 29853 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) 29854 return SDValue(); 29855 29856 // This takes at least 3 instructions, so favor a library call when operating 29857 // on a scalar and minimizing code size. 29858 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) 29859 return SDValue(); 29860 29861 SDValue Op0 = N->getOperand(0); 29862 SDValue Op1 = N->getOperand(1); 29863 SDLoc DL(N); 29864 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( 29865 DAG.getDataLayout(), *DAG.getContext(), VT); 29866 29867 // There are 4 possibilities involving NaN inputs, and these are the required 29868 // outputs: 29869 // Op1 29870 // Num NaN 29871 // ---------------- 29872 // Num | Max | Op0 | 29873 // Op0 ---------------- 29874 // NaN | Op1 | NaN | 29875 // ---------------- 29876 // 29877 // The SSE FP max/min instructions were not designed for this case, but rather 29878 // to implement: 29879 // Min = Op1 < Op0 ? Op1 : Op0 29880 // Max = Op1 > Op0 ? Op1 : Op0 29881 // 29882 // So they always return Op0 if either input is a NaN. However, we can still 29883 // use those instructions for fmaxnum by selecting away a NaN input. 29884 29885 // If either operand is NaN, the 2nd source operand (Op0) is passed through. 29886 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; 29887 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); 29888 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); 29889 29890 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands 29891 // are NaN, the NaN value of Op1 is the result. 29892 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; 29893 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); 29894 } 29895 29896 /// Do target-specific dag combines on X86ISD::FAND nodes. 29897 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, 29898 const X86Subtarget &Subtarget) { 29899 // FAND(0.0, x) -> 0.0 29900 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 29901 if (C->getValueAPF().isPosZero()) 29902 return N->getOperand(0); 29903 29904 // FAND(x, 0.0) -> 0.0 29905 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 29906 if (C->getValueAPF().isPosZero()) 29907 return N->getOperand(1); 29908 29909 return lowerX86FPLogicOp(N, DAG, Subtarget); 29910 } 29911 29912 /// Do target-specific dag combines on X86ISD::FANDN nodes 29913 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, 29914 const X86Subtarget &Subtarget) { 29915 // FANDN(0.0, x) -> x 29916 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 29917 if (C->getValueAPF().isPosZero()) 29918 return N->getOperand(1); 29919 29920 // FANDN(x, 0.0) -> 0.0 29921 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 29922 if (C->getValueAPF().isPosZero()) 29923 return N->getOperand(1); 29924 29925 return lowerX86FPLogicOp(N, DAG, Subtarget); 29926 } 29927 29928 static SDValue combineBT(SDNode *N, SelectionDAG &DAG, 29929 TargetLowering::DAGCombinerInfo &DCI) { 29930 // BT ignores high bits in the bit index operand. 29931 SDValue Op1 = N->getOperand(1); 29932 if (Op1.hasOneUse()) { 29933 unsigned BitWidth = Op1.getValueSizeInBits(); 29934 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 29935 APInt KnownZero, KnownOne; 29936 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 29937 !DCI.isBeforeLegalizeOps()); 29938 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 29939 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 29940 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 29941 DCI.CommitTargetLoweringOpt(TLO); 29942 } 29943 return SDValue(); 29944 } 29945 29946 static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) { 29947 SDValue Op = peekThroughBitcasts(N->getOperand(0)); 29948 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 29949 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 29950 VT.getVectorElementType().getSizeInBits() == 29951 OpVT.getVectorElementType().getSizeInBits()) { 29952 return DAG.getBitcast(VT, Op); 29953 } 29954 return SDValue(); 29955 } 29956 29957 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, 29958 const X86Subtarget &Subtarget) { 29959 EVT VT = N->getValueType(0); 29960 if (!VT.isVector()) 29961 return SDValue(); 29962 29963 SDValue N0 = N->getOperand(0); 29964 SDValue N1 = N->getOperand(1); 29965 EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); 29966 SDLoc dl(N); 29967 29968 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the 29969 // both SSE and AVX2 since there is no sign-extended shift right 29970 // operation on a vector with 64-bit elements. 29971 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> 29972 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) 29973 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || 29974 N0.getOpcode() == ISD::SIGN_EXTEND)) { 29975 SDValue N00 = N0.getOperand(0); 29976 29977 // EXTLOAD has a better solution on AVX2, 29978 // it may be replaced with X86ISD::VSEXT node. 29979 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256()) 29980 if (!ISD::isNormalLoad(N00.getNode())) 29981 return SDValue(); 29982 29983 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { 29984 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 29985 N00, N1); 29986 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); 29987 } 29988 } 29989 return SDValue(); 29990 } 29991 29992 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext) 29993 /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities 29994 /// to combine math ops, use an LEA, or use a complex addressing mode. This can 29995 /// eliminate extend, add, and shift instructions. 29996 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, 29997 const X86Subtarget &Subtarget) { 29998 // TODO: This should be valid for other integer types. 29999 EVT VT = Sext->getValueType(0); 30000 if (VT != MVT::i64) 30001 return SDValue(); 30002 30003 // We need an 'add nsw' feeding into the 'sext'. 30004 SDValue Add = Sext->getOperand(0); 30005 if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) 30006 return SDValue(); 30007 30008 // Having a constant operand to the 'add' ensures that we are not increasing 30009 // the instruction count because the constant is extended for free below. 30010 // A constant operand can also become the displacement field of an LEA. 30011 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); 30012 if (!AddOp1) 30013 return SDValue(); 30014 30015 // Don't make the 'add' bigger if there's no hope of combining it with some 30016 // other 'add' or 'shl' instruction. 30017 // TODO: It may be profitable to generate simpler LEA instructions in place 30018 // of single 'add' instructions, but the cost model for selecting an LEA 30019 // currently has a high threshold. 30020 bool HasLEAPotential = false; 30021 for (auto *User : Sext->uses()) { 30022 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { 30023 HasLEAPotential = true; 30024 break; 30025 } 30026 } 30027 if (!HasLEAPotential) 30028 return SDValue(); 30029 30030 // Everything looks good, so pull the 'sext' ahead of the 'add'. 30031 int64_t AddConstant = AddOp1->getSExtValue(); 30032 SDValue AddOp0 = Add.getOperand(0); 30033 SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); 30034 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); 30035 30036 // The wider add is guaranteed to not wrap because both operands are 30037 // sign-extended. 30038 SDNodeFlags Flags; 30039 Flags.setNoSignedWrap(true); 30040 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); 30041 } 30042 30043 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> 30044 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y) 30045 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly 30046 /// extends from AH (which we otherwise need to do contortions to access). 30047 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { 30048 SDValue N0 = N->getOperand(0); 30049 auto OpcodeN = N->getOpcode(); 30050 auto OpcodeN0 = N0.getOpcode(); 30051 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) || 30052 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM))) 30053 return SDValue(); 30054 30055 EVT VT = N->getValueType(0); 30056 EVT InVT = N0.getValueType(); 30057 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32) 30058 return SDValue(); 30059 30060 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); 30061 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG 30062 : X86ISD::UDIVREM8_ZEXT_HREG; 30063 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), 30064 N0.getOperand(1)); 30065 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); 30066 return R.getValue(1); 30067 } 30068 30069 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or 30070 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating 30071 /// with UNDEFs) of the input to vectors of the same size as the target type 30072 /// which then extends the lowest elements. 30073 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, 30074 TargetLowering::DAGCombinerInfo &DCI, 30075 const X86Subtarget &Subtarget) { 30076 unsigned Opcode = N->getOpcode(); 30077 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) 30078 return SDValue(); 30079 if (!DCI.isBeforeLegalizeOps()) 30080 return SDValue(); 30081 if (!Subtarget.hasSSE2()) 30082 return SDValue(); 30083 30084 SDValue N0 = N->getOperand(0); 30085 EVT VT = N->getValueType(0); 30086 EVT SVT = VT.getScalarType(); 30087 EVT InVT = N0.getValueType(); 30088 EVT InSVT = InVT.getScalarType(); 30089 30090 // Input type must be a vector and we must be extending legal integer types. 30091 if (!VT.isVector()) 30092 return SDValue(); 30093 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) 30094 return SDValue(); 30095 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) 30096 return SDValue(); 30097 30098 // On AVX2+ targets, if the input/output types are both legal then we will be 30099 // able to use SIGN_EXTEND/ZERO_EXTEND directly. 30100 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 30101 DAG.getTargetLoweringInfo().isTypeLegal(InVT)) 30102 return SDValue(); 30103 30104 SDLoc DL(N); 30105 30106 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { 30107 EVT InVT = N.getValueType(); 30108 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), 30109 Size / InVT.getScalarSizeInBits()); 30110 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(), 30111 DAG.getUNDEF(InVT)); 30112 Opnds[0] = N; 30113 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); 30114 }; 30115 30116 // If target-size is less than 128-bits, extend to a type that would extend 30117 // to 128 bits, extend that and extract the original target vector. 30118 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) { 30119 unsigned Scale = 128 / VT.getSizeInBits(); 30120 EVT ExVT = 30121 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); 30122 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); 30123 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); 30124 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, 30125 DAG.getIntPtrConstant(0, DL)); 30126 } 30127 30128 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to 30129 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. 30130 // Also use this if we don't have SSE41 to allow the legalizer do its job. 30131 if (!Subtarget.hasSSE41() || VT.is128BitVector() || 30132 (VT.is256BitVector() && Subtarget.hasInt256())) { 30133 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); 30134 return Opcode == ISD::SIGN_EXTEND 30135 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) 30136 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT); 30137 } 30138 30139 // On pre-AVX2 targets, split into 128-bit nodes of 30140 // ISD::*_EXTEND_VECTOR_INREG. 30141 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) { 30142 unsigned NumVecs = VT.getSizeInBits() / 128; 30143 unsigned NumSubElts = 128 / SVT.getSizeInBits(); 30144 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); 30145 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); 30146 30147 SmallVector<SDValue, 8> Opnds; 30148 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { 30149 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, 30150 DAG.getIntPtrConstant(Offset, DL)); 30151 SrcVec = ExtendVecSize(DL, SrcVec, 128); 30152 SrcVec = Opcode == ISD::SIGN_EXTEND 30153 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) 30154 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); 30155 Opnds.push_back(SrcVec); 30156 } 30157 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); 30158 } 30159 30160 return SDValue(); 30161 } 30162 30163 static SDValue combineSext(SDNode *N, SelectionDAG &DAG, 30164 TargetLowering::DAGCombinerInfo &DCI, 30165 const X86Subtarget &Subtarget) { 30166 SDValue N0 = N->getOperand(0); 30167 EVT VT = N->getValueType(0); 30168 EVT InVT = N0.getValueType(); 30169 SDLoc DL(N); 30170 30171 if (SDValue DivRem8 = getDivRem8(N, DAG)) 30172 return DivRem8; 30173 30174 if (!DCI.isBeforeLegalizeOps()) { 30175 if (InVT == MVT::i1) { 30176 SDValue Zero = DAG.getConstant(0, DL, VT); 30177 SDValue AllOnes = 30178 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); 30179 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); 30180 } 30181 return SDValue(); 30182 } 30183 30184 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) 30185 return V; 30186 30187 if (Subtarget.hasAVX() && VT.is256BitVector()) 30188 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) 30189 return R; 30190 30191 if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) 30192 return NewAdd; 30193 30194 return SDValue(); 30195 } 30196 30197 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, 30198 const X86Subtarget &Subtarget) { 30199 SDLoc dl(N); 30200 EVT VT = N->getValueType(0); 30201 30202 // Let legalize expand this if it isn't a legal type yet. 30203 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 30204 return SDValue(); 30205 30206 EVT ScalarVT = VT.getScalarType(); 30207 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) 30208 return SDValue(); 30209 30210 SDValue A = N->getOperand(0); 30211 SDValue B = N->getOperand(1); 30212 SDValue C = N->getOperand(2); 30213 30214 bool NegA = (A.getOpcode() == ISD::FNEG); 30215 bool NegB = (B.getOpcode() == ISD::FNEG); 30216 bool NegC = (C.getOpcode() == ISD::FNEG); 30217 30218 // Negative multiplication when NegA xor NegB 30219 bool NegMul = (NegA != NegB); 30220 if (NegA) 30221 A = A.getOperand(0); 30222 if (NegB) 30223 B = B.getOperand(0); 30224 if (NegC) 30225 C = C.getOperand(0); 30226 30227 unsigned Opcode; 30228 if (!NegMul) 30229 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 30230 else 30231 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 30232 30233 return DAG.getNode(Opcode, dl, VT, A, B, C); 30234 } 30235 30236 static SDValue combineZext(SDNode *N, SelectionDAG &DAG, 30237 TargetLowering::DAGCombinerInfo &DCI, 30238 const X86Subtarget &Subtarget) { 30239 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 30240 // (and (i32 x86isd::setcc_carry), 1) 30241 // This eliminates the zext. This transformation is necessary because 30242 // ISD::SETCC is always legalized to i8. 30243 SDLoc dl(N); 30244 SDValue N0 = N->getOperand(0); 30245 EVT VT = N->getValueType(0); 30246 30247 if (N0.getOpcode() == ISD::AND && 30248 N0.hasOneUse() && 30249 N0.getOperand(0).hasOneUse()) { 30250 SDValue N00 = N0.getOperand(0); 30251 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 30252 if (!isOneConstant(N0.getOperand(1))) 30253 return SDValue(); 30254 return DAG.getNode(ISD::AND, dl, VT, 30255 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 30256 N00.getOperand(0), N00.getOperand(1)), 30257 DAG.getConstant(1, dl, VT)); 30258 } 30259 } 30260 30261 if (N0.getOpcode() == ISD::TRUNCATE && 30262 N0.hasOneUse() && 30263 N0.getOperand(0).hasOneUse()) { 30264 SDValue N00 = N0.getOperand(0); 30265 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 30266 return DAG.getNode(ISD::AND, dl, VT, 30267 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 30268 N00.getOperand(0), N00.getOperand(1)), 30269 DAG.getConstant(1, dl, VT)); 30270 } 30271 } 30272 30273 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) 30274 return V; 30275 30276 if (VT.is256BitVector()) 30277 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) 30278 return R; 30279 30280 if (SDValue DivRem8 = getDivRem8(N, DAG)) 30281 return DivRem8; 30282 30283 return SDValue(); 30284 } 30285 30286 /// Optimize x == -y --> x+y == 0 30287 /// x != -y --> x+y != 0 30288 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, 30289 const X86Subtarget &Subtarget) { 30290 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 30291 SDValue LHS = N->getOperand(0); 30292 SDValue RHS = N->getOperand(1); 30293 EVT VT = N->getValueType(0); 30294 SDLoc DL(N); 30295 30296 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 30297 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { 30298 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, 30299 LHS.getOperand(1)); 30300 return DAG.getSetCC(DL, N->getValueType(0), addV, 30301 DAG.getConstant(0, DL, addV.getValueType()), CC); 30302 } 30303 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 30304 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { 30305 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, 30306 RHS.getOperand(1)); 30307 return DAG.getSetCC(DL, N->getValueType(0), addV, 30308 DAG.getConstant(0, DL, addV.getValueType()), CC); 30309 } 30310 30311 if (VT.getScalarType() == MVT::i1 && 30312 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { 30313 bool IsSEXT0 = 30314 (LHS.getOpcode() == ISD::SIGN_EXTEND) && 30315 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); 30316 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); 30317 30318 if (!IsSEXT0 || !IsVZero1) { 30319 // Swap the operands and update the condition code. 30320 std::swap(LHS, RHS); 30321 CC = ISD::getSetCCSwappedOperands(CC); 30322 30323 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && 30324 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); 30325 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); 30326 } 30327 30328 if (IsSEXT0 && IsVZero1) { 30329 assert(VT == LHS.getOperand(0).getValueType() && 30330 "Uexpected operand type"); 30331 if (CC == ISD::SETGT) 30332 return DAG.getConstant(0, DL, VT); 30333 if (CC == ISD::SETLE) 30334 return DAG.getConstant(1, DL, VT); 30335 if (CC == ISD::SETEQ || CC == ISD::SETGE) 30336 return DAG.getNOT(DL, LHS.getOperand(0), VT); 30337 30338 assert((CC == ISD::SETNE || CC == ISD::SETLT) && 30339 "Unexpected condition code!"); 30340 return LHS.getOperand(0); 30341 } 30342 } 30343 30344 // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization 30345 // via legalization because v4i32 is not a legal type. 30346 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) 30347 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG); 30348 30349 return SDValue(); 30350 } 30351 30352 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) { 30353 SDLoc DL(N); 30354 // Gather and Scatter instructions use k-registers for masks. The type of 30355 // the masks is v*i1. So the mask will be truncated anyway. 30356 // The SIGN_EXTEND_INREG my be dropped. 30357 SDValue Mask = N->getOperand(2); 30358 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { 30359 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); 30360 NewOps[2] = Mask.getOperand(0); 30361 DAG.UpdateNodeOperands(N, NewOps); 30362 } 30363 return SDValue(); 30364 } 30365 30366 // Helper function of performSETCCCombine. It is to materialize "setb reg" 30367 // as "sbb reg,reg", since it can be extended without zext and produces 30368 // an all-ones bit which is more useful than 0/1 in some cases. 30369 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS, 30370 SelectionDAG &DAG, MVT VT) { 30371 if (VT == MVT::i8) 30372 return DAG.getNode(ISD::AND, DL, VT, 30373 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 30374 DAG.getConstant(X86::COND_B, DL, MVT::i8), 30375 EFLAGS), 30376 DAG.getConstant(1, DL, VT)); 30377 assert (VT == MVT::i1 && "Unexpected type for SECCC node"); 30378 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, 30379 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 30380 DAG.getConstant(X86::COND_B, DL, MVT::i8), 30381 EFLAGS)); 30382 } 30383 30384 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 30385 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, 30386 TargetLowering::DAGCombinerInfo &DCI, 30387 const X86Subtarget &Subtarget) { 30388 SDLoc DL(N); 30389 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 30390 SDValue EFLAGS = N->getOperand(1); 30391 30392 if (CC == X86::COND_A) { 30393 // Try to convert COND_A into COND_B in an attempt to facilitate 30394 // materializing "setb reg". 30395 // 30396 // Do not flip "e > c", where "c" is a constant, because Cmp instruction 30397 // cannot take an immediate as its first operand. 30398 // 30399 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 30400 EFLAGS.getValueType().isInteger() && 30401 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { 30402 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), 30403 EFLAGS.getNode()->getVTList(), 30404 EFLAGS.getOperand(1), EFLAGS.getOperand(0)); 30405 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); 30406 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); 30407 } 30408 } 30409 30410 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 30411 // a zext and produces an all-ones bit which is more useful than 0/1 in some 30412 // cases. 30413 if (CC == X86::COND_B) 30414 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); 30415 30416 // Try to simplify the EFLAGS and condition code operands. 30417 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { 30418 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); 30419 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 30420 } 30421 30422 return SDValue(); 30423 } 30424 30425 /// Optimize branch condition evaluation. 30426 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, 30427 TargetLowering::DAGCombinerInfo &DCI, 30428 const X86Subtarget &Subtarget) { 30429 SDLoc DL(N); 30430 SDValue EFLAGS = N->getOperand(3); 30431 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 30432 30433 // Try to simplify the EFLAGS and condition code operands. 30434 // Make sure to not keep references to operands, as combineSetCCEFLAGS can 30435 // RAUW them under us. 30436 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { 30437 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); 30438 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), 30439 N->getOperand(1), Cond, Flags); 30440 } 30441 30442 return SDValue(); 30443 } 30444 30445 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, 30446 SelectionDAG &DAG) { 30447 // Take advantage of vector comparisons producing 0 or -1 in each lane to 30448 // optimize away operation when it's from a constant. 30449 // 30450 // The general transformation is: 30451 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 30452 // AND(VECTOR_CMP(x,y), constant2) 30453 // constant2 = UNARYOP(constant) 30454 30455 // Early exit if this isn't a vector operation, the operand of the 30456 // unary operation isn't a bitwise AND, or if the sizes of the operations 30457 // aren't the same. 30458 EVT VT = N->getValueType(0); 30459 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 30460 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 30461 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 30462 return SDValue(); 30463 30464 // Now check that the other operand of the AND is a constant. We could 30465 // make the transformation for non-constant splats as well, but it's unclear 30466 // that would be a benefit as it would not eliminate any operations, just 30467 // perform one more step in scalar code before moving to the vector unit. 30468 if (BuildVectorSDNode *BV = 30469 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 30470 // Bail out if the vector isn't a constant. 30471 if (!BV->isConstant()) 30472 return SDValue(); 30473 30474 // Everything checks out. Build up the new and improved node. 30475 SDLoc DL(N); 30476 EVT IntVT = BV->getValueType(0); 30477 // Create a new constant of the appropriate type for the transformed 30478 // DAG. 30479 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 30480 // The AND node needs bitcasts to/from an integer vector type around it. 30481 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); 30482 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 30483 N->getOperand(0)->getOperand(0), MaskConst); 30484 SDValue Res = DAG.getBitcast(VT, NewAnd); 30485 return Res; 30486 } 30487 30488 return SDValue(); 30489 } 30490 30491 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, 30492 const X86Subtarget &Subtarget) { 30493 SDValue Op0 = N->getOperand(0); 30494 EVT VT = N->getValueType(0); 30495 EVT InVT = Op0.getValueType(); 30496 EVT InSVT = InVT.getScalarType(); 30497 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 30498 30499 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) 30500 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) 30501 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { 30502 SDLoc dl(N); 30503 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 30504 InVT.getVectorNumElements()); 30505 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); 30506 30507 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT)) 30508 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); 30509 30510 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); 30511 } 30512 30513 return SDValue(); 30514 } 30515 30516 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, 30517 const X86Subtarget &Subtarget) { 30518 // First try to optimize away the conversion entirely when it's 30519 // conditionally from a constant. Vectors only. 30520 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG)) 30521 return Res; 30522 30523 // Now move on to more general possibilities. 30524 SDValue Op0 = N->getOperand(0); 30525 EVT VT = N->getValueType(0); 30526 EVT InVT = Op0.getValueType(); 30527 EVT InSVT = InVT.getScalarType(); 30528 30529 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) 30530 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) 30531 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { 30532 SDLoc dl(N); 30533 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 30534 InVT.getVectorNumElements()); 30535 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 30536 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); 30537 } 30538 30539 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 30540 // a 32-bit target where SSE doesn't support i64->FP operations. 30541 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { 30542 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 30543 EVT LdVT = Ld->getValueType(0); 30544 30545 // This transformation is not supported if the result type is f16 or f128. 30546 if (VT == MVT::f16 || VT == MVT::f128) 30547 return SDValue(); 30548 30549 if (!Ld->isVolatile() && !VT.isVector() && 30550 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 30551 !Subtarget.is64Bit() && LdVT == MVT::i64) { 30552 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD( 30553 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); 30554 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 30555 return FILDChain; 30556 } 30557 } 30558 return SDValue(); 30559 } 30560 30561 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 30562 static SDValue combineADC(SDNode *N, SelectionDAG &DAG, 30563 X86TargetLowering::DAGCombinerInfo &DCI) { 30564 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 30565 // the result is either zero or one (depending on the input carry bit). 30566 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 30567 if (X86::isZeroNode(N->getOperand(0)) && 30568 X86::isZeroNode(N->getOperand(1)) && 30569 // We don't have a good way to replace an EFLAGS use, so only do this when 30570 // dead right now. 30571 SDValue(N, 1).use_empty()) { 30572 SDLoc DL(N); 30573 EVT VT = N->getValueType(0); 30574 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); 30575 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 30576 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 30577 DAG.getConstant(X86::COND_B, DL, 30578 MVT::i8), 30579 N->getOperand(2)), 30580 DAG.getConstant(1, DL, VT)); 30581 return DCI.CombineTo(N, Res1, CarryOut); 30582 } 30583 30584 return SDValue(); 30585 } 30586 30587 /// fold (add Y, (sete X, 0)) -> adc 0, Y 30588 /// (add Y, (setne X, 0)) -> sbb -1, Y 30589 /// (sub (sete X, 0), Y) -> sbb 0, Y 30590 /// (sub (setne X, 0), Y) -> adc -1, Y 30591 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 30592 SDLoc DL(N); 30593 30594 // Look through ZExts. 30595 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 30596 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 30597 return SDValue(); 30598 30599 SDValue SetCC = Ext.getOperand(0); 30600 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 30601 return SDValue(); 30602 30603 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 30604 if (CC != X86::COND_E && CC != X86::COND_NE) 30605 return SDValue(); 30606 30607 SDValue Cmp = SetCC.getOperand(1); 30608 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 30609 !X86::isZeroNode(Cmp.getOperand(1)) || 30610 !Cmp.getOperand(0).getValueType().isInteger()) 30611 return SDValue(); 30612 30613 SDValue CmpOp0 = Cmp.getOperand(0); 30614 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 30615 DAG.getConstant(1, DL, CmpOp0.getValueType())); 30616 30617 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 30618 if (CC == X86::COND_NE) 30619 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 30620 DL, OtherVal.getValueType(), OtherVal, 30621 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()), 30622 NewCmp); 30623 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 30624 DL, OtherVal.getValueType(), OtherVal, 30625 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); 30626 } 30627 30628 static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG, 30629 const X86Subtarget &Subtarget) { 30630 SDLoc DL(N); 30631 EVT VT = N->getValueType(0); 30632 SDValue Op0 = N->getOperand(0); 30633 SDValue Op1 = N->getOperand(1); 30634 30635 if (!VT.isVector() || !VT.isSimple() || 30636 !(VT.getVectorElementType() == MVT::i32)) 30637 return SDValue(); 30638 30639 unsigned RegSize = 128; 30640 if (Subtarget.hasBWI()) 30641 RegSize = 512; 30642 else if (Subtarget.hasAVX2()) 30643 RegSize = 256; 30644 30645 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. 30646 if (VT.getSizeInBits() / 4 > RegSize) 30647 return SDValue(); 30648 30649 // Detect the following pattern: 30650 // 30651 // 1: %2 = zext <N x i8> %0 to <N x i32> 30652 // 2: %3 = zext <N x i8> %1 to <N x i32> 30653 // 3: %4 = sub nsw <N x i32> %2, %3 30654 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N] 30655 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4 30656 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6 30657 // 7: %8 = add nsw <N x i32> %7, %vec.phi 30658 // 30659 // The last instruction must be a reduction add. The instructions 3-6 forms an 30660 // ABSDIFF pattern. 30661 30662 // The two operands of reduction add are from PHI and a select-op as in line 7 30663 // above. 30664 SDValue SelectOp, Phi; 30665 if (Op0.getOpcode() == ISD::VSELECT) { 30666 SelectOp = Op0; 30667 Phi = Op1; 30668 } else if (Op1.getOpcode() == ISD::VSELECT) { 30669 SelectOp = Op1; 30670 Phi = Op0; 30671 } else 30672 return SDValue(); 30673 30674 // Check the condition of the select instruction is greater-than. 30675 SDValue SetCC = SelectOp->getOperand(0); 30676 if (SetCC.getOpcode() != ISD::SETCC) 30677 return SDValue(); 30678 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); 30679 if (CC != ISD::SETGT) 30680 return SDValue(); 30681 30682 Op0 = SelectOp->getOperand(1); 30683 Op1 = SelectOp->getOperand(2); 30684 30685 // The second operand of SelectOp Op1 is the negation of the first operand 30686 // Op0, which is implemented as 0 - Op0. 30687 if (!(Op1.getOpcode() == ISD::SUB && 30688 ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) && 30689 Op1.getOperand(1) == Op0)) 30690 return SDValue(); 30691 30692 // The first operand of SetCC is the first operand of SelectOp, which is the 30693 // difference between two input vectors. 30694 if (SetCC.getOperand(0) != Op0) 30695 return SDValue(); 30696 30697 // The second operand of > comparison can be either -1 or 0. 30698 if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || 30699 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) 30700 return SDValue(); 30701 30702 // The first operand of SelectOp is the difference between two input vectors. 30703 if (Op0.getOpcode() != ISD::SUB) 30704 return SDValue(); 30705 30706 Op1 = Op0.getOperand(1); 30707 Op0 = Op0.getOperand(0); 30708 30709 // Check if the operands of the diff are zero-extended from vectors of i8. 30710 if (Op0.getOpcode() != ISD::ZERO_EXTEND || 30711 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || 30712 Op1.getOpcode() != ISD::ZERO_EXTEND || 30713 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) 30714 return SDValue(); 30715 30716 // SAD pattern detected. Now build a SAD instruction and an addition for 30717 // reduction. Note that the number of elments of the result of SAD is less 30718 // than the number of elements of its input. Therefore, we could only update 30719 // part of elements in the reduction vector. 30720 30721 // Legalize the type of the inputs of PSADBW. 30722 EVT InVT = Op0.getOperand(0).getValueType(); 30723 if (InVT.getSizeInBits() <= 128) 30724 RegSize = 128; 30725 else if (InVT.getSizeInBits() <= 256) 30726 RegSize = 256; 30727 30728 unsigned NumConcat = RegSize / InVT.getSizeInBits(); 30729 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT)); 30730 Ops[0] = Op0.getOperand(0); 30731 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); 30732 Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); 30733 Ops[0] = Op1.getOperand(0); 30734 Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); 30735 30736 // The output of PSADBW is a vector of i64. 30737 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); 30738 SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1); 30739 30740 // We need to turn the vector of i64 into a vector of i32. 30741 // If the reduction vector is at least as wide as the psadbw result, just 30742 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero 30743 // anyway. 30744 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); 30745 if (VT.getSizeInBits() >= ResVT.getSizeInBits()) 30746 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); 30747 else 30748 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); 30749 30750 if (VT.getSizeInBits() > ResVT.getSizeInBits()) { 30751 // Update part of elements of the reduction vector. This is done by first 30752 // extracting a sub-vector from it, updating this sub-vector, and inserting 30753 // it back. 30754 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi, 30755 DAG.getIntPtrConstant(0, DL)); 30756 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi); 30757 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res, 30758 DAG.getIntPtrConstant(0, DL)); 30759 } else 30760 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); 30761 } 30762 30763 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, 30764 const X86Subtarget &Subtarget) { 30765 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; 30766 if (Flags->hasVectorReduction()) { 30767 if (SDValue Sad = detectSADPattern(N, DAG, Subtarget)) 30768 return Sad; 30769 } 30770 EVT VT = N->getValueType(0); 30771 SDValue Op0 = N->getOperand(0); 30772 SDValue Op1 = N->getOperand(1); 30773 30774 // Try to synthesize horizontal adds from adds of shuffles. 30775 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 30776 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 30777 isHorizontalBinOp(Op0, Op1, true)) 30778 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); 30779 30780 return OptimizeConditionalInDecrement(N, DAG); 30781 } 30782 30783 static SDValue combineSub(SDNode *N, SelectionDAG &DAG, 30784 const X86Subtarget &Subtarget) { 30785 SDValue Op0 = N->getOperand(0); 30786 SDValue Op1 = N->getOperand(1); 30787 30788 // X86 can't encode an immediate LHS of a sub. See if we can push the 30789 // negation into a preceding instruction. 30790 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 30791 // If the RHS of the sub is a XOR with one use and a constant, invert the 30792 // immediate. Then add one to the LHS of the sub so we can turn 30793 // X-Y -> X+~Y+1, saving one register. 30794 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 30795 isa<ConstantSDNode>(Op1.getOperand(1))) { 30796 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 30797 EVT VT = Op0.getValueType(); 30798 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, 30799 Op1.getOperand(0), 30800 DAG.getConstant(~XorC, SDLoc(Op1), VT)); 30801 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, 30802 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT)); 30803 } 30804 } 30805 30806 // Try to synthesize horizontal adds from adds of shuffles. 30807 EVT VT = N->getValueType(0); 30808 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 30809 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 30810 isHorizontalBinOp(Op0, Op1, true)) 30811 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); 30812 30813 return OptimizeConditionalInDecrement(N, DAG); 30814 } 30815 30816 static SDValue combineVZext(SDNode *N, SelectionDAG &DAG, 30817 TargetLowering::DAGCombinerInfo &DCI, 30818 const X86Subtarget &Subtarget) { 30819 SDLoc DL(N); 30820 MVT VT = N->getSimpleValueType(0); 30821 MVT SVT = VT.getVectorElementType(); 30822 SDValue Op = N->getOperand(0); 30823 MVT OpVT = Op.getSimpleValueType(); 30824 MVT OpEltVT = OpVT.getVectorElementType(); 30825 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); 30826 30827 // Perform any constant folding. 30828 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { 30829 SmallVector<SDValue, 4> Vals; 30830 for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { 30831 SDValue OpElt = Op.getOperand(i); 30832 if (OpElt.getOpcode() == ISD::UNDEF) { 30833 Vals.push_back(DAG.getUNDEF(SVT)); 30834 continue; 30835 } 30836 APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue(); 30837 assert(Cst.getBitWidth() == OpEltVT.getSizeInBits()); 30838 Cst = Cst.zextOrTrunc(SVT.getSizeInBits()); 30839 Vals.push_back(DAG.getConstant(Cst, DL, SVT)); 30840 } 30841 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals); 30842 } 30843 30844 // (vzext (bitcast (vzext (x)) -> (vzext x) 30845 SDValue V = peekThroughBitcasts(Op); 30846 if (V != Op && V.getOpcode() == X86ISD::VZEXT) { 30847 MVT InnerVT = V.getSimpleValueType(); 30848 MVT InnerEltVT = InnerVT.getVectorElementType(); 30849 30850 // If the element sizes match exactly, we can just do one larger vzext. This 30851 // is always an exact type match as vzext operates on integer types. 30852 if (OpEltVT == InnerEltVT) { 30853 assert(OpVT == InnerVT && "Types must match for vzext!"); 30854 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); 30855 } 30856 30857 // The only other way we can combine them is if only a single element of the 30858 // inner vzext is used in the input to the outer vzext. 30859 if (InnerEltVT.getSizeInBits() < InputBits) 30860 return SDValue(); 30861 30862 // In this case, the inner vzext is completely dead because we're going to 30863 // only look at bits inside of the low element. Just do the outer vzext on 30864 // a bitcast of the input to the inner. 30865 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V)); 30866 } 30867 30868 // Check if we can bypass extracting and re-inserting an element of an input 30869 // vector. Essentially: 30870 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 30871 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && 30872 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 30873 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { 30874 SDValue ExtractedV = V.getOperand(0); 30875 SDValue OrigV = ExtractedV.getOperand(0); 30876 if (isNullConstant(ExtractedV.getOperand(1))) { 30877 MVT OrigVT = OrigV.getSimpleValueType(); 30878 // Extract a subvector if necessary... 30879 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { 30880 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); 30881 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), 30882 OrigVT.getVectorNumElements() / Ratio); 30883 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, 30884 DAG.getIntPtrConstant(0, DL)); 30885 } 30886 Op = DAG.getBitcast(OpVT, OrigV); 30887 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); 30888 } 30889 } 30890 30891 return SDValue(); 30892 } 30893 30894 /// Canonicalize (LSUB p, 1) -> (LADD p, -1). 30895 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG, 30896 const X86Subtarget &Subtarget) { 30897 SDValue Chain = N->getOperand(0); 30898 SDValue LHS = N->getOperand(1); 30899 SDValue RHS = N->getOperand(2); 30900 MVT VT = RHS.getSimpleValueType(); 30901 SDLoc DL(N); 30902 30903 auto *C = dyn_cast<ConstantSDNode>(RHS); 30904 if (!C || C->getZExtValue() != 1) 30905 return SDValue(); 30906 30907 RHS = DAG.getConstant(-1, DL, VT); 30908 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand(); 30909 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL, 30910 DAG.getVTList(MVT::i32, MVT::Other), 30911 {Chain, LHS, RHS}, VT, MMO); 30912 } 30913 30914 // TEST (AND a, b) ,(AND a, b) -> TEST a, b 30915 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) { 30916 SDValue Op0 = N->getOperand(0); 30917 SDValue Op1 = N->getOperand(1); 30918 30919 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND) 30920 return SDValue(); 30921 30922 EVT VT = N->getValueType(0); 30923 SDLoc DL(N); 30924 30925 return DAG.getNode(X86ISD::TESTM, DL, VT, 30926 Op0->getOperand(0), Op0->getOperand(1)); 30927 } 30928 30929 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, 30930 const X86Subtarget &Subtarget) { 30931 MVT VT = N->getSimpleValueType(0); 30932 SDLoc DL(N); 30933 30934 if (N->getOperand(0) == N->getOperand(1)) { 30935 if (N->getOpcode() == X86ISD::PCMPEQ) 30936 return getOnesVector(VT, Subtarget, DAG, DL); 30937 if (N->getOpcode() == X86ISD::PCMPGT) 30938 return getZeroVector(VT, Subtarget, DAG, DL); 30939 } 30940 30941 return SDValue(); 30942 } 30943 30944 30945 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 30946 DAGCombinerInfo &DCI) const { 30947 SelectionDAG &DAG = DCI.DAG; 30948 switch (N->getOpcode()) { 30949 default: break; 30950 case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI); 30951 case ISD::VSELECT: 30952 case ISD::SELECT: 30953 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); 30954 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget); 30955 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); 30956 case ISD::ADD: return combineAdd(N, DAG, Subtarget); 30957 case ISD::SUB: return combineSub(N, DAG, Subtarget); 30958 case X86ISD::ADC: return combineADC(N, DAG, DCI); 30959 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); 30960 case ISD::SHL: 30961 case ISD::SRA: 30962 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); 30963 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); 30964 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); 30965 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); 30966 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); 30967 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); 30968 case ISD::STORE: return combineStore(N, DAG, Subtarget); 30969 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget); 30970 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); 30971 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); 30972 case ISD::FADD: 30973 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); 30974 case ISD::FNEG: return combineFneg(N, DAG, Subtarget); 30975 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); 30976 case X86ISD::FXOR: 30977 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget); 30978 case X86ISD::FMIN: 30979 case X86ISD::FMAX: return combineFMinFMax(N, DAG); 30980 case ISD::FMINNUM: 30981 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); 30982 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); 30983 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); 30984 case X86ISD::BT: return combineBT(N, DAG, DCI); 30985 case X86ISD::VZEXT_MOVL: return combineVZextMovl(N, DAG); 30986 case ISD::ANY_EXTEND: 30987 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); 30988 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); 30989 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); 30990 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); 30991 case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget); 30992 case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget); 30993 case X86ISD::VZEXT: return combineVZext(N, DAG, DCI, Subtarget); 30994 case X86ISD::SHUFP: // Handle all target specific shuffles 30995 case X86ISD::INSERTPS: 30996 case X86ISD::PALIGNR: 30997 case X86ISD::VSHLDQ: 30998 case X86ISD::VSRLDQ: 30999 case X86ISD::BLENDI: 31000 case X86ISD::UNPCKH: 31001 case X86ISD::UNPCKL: 31002 case X86ISD::MOVHLPS: 31003 case X86ISD::MOVLHPS: 31004 case X86ISD::PSHUFB: 31005 case X86ISD::PSHUFD: 31006 case X86ISD::PSHUFHW: 31007 case X86ISD::PSHUFLW: 31008 case X86ISD::MOVSHDUP: 31009 case X86ISD::MOVSLDUP: 31010 case X86ISD::MOVDDUP: 31011 case X86ISD::MOVSS: 31012 case X86ISD::MOVSD: 31013 case X86ISD::VPPERM: 31014 case X86ISD::VPERMV: 31015 case X86ISD::VPERMV3: 31016 case X86ISD::VPERMIL2: 31017 case X86ISD::VPERMILPI: 31018 case X86ISD::VPERMILPV: 31019 case X86ISD::VPERM2X128: 31020 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); 31021 case ISD::FMA: return combineFMA(N, DAG, Subtarget); 31022 case ISD::MGATHER: 31023 case ISD::MSCATTER: return combineGatherScatter(N, DAG); 31024 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget); 31025 case X86ISD::TESTM: return combineTestM(N, DAG); 31026 case X86ISD::PCMPEQ: 31027 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); 31028 } 31029 31030 return SDValue(); 31031 } 31032 31033 /// Return true if the target has native support for the specified value type 31034 /// and it is 'desirable' to use the type for the given node type. e.g. On x86 31035 /// i16 is legal, but undesirable since i16 instruction encodings are longer and 31036 /// some i16 instructions are slow. 31037 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 31038 if (!isTypeLegal(VT)) 31039 return false; 31040 if (VT != MVT::i16) 31041 return true; 31042 31043 switch (Opc) { 31044 default: 31045 return true; 31046 case ISD::LOAD: 31047 case ISD::SIGN_EXTEND: 31048 case ISD::ZERO_EXTEND: 31049 case ISD::ANY_EXTEND: 31050 case ISD::SHL: 31051 case ISD::SRL: 31052 case ISD::SUB: 31053 case ISD::ADD: 31054 case ISD::MUL: 31055 case ISD::AND: 31056 case ISD::OR: 31057 case ISD::XOR: 31058 return false; 31059 } 31060 } 31061 31062 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We 31063 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if 31064 /// we don't adjust the stack we clobber the first frame index. 31065 /// See X86InstrInfo::copyPhysReg. 31066 bool X86TargetLowering::hasCopyImplyingStackAdjustment( 31067 MachineFunction *MF) const { 31068 const MachineRegisterInfo &MRI = MF->getRegInfo(); 31069 31070 return any_of(MRI.reg_instructions(X86::EFLAGS), 31071 [](const MachineInstr &RI) { return RI.isCopy(); }); 31072 } 31073 31074 /// This method query the target whether it is beneficial for dag combiner to 31075 /// promote the specified node. If true, it should return the desired promotion 31076 /// type by reference. 31077 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 31078 EVT VT = Op.getValueType(); 31079 if (VT != MVT::i16) 31080 return false; 31081 31082 bool Promote = false; 31083 bool Commute = false; 31084 switch (Op.getOpcode()) { 31085 default: break; 31086 case ISD::SIGN_EXTEND: 31087 case ISD::ZERO_EXTEND: 31088 case ISD::ANY_EXTEND: 31089 Promote = true; 31090 break; 31091 case ISD::SHL: 31092 case ISD::SRL: { 31093 SDValue N0 = Op.getOperand(0); 31094 // Look out for (store (shl (load), x)). 31095 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 31096 return false; 31097 Promote = true; 31098 break; 31099 } 31100 case ISD::ADD: 31101 case ISD::MUL: 31102 case ISD::AND: 31103 case ISD::OR: 31104 case ISD::XOR: 31105 Commute = true; 31106 // fallthrough 31107 case ISD::SUB: { 31108 SDValue N0 = Op.getOperand(0); 31109 SDValue N1 = Op.getOperand(1); 31110 if (!Commute && MayFoldLoad(N1)) 31111 return false; 31112 // Avoid disabling potential load folding opportunities. 31113 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 31114 return false; 31115 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 31116 return false; 31117 Promote = true; 31118 } 31119 } 31120 31121 PVT = MVT::i32; 31122 return Promote; 31123 } 31124 31125 //===----------------------------------------------------------------------===// 31126 // X86 Inline Assembly Support 31127 //===----------------------------------------------------------------------===// 31128 31129 // Helper to match a string separated by whitespace. 31130 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { 31131 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. 31132 31133 for (StringRef Piece : Pieces) { 31134 if (!S.startswith(Piece)) // Check if the piece matches. 31135 return false; 31136 31137 S = S.substr(Piece.size()); 31138 StringRef::size_type Pos = S.find_first_not_of(" \t"); 31139 if (Pos == 0) // We matched a prefix. 31140 return false; 31141 31142 S = S.substr(Pos); 31143 } 31144 31145 return S.empty(); 31146 } 31147 31148 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { 31149 31150 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { 31151 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && 31152 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && 31153 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { 31154 31155 if (AsmPieces.size() == 3) 31156 return true; 31157 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) 31158 return true; 31159 } 31160 } 31161 return false; 31162 } 31163 31164 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 31165 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 31166 31167 const std::string &AsmStr = IA->getAsmString(); 31168 31169 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 31170 if (!Ty || Ty->getBitWidth() % 16 != 0) 31171 return false; 31172 31173 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 31174 SmallVector<StringRef, 4> AsmPieces; 31175 SplitString(AsmStr, AsmPieces, ";\n"); 31176 31177 switch (AsmPieces.size()) { 31178 default: return false; 31179 case 1: 31180 // FIXME: this should verify that we are targeting a 486 or better. If not, 31181 // we will turn this bswap into something that will be lowered to logical 31182 // ops instead of emitting the bswap asm. For now, we don't support 486 or 31183 // lower so don't worry about this. 31184 // bswap $0 31185 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || 31186 matchAsm(AsmPieces[0], {"bswapl", "$0"}) || 31187 matchAsm(AsmPieces[0], {"bswapq", "$0"}) || 31188 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || 31189 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || 31190 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { 31191 // No need to check constraints, nothing other than the equivalent of 31192 // "=r,0" would be valid here. 31193 return IntrinsicLowering::LowerToByteSwap(CI); 31194 } 31195 31196 // rorw $$8, ${0:w} --> llvm.bswap.i16 31197 if (CI->getType()->isIntegerTy(16) && 31198 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 31199 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || 31200 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { 31201 AsmPieces.clear(); 31202 StringRef ConstraintsStr = IA->getConstraintString(); 31203 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 31204 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 31205 if (clobbersFlagRegisters(AsmPieces)) 31206 return IntrinsicLowering::LowerToByteSwap(CI); 31207 } 31208 break; 31209 case 3: 31210 if (CI->getType()->isIntegerTy(32) && 31211 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 31212 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && 31213 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && 31214 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { 31215 AsmPieces.clear(); 31216 StringRef ConstraintsStr = IA->getConstraintString(); 31217 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 31218 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 31219 if (clobbersFlagRegisters(AsmPieces)) 31220 return IntrinsicLowering::LowerToByteSwap(CI); 31221 } 31222 31223 if (CI->getType()->isIntegerTy(64)) { 31224 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 31225 if (Constraints.size() >= 2 && 31226 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 31227 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 31228 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 31229 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && 31230 matchAsm(AsmPieces[1], {"bswap", "%edx"}) && 31231 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) 31232 return IntrinsicLowering::LowerToByteSwap(CI); 31233 } 31234 } 31235 break; 31236 } 31237 return false; 31238 } 31239 31240 /// Given a constraint letter, return the type of constraint for this target. 31241 X86TargetLowering::ConstraintType 31242 X86TargetLowering::getConstraintType(StringRef Constraint) const { 31243 if (Constraint.size() == 1) { 31244 switch (Constraint[0]) { 31245 case 'R': 31246 case 'q': 31247 case 'Q': 31248 case 'f': 31249 case 't': 31250 case 'u': 31251 case 'y': 31252 case 'x': 31253 case 'Y': 31254 case 'l': 31255 return C_RegisterClass; 31256 case 'a': 31257 case 'b': 31258 case 'c': 31259 case 'd': 31260 case 'S': 31261 case 'D': 31262 case 'A': 31263 return C_Register; 31264 case 'I': 31265 case 'J': 31266 case 'K': 31267 case 'L': 31268 case 'M': 31269 case 'N': 31270 case 'G': 31271 case 'C': 31272 case 'e': 31273 case 'Z': 31274 return C_Other; 31275 default: 31276 break; 31277 } 31278 } 31279 return TargetLowering::getConstraintType(Constraint); 31280 } 31281 31282 /// Examine constraint type and operand type and determine a weight value. 31283 /// This object must already have been set up with the operand type 31284 /// and the current alternative constraint selected. 31285 TargetLowering::ConstraintWeight 31286 X86TargetLowering::getSingleConstraintMatchWeight( 31287 AsmOperandInfo &info, const char *constraint) const { 31288 ConstraintWeight weight = CW_Invalid; 31289 Value *CallOperandVal = info.CallOperandVal; 31290 // If we don't have a value, we can't do a match, 31291 // but allow it at the lowest weight. 31292 if (!CallOperandVal) 31293 return CW_Default; 31294 Type *type = CallOperandVal->getType(); 31295 // Look at the constraint type. 31296 switch (*constraint) { 31297 default: 31298 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 31299 case 'R': 31300 case 'q': 31301 case 'Q': 31302 case 'a': 31303 case 'b': 31304 case 'c': 31305 case 'd': 31306 case 'S': 31307 case 'D': 31308 case 'A': 31309 if (CallOperandVal->getType()->isIntegerTy()) 31310 weight = CW_SpecificReg; 31311 break; 31312 case 'f': 31313 case 't': 31314 case 'u': 31315 if (type->isFloatingPointTy()) 31316 weight = CW_SpecificReg; 31317 break; 31318 case 'y': 31319 if (type->isX86_MMXTy() && Subtarget.hasMMX()) 31320 weight = CW_SpecificReg; 31321 break; 31322 case 'x': 31323 case 'Y': 31324 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || 31325 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256())) 31326 weight = CW_Register; 31327 break; 31328 case 'I': 31329 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 31330 if (C->getZExtValue() <= 31) 31331 weight = CW_Constant; 31332 } 31333 break; 31334 case 'J': 31335 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 31336 if (C->getZExtValue() <= 63) 31337 weight = CW_Constant; 31338 } 31339 break; 31340 case 'K': 31341 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 31342 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 31343 weight = CW_Constant; 31344 } 31345 break; 31346 case 'L': 31347 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 31348 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 31349 weight = CW_Constant; 31350 } 31351 break; 31352 case 'M': 31353 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 31354 if (C->getZExtValue() <= 3) 31355 weight = CW_Constant; 31356 } 31357 break; 31358 case 'N': 31359 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 31360 if (C->getZExtValue() <= 0xff) 31361 weight = CW_Constant; 31362 } 31363 break; 31364 case 'G': 31365 case 'C': 31366 if (isa<ConstantFP>(CallOperandVal)) { 31367 weight = CW_Constant; 31368 } 31369 break; 31370 case 'e': 31371 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 31372 if ((C->getSExtValue() >= -0x80000000LL) && 31373 (C->getSExtValue() <= 0x7fffffffLL)) 31374 weight = CW_Constant; 31375 } 31376 break; 31377 case 'Z': 31378 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 31379 if (C->getZExtValue() <= 0xffffffff) 31380 weight = CW_Constant; 31381 } 31382 break; 31383 } 31384 return weight; 31385 } 31386 31387 /// Try to replace an X constraint, which matches anything, with another that 31388 /// has more specific requirements based on the type of the corresponding 31389 /// operand. 31390 const char *X86TargetLowering:: 31391 LowerXConstraint(EVT ConstraintVT) const { 31392 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 31393 // 'f' like normal targets. 31394 if (ConstraintVT.isFloatingPoint()) { 31395 if (Subtarget.hasSSE2()) 31396 return "Y"; 31397 if (Subtarget.hasSSE1()) 31398 return "x"; 31399 } 31400 31401 return TargetLowering::LowerXConstraint(ConstraintVT); 31402 } 31403 31404 /// Lower the specified operand into the Ops vector. 31405 /// If it is invalid, don't add anything to Ops. 31406 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 31407 std::string &Constraint, 31408 std::vector<SDValue>&Ops, 31409 SelectionDAG &DAG) const { 31410 SDValue Result; 31411 31412 // Only support length 1 constraints for now. 31413 if (Constraint.length() > 1) return; 31414 31415 char ConstraintLetter = Constraint[0]; 31416 switch (ConstraintLetter) { 31417 default: break; 31418 case 'I': 31419 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31420 if (C->getZExtValue() <= 31) { 31421 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 31422 Op.getValueType()); 31423 break; 31424 } 31425 } 31426 return; 31427 case 'J': 31428 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31429 if (C->getZExtValue() <= 63) { 31430 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 31431 Op.getValueType()); 31432 break; 31433 } 31434 } 31435 return; 31436 case 'K': 31437 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31438 if (isInt<8>(C->getSExtValue())) { 31439 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 31440 Op.getValueType()); 31441 break; 31442 } 31443 } 31444 return; 31445 case 'L': 31446 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31447 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || 31448 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { 31449 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), 31450 Op.getValueType()); 31451 break; 31452 } 31453 } 31454 return; 31455 case 'M': 31456 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31457 if (C->getZExtValue() <= 3) { 31458 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 31459 Op.getValueType()); 31460 break; 31461 } 31462 } 31463 return; 31464 case 'N': 31465 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31466 if (C->getZExtValue() <= 255) { 31467 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 31468 Op.getValueType()); 31469 break; 31470 } 31471 } 31472 return; 31473 case 'O': 31474 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31475 if (C->getZExtValue() <= 127) { 31476 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 31477 Op.getValueType()); 31478 break; 31479 } 31480 } 31481 return; 31482 case 'e': { 31483 // 32-bit signed value 31484 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31485 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 31486 C->getSExtValue())) { 31487 // Widen to 64 bits here to get it sign extended. 31488 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); 31489 break; 31490 } 31491 // FIXME gcc accepts some relocatable values here too, but only in certain 31492 // memory models; it's complicated. 31493 } 31494 return; 31495 } 31496 case 'Z': { 31497 // 32-bit unsigned value 31498 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 31499 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 31500 C->getZExtValue())) { 31501 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), 31502 Op.getValueType()); 31503 break; 31504 } 31505 } 31506 // FIXME gcc accepts some relocatable values here too, but only in certain 31507 // memory models; it's complicated. 31508 return; 31509 } 31510 case 'i': { 31511 // Literal immediates are always ok. 31512 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 31513 // Widen to 64 bits here to get it sign extended. 31514 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); 31515 break; 31516 } 31517 31518 // In any sort of PIC mode addresses need to be computed at runtime by 31519 // adding in a register or some sort of table lookup. These can't 31520 // be used as immediates. 31521 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) 31522 return; 31523 31524 // If we are in non-pic codegen mode, we allow the address of a global (with 31525 // an optional displacement) to be used with 'i'. 31526 GlobalAddressSDNode *GA = nullptr; 31527 int64_t Offset = 0; 31528 31529 // Match either (GA), (GA+C), (GA+C1+C2), etc. 31530 while (1) { 31531 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 31532 Offset += GA->getOffset(); 31533 break; 31534 } else if (Op.getOpcode() == ISD::ADD) { 31535 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 31536 Offset += C->getZExtValue(); 31537 Op = Op.getOperand(0); 31538 continue; 31539 } 31540 } else if (Op.getOpcode() == ISD::SUB) { 31541 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 31542 Offset += -C->getZExtValue(); 31543 Op = Op.getOperand(0); 31544 continue; 31545 } 31546 } 31547 31548 // Otherwise, this isn't something we can handle, reject it. 31549 return; 31550 } 31551 31552 const GlobalValue *GV = GA->getGlobal(); 31553 // If we require an extra load to get this address, as in PIC mode, we 31554 // can't accept it. 31555 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV))) 31556 return; 31557 31558 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), 31559 GA->getValueType(0), Offset); 31560 break; 31561 } 31562 } 31563 31564 if (Result.getNode()) { 31565 Ops.push_back(Result); 31566 return; 31567 } 31568 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 31569 } 31570 31571 /// Check if \p RC is a general purpose register class. 31572 /// I.e., GR* or one of their variant. 31573 static bool isGRClass(const TargetRegisterClass &RC) { 31574 switch (RC.getID()) { 31575 case X86::GR8RegClassID: 31576 case X86::GR8_ABCD_LRegClassID: 31577 case X86::GR8_ABCD_HRegClassID: 31578 case X86::GR8_NOREXRegClassID: 31579 case X86::GR16RegClassID: 31580 case X86::GR16_ABCDRegClassID: 31581 case X86::GR16_NOREXRegClassID: 31582 case X86::GR32RegClassID: 31583 case X86::GR32_ABCDRegClassID: 31584 case X86::GR32_TCRegClassID: 31585 case X86::GR32_NOREXRegClassID: 31586 case X86::GR32_NOAXRegClassID: 31587 case X86::GR32_NOSPRegClassID: 31588 case X86::GR32_NOREX_NOSPRegClassID: 31589 case X86::GR32_ADRegClassID: 31590 case X86::GR64RegClassID: 31591 case X86::GR64_ABCDRegClassID: 31592 case X86::GR64_TCRegClassID: 31593 case X86::GR64_TCW64RegClassID: 31594 case X86::GR64_NOREXRegClassID: 31595 case X86::GR64_NOSPRegClassID: 31596 case X86::GR64_NOREX_NOSPRegClassID: 31597 case X86::LOW32_ADDR_ACCESSRegClassID: 31598 case X86::LOW32_ADDR_ACCESS_RBPRegClassID: 31599 return true; 31600 default: 31601 return false; 31602 } 31603 } 31604 31605 /// Check if \p RC is a vector register class. 31606 /// I.e., FR* / VR* or one of their variant. 31607 static bool isFRClass(const TargetRegisterClass &RC) { 31608 switch (RC.getID()) { 31609 case X86::FR32RegClassID: 31610 case X86::FR32XRegClassID: 31611 case X86::FR64RegClassID: 31612 case X86::FR64XRegClassID: 31613 case X86::FR128RegClassID: 31614 case X86::VR64RegClassID: 31615 case X86::VR128RegClassID: 31616 case X86::VR128LRegClassID: 31617 case X86::VR128HRegClassID: 31618 case X86::VR128XRegClassID: 31619 case X86::VR256RegClassID: 31620 case X86::VR256LRegClassID: 31621 case X86::VR256HRegClassID: 31622 case X86::VR256XRegClassID: 31623 case X86::VR512RegClassID: 31624 return true; 31625 default: 31626 return false; 31627 } 31628 } 31629 31630 std::pair<unsigned, const TargetRegisterClass *> 31631 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 31632 StringRef Constraint, 31633 MVT VT) const { 31634 // First, see if this is a constraint that directly corresponds to an LLVM 31635 // register class. 31636 if (Constraint.size() == 1) { 31637 // GCC Constraint Letters 31638 switch (Constraint[0]) { 31639 default: break; 31640 // TODO: Slight differences here in allocation order and leaving 31641 // RIP in the class. Do they matter any more here than they do 31642 // in the normal allocation? 31643 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 31644 if (Subtarget.is64Bit()) { 31645 if (VT == MVT::i32 || VT == MVT::f32) 31646 return std::make_pair(0U, &X86::GR32RegClass); 31647 if (VT == MVT::i16) 31648 return std::make_pair(0U, &X86::GR16RegClass); 31649 if (VT == MVT::i8 || VT == MVT::i1) 31650 return std::make_pair(0U, &X86::GR8RegClass); 31651 if (VT == MVT::i64 || VT == MVT::f64) 31652 return std::make_pair(0U, &X86::GR64RegClass); 31653 break; 31654 } 31655 // 32-bit fallthrough 31656 case 'Q': // Q_REGS 31657 if (VT == MVT::i32 || VT == MVT::f32) 31658 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 31659 if (VT == MVT::i16) 31660 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 31661 if (VT == MVT::i8 || VT == MVT::i1) 31662 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 31663 if (VT == MVT::i64) 31664 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 31665 break; 31666 case 'r': // GENERAL_REGS 31667 case 'l': // INDEX_REGS 31668 if (VT == MVT::i8 || VT == MVT::i1) 31669 return std::make_pair(0U, &X86::GR8RegClass); 31670 if (VT == MVT::i16) 31671 return std::make_pair(0U, &X86::GR16RegClass); 31672 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit()) 31673 return std::make_pair(0U, &X86::GR32RegClass); 31674 return std::make_pair(0U, &X86::GR64RegClass); 31675 case 'R': // LEGACY_REGS 31676 if (VT == MVT::i8 || VT == MVT::i1) 31677 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 31678 if (VT == MVT::i16) 31679 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 31680 if (VT == MVT::i32 || !Subtarget.is64Bit()) 31681 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 31682 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 31683 case 'f': // FP Stack registers. 31684 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 31685 // value to the correct fpstack register class. 31686 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 31687 return std::make_pair(0U, &X86::RFP32RegClass); 31688 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 31689 return std::make_pair(0U, &X86::RFP64RegClass); 31690 return std::make_pair(0U, &X86::RFP80RegClass); 31691 case 'y': // MMX_REGS if MMX allowed. 31692 if (!Subtarget.hasMMX()) break; 31693 return std::make_pair(0U, &X86::VR64RegClass); 31694 case 'Y': // SSE_REGS if SSE2 allowed 31695 if (!Subtarget.hasSSE2()) break; 31696 // FALL THROUGH. 31697 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 31698 if (!Subtarget.hasSSE1()) break; 31699 31700 switch (VT.SimpleTy) { 31701 default: break; 31702 // Scalar SSE types. 31703 case MVT::f32: 31704 case MVT::i32: 31705 return std::make_pair(0U, &X86::FR32RegClass); 31706 case MVT::f64: 31707 case MVT::i64: 31708 return std::make_pair(0U, &X86::FR64RegClass); 31709 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. 31710 // Vector types. 31711 case MVT::v16i8: 31712 case MVT::v8i16: 31713 case MVT::v4i32: 31714 case MVT::v2i64: 31715 case MVT::v4f32: 31716 case MVT::v2f64: 31717 return std::make_pair(0U, &X86::VR128RegClass); 31718 // AVX types. 31719 case MVT::v32i8: 31720 case MVT::v16i16: 31721 case MVT::v8i32: 31722 case MVT::v4i64: 31723 case MVT::v8f32: 31724 case MVT::v4f64: 31725 return std::make_pair(0U, &X86::VR256RegClass); 31726 case MVT::v8f64: 31727 case MVT::v16f32: 31728 case MVT::v16i32: 31729 case MVT::v8i64: 31730 return std::make_pair(0U, &X86::VR512RegClass); 31731 } 31732 break; 31733 } 31734 } 31735 31736 // Use the default implementation in TargetLowering to convert the register 31737 // constraint into a member of a register class. 31738 std::pair<unsigned, const TargetRegisterClass*> Res; 31739 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 31740 31741 // Not found as a standard register? 31742 if (!Res.second) { 31743 // Map st(0) -> st(7) -> ST0 31744 if (Constraint.size() == 7 && Constraint[0] == '{' && 31745 tolower(Constraint[1]) == 's' && 31746 tolower(Constraint[2]) == 't' && 31747 Constraint[3] == '(' && 31748 (Constraint[4] >= '0' && Constraint[4] <= '7') && 31749 Constraint[5] == ')' && 31750 Constraint[6] == '}') { 31751 31752 Res.first = X86::FP0+Constraint[4]-'0'; 31753 Res.second = &X86::RFP80RegClass; 31754 return Res; 31755 } 31756 31757 // GCC allows "st(0)" to be called just plain "st". 31758 if (StringRef("{st}").equals_lower(Constraint)) { 31759 Res.first = X86::FP0; 31760 Res.second = &X86::RFP80RegClass; 31761 return Res; 31762 } 31763 31764 // flags -> EFLAGS 31765 if (StringRef("{flags}").equals_lower(Constraint)) { 31766 Res.first = X86::EFLAGS; 31767 Res.second = &X86::CCRRegClass; 31768 return Res; 31769 } 31770 31771 // 'A' means EAX + EDX. 31772 if (Constraint == "A") { 31773 Res.first = X86::EAX; 31774 Res.second = &X86::GR32_ADRegClass; 31775 return Res; 31776 } 31777 return Res; 31778 } 31779 31780 // Otherwise, check to see if this is a register class of the wrong value 31781 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 31782 // turn into {ax},{dx}. 31783 // MVT::Other is used to specify clobber names. 31784 if (Res.second->hasType(VT) || VT == MVT::Other) 31785 return Res; // Correct type already, nothing to do. 31786 31787 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should 31788 // return "eax". This should even work for things like getting 64bit integer 31789 // registers when given an f64 type. 31790 const TargetRegisterClass *Class = Res.second; 31791 // The generic code will match the first register class that contains the 31792 // given register. Thus, based on the ordering of the tablegened file, 31793 // the "plain" GR classes might not come first. 31794 // Therefore, use a helper method. 31795 if (isGRClass(*Class)) { 31796 unsigned Size = VT.getSizeInBits(); 31797 if (Size == 1) Size = 8; 31798 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); 31799 if (DestReg > 0) { 31800 Res.first = DestReg; 31801 Res.second = Size == 8 ? &X86::GR8RegClass 31802 : Size == 16 ? &X86::GR16RegClass 31803 : Size == 32 ? &X86::GR32RegClass 31804 : &X86::GR64RegClass; 31805 assert(Res.second->contains(Res.first) && "Register in register class"); 31806 } else { 31807 // No register found/type mismatch. 31808 Res.first = 0; 31809 Res.second = nullptr; 31810 } 31811 } else if (isFRClass(*Class)) { 31812 // Handle references to XMM physical registers that got mapped into the 31813 // wrong class. This can happen with constraints like {xmm0} where the 31814 // target independent register mapper will just pick the first match it can 31815 // find, ignoring the required type. 31816 31817 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. 31818 if (VT == MVT::f32 || VT == MVT::i32) 31819 Res.second = &X86::FR32RegClass; 31820 else if (VT == MVT::f64 || VT == MVT::i64) 31821 Res.second = &X86::FR64RegClass; 31822 else if (X86::VR128RegClass.hasType(VT)) 31823 Res.second = &X86::VR128RegClass; 31824 else if (X86::VR256RegClass.hasType(VT)) 31825 Res.second = &X86::VR256RegClass; 31826 else if (X86::VR512RegClass.hasType(VT)) 31827 Res.second = &X86::VR512RegClass; 31828 else { 31829 // Type mismatch and not a clobber: Return an error; 31830 Res.first = 0; 31831 Res.second = nullptr; 31832 } 31833 } 31834 31835 return Res; 31836 } 31837 31838 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, 31839 const AddrMode &AM, Type *Ty, 31840 unsigned AS) const { 31841 // Scaling factors are not free at all. 31842 // An indexed folded instruction, i.e., inst (reg1, reg2, scale), 31843 // will take 2 allocations in the out of order engine instead of 1 31844 // for plain addressing mode, i.e. inst (reg1). 31845 // E.g., 31846 // vaddps (%rsi,%drx), %ymm0, %ymm1 31847 // Requires two allocations (one for the load, one for the computation) 31848 // whereas: 31849 // vaddps (%rsi), %ymm0, %ymm1 31850 // Requires just 1 allocation, i.e., freeing allocations for other operations 31851 // and having less micro operations to execute. 31852 // 31853 // For some X86 architectures, this is even worse because for instance for 31854 // stores, the complex addressing mode forces the instruction to use the 31855 // "load" ports instead of the dedicated "store" port. 31856 // E.g., on Haswell: 31857 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. 31858 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. 31859 if (isLegalAddressingMode(DL, AM, Ty, AS)) 31860 // Scale represents reg2 * scale, thus account for 1 31861 // as soon as we use a second register. 31862 return AM.Scale != 0; 31863 return -1; 31864 } 31865 31866 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { 31867 // Integer division on x86 is expensive. However, when aggressively optimizing 31868 // for code size, we prefer to use a div instruction, as it is usually smaller 31869 // than the alternative sequence. 31870 // The exception to this is vector division. Since x86 doesn't have vector 31871 // integer division, leaving the division as-is is a loss even in terms of 31872 // size, because it will have to be scalarized, while the alternative code 31873 // sequence can be performed in vector form. 31874 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, 31875 Attribute::MinSize); 31876 return OptSize && !VT.isVector(); 31877 } 31878 31879 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 31880 if (!Subtarget.is64Bit()) 31881 return; 31882 31883 // Update IsSplitCSR in X86MachineFunctionInfo. 31884 X86MachineFunctionInfo *AFI = 31885 Entry->getParent()->getInfo<X86MachineFunctionInfo>(); 31886 AFI->setIsSplitCSR(true); 31887 } 31888 31889 void X86TargetLowering::insertCopiesSplitCSR( 31890 MachineBasicBlock *Entry, 31891 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 31892 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 31893 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 31894 if (!IStart) 31895 return; 31896 31897 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 31898 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 31899 MachineBasicBlock::iterator MBBI = Entry->begin(); 31900 for (const MCPhysReg *I = IStart; *I; ++I) { 31901 const TargetRegisterClass *RC = nullptr; 31902 if (X86::GR64RegClass.contains(*I)) 31903 RC = &X86::GR64RegClass; 31904 else 31905 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 31906 31907 unsigned NewVR = MRI->createVirtualRegister(RC); 31908 // Create copy from CSR to a virtual register. 31909 // FIXME: this currently does not emit CFI pseudo-instructions, it works 31910 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 31911 // nounwind. If we want to generalize this later, we may need to emit 31912 // CFI pseudo-instructions. 31913 assert(Entry->getParent()->getFunction()->hasFnAttribute( 31914 Attribute::NoUnwind) && 31915 "Function should be nounwind in insertCopiesSplitCSR!"); 31916 Entry->addLiveIn(*I); 31917 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 31918 .addReg(*I); 31919 31920 // Insert the copy-back instructions right before the terminator. 31921 for (auto *Exit : Exits) 31922 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 31923 TII->get(TargetOpcode::COPY), *I) 31924 .addReg(NewVR); 31925 } 31926 } 31927