1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that X86 uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "x86-isel" 16 #include "X86ISelLowering.h" 17 #include "Utils/X86ShuffleDecode.h" 18 #include "X86.h" 19 #include "X86InstrBuilder.h" 20 #include "X86TargetMachine.h" 21 #include "X86TargetObjectFile.h" 22 #include "llvm/ADT/SmallSet.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/ADT/StringExtras.h" 25 #include "llvm/ADT/VariadicFunction.h" 26 #include "llvm/CodeGen/IntrinsicLowering.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineInstrBuilder.h" 30 #include "llvm/CodeGen/MachineJumpTableInfo.h" 31 #include "llvm/CodeGen/MachineModuleInfo.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/IR/CallingConv.h" 34 #include "llvm/IR/Constants.h" 35 #include "llvm/IR/DerivedTypes.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/GlobalAlias.h" 38 #include "llvm/IR/GlobalVariable.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/Intrinsics.h" 41 #include "llvm/IR/LLVMContext.h" 42 #include "llvm/MC/MCAsmInfo.h" 43 #include "llvm/MC/MCContext.h" 44 #include "llvm/MC/MCExpr.h" 45 #include "llvm/MC/MCSymbol.h" 46 #include "llvm/Support/CallSite.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Support/ErrorHandling.h" 49 #include "llvm/Support/MathExtras.h" 50 #include "llvm/Target/TargetOptions.h" 51 #include <bitset> 52 #include <cctype> 53 using namespace llvm; 54 55 STATISTIC(NumTailCalls, "Number of tail calls"); 56 57 // Forward declarations. 58 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 59 SDValue V2); 60 61 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This 62 /// sets things up to match to an AVX VEXTRACTF128 instruction or a 63 /// simple subregister reference. Idx is an index in the 128 bits we 64 /// want. It need not be aligned to a 128-bit bounday. That makes 65 /// lowering EXTRACT_VECTOR_ELT operations easier. 66 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 67 SelectionDAG &DAG, DebugLoc dl) { 68 EVT VT = Vec.getValueType(); 69 assert(VT.is256BitVector() && "Unexpected vector size!"); 70 EVT ElVT = VT.getVectorElementType(); 71 unsigned Factor = VT.getSizeInBits()/128; 72 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 73 VT.getVectorNumElements()/Factor); 74 75 // Extract from UNDEF is UNDEF. 76 if (Vec.getOpcode() == ISD::UNDEF) 77 return DAG.getUNDEF(ResultVT); 78 79 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 80 // we can match to VEXTRACTF128. 81 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 82 83 // This is the index of the first element of the 128-bit chunk 84 // we want. 85 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 86 * ElemsPerChunk); 87 88 // If the input is a buildvector just emit a smaller one. 89 if (Vec.getOpcode() == ISD::BUILD_VECTOR) 90 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, 91 Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); 92 93 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 94 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 95 VecIdx); 96 97 return Result; 98 } 99 100 /// Generate a DAG to put 128-bits into a vector > 128 bits. This 101 /// sets things up to match to an AVX VINSERTF128 instruction or a 102 /// simple superregister reference. Idx is an index in the 128 bits 103 /// we want. It need not be aligned to a 128-bit bounday. That makes 104 /// lowering INSERT_VECTOR_ELT operations easier. 105 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, 106 unsigned IdxVal, SelectionDAG &DAG, 107 DebugLoc dl) { 108 // Inserting UNDEF is Result 109 if (Vec.getOpcode() == ISD::UNDEF) 110 return Result; 111 112 EVT VT = Vec.getValueType(); 113 assert(VT.is128BitVector() && "Unexpected vector size!"); 114 115 EVT ElVT = VT.getVectorElementType(); 116 EVT ResultVT = Result.getValueType(); 117 118 // Insert the relevant 128 bits. 119 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 120 121 // This is the index of the first element of the 128-bit chunk 122 // we want. 123 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 124 * ElemsPerChunk); 125 126 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 127 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 128 VecIdx); 129 } 130 131 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 132 /// instructions. This is used because creating CONCAT_VECTOR nodes of 133 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 134 /// large BUILD_VECTORS. 135 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 136 unsigned NumElems, SelectionDAG &DAG, 137 DebugLoc dl) { 138 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 139 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 140 } 141 142 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 143 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 144 bool is64Bit = Subtarget->is64Bit(); 145 146 if (Subtarget->isTargetEnvMacho()) { 147 if (is64Bit) 148 return new X86_64MachoTargetObjectFile(); 149 return new TargetLoweringObjectFileMachO(); 150 } 151 152 if (Subtarget->isTargetLinux()) 153 return new X86LinuxTargetObjectFile(); 154 if (Subtarget->isTargetELF()) 155 return new TargetLoweringObjectFileELF(); 156 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 157 return new TargetLoweringObjectFileCOFF(); 158 llvm_unreachable("unknown subtarget type"); 159 } 160 161 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 162 : TargetLowering(TM, createTLOF(TM)) { 163 Subtarget = &TM.getSubtarget<X86Subtarget>(); 164 X86ScalarSSEf64 = Subtarget->hasSSE2(); 165 X86ScalarSSEf32 = Subtarget->hasSSE1(); 166 167 RegInfo = TM.getRegisterInfo(); 168 TD = getDataLayout(); 169 170 // Set up the TargetLowering object. 171 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 172 173 // X86 is weird, it always uses i8 for shift amounts and setcc results. 174 setBooleanContents(ZeroOrOneBooleanContent); 175 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 176 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 177 178 // For 64-bit since we have so many registers use the ILP scheduler, for 179 // 32-bit code use the register pressure specific scheduling. 180 // For Atom, always use ILP scheduling. 181 if (Subtarget->isAtom()) 182 setSchedulingPreference(Sched::ILP); 183 else if (Subtarget->is64Bit()) 184 setSchedulingPreference(Sched::ILP); 185 else 186 setSchedulingPreference(Sched::RegPressure); 187 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 188 189 // Bypass expensive divides on Atom when compiling with O2 190 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { 191 addBypassSlowDiv(32, 8); 192 if (Subtarget->is64Bit()) 193 addBypassSlowDiv(64, 16); 194 } 195 196 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 197 // Setup Windows compiler runtime calls. 198 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 199 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 200 setLibcallName(RTLIB::SREM_I64, "_allrem"); 201 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 202 setLibcallName(RTLIB::MUL_I64, "_allmul"); 203 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 204 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 205 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 206 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 207 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 208 209 // The _ftol2 runtime function has an unusual calling conv, which 210 // is modeled by a special pseudo-instruction. 211 setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); 212 setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); 213 setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); 214 setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); 215 } 216 217 if (Subtarget->isTargetDarwin()) { 218 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 219 setUseUnderscoreSetJmp(false); 220 setUseUnderscoreLongJmp(false); 221 } else if (Subtarget->isTargetMingw()) { 222 // MS runtime is weird: it exports _setjmp, but longjmp! 223 setUseUnderscoreSetJmp(true); 224 setUseUnderscoreLongJmp(false); 225 } else { 226 setUseUnderscoreSetJmp(true); 227 setUseUnderscoreLongJmp(true); 228 } 229 230 // Set up the register classes. 231 addRegisterClass(MVT::i8, &X86::GR8RegClass); 232 addRegisterClass(MVT::i16, &X86::GR16RegClass); 233 addRegisterClass(MVT::i32, &X86::GR32RegClass); 234 if (Subtarget->is64Bit()) 235 addRegisterClass(MVT::i64, &X86::GR64RegClass); 236 237 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 238 239 // We don't accept any truncstore of integer registers. 240 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 241 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 242 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 243 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 244 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 245 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 246 247 // SETOEQ and SETUNE require checking two conditions. 248 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 249 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 250 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 251 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 252 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 253 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 254 255 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 256 // operation. 257 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 258 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 259 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 260 261 if (Subtarget->is64Bit()) { 262 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 263 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 264 } else if (!TM.Options.UseSoftFloat) { 265 // We have an algorithm for SSE2->double, and we turn this into a 266 // 64-bit FILD followed by conditional FADD for other targets. 267 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 268 // We have an algorithm for SSE2, and we turn this into a 64-bit 269 // FILD for other targets. 270 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 271 } 272 273 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 274 // this operation. 275 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 276 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 277 278 if (!TM.Options.UseSoftFloat) { 279 // SSE has no i16 to fp conversion, only i32 280 if (X86ScalarSSEf32) { 281 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 282 // f32 and f64 cases are Legal, f80 case is not 283 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 284 } else { 285 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 286 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 287 } 288 } else { 289 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 290 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 291 } 292 293 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 294 // are Legal, f80 is custom lowered. 295 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 296 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 297 298 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 299 // this operation. 300 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 301 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 302 303 if (X86ScalarSSEf32) { 304 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 305 // f32 and f64 cases are Legal, f80 case is not 306 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 307 } else { 308 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 309 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 310 } 311 312 // Handle FP_TO_UINT by promoting the destination to a larger signed 313 // conversion. 314 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 315 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 316 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 317 318 if (Subtarget->is64Bit()) { 319 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 320 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 321 } else if (!TM.Options.UseSoftFloat) { 322 // Since AVX is a superset of SSE3, only check for SSE here. 323 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 324 // Expand FP_TO_UINT into a select. 325 // FIXME: We would like to use a Custom expander here eventually to do 326 // the optimal thing for SSE vs. the default expansion in the legalizer. 327 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 328 else 329 // With SSE3 we can use fisttpll to convert to a signed i64; without 330 // SSE, we're stuck with a fistpll. 331 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 332 } 333 334 if (isTargetFTOL()) { 335 // Use the _ftol2 runtime function, which has a pseudo-instruction 336 // to handle its weird calling convention. 337 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 338 } 339 340 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 341 if (!X86ScalarSSEf64) { 342 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 343 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 344 if (Subtarget->is64Bit()) { 345 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 346 // Without SSE, i64->f64 goes through memory. 347 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 348 } 349 } 350 351 // Scalar integer divide and remainder are lowered to use operations that 352 // produce two results, to match the available instructions. This exposes 353 // the two-result form to trivial CSE, which is able to combine x/y and x%y 354 // into a single instruction. 355 // 356 // Scalar integer multiply-high is also lowered to use two-result 357 // operations, to match the available instructions. However, plain multiply 358 // (low) operations are left as Legal, as there are single-result 359 // instructions for this in x86. Using the two-result multiply instructions 360 // when both high and low results are needed must be arranged by dagcombine. 361 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 362 MVT VT = IntVTs[i]; 363 setOperationAction(ISD::MULHS, VT, Expand); 364 setOperationAction(ISD::MULHU, VT, Expand); 365 setOperationAction(ISD::SDIV, VT, Expand); 366 setOperationAction(ISD::UDIV, VT, Expand); 367 setOperationAction(ISD::SREM, VT, Expand); 368 setOperationAction(ISD::UREM, VT, Expand); 369 370 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 371 setOperationAction(ISD::ADDC, VT, Custom); 372 setOperationAction(ISD::ADDE, VT, Custom); 373 setOperationAction(ISD::SUBC, VT, Custom); 374 setOperationAction(ISD::SUBE, VT, Custom); 375 } 376 377 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 378 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 379 setOperationAction(ISD::BR_CC , MVT::f32, Expand); 380 setOperationAction(ISD::BR_CC , MVT::f64, Expand); 381 setOperationAction(ISD::BR_CC , MVT::f80, Expand); 382 setOperationAction(ISD::BR_CC , MVT::i8, Expand); 383 setOperationAction(ISD::BR_CC , MVT::i16, Expand); 384 setOperationAction(ISD::BR_CC , MVT::i32, Expand); 385 setOperationAction(ISD::BR_CC , MVT::i64, Expand); 386 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 387 if (Subtarget->is64Bit()) 388 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 389 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 390 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 391 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 392 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 393 setOperationAction(ISD::FREM , MVT::f32 , Expand); 394 setOperationAction(ISD::FREM , MVT::f64 , Expand); 395 setOperationAction(ISD::FREM , MVT::f80 , Expand); 396 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 397 398 // Promote the i8 variants and force them on up to i32 which has a shorter 399 // encoding. 400 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 401 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 402 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 403 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 404 if (Subtarget->hasBMI()) { 405 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 406 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 407 if (Subtarget->is64Bit()) 408 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 409 } else { 410 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 411 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 412 if (Subtarget->is64Bit()) 413 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 414 } 415 416 if (Subtarget->hasLZCNT()) { 417 // When promoting the i8 variants, force them to i32 for a shorter 418 // encoding. 419 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 420 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 421 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 422 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 423 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 424 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 425 if (Subtarget->is64Bit()) 426 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 427 } else { 428 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 429 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 430 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 431 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 432 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 433 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 434 if (Subtarget->is64Bit()) { 435 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 436 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 437 } 438 } 439 440 if (Subtarget->hasPOPCNT()) { 441 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 442 } else { 443 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 444 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 445 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 446 if (Subtarget->is64Bit()) 447 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 448 } 449 450 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 451 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 452 453 // These should be promoted to a larger select which is supported. 454 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 455 // X86 wants to expand cmov itself. 456 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 457 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 458 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 459 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 460 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 461 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 462 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 463 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 464 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 465 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 466 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 467 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 468 if (Subtarget->is64Bit()) { 469 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 470 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 471 } 472 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 473 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support 474 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 475 // support continuation, user-level threading, and etc.. As a result, no 476 // other SjLj exception interfaces are implemented and please don't build 477 // your own exception handling based on them. 478 // LLVM/Clang supports zero-cost DWARF exception handling. 479 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 480 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 481 482 // Darwin ABI issue. 483 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 484 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 485 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 486 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 487 if (Subtarget->is64Bit()) 488 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 489 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 490 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 491 if (Subtarget->is64Bit()) { 492 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 493 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 494 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 495 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 496 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 497 } 498 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 499 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 500 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 501 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 502 if (Subtarget->is64Bit()) { 503 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 504 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 505 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 506 } 507 508 if (Subtarget->hasSSE1()) 509 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 510 511 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 512 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 513 514 // On X86 and X86-64, atomic operations are lowered to locked instructions. 515 // Locked instructions, in turn, have implicit fence semantics (all memory 516 // operations are flushed before issuing the locked instruction, and they 517 // are not buffered), so we can fold away the common pattern of 518 // fence-atomic-fence. 519 setShouldFoldAtomicFences(true); 520 521 // Expand certain atomics 522 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 523 MVT VT = IntVTs[i]; 524 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 525 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 526 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 527 } 528 529 if (!Subtarget->is64Bit()) { 530 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 531 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 532 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 533 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 534 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 535 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 536 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 537 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 538 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); 539 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); 540 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); 541 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); 542 } 543 544 if (Subtarget->hasCmpxchg16b()) { 545 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 546 } 547 548 // FIXME - use subtarget debug flags 549 if (!Subtarget->isTargetDarwin() && 550 !Subtarget->isTargetELF() && 551 !Subtarget->isTargetCygMing()) { 552 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 553 } 554 555 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 556 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 557 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 558 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 559 if (Subtarget->is64Bit()) { 560 setExceptionPointerRegister(X86::RAX); 561 setExceptionSelectorRegister(X86::RDX); 562 } else { 563 setExceptionPointerRegister(X86::EAX); 564 setExceptionSelectorRegister(X86::EDX); 565 } 566 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 567 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 568 569 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 570 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 571 572 setOperationAction(ISD::TRAP, MVT::Other, Legal); 573 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 574 575 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 576 setOperationAction(ISD::VASTART , MVT::Other, Custom); 577 setOperationAction(ISD::VAEND , MVT::Other, Expand); 578 if (Subtarget->is64Bit()) { 579 setOperationAction(ISD::VAARG , MVT::Other, Custom); 580 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 581 } else { 582 setOperationAction(ISD::VAARG , MVT::Other, Expand); 583 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 584 } 585 586 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 587 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 588 589 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 590 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 591 MVT::i64 : MVT::i32, Custom); 592 else if (TM.Options.EnableSegmentedStacks) 593 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 594 MVT::i64 : MVT::i32, Custom); 595 else 596 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 597 MVT::i64 : MVT::i32, Expand); 598 599 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 600 // f32 and f64 use SSE. 601 // Set up the FP register classes. 602 addRegisterClass(MVT::f32, &X86::FR32RegClass); 603 addRegisterClass(MVT::f64, &X86::FR64RegClass); 604 605 // Use ANDPD to simulate FABS. 606 setOperationAction(ISD::FABS , MVT::f64, Custom); 607 setOperationAction(ISD::FABS , MVT::f32, Custom); 608 609 // Use XORP to simulate FNEG. 610 setOperationAction(ISD::FNEG , MVT::f64, Custom); 611 setOperationAction(ISD::FNEG , MVT::f32, Custom); 612 613 // Use ANDPD and ORPD to simulate FCOPYSIGN. 614 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 615 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 616 617 // Lower this to FGETSIGNx86 plus an AND. 618 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 619 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 620 621 // We don't support sin/cos/fmod 622 setOperationAction(ISD::FSIN , MVT::f64, Expand); 623 setOperationAction(ISD::FCOS , MVT::f64, Expand); 624 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 625 setOperationAction(ISD::FSIN , MVT::f32, Expand); 626 setOperationAction(ISD::FCOS , MVT::f32, Expand); 627 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 628 629 // Expand FP immediates into loads from the stack, except for the special 630 // cases we handle. 631 addLegalFPImmediate(APFloat(+0.0)); // xorpd 632 addLegalFPImmediate(APFloat(+0.0f)); // xorps 633 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 634 // Use SSE for f32, x87 for f64. 635 // Set up the FP register classes. 636 addRegisterClass(MVT::f32, &X86::FR32RegClass); 637 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 638 639 // Use ANDPS to simulate FABS. 640 setOperationAction(ISD::FABS , MVT::f32, Custom); 641 642 // Use XORP to simulate FNEG. 643 setOperationAction(ISD::FNEG , MVT::f32, Custom); 644 645 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 646 647 // Use ANDPS and ORPS to simulate FCOPYSIGN. 648 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 649 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 650 651 // We don't support sin/cos/fmod 652 setOperationAction(ISD::FSIN , MVT::f32, Expand); 653 setOperationAction(ISD::FCOS , MVT::f32, Expand); 654 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 655 656 // Special cases we handle for FP constants. 657 addLegalFPImmediate(APFloat(+0.0f)); // xorps 658 addLegalFPImmediate(APFloat(+0.0)); // FLD0 659 addLegalFPImmediate(APFloat(+1.0)); // FLD1 660 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 661 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 662 663 if (!TM.Options.UnsafeFPMath) { 664 setOperationAction(ISD::FSIN , MVT::f64, Expand); 665 setOperationAction(ISD::FCOS , MVT::f64, Expand); 666 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 667 } 668 } else if (!TM.Options.UseSoftFloat) { 669 // f32 and f64 in x87. 670 // Set up the FP register classes. 671 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 672 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 673 674 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 675 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 676 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 677 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 678 679 if (!TM.Options.UnsafeFPMath) { 680 setOperationAction(ISD::FSIN , MVT::f64, Expand); 681 setOperationAction(ISD::FSIN , MVT::f32, Expand); 682 setOperationAction(ISD::FCOS , MVT::f64, Expand); 683 setOperationAction(ISD::FCOS , MVT::f32, Expand); 684 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 685 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 686 } 687 addLegalFPImmediate(APFloat(+0.0)); // FLD0 688 addLegalFPImmediate(APFloat(+1.0)); // FLD1 689 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 690 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 691 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 692 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 693 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 694 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 695 } 696 697 // We don't support FMA. 698 setOperationAction(ISD::FMA, MVT::f64, Expand); 699 setOperationAction(ISD::FMA, MVT::f32, Expand); 700 701 // Long double always uses X87. 702 if (!TM.Options.UseSoftFloat) { 703 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 704 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 705 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 706 { 707 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 708 addLegalFPImmediate(TmpFlt); // FLD0 709 TmpFlt.changeSign(); 710 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 711 712 bool ignored; 713 APFloat TmpFlt2(+1.0); 714 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 715 &ignored); 716 addLegalFPImmediate(TmpFlt2); // FLD1 717 TmpFlt2.changeSign(); 718 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 719 } 720 721 if (!TM.Options.UnsafeFPMath) { 722 setOperationAction(ISD::FSIN , MVT::f80, Expand); 723 setOperationAction(ISD::FCOS , MVT::f80, Expand); 724 setOperationAction(ISD::FSINCOS, MVT::f80, Expand); 725 } 726 727 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 728 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 729 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 730 setOperationAction(ISD::FRINT, MVT::f80, Expand); 731 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 732 setOperationAction(ISD::FMA, MVT::f80, Expand); 733 } 734 735 // Always use a library call for pow. 736 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 737 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 738 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 739 740 setOperationAction(ISD::FLOG, MVT::f80, Expand); 741 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 742 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 743 setOperationAction(ISD::FEXP, MVT::f80, Expand); 744 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 745 746 // First set operation action for all vector types to either promote 747 // (for widening) or expand (for scalarization). Then we will selectively 748 // turn on ones that can be effectively codegen'd. 749 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 750 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 751 MVT VT = (MVT::SimpleValueType)i; 752 setOperationAction(ISD::ADD , VT, Expand); 753 setOperationAction(ISD::SUB , VT, Expand); 754 setOperationAction(ISD::FADD, VT, Expand); 755 setOperationAction(ISD::FNEG, VT, Expand); 756 setOperationAction(ISD::FSUB, VT, Expand); 757 setOperationAction(ISD::MUL , VT, Expand); 758 setOperationAction(ISD::FMUL, VT, Expand); 759 setOperationAction(ISD::SDIV, VT, Expand); 760 setOperationAction(ISD::UDIV, VT, Expand); 761 setOperationAction(ISD::FDIV, VT, Expand); 762 setOperationAction(ISD::SREM, VT, Expand); 763 setOperationAction(ISD::UREM, VT, Expand); 764 setOperationAction(ISD::LOAD, VT, Expand); 765 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 766 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 767 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 768 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 769 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 770 setOperationAction(ISD::FABS, VT, Expand); 771 setOperationAction(ISD::FSIN, VT, Expand); 772 setOperationAction(ISD::FSINCOS, VT, Expand); 773 setOperationAction(ISD::FCOS, VT, Expand); 774 setOperationAction(ISD::FSINCOS, VT, Expand); 775 setOperationAction(ISD::FREM, VT, Expand); 776 setOperationAction(ISD::FMA, VT, Expand); 777 setOperationAction(ISD::FPOWI, VT, Expand); 778 setOperationAction(ISD::FSQRT, VT, Expand); 779 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 780 setOperationAction(ISD::FFLOOR, VT, Expand); 781 setOperationAction(ISD::FCEIL, VT, Expand); 782 setOperationAction(ISD::FTRUNC, VT, Expand); 783 setOperationAction(ISD::FRINT, VT, Expand); 784 setOperationAction(ISD::FNEARBYINT, VT, Expand); 785 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 786 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 787 setOperationAction(ISD::SDIVREM, VT, Expand); 788 setOperationAction(ISD::UDIVREM, VT, Expand); 789 setOperationAction(ISD::FPOW, VT, Expand); 790 setOperationAction(ISD::CTPOP, VT, Expand); 791 setOperationAction(ISD::CTTZ, VT, Expand); 792 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 793 setOperationAction(ISD::CTLZ, VT, Expand); 794 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 795 setOperationAction(ISD::SHL, VT, Expand); 796 setOperationAction(ISD::SRA, VT, Expand); 797 setOperationAction(ISD::SRL, VT, Expand); 798 setOperationAction(ISD::ROTL, VT, Expand); 799 setOperationAction(ISD::ROTR, VT, Expand); 800 setOperationAction(ISD::BSWAP, VT, Expand); 801 setOperationAction(ISD::SETCC, VT, Expand); 802 setOperationAction(ISD::FLOG, VT, Expand); 803 setOperationAction(ISD::FLOG2, VT, Expand); 804 setOperationAction(ISD::FLOG10, VT, Expand); 805 setOperationAction(ISD::FEXP, VT, Expand); 806 setOperationAction(ISD::FEXP2, VT, Expand); 807 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 808 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 809 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 810 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 811 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 812 setOperationAction(ISD::TRUNCATE, VT, Expand); 813 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 814 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 815 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 816 setOperationAction(ISD::VSELECT, VT, Expand); 817 for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; 818 InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 819 setTruncStoreAction(VT, 820 (MVT::SimpleValueType)InnerVT, Expand); 821 setLoadExtAction(ISD::SEXTLOAD, VT, Expand); 822 setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); 823 setLoadExtAction(ISD::EXTLOAD, VT, Expand); 824 } 825 826 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 827 // with -msoft-float, disable use of MMX as well. 828 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 829 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 830 // No operations on x86mmx supported, everything uses intrinsics. 831 } 832 833 // MMX-sized vectors (other than x86mmx) are expected to be expanded 834 // into smaller operations. 835 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 836 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 837 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 838 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 839 setOperationAction(ISD::AND, MVT::v8i8, Expand); 840 setOperationAction(ISD::AND, MVT::v4i16, Expand); 841 setOperationAction(ISD::AND, MVT::v2i32, Expand); 842 setOperationAction(ISD::AND, MVT::v1i64, Expand); 843 setOperationAction(ISD::OR, MVT::v8i8, Expand); 844 setOperationAction(ISD::OR, MVT::v4i16, Expand); 845 setOperationAction(ISD::OR, MVT::v2i32, Expand); 846 setOperationAction(ISD::OR, MVT::v1i64, Expand); 847 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 848 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 849 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 850 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 851 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 852 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 853 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 854 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 855 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 856 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 857 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 858 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 859 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 860 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 861 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 862 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 863 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 864 865 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 866 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 867 868 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 869 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 870 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 871 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 872 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 873 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 874 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 875 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 876 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 877 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 878 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 879 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 880 } 881 882 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 883 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 884 885 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 886 // registers cannot be used even for integer operations. 887 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 888 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 889 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 890 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 891 892 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 893 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 894 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 895 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 896 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 897 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 898 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 899 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 900 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 901 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 902 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 903 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 904 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 905 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 906 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 907 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 908 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 909 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 910 911 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 912 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 913 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 914 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 915 916 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 917 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 918 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 919 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 920 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 921 922 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 923 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 924 MVT VT = (MVT::SimpleValueType)i; 925 // Do not attempt to custom lower non-power-of-2 vectors 926 if (!isPowerOf2_32(VT.getVectorNumElements())) 927 continue; 928 // Do not attempt to custom lower non-128-bit vectors 929 if (!VT.is128BitVector()) 930 continue; 931 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 932 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 933 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 934 } 935 936 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 937 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 938 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 939 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 940 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 942 943 if (Subtarget->is64Bit()) { 944 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 945 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 946 } 947 948 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 949 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 950 MVT VT = (MVT::SimpleValueType)i; 951 952 // Do not attempt to promote non-128-bit vectors 953 if (!VT.is128BitVector()) 954 continue; 955 956 setOperationAction(ISD::AND, VT, Promote); 957 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 958 setOperationAction(ISD::OR, VT, Promote); 959 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 960 setOperationAction(ISD::XOR, VT, Promote); 961 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 962 setOperationAction(ISD::LOAD, VT, Promote); 963 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 964 setOperationAction(ISD::SELECT, VT, Promote); 965 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 966 } 967 968 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 969 970 // Custom lower v2i64 and v2f64 selects. 971 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 972 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 973 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 974 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 975 976 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 977 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 978 979 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 980 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 981 // As there is no 64-bit GPR available, we need build a special custom 982 // sequence to convert from v2i32 to v2f32. 983 if (!Subtarget->is64Bit()) 984 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 985 986 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 987 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 988 989 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); 990 } 991 992 if (Subtarget->hasSSE41()) { 993 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 994 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 995 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 996 setOperationAction(ISD::FRINT, MVT::f32, Legal); 997 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 998 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 999 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1000 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1001 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1002 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1003 1004 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 1005 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 1006 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 1007 setOperationAction(ISD::FRINT, MVT::v4f32, Legal); 1008 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 1009 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 1010 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 1011 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 1012 setOperationAction(ISD::FRINT, MVT::v2f64, Legal); 1013 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 1014 1015 // FIXME: Do we need to handle scalar-to-vector here? 1016 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 1017 1018 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 1019 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 1020 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 1021 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 1022 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 1023 1024 // i8 and i16 vectors are custom , because the source register and source 1025 // source memory operand types are not the same width. f32 vectors are 1026 // custom since the immediate controlling the insert encodes additional 1027 // information. 1028 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 1029 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 1030 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 1031 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 1032 1033 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 1034 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 1035 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 1036 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 1037 1038 // FIXME: these should be Legal but thats only for the case where 1039 // the index is constant. For now custom expand to deal with that. 1040 if (Subtarget->is64Bit()) { 1041 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 1042 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 1043 } 1044 } 1045 1046 if (Subtarget->hasSSE2()) { 1047 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 1048 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 1049 1050 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 1051 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 1052 1053 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 1054 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 1055 1056 if (Subtarget->hasInt256()) { 1057 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 1058 setOperationAction(ISD::SRL, MVT::v4i32, Legal); 1059 1060 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 1061 setOperationAction(ISD::SHL, MVT::v4i32, Legal); 1062 1063 setOperationAction(ISD::SRA, MVT::v4i32, Legal); 1064 } else { 1065 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1066 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1067 1068 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1069 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1070 1071 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1072 } 1073 setOperationAction(ISD::SDIV, MVT::v8i16, Custom); 1074 setOperationAction(ISD::SDIV, MVT::v4i32, Custom); 1075 } 1076 1077 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { 1078 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1079 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1080 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1081 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1082 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1083 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1084 1085 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1086 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1087 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1088 1089 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1090 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1091 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1092 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1093 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1094 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 1095 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); 1096 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); 1097 setOperationAction(ISD::FRINT, MVT::v8f32, Legal); 1098 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); 1099 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1100 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 1101 1102 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1103 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1104 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1105 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1106 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1107 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1108 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 1109 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 1110 setOperationAction(ISD::FRINT, MVT::v4f64, Legal); 1111 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); 1112 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1113 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 1114 1115 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 1116 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 1117 1118 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 1119 1120 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1121 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1122 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1123 1124 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1125 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1126 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1127 1128 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); 1129 1130 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1131 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1132 1133 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1134 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1135 1136 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1137 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1138 1139 setOperationAction(ISD::SDIV, MVT::v16i16, Custom); 1140 1141 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1142 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1143 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1144 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1145 1146 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1147 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1148 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1149 1150 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1151 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1152 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1153 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1154 1155 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); 1156 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 1157 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); 1158 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1159 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); 1160 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); 1161 1162 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { 1163 setOperationAction(ISD::FMA, MVT::v8f32, Legal); 1164 setOperationAction(ISD::FMA, MVT::v4f64, Legal); 1165 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 1166 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 1167 setOperationAction(ISD::FMA, MVT::f32, Legal); 1168 setOperationAction(ISD::FMA, MVT::f64, Legal); 1169 } 1170 1171 if (Subtarget->hasInt256()) { 1172 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1173 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1174 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1175 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1176 1177 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1178 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1179 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1180 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1181 1182 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1183 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1184 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1185 // Don't lower v32i8 because there is no 128-bit byte mul 1186 1187 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1188 1189 setOperationAction(ISD::SRL, MVT::v4i64, Legal); 1190 setOperationAction(ISD::SRL, MVT::v8i32, Legal); 1191 1192 setOperationAction(ISD::SHL, MVT::v4i64, Legal); 1193 setOperationAction(ISD::SHL, MVT::v8i32, Legal); 1194 1195 setOperationAction(ISD::SRA, MVT::v8i32, Legal); 1196 1197 setOperationAction(ISD::SDIV, MVT::v8i32, Custom); 1198 } else { 1199 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1200 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1201 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1202 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1203 1204 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1205 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1206 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1207 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1208 1209 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1210 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1211 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1212 // Don't lower v32i8 because there is no 128-bit byte mul 1213 1214 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1215 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1216 1217 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1218 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1219 1220 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1221 } 1222 1223 // Custom lower several nodes for 256-bit types. 1224 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1225 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1226 MVT VT = (MVT::SimpleValueType)i; 1227 1228 // Extract subvector is special because the value type 1229 // (result) is 128-bit but the source is 256-bit wide. 1230 if (VT.is128BitVector()) 1231 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1232 1233 // Do not attempt to custom lower other non-256-bit vectors 1234 if (!VT.is256BitVector()) 1235 continue; 1236 1237 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1238 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1239 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1240 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1241 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1242 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1243 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1244 } 1245 1246 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1247 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 1248 MVT VT = (MVT::SimpleValueType)i; 1249 1250 // Do not attempt to promote non-256-bit vectors 1251 if (!VT.is256BitVector()) 1252 continue; 1253 1254 setOperationAction(ISD::AND, VT, Promote); 1255 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1256 setOperationAction(ISD::OR, VT, Promote); 1257 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1258 setOperationAction(ISD::XOR, VT, Promote); 1259 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1260 setOperationAction(ISD::LOAD, VT, Promote); 1261 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1262 setOperationAction(ISD::SELECT, VT, Promote); 1263 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1264 } 1265 } 1266 1267 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1268 // of this type with custom code. 1269 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 1270 VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { 1271 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1272 Custom); 1273 } 1274 1275 // We want to custom lower some of our intrinsics. 1276 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1277 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1278 1279 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1280 // handle type legalization for these operations here. 1281 // 1282 // FIXME: We really should do custom legalization for addition and 1283 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1284 // than generic legalization for 64-bit multiplication-with-overflow, though. 1285 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1286 // Add/Sub/Mul with overflow operations are custom lowered. 1287 MVT VT = IntVTs[i]; 1288 setOperationAction(ISD::SADDO, VT, Custom); 1289 setOperationAction(ISD::UADDO, VT, Custom); 1290 setOperationAction(ISD::SSUBO, VT, Custom); 1291 setOperationAction(ISD::USUBO, VT, Custom); 1292 setOperationAction(ISD::SMULO, VT, Custom); 1293 setOperationAction(ISD::UMULO, VT, Custom); 1294 } 1295 1296 // There are no 8-bit 3-address imul/mul instructions 1297 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1298 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1299 1300 if (!Subtarget->is64Bit()) { 1301 // These libcalls are not available in 32-bit. 1302 setLibcallName(RTLIB::SHL_I128, 0); 1303 setLibcallName(RTLIB::SRL_I128, 0); 1304 setLibcallName(RTLIB::SRA_I128, 0); 1305 } 1306 1307 // Combine sin / cos into one node or libcall if possible. 1308 if (Subtarget->hasSinCos()) { 1309 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 1310 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 1311 if (Subtarget->isTargetDarwin()) { 1312 // For MacOSX, we don't want to the normal expansion of a libcall to 1313 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 1314 // traffic. 1315 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1316 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1317 } 1318 } 1319 1320 // We have target-specific dag combine patterns for the following nodes: 1321 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1322 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1323 setTargetDAGCombine(ISD::VSELECT); 1324 setTargetDAGCombine(ISD::SELECT); 1325 setTargetDAGCombine(ISD::SHL); 1326 setTargetDAGCombine(ISD::SRA); 1327 setTargetDAGCombine(ISD::SRL); 1328 setTargetDAGCombine(ISD::OR); 1329 setTargetDAGCombine(ISD::AND); 1330 setTargetDAGCombine(ISD::ADD); 1331 setTargetDAGCombine(ISD::FADD); 1332 setTargetDAGCombine(ISD::FSUB); 1333 setTargetDAGCombine(ISD::FMA); 1334 setTargetDAGCombine(ISD::SUB); 1335 setTargetDAGCombine(ISD::LOAD); 1336 setTargetDAGCombine(ISD::STORE); 1337 setTargetDAGCombine(ISD::ZERO_EXTEND); 1338 setTargetDAGCombine(ISD::ANY_EXTEND); 1339 setTargetDAGCombine(ISD::SIGN_EXTEND); 1340 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 1341 setTargetDAGCombine(ISD::TRUNCATE); 1342 setTargetDAGCombine(ISD::SINT_TO_FP); 1343 setTargetDAGCombine(ISD::SETCC); 1344 if (Subtarget->is64Bit()) 1345 setTargetDAGCombine(ISD::MUL); 1346 setTargetDAGCombine(ISD::XOR); 1347 1348 computeRegisterProperties(); 1349 1350 // On Darwin, -Os means optimize for size without hurting performance, 1351 // do not reduce the limit. 1352 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1353 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1354 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1355 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1356 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1357 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1358 setPrefLoopAlignment(4); // 2^4 bytes. 1359 BenefitFromCodePlacementOpt = true; 1360 1361 // Predictable cmov don't hurt on atom because it's in-order. 1362 PredictableSelectIsExpensive = !Subtarget->isAtom(); 1363 1364 setPrefFunctionAlignment(4); // 2^4 bytes. 1365 } 1366 1367 EVT X86TargetLowering::getSetCCResultType(EVT VT) const { 1368 if (!VT.isVector()) return MVT::i8; 1369 return VT.changeVectorElementTypeToInteger(); 1370 } 1371 1372 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1373 /// the desired ByVal argument alignment. 1374 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1375 if (MaxAlign == 16) 1376 return; 1377 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1378 if (VTy->getBitWidth() == 128) 1379 MaxAlign = 16; 1380 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1381 unsigned EltAlign = 0; 1382 getMaxByValAlign(ATy->getElementType(), EltAlign); 1383 if (EltAlign > MaxAlign) 1384 MaxAlign = EltAlign; 1385 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1386 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1387 unsigned EltAlign = 0; 1388 getMaxByValAlign(STy->getElementType(i), EltAlign); 1389 if (EltAlign > MaxAlign) 1390 MaxAlign = EltAlign; 1391 if (MaxAlign == 16) 1392 break; 1393 } 1394 } 1395 } 1396 1397 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1398 /// function arguments in the caller parameter area. For X86, aggregates 1399 /// that contain SSE vectors are placed at 16-byte boundaries while the rest 1400 /// are at 4-byte boundaries. 1401 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1402 if (Subtarget->is64Bit()) { 1403 // Max of 8 and alignment of type. 1404 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1405 if (TyAlign > 8) 1406 return TyAlign; 1407 return 8; 1408 } 1409 1410 unsigned Align = 4; 1411 if (Subtarget->hasSSE1()) 1412 getMaxByValAlign(Ty, Align); 1413 return Align; 1414 } 1415 1416 /// getOptimalMemOpType - Returns the target specific optimal type for load 1417 /// and store operations as a result of memset, memcpy, and memmove 1418 /// lowering. If DstAlign is zero that means it's safe to destination 1419 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1420 /// means there isn't a need to check it against alignment requirement, 1421 /// probably because the source does not need to be loaded. If 'IsMemset' is 1422 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 1423 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 1424 /// source is constant so it does not need to be loaded. 1425 /// It returns EVT::Other if the type should be determined using generic 1426 /// target-independent logic. 1427 EVT 1428 X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1429 unsigned DstAlign, unsigned SrcAlign, 1430 bool IsMemset, bool ZeroMemset, 1431 bool MemcpyStrSrc, 1432 MachineFunction &MF) const { 1433 const Function *F = MF.getFunction(); 1434 if ((!IsMemset || ZeroMemset) && 1435 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, 1436 Attribute::NoImplicitFloat)) { 1437 if (Size >= 16 && 1438 (Subtarget->isUnalignedMemAccessFast() || 1439 ((DstAlign == 0 || DstAlign >= 16) && 1440 (SrcAlign == 0 || SrcAlign >= 16)))) { 1441 if (Size >= 32) { 1442 if (Subtarget->hasInt256()) 1443 return MVT::v8i32; 1444 if (Subtarget->hasFp256()) 1445 return MVT::v8f32; 1446 } 1447 if (Subtarget->hasSSE2()) 1448 return MVT::v4i32; 1449 if (Subtarget->hasSSE1()) 1450 return MVT::v4f32; 1451 } else if (!MemcpyStrSrc && Size >= 8 && 1452 !Subtarget->is64Bit() && 1453 Subtarget->hasSSE2()) { 1454 // Do not use f64 to lower memcpy if source is string constant. It's 1455 // better to use i32 to avoid the loads. 1456 return MVT::f64; 1457 } 1458 } 1459 if (Subtarget->is64Bit() && Size >= 8) 1460 return MVT::i64; 1461 return MVT::i32; 1462 } 1463 1464 bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 1465 if (VT == MVT::f32) 1466 return X86ScalarSSEf32; 1467 else if (VT == MVT::f64) 1468 return X86ScalarSSEf64; 1469 return true; 1470 } 1471 1472 bool 1473 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { 1474 if (Fast) 1475 *Fast = Subtarget->isUnalignedMemAccessFast(); 1476 return true; 1477 } 1478 1479 /// getJumpTableEncoding - Return the entry encoding for a jump table in the 1480 /// current function. The returned value is a member of the 1481 /// MachineJumpTableInfo::JTEntryKind enum. 1482 unsigned X86TargetLowering::getJumpTableEncoding() const { 1483 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1484 // symbol. 1485 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1486 Subtarget->isPICStyleGOT()) 1487 return MachineJumpTableInfo::EK_Custom32; 1488 1489 // Otherwise, use the normal jump table encoding heuristics. 1490 return TargetLowering::getJumpTableEncoding(); 1491 } 1492 1493 const MCExpr * 1494 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1495 const MachineBasicBlock *MBB, 1496 unsigned uid,MCContext &Ctx) const{ 1497 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1498 Subtarget->isPICStyleGOT()); 1499 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1500 // entries. 1501 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1502 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1503 } 1504 1505 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1506 /// jumptable. 1507 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1508 SelectionDAG &DAG) const { 1509 if (!Subtarget->is64Bit()) 1510 // This doesn't have DebugLoc associated with it, but is not really the 1511 // same as a Register. 1512 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1513 return Table; 1514 } 1515 1516 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1517 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1518 /// MCExpr. 1519 const MCExpr *X86TargetLowering:: 1520 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1521 MCContext &Ctx) const { 1522 // X86-64 uses RIP relative addressing based on the jump table label. 1523 if (Subtarget->isPICStyleRIPRel()) 1524 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1525 1526 // Otherwise, the reference is relative to the PIC base. 1527 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1528 } 1529 1530 // FIXME: Why this routine is here? Move to RegInfo! 1531 std::pair<const TargetRegisterClass*, uint8_t> 1532 X86TargetLowering::findRepresentativeClass(MVT VT) const{ 1533 const TargetRegisterClass *RRC = 0; 1534 uint8_t Cost = 1; 1535 switch (VT.SimpleTy) { 1536 default: 1537 return TargetLowering::findRepresentativeClass(VT); 1538 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1539 RRC = Subtarget->is64Bit() ? 1540 (const TargetRegisterClass*)&X86::GR64RegClass : 1541 (const TargetRegisterClass*)&X86::GR32RegClass; 1542 break; 1543 case MVT::x86mmx: 1544 RRC = &X86::VR64RegClass; 1545 break; 1546 case MVT::f32: case MVT::f64: 1547 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1548 case MVT::v4f32: case MVT::v2f64: 1549 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1550 case MVT::v4f64: 1551 RRC = &X86::VR128RegClass; 1552 break; 1553 } 1554 return std::make_pair(RRC, Cost); 1555 } 1556 1557 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1558 unsigned &Offset) const { 1559 if (!Subtarget->isTargetLinux()) 1560 return false; 1561 1562 if (Subtarget->is64Bit()) { 1563 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1564 Offset = 0x28; 1565 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1566 AddressSpace = 256; 1567 else 1568 AddressSpace = 257; 1569 } else { 1570 // %gs:0x14 on i386 1571 Offset = 0x14; 1572 AddressSpace = 256; 1573 } 1574 return true; 1575 } 1576 1577 //===----------------------------------------------------------------------===// 1578 // Return Value Calling Convention Implementation 1579 //===----------------------------------------------------------------------===// 1580 1581 #include "X86GenCallingConv.inc" 1582 1583 bool 1584 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1585 MachineFunction &MF, bool isVarArg, 1586 const SmallVectorImpl<ISD::OutputArg> &Outs, 1587 LLVMContext &Context) const { 1588 SmallVector<CCValAssign, 16> RVLocs; 1589 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1590 RVLocs, Context); 1591 return CCInfo.CheckReturn(Outs, RetCC_X86); 1592 } 1593 1594 SDValue 1595 X86TargetLowering::LowerReturn(SDValue Chain, 1596 CallingConv::ID CallConv, bool isVarArg, 1597 const SmallVectorImpl<ISD::OutputArg> &Outs, 1598 const SmallVectorImpl<SDValue> &OutVals, 1599 DebugLoc dl, SelectionDAG &DAG) const { 1600 MachineFunction &MF = DAG.getMachineFunction(); 1601 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1602 1603 SmallVector<CCValAssign, 16> RVLocs; 1604 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1605 RVLocs, *DAG.getContext()); 1606 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1607 1608 SDValue Flag; 1609 SmallVector<SDValue, 6> RetOps; 1610 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1611 // Operand #1 = Bytes To Pop 1612 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1613 MVT::i16)); 1614 1615 // Copy the result values into the output registers. 1616 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1617 CCValAssign &VA = RVLocs[i]; 1618 assert(VA.isRegLoc() && "Can only return in registers!"); 1619 SDValue ValToCopy = OutVals[i]; 1620 EVT ValVT = ValToCopy.getValueType(); 1621 1622 // Promote values to the appropriate types 1623 if (VA.getLocInfo() == CCValAssign::SExt) 1624 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 1625 else if (VA.getLocInfo() == CCValAssign::ZExt) 1626 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 1627 else if (VA.getLocInfo() == CCValAssign::AExt) 1628 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 1629 else if (VA.getLocInfo() == CCValAssign::BCvt) 1630 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 1631 1632 // If this is x86-64, and we disabled SSE, we can't return FP values, 1633 // or SSE or MMX vectors. 1634 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1635 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1636 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1637 report_fatal_error("SSE register return with SSE disabled"); 1638 } 1639 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1640 // llvm-gcc has never done it right and no one has noticed, so this 1641 // should be OK for now. 1642 if (ValVT == MVT::f64 && 1643 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1644 report_fatal_error("SSE2 register return with SSE2 disabled"); 1645 1646 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1647 // the RET instruction and handled by the FP Stackifier. 1648 if (VA.getLocReg() == X86::ST0 || 1649 VA.getLocReg() == X86::ST1) { 1650 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1651 // change the value to the FP stack register class. 1652 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1653 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1654 RetOps.push_back(ValToCopy); 1655 // Don't emit a copytoreg. 1656 continue; 1657 } 1658 1659 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1660 // which is returned in RAX / RDX. 1661 if (Subtarget->is64Bit()) { 1662 if (ValVT == MVT::x86mmx) { 1663 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1664 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1665 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1666 ValToCopy); 1667 // If we don't have SSE2 available, convert to v4f32 so the generated 1668 // register is legal. 1669 if (!Subtarget->hasSSE2()) 1670 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1671 } 1672 } 1673 } 1674 1675 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1676 Flag = Chain.getValue(1); 1677 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1678 } 1679 1680 // The x86-64 ABIs require that for returning structs by value we copy 1681 // the sret argument into %rax/%eax (depending on ABI) for the return. 1682 // We saved the argument into a virtual register in the entry block, 1683 // so now we copy the value out and into %rax/%eax. 1684 if (Subtarget->is64Bit() && 1685 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1686 MachineFunction &MF = DAG.getMachineFunction(); 1687 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1688 unsigned Reg = FuncInfo->getSRetReturnReg(); 1689 assert(Reg && 1690 "SRetReturnReg should have been set in LowerFormalArguments()."); 1691 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1692 1693 unsigned RetValReg = Subtarget->isTarget64BitILP32() ? X86::EAX : X86::RAX; 1694 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); 1695 Flag = Chain.getValue(1); 1696 1697 // RAX/EAX now acts like a return value. 1698 RetOps.push_back(DAG.getRegister(RetValReg, MVT::i64)); 1699 } 1700 1701 RetOps[0] = Chain; // Update chain. 1702 1703 // Add the flag if we have it. 1704 if (Flag.getNode()) 1705 RetOps.push_back(Flag); 1706 1707 return DAG.getNode(X86ISD::RET_FLAG, dl, 1708 MVT::Other, &RetOps[0], RetOps.size()); 1709 } 1710 1711 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 1712 if (N->getNumValues() != 1) 1713 return false; 1714 if (!N->hasNUsesOfValue(1, 0)) 1715 return false; 1716 1717 SDValue TCChain = Chain; 1718 SDNode *Copy = *N->use_begin(); 1719 if (Copy->getOpcode() == ISD::CopyToReg) { 1720 // If the copy has a glue operand, we conservatively assume it isn't safe to 1721 // perform a tail call. 1722 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 1723 return false; 1724 TCChain = Copy->getOperand(0); 1725 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 1726 return false; 1727 1728 bool HasRet = false; 1729 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1730 UI != UE; ++UI) { 1731 if (UI->getOpcode() != X86ISD::RET_FLAG) 1732 return false; 1733 HasRet = true; 1734 } 1735 1736 if (!HasRet) 1737 return false; 1738 1739 Chain = TCChain; 1740 return true; 1741 } 1742 1743 MVT 1744 X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, 1745 ISD::NodeType ExtendKind) const { 1746 MVT ReturnMVT; 1747 // TODO: Is this also valid on 32-bit? 1748 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1749 ReturnMVT = MVT::i8; 1750 else 1751 ReturnMVT = MVT::i32; 1752 1753 MVT MinVT = getRegisterType(ReturnMVT); 1754 return VT.bitsLT(MinVT) ? MinVT : VT; 1755 } 1756 1757 /// LowerCallResult - Lower the result values of a call into the 1758 /// appropriate copies out of appropriate physical registers. 1759 /// 1760 SDValue 1761 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1762 CallingConv::ID CallConv, bool isVarArg, 1763 const SmallVectorImpl<ISD::InputArg> &Ins, 1764 DebugLoc dl, SelectionDAG &DAG, 1765 SmallVectorImpl<SDValue> &InVals) const { 1766 1767 // Assign locations to each value returned by this call. 1768 SmallVector<CCValAssign, 16> RVLocs; 1769 bool Is64Bit = Subtarget->is64Bit(); 1770 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1771 getTargetMachine(), RVLocs, *DAG.getContext()); 1772 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1773 1774 // Copy all of the result registers out of their specified physreg. 1775 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 1776 CCValAssign &VA = RVLocs[i]; 1777 EVT CopyVT = VA.getValVT(); 1778 1779 // If this is x86-64, and we disabled SSE, we can't return FP values 1780 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1781 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1782 report_fatal_error("SSE register return with SSE disabled"); 1783 } 1784 1785 SDValue Val; 1786 1787 // If this is a call to a function that returns an fp value on the floating 1788 // point stack, we must guarantee the value is popped from the stack, so 1789 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1790 // if the return value is not used. We use the FpPOP_RETVAL instruction 1791 // instead. 1792 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1793 // If we prefer to use the value in xmm registers, copy it out as f80 and 1794 // use a truncate to move it from fp stack reg to xmm reg. 1795 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1796 SDValue Ops[] = { Chain, InFlag }; 1797 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1798 MVT::Other, MVT::Glue, Ops, 2), 1); 1799 Val = Chain.getValue(0); 1800 1801 // Round the f80 to the right size, which also moves it to the appropriate 1802 // xmm register. 1803 if (CopyVT != VA.getValVT()) 1804 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1805 // This truncation won't change the value. 1806 DAG.getIntPtrConstant(1)); 1807 } else { 1808 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1809 CopyVT, InFlag).getValue(1); 1810 Val = Chain.getValue(0); 1811 } 1812 InFlag = Chain.getValue(2); 1813 InVals.push_back(Val); 1814 } 1815 1816 return Chain; 1817 } 1818 1819 //===----------------------------------------------------------------------===// 1820 // C & StdCall & Fast Calling Convention implementation 1821 //===----------------------------------------------------------------------===// 1822 // StdCall calling convention seems to be standard for many Windows' API 1823 // routines and around. It differs from C calling convention just a little: 1824 // callee should clean up the stack, not caller. Symbols should be also 1825 // decorated in some fancy way :) It doesn't support any vector arguments. 1826 // For info on fast calling convention see Fast Calling Convention (tail call) 1827 // implementation LowerX86_32FastCCCallTo. 1828 1829 /// CallIsStructReturn - Determines whether a call uses struct return 1830 /// semantics. 1831 enum StructReturnType { 1832 NotStructReturn, 1833 RegStructReturn, 1834 StackStructReturn 1835 }; 1836 static StructReturnType 1837 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1838 if (Outs.empty()) 1839 return NotStructReturn; 1840 1841 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 1842 if (!Flags.isSRet()) 1843 return NotStructReturn; 1844 if (Flags.isInReg()) 1845 return RegStructReturn; 1846 return StackStructReturn; 1847 } 1848 1849 /// ArgsAreStructReturn - Determines whether a function uses struct 1850 /// return semantics. 1851 static StructReturnType 1852 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1853 if (Ins.empty()) 1854 return NotStructReturn; 1855 1856 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 1857 if (!Flags.isSRet()) 1858 return NotStructReturn; 1859 if (Flags.isInReg()) 1860 return RegStructReturn; 1861 return StackStructReturn; 1862 } 1863 1864 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1865 /// by "Src" to address "Dst" with size and alignment information specified by 1866 /// the specific parameter attribute. The copy will be passed as a byval 1867 /// function parameter. 1868 static SDValue 1869 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1870 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1871 DebugLoc dl) { 1872 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1873 1874 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1875 /*isVolatile*/false, /*AlwaysInline=*/true, 1876 MachinePointerInfo(), MachinePointerInfo()); 1877 } 1878 1879 /// IsTailCallConvention - Return true if the calling convention is one that 1880 /// supports tail call optimization. 1881 static bool IsTailCallConvention(CallingConv::ID CC) { 1882 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 1883 CC == CallingConv::HiPE); 1884 } 1885 1886 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1887 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 1888 return false; 1889 1890 CallSite CS(CI); 1891 CallingConv::ID CalleeCC = CS.getCallingConv(); 1892 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1893 return false; 1894 1895 return true; 1896 } 1897 1898 /// FuncIsMadeTailCallSafe - Return true if the function is being made into 1899 /// a tailcall target by changing its ABI. 1900 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 1901 bool GuaranteedTailCallOpt) { 1902 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1903 } 1904 1905 SDValue 1906 X86TargetLowering::LowerMemArgument(SDValue Chain, 1907 CallingConv::ID CallConv, 1908 const SmallVectorImpl<ISD::InputArg> &Ins, 1909 DebugLoc dl, SelectionDAG &DAG, 1910 const CCValAssign &VA, 1911 MachineFrameInfo *MFI, 1912 unsigned i) const { 1913 // Create the nodes corresponding to a load from this parameter slot. 1914 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1915 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 1916 getTargetMachine().Options.GuaranteedTailCallOpt); 1917 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1918 EVT ValVT; 1919 1920 // If value is passed by pointer we have address passed instead of the value 1921 // itself. 1922 if (VA.getLocInfo() == CCValAssign::Indirect) 1923 ValVT = VA.getLocVT(); 1924 else 1925 ValVT = VA.getValVT(); 1926 1927 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1928 // changed with more analysis. 1929 // In case of tail call optimization mark all arguments mutable. Since they 1930 // could be overwritten by lowering of arguments in case of a tail call. 1931 if (Flags.isByVal()) { 1932 unsigned Bytes = Flags.getByValSize(); 1933 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1934 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1935 return DAG.getFrameIndex(FI, getPointerTy()); 1936 } else { 1937 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1938 VA.getLocMemOffset(), isImmutable); 1939 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1940 return DAG.getLoad(ValVT, dl, Chain, FIN, 1941 MachinePointerInfo::getFixedStack(FI), 1942 false, false, false, 0); 1943 } 1944 } 1945 1946 SDValue 1947 X86TargetLowering::LowerFormalArguments(SDValue Chain, 1948 CallingConv::ID CallConv, 1949 bool isVarArg, 1950 const SmallVectorImpl<ISD::InputArg> &Ins, 1951 DebugLoc dl, 1952 SelectionDAG &DAG, 1953 SmallVectorImpl<SDValue> &InVals) 1954 const { 1955 MachineFunction &MF = DAG.getMachineFunction(); 1956 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1957 1958 const Function* Fn = MF.getFunction(); 1959 if (Fn->hasExternalLinkage() && 1960 Subtarget->isTargetCygMing() && 1961 Fn->getName() == "main") 1962 FuncInfo->setForceFramePointer(true); 1963 1964 MachineFrameInfo *MFI = MF.getFrameInfo(); 1965 bool Is64Bit = Subtarget->is64Bit(); 1966 bool IsWindows = Subtarget->isTargetWindows(); 1967 bool IsWin64 = Subtarget->isTargetWin64(); 1968 1969 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1970 "Var args not supported with calling convention fastcc, ghc or hipe"); 1971 1972 // Assign locations to all of the incoming arguments. 1973 SmallVector<CCValAssign, 16> ArgLocs; 1974 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1975 ArgLocs, *DAG.getContext()); 1976 1977 // Allocate shadow area for Win64 1978 if (IsWin64) { 1979 CCInfo.AllocateStack(32, 8); 1980 } 1981 1982 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1983 1984 unsigned LastVal = ~0U; 1985 SDValue ArgValue; 1986 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1987 CCValAssign &VA = ArgLocs[i]; 1988 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1989 // places. 1990 assert(VA.getValNo() != LastVal && 1991 "Don't support value assigned to multiple locs yet"); 1992 (void)LastVal; 1993 LastVal = VA.getValNo(); 1994 1995 if (VA.isRegLoc()) { 1996 EVT RegVT = VA.getLocVT(); 1997 const TargetRegisterClass *RC; 1998 if (RegVT == MVT::i32) 1999 RC = &X86::GR32RegClass; 2000 else if (Is64Bit && RegVT == MVT::i64) 2001 RC = &X86::GR64RegClass; 2002 else if (RegVT == MVT::f32) 2003 RC = &X86::FR32RegClass; 2004 else if (RegVT == MVT::f64) 2005 RC = &X86::FR64RegClass; 2006 else if (RegVT.is256BitVector()) 2007 RC = &X86::VR256RegClass; 2008 else if (RegVT.is128BitVector()) 2009 RC = &X86::VR128RegClass; 2010 else if (RegVT == MVT::x86mmx) 2011 RC = &X86::VR64RegClass; 2012 else 2013 llvm_unreachable("Unknown argument type!"); 2014 2015 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2016 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 2017 2018 // If this is an 8 or 16-bit value, it is really passed promoted to 32 2019 // bits. Insert an assert[sz]ext to capture this, then truncate to the 2020 // right size. 2021 if (VA.getLocInfo() == CCValAssign::SExt) 2022 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 2023 DAG.getValueType(VA.getValVT())); 2024 else if (VA.getLocInfo() == CCValAssign::ZExt) 2025 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 2026 DAG.getValueType(VA.getValVT())); 2027 else if (VA.getLocInfo() == CCValAssign::BCvt) 2028 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 2029 2030 if (VA.isExtInLoc()) { 2031 // Handle MMX values passed in XMM regs. 2032 if (RegVT.isVector()) 2033 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 2034 else 2035 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2036 } 2037 } else { 2038 assert(VA.isMemLoc()); 2039 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 2040 } 2041 2042 // If value is passed via pointer - do a load. 2043 if (VA.getLocInfo() == CCValAssign::Indirect) 2044 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 2045 MachinePointerInfo(), false, false, false, 0); 2046 2047 InVals.push_back(ArgValue); 2048 } 2049 2050 // The x86-64 ABIs require that for returning structs by value we copy 2051 // the sret argument into %rax/%eax (depending on ABI) for the return. 2052 // Save the argument into a virtual register so that we can access it 2053 // from the return points. 2054 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 2055 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2056 unsigned Reg = FuncInfo->getSRetReturnReg(); 2057 if (!Reg) { 2058 MVT PtrTy = getPointerTy(); 2059 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 2060 FuncInfo->setSRetReturnReg(Reg); 2061 } 2062 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 2063 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 2064 } 2065 2066 unsigned StackSize = CCInfo.getNextStackOffset(); 2067 // Align stack specially for tail calls. 2068 if (FuncIsMadeTailCallSafe(CallConv, 2069 MF.getTarget().Options.GuaranteedTailCallOpt)) 2070 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 2071 2072 // If the function takes variable number of arguments, make a frame index for 2073 // the start of the first vararg value... for expansion of llvm.va_start. 2074 if (isVarArg) { 2075 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 2076 CallConv != CallingConv::X86_ThisCall)) { 2077 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 2078 } 2079 if (Is64Bit) { 2080 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 2081 2082 // FIXME: We should really autogenerate these arrays 2083 static const uint16_t GPR64ArgRegsWin64[] = { 2084 X86::RCX, X86::RDX, X86::R8, X86::R9 2085 }; 2086 static const uint16_t GPR64ArgRegs64Bit[] = { 2087 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 2088 }; 2089 static const uint16_t XMMArgRegs64Bit[] = { 2090 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2091 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2092 }; 2093 const uint16_t *GPR64ArgRegs; 2094 unsigned NumXMMRegs = 0; 2095 2096 if (IsWin64) { 2097 // The XMM registers which might contain var arg parameters are shadowed 2098 // in their paired GPR. So we only need to save the GPR to their home 2099 // slots. 2100 TotalNumIntRegs = 4; 2101 GPR64ArgRegs = GPR64ArgRegsWin64; 2102 } else { 2103 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 2104 GPR64ArgRegs = GPR64ArgRegs64Bit; 2105 2106 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 2107 TotalNumXMMRegs); 2108 } 2109 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 2110 TotalNumIntRegs); 2111 2112 bool NoImplicitFloatOps = Fn->getAttributes(). 2113 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 2114 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 2115 "SSE register cannot be used when SSE is disabled!"); 2116 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 2117 NoImplicitFloatOps) && 2118 "SSE register cannot be used when SSE is disabled!"); 2119 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 2120 !Subtarget->hasSSE1()) 2121 // Kernel mode asks for SSE to be disabled, so don't push them 2122 // on the stack. 2123 TotalNumXMMRegs = 0; 2124 2125 if (IsWin64) { 2126 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 2127 // Get to the caller-allocated home save location. Add 8 to account 2128 // for the return address. 2129 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2130 FuncInfo->setRegSaveFrameIndex( 2131 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2132 // Fixup to set vararg frame on shadow area (4 x i64). 2133 if (NumIntRegs < 4) 2134 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2135 } else { 2136 // For X86-64, if there are vararg parameters that are passed via 2137 // registers, then we must store them to their spots on the stack so 2138 // they may be loaded by deferencing the result of va_next. 2139 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2140 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 2141 FuncInfo->setRegSaveFrameIndex( 2142 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 2143 false)); 2144 } 2145 2146 // Store the integer parameter registers. 2147 SmallVector<SDValue, 8> MemOps; 2148 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2149 getPointerTy()); 2150 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2151 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 2152 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 2153 DAG.getIntPtrConstant(Offset)); 2154 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 2155 &X86::GR64RegClass); 2156 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 2157 SDValue Store = 2158 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2159 MachinePointerInfo::getFixedStack( 2160 FuncInfo->getRegSaveFrameIndex(), Offset), 2161 false, false, 0); 2162 MemOps.push_back(Store); 2163 Offset += 8; 2164 } 2165 2166 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2167 // Now store the XMM (fp + vector) parameter registers. 2168 SmallVector<SDValue, 11> SaveXMMOps; 2169 SaveXMMOps.push_back(Chain); 2170 2171 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2172 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2173 SaveXMMOps.push_back(ALVal); 2174 2175 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2176 FuncInfo->getRegSaveFrameIndex())); 2177 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2178 FuncInfo->getVarArgsFPOffset())); 2179 2180 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2181 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2182 &X86::VR128RegClass); 2183 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2184 SaveXMMOps.push_back(Val); 2185 } 2186 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2187 MVT::Other, 2188 &SaveXMMOps[0], SaveXMMOps.size())); 2189 } 2190 2191 if (!MemOps.empty()) 2192 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2193 &MemOps[0], MemOps.size()); 2194 } 2195 } 2196 2197 // Some CCs need callee pop. 2198 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2199 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2200 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2201 } else { 2202 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2203 // If this is an sret function, the return should pop the hidden pointer. 2204 if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2205 argsAreStructReturn(Ins) == StackStructReturn) 2206 FuncInfo->setBytesToPopOnReturn(4); 2207 } 2208 2209 if (!Is64Bit) { 2210 // RegSaveFrameIndex is X86-64 only. 2211 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2212 if (CallConv == CallingConv::X86_FastCall || 2213 CallConv == CallingConv::X86_ThisCall) 2214 // fastcc functions can't have varargs. 2215 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2216 } 2217 2218 FuncInfo->setArgumentStackSize(StackSize); 2219 2220 return Chain; 2221 } 2222 2223 SDValue 2224 X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2225 SDValue StackPtr, SDValue Arg, 2226 DebugLoc dl, SelectionDAG &DAG, 2227 const CCValAssign &VA, 2228 ISD::ArgFlagsTy Flags) const { 2229 unsigned LocMemOffset = VA.getLocMemOffset(); 2230 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2231 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2232 if (Flags.isByVal()) 2233 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2234 2235 return DAG.getStore(Chain, dl, Arg, PtrOff, 2236 MachinePointerInfo::getStack(LocMemOffset), 2237 false, false, 0); 2238 } 2239 2240 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2241 /// optimization is performed and it is required. 2242 SDValue 2243 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2244 SDValue &OutRetAddr, SDValue Chain, 2245 bool IsTailCall, bool Is64Bit, 2246 int FPDiff, DebugLoc dl) const { 2247 // Adjust the Return address stack slot. 2248 EVT VT = getPointerTy(); 2249 OutRetAddr = getReturnAddressFrameIndex(DAG); 2250 2251 // Load the "old" Return address. 2252 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2253 false, false, false, 0); 2254 return SDValue(OutRetAddr.getNode(), 1); 2255 } 2256 2257 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2258 /// optimization is performed and it is required (FPDiff!=0). 2259 static SDValue 2260 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2261 SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, 2262 unsigned SlotSize, int FPDiff, DebugLoc dl) { 2263 // Store the return address to the appropriate stack slot. 2264 if (!FPDiff) return Chain; 2265 // Calculate the new stack slot for the return address. 2266 int NewReturnAddrFI = 2267 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 2268 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 2269 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2270 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2271 false, false, 0); 2272 return Chain; 2273 } 2274 2275 SDValue 2276 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2277 SmallVectorImpl<SDValue> &InVals) const { 2278 SelectionDAG &DAG = CLI.DAG; 2279 DebugLoc &dl = CLI.DL; 2280 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2281 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2282 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2283 SDValue Chain = CLI.Chain; 2284 SDValue Callee = CLI.Callee; 2285 CallingConv::ID CallConv = CLI.CallConv; 2286 bool &isTailCall = CLI.IsTailCall; 2287 bool isVarArg = CLI.IsVarArg; 2288 2289 MachineFunction &MF = DAG.getMachineFunction(); 2290 bool Is64Bit = Subtarget->is64Bit(); 2291 bool IsWin64 = Subtarget->isTargetWin64(); 2292 bool IsWindows = Subtarget->isTargetWindows(); 2293 StructReturnType SR = callIsStructReturn(Outs); 2294 bool IsSibcall = false; 2295 2296 if (MF.getTarget().Options.DisableTailCalls) 2297 isTailCall = false; 2298 2299 if (isTailCall) { 2300 // Check if it's really possible to do a tail call. 2301 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2302 isVarArg, SR != NotStructReturn, 2303 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 2304 Outs, OutVals, Ins, DAG); 2305 2306 // Sibcalls are automatically detected tailcalls which do not require 2307 // ABI changes. 2308 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2309 IsSibcall = true; 2310 2311 if (isTailCall) 2312 ++NumTailCalls; 2313 } 2314 2315 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2316 "Var args not supported with calling convention fastcc, ghc or hipe"); 2317 2318 // Analyze operands of the call, assigning locations to each operand. 2319 SmallVector<CCValAssign, 16> ArgLocs; 2320 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2321 ArgLocs, *DAG.getContext()); 2322 2323 // Allocate shadow area for Win64 2324 if (IsWin64) { 2325 CCInfo.AllocateStack(32, 8); 2326 } 2327 2328 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2329 2330 // Get a count of how many bytes are to be pushed on the stack. 2331 unsigned NumBytes = CCInfo.getNextStackOffset(); 2332 if (IsSibcall) 2333 // This is a sibcall. The memory operands are available in caller's 2334 // own caller's stack. 2335 NumBytes = 0; 2336 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2337 IsTailCallConvention(CallConv)) 2338 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2339 2340 int FPDiff = 0; 2341 if (isTailCall && !IsSibcall) { 2342 // Lower arguments at fp - stackoffset + fpdiff. 2343 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2344 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2345 2346 FPDiff = NumBytesCallerPushed - NumBytes; 2347 2348 // Set the delta of movement of the returnaddr stackslot. 2349 // But only set if delta is greater than previous delta. 2350 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2351 X86Info->setTCReturnAddrDelta(FPDiff); 2352 } 2353 2354 if (!IsSibcall) 2355 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2356 2357 SDValue RetAddrFrIdx; 2358 // Load return address for tail calls. 2359 if (isTailCall && FPDiff) 2360 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2361 Is64Bit, FPDiff, dl); 2362 2363 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2364 SmallVector<SDValue, 8> MemOpChains; 2365 SDValue StackPtr; 2366 2367 // Walk the register/memloc assignments, inserting copies/loads. In the case 2368 // of tail call optimization arguments are handle later. 2369 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2370 CCValAssign &VA = ArgLocs[i]; 2371 EVT RegVT = VA.getLocVT(); 2372 SDValue Arg = OutVals[i]; 2373 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2374 bool isByVal = Flags.isByVal(); 2375 2376 // Promote the value if needed. 2377 switch (VA.getLocInfo()) { 2378 default: llvm_unreachable("Unknown loc info!"); 2379 case CCValAssign::Full: break; 2380 case CCValAssign::SExt: 2381 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2382 break; 2383 case CCValAssign::ZExt: 2384 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2385 break; 2386 case CCValAssign::AExt: 2387 if (RegVT.is128BitVector()) { 2388 // Special case: passing MMX values in XMM registers. 2389 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2390 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2391 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2392 } else 2393 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2394 break; 2395 case CCValAssign::BCvt: 2396 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2397 break; 2398 case CCValAssign::Indirect: { 2399 // Store the argument. 2400 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2401 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2402 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2403 MachinePointerInfo::getFixedStack(FI), 2404 false, false, 0); 2405 Arg = SpillSlot; 2406 break; 2407 } 2408 } 2409 2410 if (VA.isRegLoc()) { 2411 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2412 if (isVarArg && IsWin64) { 2413 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2414 // shadow reg if callee is a varargs function. 2415 unsigned ShadowReg = 0; 2416 switch (VA.getLocReg()) { 2417 case X86::XMM0: ShadowReg = X86::RCX; break; 2418 case X86::XMM1: ShadowReg = X86::RDX; break; 2419 case X86::XMM2: ShadowReg = X86::R8; break; 2420 case X86::XMM3: ShadowReg = X86::R9; break; 2421 } 2422 if (ShadowReg) 2423 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2424 } 2425 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2426 assert(VA.isMemLoc()); 2427 if (StackPtr.getNode() == 0) 2428 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2429 getPointerTy()); 2430 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2431 dl, DAG, VA, Flags)); 2432 } 2433 } 2434 2435 if (!MemOpChains.empty()) 2436 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2437 &MemOpChains[0], MemOpChains.size()); 2438 2439 if (Subtarget->isPICStyleGOT()) { 2440 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2441 // GOT pointer. 2442 if (!isTailCall) { 2443 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 2444 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); 2445 } else { 2446 // If we are tail calling and generating PIC/GOT style code load the 2447 // address of the callee into ECX. The value in ecx is used as target of 2448 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2449 // for tail calls on PIC/GOT architectures. Normally we would just put the 2450 // address of GOT into ebx and then call target@PLT. But for tail calls 2451 // ebx would be restored (since ebx is callee saved) before jumping to the 2452 // target@PLT. 2453 2454 // Note: The actual moving to ECX is done further down. 2455 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2456 if (G && !G->getGlobal()->hasHiddenVisibility() && 2457 !G->getGlobal()->hasProtectedVisibility()) 2458 Callee = LowerGlobalAddress(Callee, DAG); 2459 else if (isa<ExternalSymbolSDNode>(Callee)) 2460 Callee = LowerExternalSymbol(Callee, DAG); 2461 } 2462 } 2463 2464 if (Is64Bit && isVarArg && !IsWin64) { 2465 // From AMD64 ABI document: 2466 // For calls that may call functions that use varargs or stdargs 2467 // (prototype-less calls or calls to functions containing ellipsis (...) in 2468 // the declaration) %al is used as hidden argument to specify the number 2469 // of SSE registers used. The contents of %al do not need to match exactly 2470 // the number of registers, but must be an ubound on the number of SSE 2471 // registers used and is in the range 0 - 8 inclusive. 2472 2473 // Count the number of XMM registers allocated. 2474 static const uint16_t XMMArgRegs[] = { 2475 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2476 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2477 }; 2478 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2479 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2480 && "SSE registers cannot be used when SSE is disabled"); 2481 2482 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 2483 DAG.getConstant(NumXMMRegs, MVT::i8))); 2484 } 2485 2486 // For tail calls lower the arguments to the 'real' stack slot. 2487 if (isTailCall) { 2488 // Force all the incoming stack arguments to be loaded from the stack 2489 // before any new outgoing arguments are stored to the stack, because the 2490 // outgoing stack slots may alias the incoming argument stack slots, and 2491 // the alias isn't otherwise explicit. This is slightly more conservative 2492 // than necessary, because it means that each store effectively depends 2493 // on every argument instead of just those arguments it would clobber. 2494 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2495 2496 SmallVector<SDValue, 8> MemOpChains2; 2497 SDValue FIN; 2498 int FI = 0; 2499 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2500 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2501 CCValAssign &VA = ArgLocs[i]; 2502 if (VA.isRegLoc()) 2503 continue; 2504 assert(VA.isMemLoc()); 2505 SDValue Arg = OutVals[i]; 2506 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2507 // Create frame index. 2508 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2509 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2510 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2511 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2512 2513 if (Flags.isByVal()) { 2514 // Copy relative to framepointer. 2515 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2516 if (StackPtr.getNode() == 0) 2517 StackPtr = DAG.getCopyFromReg(Chain, dl, 2518 RegInfo->getStackRegister(), 2519 getPointerTy()); 2520 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2521 2522 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2523 ArgChain, 2524 Flags, DAG, dl)); 2525 } else { 2526 // Store relative to framepointer. 2527 MemOpChains2.push_back( 2528 DAG.getStore(ArgChain, dl, Arg, FIN, 2529 MachinePointerInfo::getFixedStack(FI), 2530 false, false, 0)); 2531 } 2532 } 2533 } 2534 2535 if (!MemOpChains2.empty()) 2536 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2537 &MemOpChains2[0], MemOpChains2.size()); 2538 2539 // Store the return address to the appropriate stack slot. 2540 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2541 getPointerTy(), RegInfo->getSlotSize(), 2542 FPDiff, dl); 2543 } 2544 2545 // Build a sequence of copy-to-reg nodes chained together with token chain 2546 // and flag operands which copy the outgoing args into registers. 2547 SDValue InFlag; 2548 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2549 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2550 RegsToPass[i].second, InFlag); 2551 InFlag = Chain.getValue(1); 2552 } 2553 2554 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2555 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2556 // In the 64-bit large code model, we have to make all calls 2557 // through a register, since the call instruction's 32-bit 2558 // pc-relative offset may not be large enough to hold the whole 2559 // address. 2560 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2561 // If the callee is a GlobalAddress node (quite common, every direct call 2562 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2563 // it. 2564 2565 // We should use extra load for direct calls to dllimported functions in 2566 // non-JIT mode. 2567 const GlobalValue *GV = G->getGlobal(); 2568 if (!GV->hasDLLImportLinkage()) { 2569 unsigned char OpFlags = 0; 2570 bool ExtraLoad = false; 2571 unsigned WrapperKind = ISD::DELETED_NODE; 2572 2573 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2574 // external symbols most go through the PLT in PIC mode. If the symbol 2575 // has hidden or protected visibility, or if it is static or local, then 2576 // we don't need to use the PLT - we can directly call it. 2577 if (Subtarget->isTargetELF() && 2578 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2579 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2580 OpFlags = X86II::MO_PLT; 2581 } else if (Subtarget->isPICStyleStubAny() && 2582 (GV->isDeclaration() || GV->isWeakForLinker()) && 2583 (!Subtarget->getTargetTriple().isMacOSX() || 2584 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2585 // PC-relative references to external symbols should go through $stub, 2586 // unless we're building with the leopard linker or later, which 2587 // automatically synthesizes these stubs. 2588 OpFlags = X86II::MO_DARWIN_STUB; 2589 } else if (Subtarget->isPICStyleRIPRel() && 2590 isa<Function>(GV) && 2591 cast<Function>(GV)->getAttributes(). 2592 hasAttribute(AttributeSet::FunctionIndex, 2593 Attribute::NonLazyBind)) { 2594 // If the function is marked as non-lazy, generate an indirect call 2595 // which loads from the GOT directly. This avoids runtime overhead 2596 // at the cost of eager binding (and one extra byte of encoding). 2597 OpFlags = X86II::MO_GOTPCREL; 2598 WrapperKind = X86ISD::WrapperRIP; 2599 ExtraLoad = true; 2600 } 2601 2602 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2603 G->getOffset(), OpFlags); 2604 2605 // Add a wrapper if needed. 2606 if (WrapperKind != ISD::DELETED_NODE) 2607 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2608 // Add extra indirection if needed. 2609 if (ExtraLoad) 2610 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2611 MachinePointerInfo::getGOT(), 2612 false, false, false, 0); 2613 } 2614 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2615 unsigned char OpFlags = 0; 2616 2617 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2618 // external symbols should go through the PLT. 2619 if (Subtarget->isTargetELF() && 2620 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2621 OpFlags = X86II::MO_PLT; 2622 } else if (Subtarget->isPICStyleStubAny() && 2623 (!Subtarget->getTargetTriple().isMacOSX() || 2624 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2625 // PC-relative references to external symbols should go through $stub, 2626 // unless we're building with the leopard linker or later, which 2627 // automatically synthesizes these stubs. 2628 OpFlags = X86II::MO_DARWIN_STUB; 2629 } 2630 2631 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2632 OpFlags); 2633 } 2634 2635 // Returns a chain & a flag for retval copy to use. 2636 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2637 SmallVector<SDValue, 8> Ops; 2638 2639 if (!IsSibcall && isTailCall) { 2640 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2641 DAG.getIntPtrConstant(0, true), InFlag); 2642 InFlag = Chain.getValue(1); 2643 } 2644 2645 Ops.push_back(Chain); 2646 Ops.push_back(Callee); 2647 2648 if (isTailCall) 2649 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2650 2651 // Add argument registers to the end of the list so that they are known live 2652 // into the call. 2653 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2654 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2655 RegsToPass[i].second.getValueType())); 2656 2657 // Add a register mask operand representing the call-preserved registers. 2658 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2659 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 2660 assert(Mask && "Missing call preserved mask for calling convention"); 2661 Ops.push_back(DAG.getRegisterMask(Mask)); 2662 2663 if (InFlag.getNode()) 2664 Ops.push_back(InFlag); 2665 2666 if (isTailCall) { 2667 // We used to do: 2668 //// If this is the first return lowered for this function, add the regs 2669 //// to the liveout set for the function. 2670 // This isn't right, although it's probably harmless on x86; liveouts 2671 // should be computed from returns not tail calls. Consider a void 2672 // function making a tail call to a function returning int. 2673 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 2674 } 2675 2676 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2677 InFlag = Chain.getValue(1); 2678 2679 // Create the CALLSEQ_END node. 2680 unsigned NumBytesForCalleeToPush; 2681 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2682 getTargetMachine().Options.GuaranteedTailCallOpt)) 2683 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2684 else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2685 SR == StackStructReturn) 2686 // If this is a call to a struct-return function, the callee 2687 // pops the hidden struct pointer, so we have to push it back. 2688 // This is common for Darwin/X86, Linux & Mingw32 targets. 2689 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 2690 NumBytesForCalleeToPush = 4; 2691 else 2692 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2693 2694 // Returns a flag for retval copy to use. 2695 if (!IsSibcall) { 2696 Chain = DAG.getCALLSEQ_END(Chain, 2697 DAG.getIntPtrConstant(NumBytes, true), 2698 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2699 true), 2700 InFlag); 2701 InFlag = Chain.getValue(1); 2702 } 2703 2704 // Handle result values, copying them out of physregs into vregs that we 2705 // return. 2706 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2707 Ins, dl, DAG, InVals); 2708 } 2709 2710 //===----------------------------------------------------------------------===// 2711 // Fast Calling Convention (tail call) implementation 2712 //===----------------------------------------------------------------------===// 2713 2714 // Like std call, callee cleans arguments, convention except that ECX is 2715 // reserved for storing the tail called function address. Only 2 registers are 2716 // free for argument passing (inreg). Tail call optimization is performed 2717 // provided: 2718 // * tailcallopt is enabled 2719 // * caller/callee are fastcc 2720 // On X86_64 architecture with GOT-style position independent code only local 2721 // (within module) calls are supported at the moment. 2722 // To keep the stack aligned according to platform abi the function 2723 // GetAlignedArgumentStackSize ensures that argument delta is always multiples 2724 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2725 // If a tail called function callee has more arguments than the caller the 2726 // caller needs to make sure that there is room to move the RETADDR to. This is 2727 // achieved by reserving an area the size of the argument delta right after the 2728 // original REtADDR, but before the saved framepointer or the spilled registers 2729 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2730 // stack layout: 2731 // arg1 2732 // arg2 2733 // RETADDR 2734 // [ new RETADDR 2735 // move area ] 2736 // (possible EBP) 2737 // ESI 2738 // EDI 2739 // local1 .. 2740 2741 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2742 /// for a 16 byte align requirement. 2743 unsigned 2744 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2745 SelectionDAG& DAG) const { 2746 MachineFunction &MF = DAG.getMachineFunction(); 2747 const TargetMachine &TM = MF.getTarget(); 2748 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2749 unsigned StackAlignment = TFI.getStackAlignment(); 2750 uint64_t AlignMask = StackAlignment - 1; 2751 int64_t Offset = StackSize; 2752 unsigned SlotSize = RegInfo->getSlotSize(); 2753 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2754 // Number smaller than 12 so just add the difference. 2755 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2756 } else { 2757 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2758 Offset = ((~AlignMask) & Offset) + StackAlignment + 2759 (StackAlignment-SlotSize); 2760 } 2761 return Offset; 2762 } 2763 2764 /// MatchingStackOffset - Return true if the given stack call argument is 2765 /// already available in the same position (relatively) of the caller's 2766 /// incoming argument stack. 2767 static 2768 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2769 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2770 const X86InstrInfo *TII) { 2771 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2772 int FI = INT_MAX; 2773 if (Arg.getOpcode() == ISD::CopyFromReg) { 2774 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2775 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2776 return false; 2777 MachineInstr *Def = MRI->getVRegDef(VR); 2778 if (!Def) 2779 return false; 2780 if (!Flags.isByVal()) { 2781 if (!TII->isLoadFromStackSlot(Def, FI)) 2782 return false; 2783 } else { 2784 unsigned Opcode = Def->getOpcode(); 2785 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2786 Def->getOperand(1).isFI()) { 2787 FI = Def->getOperand(1).getIndex(); 2788 Bytes = Flags.getByValSize(); 2789 } else 2790 return false; 2791 } 2792 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2793 if (Flags.isByVal()) 2794 // ByVal argument is passed in as a pointer but it's now being 2795 // dereferenced. e.g. 2796 // define @foo(%struct.X* %A) { 2797 // tail call @bar(%struct.X* byval %A) 2798 // } 2799 return false; 2800 SDValue Ptr = Ld->getBasePtr(); 2801 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2802 if (!FINode) 2803 return false; 2804 FI = FINode->getIndex(); 2805 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2806 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2807 FI = FINode->getIndex(); 2808 Bytes = Flags.getByValSize(); 2809 } else 2810 return false; 2811 2812 assert(FI != INT_MAX); 2813 if (!MFI->isFixedObjectIndex(FI)) 2814 return false; 2815 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2816 } 2817 2818 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2819 /// for tail call optimization. Targets which want to do tail call 2820 /// optimization should implement this function. 2821 bool 2822 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2823 CallingConv::ID CalleeCC, 2824 bool isVarArg, 2825 bool isCalleeStructRet, 2826 bool isCallerStructRet, 2827 Type *RetTy, 2828 const SmallVectorImpl<ISD::OutputArg> &Outs, 2829 const SmallVectorImpl<SDValue> &OutVals, 2830 const SmallVectorImpl<ISD::InputArg> &Ins, 2831 SelectionDAG &DAG) const { 2832 if (!IsTailCallConvention(CalleeCC) && 2833 CalleeCC != CallingConv::C) 2834 return false; 2835 2836 // If -tailcallopt is specified, make fastcc functions tail-callable. 2837 const MachineFunction &MF = DAG.getMachineFunction(); 2838 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2839 2840 // If the function return type is x86_fp80 and the callee return type is not, 2841 // then the FP_EXTEND of the call result is not a nop. It's not safe to 2842 // perform a tailcall optimization here. 2843 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 2844 return false; 2845 2846 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2847 bool CCMatch = CallerCC == CalleeCC; 2848 2849 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2850 if (IsTailCallConvention(CalleeCC) && CCMatch) 2851 return true; 2852 return false; 2853 } 2854 2855 // Look for obvious safe cases to perform tail call optimization that do not 2856 // require ABI changes. This is what gcc calls sibcall. 2857 2858 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2859 // emit a special epilogue. 2860 if (RegInfo->needsStackRealignment(MF)) 2861 return false; 2862 2863 // Also avoid sibcall optimization if either caller or callee uses struct 2864 // return semantics. 2865 if (isCalleeStructRet || isCallerStructRet) 2866 return false; 2867 2868 // An stdcall caller is expected to clean up its arguments; the callee 2869 // isn't going to do that. 2870 if (!CCMatch && CallerCC == CallingConv::X86_StdCall) 2871 return false; 2872 2873 // Do not sibcall optimize vararg calls unless all arguments are passed via 2874 // registers. 2875 if (isVarArg && !Outs.empty()) { 2876 2877 // Optimizing for varargs on Win64 is unlikely to be safe without 2878 // additional testing. 2879 if (Subtarget->isTargetWin64()) 2880 return false; 2881 2882 SmallVector<CCValAssign, 16> ArgLocs; 2883 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2884 getTargetMachine(), ArgLocs, *DAG.getContext()); 2885 2886 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2887 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2888 if (!ArgLocs[i].isRegLoc()) 2889 return false; 2890 } 2891 2892 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2893 // stack. Therefore, if it's not used by the call it is not safe to optimize 2894 // this into a sibcall. 2895 bool Unused = false; 2896 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2897 if (!Ins[i].Used) { 2898 Unused = true; 2899 break; 2900 } 2901 } 2902 if (Unused) { 2903 SmallVector<CCValAssign, 16> RVLocs; 2904 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2905 getTargetMachine(), RVLocs, *DAG.getContext()); 2906 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2907 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2908 CCValAssign &VA = RVLocs[i]; 2909 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2910 return false; 2911 } 2912 } 2913 2914 // If the calling conventions do not match, then we'd better make sure the 2915 // results are returned in the same way as what the caller expects. 2916 if (!CCMatch) { 2917 SmallVector<CCValAssign, 16> RVLocs1; 2918 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2919 getTargetMachine(), RVLocs1, *DAG.getContext()); 2920 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2921 2922 SmallVector<CCValAssign, 16> RVLocs2; 2923 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2924 getTargetMachine(), RVLocs2, *DAG.getContext()); 2925 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2926 2927 if (RVLocs1.size() != RVLocs2.size()) 2928 return false; 2929 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2930 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2931 return false; 2932 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2933 return false; 2934 if (RVLocs1[i].isRegLoc()) { 2935 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2936 return false; 2937 } else { 2938 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2939 return false; 2940 } 2941 } 2942 } 2943 2944 // If the callee takes no arguments then go on to check the results of the 2945 // call. 2946 if (!Outs.empty()) { 2947 // Check if stack adjustment is needed. For now, do not do this if any 2948 // argument is passed on the stack. 2949 SmallVector<CCValAssign, 16> ArgLocs; 2950 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2951 getTargetMachine(), ArgLocs, *DAG.getContext()); 2952 2953 // Allocate shadow area for Win64 2954 if (Subtarget->isTargetWin64()) { 2955 CCInfo.AllocateStack(32, 8); 2956 } 2957 2958 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2959 if (CCInfo.getNextStackOffset()) { 2960 MachineFunction &MF = DAG.getMachineFunction(); 2961 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2962 return false; 2963 2964 // Check if the arguments are already laid out in the right way as 2965 // the caller's fixed stack objects. 2966 MachineFrameInfo *MFI = MF.getFrameInfo(); 2967 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2968 const X86InstrInfo *TII = 2969 ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2970 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2971 CCValAssign &VA = ArgLocs[i]; 2972 SDValue Arg = OutVals[i]; 2973 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2974 if (VA.getLocInfo() == CCValAssign::Indirect) 2975 return false; 2976 if (!VA.isRegLoc()) { 2977 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2978 MFI, MRI, TII)) 2979 return false; 2980 } 2981 } 2982 } 2983 2984 // If the tailcall address may be in a register, then make sure it's 2985 // possible to register allocate for it. In 32-bit, the call address can 2986 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2987 // callee-saved registers are restored. These happen to be the same 2988 // registers used to pass 'inreg' arguments so watch out for those. 2989 if (!Subtarget->is64Bit() && 2990 ((!isa<GlobalAddressSDNode>(Callee) && 2991 !isa<ExternalSymbolSDNode>(Callee)) || 2992 getTargetMachine().getRelocationModel() == Reloc::PIC_)) { 2993 unsigned NumInRegs = 0; 2994 // In PIC we need an extra register to formulate the address computation 2995 // for the callee. 2996 unsigned MaxInRegs = 2997 (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; 2998 2999 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3000 CCValAssign &VA = ArgLocs[i]; 3001 if (!VA.isRegLoc()) 3002 continue; 3003 unsigned Reg = VA.getLocReg(); 3004 switch (Reg) { 3005 default: break; 3006 case X86::EAX: case X86::EDX: case X86::ECX: 3007 if (++NumInRegs == MaxInRegs) 3008 return false; 3009 break; 3010 } 3011 } 3012 } 3013 } 3014 3015 return true; 3016 } 3017 3018 FastISel * 3019 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 3020 const TargetLibraryInfo *libInfo) const { 3021 return X86::createFastISel(funcInfo, libInfo); 3022 } 3023 3024 //===----------------------------------------------------------------------===// 3025 // Other Lowering Hooks 3026 //===----------------------------------------------------------------------===// 3027 3028 static bool MayFoldLoad(SDValue Op) { 3029 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 3030 } 3031 3032 static bool MayFoldIntoStore(SDValue Op) { 3033 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 3034 } 3035 3036 static bool isTargetShuffle(unsigned Opcode) { 3037 switch(Opcode) { 3038 default: return false; 3039 case X86ISD::PSHUFD: 3040 case X86ISD::PSHUFHW: 3041 case X86ISD::PSHUFLW: 3042 case X86ISD::SHUFP: 3043 case X86ISD::PALIGNR: 3044 case X86ISD::MOVLHPS: 3045 case X86ISD::MOVLHPD: 3046 case X86ISD::MOVHLPS: 3047 case X86ISD::MOVLPS: 3048 case X86ISD::MOVLPD: 3049 case X86ISD::MOVSHDUP: 3050 case X86ISD::MOVSLDUP: 3051 case X86ISD::MOVDDUP: 3052 case X86ISD::MOVSS: 3053 case X86ISD::MOVSD: 3054 case X86ISD::UNPCKL: 3055 case X86ISD::UNPCKH: 3056 case X86ISD::VPERMILP: 3057 case X86ISD::VPERM2X128: 3058 case X86ISD::VPERMI: 3059 return true; 3060 } 3061 } 3062 3063 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3064 SDValue V1, SelectionDAG &DAG) { 3065 switch(Opc) { 3066 default: llvm_unreachable("Unknown x86 shuffle node"); 3067 case X86ISD::MOVSHDUP: 3068 case X86ISD::MOVSLDUP: 3069 case X86ISD::MOVDDUP: 3070 return DAG.getNode(Opc, dl, VT, V1); 3071 } 3072 } 3073 3074 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3075 SDValue V1, unsigned TargetMask, 3076 SelectionDAG &DAG) { 3077 switch(Opc) { 3078 default: llvm_unreachable("Unknown x86 shuffle node"); 3079 case X86ISD::PSHUFD: 3080 case X86ISD::PSHUFHW: 3081 case X86ISD::PSHUFLW: 3082 case X86ISD::VPERMILP: 3083 case X86ISD::VPERMI: 3084 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 3085 } 3086 } 3087 3088 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3089 SDValue V1, SDValue V2, unsigned TargetMask, 3090 SelectionDAG &DAG) { 3091 switch(Opc) { 3092 default: llvm_unreachable("Unknown x86 shuffle node"); 3093 case X86ISD::PALIGNR: 3094 case X86ISD::SHUFP: 3095 case X86ISD::VPERM2X128: 3096 return DAG.getNode(Opc, dl, VT, V1, V2, 3097 DAG.getConstant(TargetMask, MVT::i8)); 3098 } 3099 } 3100 3101 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3102 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3103 switch(Opc) { 3104 default: llvm_unreachable("Unknown x86 shuffle node"); 3105 case X86ISD::MOVLHPS: 3106 case X86ISD::MOVLHPD: 3107 case X86ISD::MOVHLPS: 3108 case X86ISD::MOVLPS: 3109 case X86ISD::MOVLPD: 3110 case X86ISD::MOVSS: 3111 case X86ISD::MOVSD: 3112 case X86ISD::UNPCKL: 3113 case X86ISD::UNPCKH: 3114 return DAG.getNode(Opc, dl, VT, V1, V2); 3115 } 3116 } 3117 3118 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3119 MachineFunction &MF = DAG.getMachineFunction(); 3120 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3121 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3122 3123 if (ReturnAddrIndex == 0) { 3124 // Set up a frame object for the return address. 3125 unsigned SlotSize = RegInfo->getSlotSize(); 3126 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 3127 false); 3128 FuncInfo->setRAIndex(ReturnAddrIndex); 3129 } 3130 3131 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 3132 } 3133 3134 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3135 bool hasSymbolicDisplacement) { 3136 // Offset should fit into 32 bit immediate field. 3137 if (!isInt<32>(Offset)) 3138 return false; 3139 3140 // If we don't have a symbolic displacement - we don't have any extra 3141 // restrictions. 3142 if (!hasSymbolicDisplacement) 3143 return true; 3144 3145 // FIXME: Some tweaks might be needed for medium code model. 3146 if (M != CodeModel::Small && M != CodeModel::Kernel) 3147 return false; 3148 3149 // For small code model we assume that latest object is 16MB before end of 31 3150 // bits boundary. We may also accept pretty large negative constants knowing 3151 // that all objects are in the positive half of address space. 3152 if (M == CodeModel::Small && Offset < 16*1024*1024) 3153 return true; 3154 3155 // For kernel code model we know that all object resist in the negative half 3156 // of 32bits address space. We may not accept negative offsets, since they may 3157 // be just off and we may accept pretty large positive ones. 3158 if (M == CodeModel::Kernel && Offset > 0) 3159 return true; 3160 3161 return false; 3162 } 3163 3164 /// isCalleePop - Determines whether the callee is required to pop its 3165 /// own arguments. Callee pop is necessary to support tail calls. 3166 bool X86::isCalleePop(CallingConv::ID CallingConv, 3167 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3168 if (IsVarArg) 3169 return false; 3170 3171 switch (CallingConv) { 3172 default: 3173 return false; 3174 case CallingConv::X86_StdCall: 3175 return !is64Bit; 3176 case CallingConv::X86_FastCall: 3177 return !is64Bit; 3178 case CallingConv::X86_ThisCall: 3179 return !is64Bit; 3180 case CallingConv::Fast: 3181 return TailCallOpt; 3182 case CallingConv::GHC: 3183 return TailCallOpt; 3184 case CallingConv::HiPE: 3185 return TailCallOpt; 3186 } 3187 } 3188 3189 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3190 /// specific condition code, returning the condition code and the LHS/RHS of the 3191 /// comparison to make. 3192 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3193 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3194 if (!isFP) { 3195 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3196 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3197 // X > -1 -> X == 0, jump !sign. 3198 RHS = DAG.getConstant(0, RHS.getValueType()); 3199 return X86::COND_NS; 3200 } 3201 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3202 // X < 0 -> X == 0, jump on sign. 3203 return X86::COND_S; 3204 } 3205 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3206 // X < 1 -> X <= 0 3207 RHS = DAG.getConstant(0, RHS.getValueType()); 3208 return X86::COND_LE; 3209 } 3210 } 3211 3212 switch (SetCCOpcode) { 3213 default: llvm_unreachable("Invalid integer condition!"); 3214 case ISD::SETEQ: return X86::COND_E; 3215 case ISD::SETGT: return X86::COND_G; 3216 case ISD::SETGE: return X86::COND_GE; 3217 case ISD::SETLT: return X86::COND_L; 3218 case ISD::SETLE: return X86::COND_LE; 3219 case ISD::SETNE: return X86::COND_NE; 3220 case ISD::SETULT: return X86::COND_B; 3221 case ISD::SETUGT: return X86::COND_A; 3222 case ISD::SETULE: return X86::COND_BE; 3223 case ISD::SETUGE: return X86::COND_AE; 3224 } 3225 } 3226 3227 // First determine if it is required or is profitable to flip the operands. 3228 3229 // If LHS is a foldable load, but RHS is not, flip the condition. 3230 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3231 !ISD::isNON_EXTLoad(RHS.getNode())) { 3232 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3233 std::swap(LHS, RHS); 3234 } 3235 3236 switch (SetCCOpcode) { 3237 default: break; 3238 case ISD::SETOLT: 3239 case ISD::SETOLE: 3240 case ISD::SETUGT: 3241 case ISD::SETUGE: 3242 std::swap(LHS, RHS); 3243 break; 3244 } 3245 3246 // On a floating point condition, the flags are set as follows: 3247 // ZF PF CF op 3248 // 0 | 0 | 0 | X > Y 3249 // 0 | 0 | 1 | X < Y 3250 // 1 | 0 | 0 | X == Y 3251 // 1 | 1 | 1 | unordered 3252 switch (SetCCOpcode) { 3253 default: llvm_unreachable("Condcode should be pre-legalized away"); 3254 case ISD::SETUEQ: 3255 case ISD::SETEQ: return X86::COND_E; 3256 case ISD::SETOLT: // flipped 3257 case ISD::SETOGT: 3258 case ISD::SETGT: return X86::COND_A; 3259 case ISD::SETOLE: // flipped 3260 case ISD::SETOGE: 3261 case ISD::SETGE: return X86::COND_AE; 3262 case ISD::SETUGT: // flipped 3263 case ISD::SETULT: 3264 case ISD::SETLT: return X86::COND_B; 3265 case ISD::SETUGE: // flipped 3266 case ISD::SETULE: 3267 case ISD::SETLE: return X86::COND_BE; 3268 case ISD::SETONE: 3269 case ISD::SETNE: return X86::COND_NE; 3270 case ISD::SETUO: return X86::COND_P; 3271 case ISD::SETO: return X86::COND_NP; 3272 case ISD::SETOEQ: 3273 case ISD::SETUNE: return X86::COND_INVALID; 3274 } 3275 } 3276 3277 /// hasFPCMov - is there a floating point cmov for the specific X86 condition 3278 /// code. Current x86 isa includes the following FP cmov instructions: 3279 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3280 static bool hasFPCMov(unsigned X86CC) { 3281 switch (X86CC) { 3282 default: 3283 return false; 3284 case X86::COND_B: 3285 case X86::COND_BE: 3286 case X86::COND_E: 3287 case X86::COND_P: 3288 case X86::COND_A: 3289 case X86::COND_AE: 3290 case X86::COND_NE: 3291 case X86::COND_NP: 3292 return true; 3293 } 3294 } 3295 3296 /// isFPImmLegal - Returns true if the target can instruction select the 3297 /// specified FP immediate natively. If false, the legalizer will 3298 /// materialize the FP immediate as a load from a constant pool. 3299 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3300 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3301 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3302 return true; 3303 } 3304 return false; 3305 } 3306 3307 /// isUndefOrInRange - Return true if Val is undef or if its value falls within 3308 /// the specified range (L, H]. 3309 static bool isUndefOrInRange(int Val, int Low, int Hi) { 3310 return (Val < 0) || (Val >= Low && Val < Hi); 3311 } 3312 3313 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3314 /// specified value. 3315 static bool isUndefOrEqual(int Val, int CmpVal) { 3316 return (Val < 0 || Val == CmpVal); 3317 } 3318 3319 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 3320 /// from position Pos and ending in Pos+Size, falls within the specified 3321 /// sequential range (L, L+Pos]. or is undef. 3322 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3323 unsigned Pos, unsigned Size, int Low) { 3324 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3325 if (!isUndefOrEqual(Mask[i], Low)) 3326 return false; 3327 return true; 3328 } 3329 3330 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3331 /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3332 /// the second operand. 3333 static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { 3334 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3335 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3336 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3337 return (Mask[0] < 2 && Mask[1] < 2); 3338 return false; 3339 } 3340 3341 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3342 /// is suitable for input to PSHUFHW. 3343 static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { 3344 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3345 return false; 3346 3347 // Lower quadword copied in order or undef. 3348 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 3349 return false; 3350 3351 // Upper quadword shuffled. 3352 for (unsigned i = 4; i != 8; ++i) 3353 if (!isUndefOrInRange(Mask[i], 4, 8)) 3354 return false; 3355 3356 if (VT == MVT::v16i16) { 3357 // Lower quadword copied in order or undef. 3358 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) 3359 return false; 3360 3361 // Upper quadword shuffled. 3362 for (unsigned i = 12; i != 16; ++i) 3363 if (!isUndefOrInRange(Mask[i], 12, 16)) 3364 return false; 3365 } 3366 3367 return true; 3368 } 3369 3370 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3371 /// is suitable for input to PSHUFLW. 3372 static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { 3373 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3374 return false; 3375 3376 // Upper quadword copied in order. 3377 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 3378 return false; 3379 3380 // Lower quadword shuffled. 3381 for (unsigned i = 0; i != 4; ++i) 3382 if (!isUndefOrInRange(Mask[i], 0, 4)) 3383 return false; 3384 3385 if (VT == MVT::v16i16) { 3386 // Upper quadword copied in order. 3387 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) 3388 return false; 3389 3390 // Lower quadword shuffled. 3391 for (unsigned i = 8; i != 12; ++i) 3392 if (!isUndefOrInRange(Mask[i], 8, 12)) 3393 return false; 3394 } 3395 3396 return true; 3397 } 3398 3399 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3400 /// is suitable for input to PALIGNR. 3401 static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, 3402 const X86Subtarget *Subtarget) { 3403 if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || 3404 (VT.is256BitVector() && !Subtarget->hasInt256())) 3405 return false; 3406 3407 unsigned NumElts = VT.getVectorNumElements(); 3408 unsigned NumLanes = VT.getSizeInBits()/128; 3409 unsigned NumLaneElts = NumElts/NumLanes; 3410 3411 // Do not handle 64-bit element shuffles with palignr. 3412 if (NumLaneElts == 2) 3413 return false; 3414 3415 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 3416 unsigned i; 3417 for (i = 0; i != NumLaneElts; ++i) { 3418 if (Mask[i+l] >= 0) 3419 break; 3420 } 3421 3422 // Lane is all undef, go to next lane 3423 if (i == NumLaneElts) 3424 continue; 3425 3426 int Start = Mask[i+l]; 3427 3428 // Make sure its in this lane in one of the sources 3429 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 3430 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 3431 return false; 3432 3433 // If not lane 0, then we must match lane 0 3434 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 3435 return false; 3436 3437 // Correct second source to be contiguous with first source 3438 if (Start >= (int)NumElts) 3439 Start -= NumElts - NumLaneElts; 3440 3441 // Make sure we're shifting in the right direction. 3442 if (Start <= (int)(i+l)) 3443 return false; 3444 3445 Start -= i; 3446 3447 // Check the rest of the elements to see if they are consecutive. 3448 for (++i; i != NumLaneElts; ++i) { 3449 int Idx = Mask[i+l]; 3450 3451 // Make sure its in this lane 3452 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 3453 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 3454 return false; 3455 3456 // If not lane 0, then we must match lane 0 3457 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 3458 return false; 3459 3460 if (Idx >= (int)NumElts) 3461 Idx -= NumElts - NumLaneElts; 3462 3463 if (!isUndefOrEqual(Idx, Start+i)) 3464 return false; 3465 3466 } 3467 } 3468 3469 return true; 3470 } 3471 3472 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3473 /// the two vector operands have swapped position. 3474 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3475 unsigned NumElems) { 3476 for (unsigned i = 0; i != NumElems; ++i) { 3477 int idx = Mask[i]; 3478 if (idx < 0) 3479 continue; 3480 else if (idx < (int)NumElems) 3481 Mask[i] = idx + NumElems; 3482 else 3483 Mask[i] = idx - NumElems; 3484 } 3485 } 3486 3487 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3488 /// specifies a shuffle of elements that is suitable for input to 128/256-bit 3489 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3490 /// reverse of what x86 shuffles want. 3491 static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, 3492 bool Commuted = false) { 3493 if (!HasFp256 && VT.is256BitVector()) 3494 return false; 3495 3496 unsigned NumElems = VT.getVectorNumElements(); 3497 unsigned NumLanes = VT.getSizeInBits()/128; 3498 unsigned NumLaneElems = NumElems/NumLanes; 3499 3500 if (NumLaneElems != 2 && NumLaneElems != 4) 3501 return false; 3502 3503 // VSHUFPSY divides the resulting vector into 4 chunks. 3504 // The sources are also splitted into 4 chunks, and each destination 3505 // chunk must come from a different source chunk. 3506 // 3507 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3508 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3509 // 3510 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3511 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3512 // 3513 // VSHUFPDY divides the resulting vector into 4 chunks. 3514 // The sources are also splitted into 4 chunks, and each destination 3515 // chunk must come from a different source chunk. 3516 // 3517 // SRC1 => X3 X2 X1 X0 3518 // SRC2 => Y3 Y2 Y1 Y0 3519 // 3520 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3521 // 3522 unsigned HalfLaneElems = NumLaneElems/2; 3523 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 3524 for (unsigned i = 0; i != NumLaneElems; ++i) { 3525 int Idx = Mask[i+l]; 3526 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 3527 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 3528 return false; 3529 // For VSHUFPSY, the mask of the second half must be the same as the 3530 // first but with the appropriate offsets. This works in the same way as 3531 // VPERMILPS works with masks. 3532 if (NumElems != 8 || l == 0 || Mask[i] < 0) 3533 continue; 3534 if (!isUndefOrEqual(Idx, Mask[i]+l)) 3535 return false; 3536 } 3537 } 3538 3539 return true; 3540 } 3541 3542 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3543 /// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3544 static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { 3545 if (!VT.is128BitVector()) 3546 return false; 3547 3548 unsigned NumElems = VT.getVectorNumElements(); 3549 3550 if (NumElems != 4) 3551 return false; 3552 3553 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3554 return isUndefOrEqual(Mask[0], 6) && 3555 isUndefOrEqual(Mask[1], 7) && 3556 isUndefOrEqual(Mask[2], 2) && 3557 isUndefOrEqual(Mask[3], 3); 3558 } 3559 3560 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3561 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3562 /// <2, 3, 2, 3> 3563 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { 3564 if (!VT.is128BitVector()) 3565 return false; 3566 3567 unsigned NumElems = VT.getVectorNumElements(); 3568 3569 if (NumElems != 4) 3570 return false; 3571 3572 return isUndefOrEqual(Mask[0], 2) && 3573 isUndefOrEqual(Mask[1], 3) && 3574 isUndefOrEqual(Mask[2], 2) && 3575 isUndefOrEqual(Mask[3], 3); 3576 } 3577 3578 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3579 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3580 static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { 3581 if (!VT.is128BitVector()) 3582 return false; 3583 3584 unsigned NumElems = VT.getVectorNumElements(); 3585 3586 if (NumElems != 2 && NumElems != 4) 3587 return false; 3588 3589 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3590 if (!isUndefOrEqual(Mask[i], i + NumElems)) 3591 return false; 3592 3593 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 3594 if (!isUndefOrEqual(Mask[i], i)) 3595 return false; 3596 3597 return true; 3598 } 3599 3600 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3601 /// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3602 static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { 3603 if (!VT.is128BitVector()) 3604 return false; 3605 3606 unsigned NumElems = VT.getVectorNumElements(); 3607 3608 if (NumElems != 2 && NumElems != 4) 3609 return false; 3610 3611 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3612 if (!isUndefOrEqual(Mask[i], i)) 3613 return false; 3614 3615 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3616 if (!isUndefOrEqual(Mask[i + e], i + NumElems)) 3617 return false; 3618 3619 return true; 3620 } 3621 3622 // 3623 // Some special combinations that can be optimized. 3624 // 3625 static 3626 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, 3627 SelectionDAG &DAG) { 3628 MVT VT = SVOp->getValueType(0).getSimpleVT(); 3629 DebugLoc dl = SVOp->getDebugLoc(); 3630 3631 if (VT != MVT::v8i32 && VT != MVT::v8f32) 3632 return SDValue(); 3633 3634 ArrayRef<int> Mask = SVOp->getMask(); 3635 3636 // These are the special masks that may be optimized. 3637 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; 3638 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; 3639 bool MatchEvenMask = true; 3640 bool MatchOddMask = true; 3641 for (int i=0; i<8; ++i) { 3642 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) 3643 MatchEvenMask = false; 3644 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) 3645 MatchOddMask = false; 3646 } 3647 3648 if (!MatchEvenMask && !MatchOddMask) 3649 return SDValue(); 3650 3651 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); 3652 3653 SDValue Op0 = SVOp->getOperand(0); 3654 SDValue Op1 = SVOp->getOperand(1); 3655 3656 if (MatchEvenMask) { 3657 // Shift the second operand right to 32 bits. 3658 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; 3659 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); 3660 } else { 3661 // Shift the first operand left to 32 bits. 3662 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; 3663 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); 3664 } 3665 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; 3666 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); 3667 } 3668 3669 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3670 /// specifies a shuffle of elements that is suitable for input to UNPCKL. 3671 static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, 3672 bool HasInt256, bool V2IsSplat = false) { 3673 unsigned NumElts = VT.getVectorNumElements(); 3674 3675 assert((VT.is128BitVector() || VT.is256BitVector()) && 3676 "Unsupported vector type for unpckh"); 3677 3678 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 3679 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3680 return false; 3681 3682 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3683 // independently on 128-bit lanes. 3684 unsigned NumLanes = VT.getSizeInBits()/128; 3685 unsigned NumLaneElts = NumElts/NumLanes; 3686 3687 for (unsigned l = 0; l != NumLanes; ++l) { 3688 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3689 i != (l+1)*NumLaneElts; 3690 i += 2, ++j) { 3691 int BitI = Mask[i]; 3692 int BitI1 = Mask[i+1]; 3693 if (!isUndefOrEqual(BitI, j)) 3694 return false; 3695 if (V2IsSplat) { 3696 if (!isUndefOrEqual(BitI1, NumElts)) 3697 return false; 3698 } else { 3699 if (!isUndefOrEqual(BitI1, j + NumElts)) 3700 return false; 3701 } 3702 } 3703 } 3704 3705 return true; 3706 } 3707 3708 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3709 /// specifies a shuffle of elements that is suitable for input to UNPCKH. 3710 static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, 3711 bool HasInt256, bool V2IsSplat = false) { 3712 unsigned NumElts = VT.getVectorNumElements(); 3713 3714 assert((VT.is128BitVector() || VT.is256BitVector()) && 3715 "Unsupported vector type for unpckh"); 3716 3717 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 3718 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3719 return false; 3720 3721 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3722 // independently on 128-bit lanes. 3723 unsigned NumLanes = VT.getSizeInBits()/128; 3724 unsigned NumLaneElts = NumElts/NumLanes; 3725 3726 for (unsigned l = 0; l != NumLanes; ++l) { 3727 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3728 i != (l+1)*NumLaneElts; i += 2, ++j) { 3729 int BitI = Mask[i]; 3730 int BitI1 = Mask[i+1]; 3731 if (!isUndefOrEqual(BitI, j)) 3732 return false; 3733 if (V2IsSplat) { 3734 if (isUndefOrEqual(BitI1, NumElts)) 3735 return false; 3736 } else { 3737 if (!isUndefOrEqual(BitI1, j+NumElts)) 3738 return false; 3739 } 3740 } 3741 } 3742 return true; 3743 } 3744 3745 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3746 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3747 /// <0, 0, 1, 1> 3748 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { 3749 unsigned NumElts = VT.getVectorNumElements(); 3750 bool Is256BitVec = VT.is256BitVector(); 3751 3752 assert((VT.is128BitVector() || VT.is256BitVector()) && 3753 "Unsupported vector type for unpckh"); 3754 3755 if (Is256BitVec && NumElts != 4 && NumElts != 8 && 3756 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3757 return false; 3758 3759 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 3760 // FIXME: Need a better way to get rid of this, there's no latency difference 3761 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 3762 // the former later. We should also remove the "_undef" special mask. 3763 if (NumElts == 4 && Is256BitVec) 3764 return false; 3765 3766 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3767 // independently on 128-bit lanes. 3768 unsigned NumLanes = VT.getSizeInBits()/128; 3769 unsigned NumLaneElts = NumElts/NumLanes; 3770 3771 for (unsigned l = 0; l != NumLanes; ++l) { 3772 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3773 i != (l+1)*NumLaneElts; 3774 i += 2, ++j) { 3775 int BitI = Mask[i]; 3776 int BitI1 = Mask[i+1]; 3777 3778 if (!isUndefOrEqual(BitI, j)) 3779 return false; 3780 if (!isUndefOrEqual(BitI1, j)) 3781 return false; 3782 } 3783 } 3784 3785 return true; 3786 } 3787 3788 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3789 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3790 /// <2, 2, 3, 3> 3791 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { 3792 unsigned NumElts = VT.getVectorNumElements(); 3793 3794 assert((VT.is128BitVector() || VT.is256BitVector()) && 3795 "Unsupported vector type for unpckh"); 3796 3797 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 3798 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3799 return false; 3800 3801 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3802 // independently on 128-bit lanes. 3803 unsigned NumLanes = VT.getSizeInBits()/128; 3804 unsigned NumLaneElts = NumElts/NumLanes; 3805 3806 for (unsigned l = 0; l != NumLanes; ++l) { 3807 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3808 i != (l+1)*NumLaneElts; i += 2, ++j) { 3809 int BitI = Mask[i]; 3810 int BitI1 = Mask[i+1]; 3811 if (!isUndefOrEqual(BitI, j)) 3812 return false; 3813 if (!isUndefOrEqual(BitI1, j)) 3814 return false; 3815 } 3816 } 3817 return true; 3818 } 3819 3820 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3821 /// specifies a shuffle of elements that is suitable for input to MOVSS, 3822 /// MOVSD, and MOVD, i.e. setting the lowest element. 3823 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 3824 if (VT.getVectorElementType().getSizeInBits() < 32) 3825 return false; 3826 if (!VT.is128BitVector()) 3827 return false; 3828 3829 unsigned NumElts = VT.getVectorNumElements(); 3830 3831 if (!isUndefOrEqual(Mask[0], NumElts)) 3832 return false; 3833 3834 for (unsigned i = 1; i != NumElts; ++i) 3835 if (!isUndefOrEqual(Mask[i], i)) 3836 return false; 3837 3838 return true; 3839 } 3840 3841 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 3842 /// as permutations between 128-bit chunks or halves. As an example: this 3843 /// shuffle bellow: 3844 /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 3845 /// The first half comes from the second half of V1 and the second half from the 3846 /// the second half of V2. 3847 static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { 3848 if (!HasFp256 || !VT.is256BitVector()) 3849 return false; 3850 3851 // The shuffle result is divided into half A and half B. In total the two 3852 // sources have 4 halves, namely: C, D, E, F. The final values of A and 3853 // B must come from C, D, E or F. 3854 unsigned HalfSize = VT.getVectorNumElements()/2; 3855 bool MatchA = false, MatchB = false; 3856 3857 // Check if A comes from one of C, D, E, F. 3858 for (unsigned Half = 0; Half != 4; ++Half) { 3859 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 3860 MatchA = true; 3861 break; 3862 } 3863 } 3864 3865 // Check if B comes from one of C, D, E, F. 3866 for (unsigned Half = 0; Half != 4; ++Half) { 3867 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 3868 MatchB = true; 3869 break; 3870 } 3871 } 3872 3873 return MatchA && MatchB; 3874 } 3875 3876 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 3877 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 3878 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 3879 MVT VT = SVOp->getValueType(0).getSimpleVT(); 3880 3881 unsigned HalfSize = VT.getVectorNumElements()/2; 3882 3883 unsigned FstHalf = 0, SndHalf = 0; 3884 for (unsigned i = 0; i < HalfSize; ++i) { 3885 if (SVOp->getMaskElt(i) > 0) { 3886 FstHalf = SVOp->getMaskElt(i)/HalfSize; 3887 break; 3888 } 3889 } 3890 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 3891 if (SVOp->getMaskElt(i) > 0) { 3892 SndHalf = SVOp->getMaskElt(i)/HalfSize; 3893 break; 3894 } 3895 } 3896 3897 return (FstHalf | (SndHalf << 4)); 3898 } 3899 3900 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 3901 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3902 /// Note that VPERMIL mask matching is different depending whether theunderlying 3903 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3904 /// to the same elements of the low, but to the higher half of the source. 3905 /// In VPERMILPD the two lanes could be shuffled independently of each other 3906 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 3907 static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { 3908 if (!HasFp256) 3909 return false; 3910 3911 unsigned NumElts = VT.getVectorNumElements(); 3912 // Only match 256-bit with 32/64-bit types 3913 if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8)) 3914 return false; 3915 3916 unsigned NumLanes = VT.getSizeInBits()/128; 3917 unsigned LaneSize = NumElts/NumLanes; 3918 for (unsigned l = 0; l != NumElts; l += LaneSize) { 3919 for (unsigned i = 0; i != LaneSize; ++i) { 3920 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 3921 return false; 3922 if (NumElts != 8 || l == 0) 3923 continue; 3924 // VPERMILPS handling 3925 if (Mask[i] < 0) 3926 continue; 3927 if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) 3928 return false; 3929 } 3930 } 3931 3932 return true; 3933 } 3934 3935 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 3936 /// of what x86 movss want. X86 movs requires the lowest element to be lowest 3937 /// element of vector 2 and the other elements to come from vector 1 in order. 3938 static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, 3939 bool V2IsSplat = false, bool V2IsUndef = false) { 3940 if (!VT.is128BitVector()) 3941 return false; 3942 3943 unsigned NumOps = VT.getVectorNumElements(); 3944 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3945 return false; 3946 3947 if (!isUndefOrEqual(Mask[0], 0)) 3948 return false; 3949 3950 for (unsigned i = 1; i != NumOps; ++i) 3951 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3952 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3953 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3954 return false; 3955 3956 return true; 3957 } 3958 3959 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3960 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3961 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3962 static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, 3963 const X86Subtarget *Subtarget) { 3964 if (!Subtarget->hasSSE3()) 3965 return false; 3966 3967 unsigned NumElems = VT.getVectorNumElements(); 3968 3969 if ((VT.is128BitVector() && NumElems != 4) || 3970 (VT.is256BitVector() && NumElems != 8)) 3971 return false; 3972 3973 // "i+1" is the value the indexed mask element must have 3974 for (unsigned i = 0; i != NumElems; i += 2) 3975 if (!isUndefOrEqual(Mask[i], i+1) || 3976 !isUndefOrEqual(Mask[i+1], i+1)) 3977 return false; 3978 3979 return true; 3980 } 3981 3982 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3983 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3984 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3985 static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, 3986 const X86Subtarget *Subtarget) { 3987 if (!Subtarget->hasSSE3()) 3988 return false; 3989 3990 unsigned NumElems = VT.getVectorNumElements(); 3991 3992 if ((VT.is128BitVector() && NumElems != 4) || 3993 (VT.is256BitVector() && NumElems != 8)) 3994 return false; 3995 3996 // "i" is the value the indexed mask element must have 3997 for (unsigned i = 0; i != NumElems; i += 2) 3998 if (!isUndefOrEqual(Mask[i], i) || 3999 !isUndefOrEqual(Mask[i+1], i)) 4000 return false; 4001 4002 return true; 4003 } 4004 4005 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 4006 /// specifies a shuffle of elements that is suitable for input to 256-bit 4007 /// version of MOVDDUP. 4008 static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { 4009 if (!HasFp256 || !VT.is256BitVector()) 4010 return false; 4011 4012 unsigned NumElts = VT.getVectorNumElements(); 4013 if (NumElts != 4) 4014 return false; 4015 4016 for (unsigned i = 0; i != NumElts/2; ++i) 4017 if (!isUndefOrEqual(Mask[i], 0)) 4018 return false; 4019 for (unsigned i = NumElts/2; i != NumElts; ++i) 4020 if (!isUndefOrEqual(Mask[i], NumElts/2)) 4021 return false; 4022 return true; 4023 } 4024 4025 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4026 /// specifies a shuffle of elements that is suitable for input to 128-bit 4027 /// version of MOVDDUP. 4028 static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { 4029 if (!VT.is128BitVector()) 4030 return false; 4031 4032 unsigned e = VT.getVectorNumElements() / 2; 4033 for (unsigned i = 0; i != e; ++i) 4034 if (!isUndefOrEqual(Mask[i], i)) 4035 return false; 4036 for (unsigned i = 0; i != e; ++i) 4037 if (!isUndefOrEqual(Mask[e+i], i)) 4038 return false; 4039 return true; 4040 } 4041 4042 /// isVEXTRACTF128Index - Return true if the specified 4043 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is 4044 /// suitable for input to VEXTRACTF128. 4045 bool X86::isVEXTRACTF128Index(SDNode *N) { 4046 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4047 return false; 4048 4049 // The index should be aligned on a 128-bit boundary. 4050 uint64_t Index = 4051 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4052 4053 MVT VT = N->getValueType(0).getSimpleVT(); 4054 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4055 bool Result = (Index * ElSize) % 128 == 0; 4056 4057 return Result; 4058 } 4059 4060 /// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 4061 /// operand specifies a subvector insert that is suitable for input to 4062 /// VINSERTF128. 4063 bool X86::isVINSERTF128Index(SDNode *N) { 4064 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4065 return false; 4066 4067 // The index should be aligned on a 128-bit boundary. 4068 uint64_t Index = 4069 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4070 4071 MVT VT = N->getValueType(0).getSimpleVT(); 4072 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4073 bool Result = (Index * ElSize) % 128 == 0; 4074 4075 return Result; 4076 } 4077 4078 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 4079 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 4080 /// Handles 128-bit and 256-bit. 4081 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 4082 MVT VT = N->getValueType(0).getSimpleVT(); 4083 4084 assert((VT.is128BitVector() || VT.is256BitVector()) && 4085 "Unsupported vector type for PSHUF/SHUFP"); 4086 4087 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 4088 // independently on 128-bit lanes. 4089 unsigned NumElts = VT.getVectorNumElements(); 4090 unsigned NumLanes = VT.getSizeInBits()/128; 4091 unsigned NumLaneElts = NumElts/NumLanes; 4092 4093 assert((NumLaneElts == 2 || NumLaneElts == 4) && 4094 "Only supports 2 or 4 elements per lane"); 4095 4096 unsigned Shift = (NumLaneElts == 4) ? 1 : 0; 4097 unsigned Mask = 0; 4098 for (unsigned i = 0; i != NumElts; ++i) { 4099 int Elt = N->getMaskElt(i); 4100 if (Elt < 0) continue; 4101 Elt &= NumLaneElts - 1; 4102 unsigned ShAmt = (i << Shift) % 8; 4103 Mask |= Elt << ShAmt; 4104 } 4105 4106 return Mask; 4107 } 4108 4109 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 4110 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 4111 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 4112 MVT VT = N->getValueType(0).getSimpleVT(); 4113 4114 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4115 "Unsupported vector type for PSHUFHW"); 4116 4117 unsigned NumElts = VT.getVectorNumElements(); 4118 4119 unsigned Mask = 0; 4120 for (unsigned l = 0; l != NumElts; l += 8) { 4121 // 8 nodes per lane, but we only care about the last 4. 4122 for (unsigned i = 0; i < 4; ++i) { 4123 int Elt = N->getMaskElt(l+i+4); 4124 if (Elt < 0) continue; 4125 Elt &= 0x3; // only 2-bits. 4126 Mask |= Elt << (i * 2); 4127 } 4128 } 4129 4130 return Mask; 4131 } 4132 4133 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 4134 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 4135 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 4136 MVT VT = N->getValueType(0).getSimpleVT(); 4137 4138 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4139 "Unsupported vector type for PSHUFHW"); 4140 4141 unsigned NumElts = VT.getVectorNumElements(); 4142 4143 unsigned Mask = 0; 4144 for (unsigned l = 0; l != NumElts; l += 8) { 4145 // 8 nodes per lane, but we only care about the first 4. 4146 for (unsigned i = 0; i < 4; ++i) { 4147 int Elt = N->getMaskElt(l+i); 4148 if (Elt < 0) continue; 4149 Elt &= 0x3; // only 2-bits 4150 Mask |= Elt << (i * 2); 4151 } 4152 } 4153 4154 return Mask; 4155 } 4156 4157 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4158 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4159 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 4160 MVT VT = SVOp->getValueType(0).getSimpleVT(); 4161 unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; 4162 4163 unsigned NumElts = VT.getVectorNumElements(); 4164 unsigned NumLanes = VT.getSizeInBits()/128; 4165 unsigned NumLaneElts = NumElts/NumLanes; 4166 4167 int Val = 0; 4168 unsigned i; 4169 for (i = 0; i != NumElts; ++i) { 4170 Val = SVOp->getMaskElt(i); 4171 if (Val >= 0) 4172 break; 4173 } 4174 if (Val >= (int)NumElts) 4175 Val -= NumElts - NumLaneElts; 4176 4177 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4178 return (Val - i) * EltSize; 4179 } 4180 4181 /// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 4182 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4183 /// instructions. 4184 unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 4185 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4186 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 4187 4188 uint64_t Index = 4189 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4190 4191 MVT VecVT = N->getOperand(0).getValueType().getSimpleVT(); 4192 MVT ElVT = VecVT.getVectorElementType(); 4193 4194 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4195 return Index / NumElemsPerChunk; 4196 } 4197 4198 /// getInsertVINSERTF128Immediate - Return the appropriate immediate 4199 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4200 /// instructions. 4201 unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 4202 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4203 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 4204 4205 uint64_t Index = 4206 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4207 4208 MVT VecVT = N->getValueType(0).getSimpleVT(); 4209 MVT ElVT = VecVT.getVectorElementType(); 4210 4211 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4212 return Index / NumElemsPerChunk; 4213 } 4214 4215 /// getShuffleCLImmediate - Return the appropriate immediate to shuffle 4216 /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. 4217 /// Handles 256-bit. 4218 static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { 4219 MVT VT = N->getValueType(0).getSimpleVT(); 4220 4221 unsigned NumElts = VT.getVectorNumElements(); 4222 4223 assert((VT.is256BitVector() && NumElts == 4) && 4224 "Unsupported vector type for VPERMQ/VPERMPD"); 4225 4226 unsigned Mask = 0; 4227 for (unsigned i = 0; i != NumElts; ++i) { 4228 int Elt = N->getMaskElt(i); 4229 if (Elt < 0) 4230 continue; 4231 Mask |= Elt << (i*2); 4232 } 4233 4234 return Mask; 4235 } 4236 /// isZeroNode - Returns true if Elt is a constant zero or a floating point 4237 /// constant +0.0. 4238 bool X86::isZeroNode(SDValue Elt) { 4239 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt)) 4240 return CN->isNullValue(); 4241 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) 4242 return CFP->getValueAPF().isPosZero(); 4243 return false; 4244 } 4245 4246 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4247 /// their permute mask. 4248 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4249 SelectionDAG &DAG) { 4250 MVT VT = SVOp->getValueType(0).getSimpleVT(); 4251 unsigned NumElems = VT.getVectorNumElements(); 4252 SmallVector<int, 8> MaskVec; 4253 4254 for (unsigned i = 0; i != NumElems; ++i) { 4255 int Idx = SVOp->getMaskElt(i); 4256 if (Idx >= 0) { 4257 if (Idx < (int)NumElems) 4258 Idx += NumElems; 4259 else 4260 Idx -= NumElems; 4261 } 4262 MaskVec.push_back(Idx); 4263 } 4264 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 4265 SVOp->getOperand(0), &MaskVec[0]); 4266 } 4267 4268 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4269 /// match movhlps. The lower half elements should come from upper half of 4270 /// V1 (and in order), and the upper half elements should come from the upper 4271 /// half of V2 (and in order). 4272 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { 4273 if (!VT.is128BitVector()) 4274 return false; 4275 if (VT.getVectorNumElements() != 4) 4276 return false; 4277 for (unsigned i = 0, e = 2; i != e; ++i) 4278 if (!isUndefOrEqual(Mask[i], i+2)) 4279 return false; 4280 for (unsigned i = 2; i != 4; ++i) 4281 if (!isUndefOrEqual(Mask[i], i+4)) 4282 return false; 4283 return true; 4284 } 4285 4286 /// isScalarLoadToVector - Returns true if the node is a scalar load that 4287 /// is promoted to a vector. It also returns the LoadSDNode by reference if 4288 /// required. 4289 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4290 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4291 return false; 4292 N = N->getOperand(0).getNode(); 4293 if (!ISD::isNON_EXTLoad(N)) 4294 return false; 4295 if (LD) 4296 *LD = cast<LoadSDNode>(N); 4297 return true; 4298 } 4299 4300 // Test whether the given value is a vector value which will be legalized 4301 // into a load. 4302 static bool WillBeConstantPoolLoad(SDNode *N) { 4303 if (N->getOpcode() != ISD::BUILD_VECTOR) 4304 return false; 4305 4306 // Check for any non-constant elements. 4307 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4308 switch (N->getOperand(i).getNode()->getOpcode()) { 4309 case ISD::UNDEF: 4310 case ISD::ConstantFP: 4311 case ISD::Constant: 4312 break; 4313 default: 4314 return false; 4315 } 4316 4317 // Vectors of all-zeros and all-ones are materialized with special 4318 // instructions rather than being loaded. 4319 return !ISD::isBuildVectorAllZeros(N) && 4320 !ISD::isBuildVectorAllOnes(N); 4321 } 4322 4323 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4324 /// match movlp{s|d}. The lower half elements should come from lower half of 4325 /// V1 (and in order), and the upper half elements should come from the upper 4326 /// half of V2 (and in order). And since V1 will become the source of the 4327 /// MOVLP, it must be either a vector load or a scalar load to vector. 4328 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4329 ArrayRef<int> Mask, EVT VT) { 4330 if (!VT.is128BitVector()) 4331 return false; 4332 4333 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4334 return false; 4335 // Is V2 is a vector load, don't do this transformation. We will try to use 4336 // load folding shufps op. 4337 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4338 return false; 4339 4340 unsigned NumElems = VT.getVectorNumElements(); 4341 4342 if (NumElems != 2 && NumElems != 4) 4343 return false; 4344 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4345 if (!isUndefOrEqual(Mask[i], i)) 4346 return false; 4347 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 4348 if (!isUndefOrEqual(Mask[i], i+NumElems)) 4349 return false; 4350 return true; 4351 } 4352 4353 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4354 /// all the same. 4355 static bool isSplatVector(SDNode *N) { 4356 if (N->getOpcode() != ISD::BUILD_VECTOR) 4357 return false; 4358 4359 SDValue SplatValue = N->getOperand(0); 4360 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4361 if (N->getOperand(i) != SplatValue) 4362 return false; 4363 return true; 4364 } 4365 4366 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4367 /// to an zero vector. 4368 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4369 static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4370 SDValue V1 = N->getOperand(0); 4371 SDValue V2 = N->getOperand(1); 4372 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4373 for (unsigned i = 0; i != NumElems; ++i) { 4374 int Idx = N->getMaskElt(i); 4375 if (Idx >= (int)NumElems) { 4376 unsigned Opc = V2.getOpcode(); 4377 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4378 continue; 4379 if (Opc != ISD::BUILD_VECTOR || 4380 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4381 return false; 4382 } else if (Idx >= 0) { 4383 unsigned Opc = V1.getOpcode(); 4384 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4385 continue; 4386 if (Opc != ISD::BUILD_VECTOR || 4387 !X86::isZeroNode(V1.getOperand(Idx))) 4388 return false; 4389 } 4390 } 4391 return true; 4392 } 4393 4394 /// getZeroVector - Returns a vector of specified type with all zero elements. 4395 /// 4396 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 4397 SelectionDAG &DAG, DebugLoc dl) { 4398 assert(VT.isVector() && "Expected a vector type"); 4399 4400 // Always build SSE zero vectors as <4 x i32> bitcasted 4401 // to their dest type. This ensures they get CSE'd. 4402 SDValue Vec; 4403 if (VT.is128BitVector()) { // SSE 4404 if (Subtarget->hasSSE2()) { // SSE2 4405 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4406 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4407 } else { // SSE1 4408 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4409 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4410 } 4411 } else if (VT.is256BitVector()) { // AVX 4412 if (Subtarget->hasInt256()) { // AVX2 4413 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4414 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4415 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4416 } else { 4417 // 256-bit logic and arithmetic instructions in AVX are all 4418 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4419 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4420 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4421 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 4422 } 4423 } else 4424 llvm_unreachable("Unexpected vector type"); 4425 4426 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4427 } 4428 4429 /// getOnesVector - Returns a vector of specified type with all bits set. 4430 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4431 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4432 /// Then bitcast to their original type, ensuring they get CSE'd. 4433 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, 4434 DebugLoc dl) { 4435 assert(VT.isVector() && "Expected a vector type"); 4436 4437 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4438 SDValue Vec; 4439 if (VT.is256BitVector()) { 4440 if (HasInt256) { // AVX2 4441 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4442 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4443 } else { // AVX 4444 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4445 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4446 } 4447 } else if (VT.is128BitVector()) { 4448 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4449 } else 4450 llvm_unreachable("Unexpected vector type"); 4451 4452 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4453 } 4454 4455 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4456 /// that point to V2 points to its first element. 4457 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 4458 for (unsigned i = 0; i != NumElems; ++i) { 4459 if (Mask[i] > (int)NumElems) { 4460 Mask[i] = NumElems; 4461 } 4462 } 4463 } 4464 4465 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4466 /// operation of specified width. 4467 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4468 SDValue V2) { 4469 unsigned NumElems = VT.getVectorNumElements(); 4470 SmallVector<int, 8> Mask; 4471 Mask.push_back(NumElems); 4472 for (unsigned i = 1; i != NumElems; ++i) 4473 Mask.push_back(i); 4474 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4475 } 4476 4477 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4478 static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4479 SDValue V2) { 4480 unsigned NumElems = VT.getVectorNumElements(); 4481 SmallVector<int, 8> Mask; 4482 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4483 Mask.push_back(i); 4484 Mask.push_back(i + NumElems); 4485 } 4486 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4487 } 4488 4489 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4490 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4491 SDValue V2) { 4492 unsigned NumElems = VT.getVectorNumElements(); 4493 SmallVector<int, 8> Mask; 4494 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4495 Mask.push_back(i + Half); 4496 Mask.push_back(i + NumElems + Half); 4497 } 4498 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4499 } 4500 4501 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4502 // a generic shuffle instruction because the target has no such instructions. 4503 // Generate shuffles which repeat i16 and i8 several times until they can be 4504 // represented by v4f32 and then be manipulated by target suported shuffles. 4505 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4506 EVT VT = V.getValueType(); 4507 int NumElems = VT.getVectorNumElements(); 4508 DebugLoc dl = V.getDebugLoc(); 4509 4510 while (NumElems > 4) { 4511 if (EltNo < NumElems/2) { 4512 V = getUnpackl(DAG, dl, VT, V, V); 4513 } else { 4514 V = getUnpackh(DAG, dl, VT, V, V); 4515 EltNo -= NumElems/2; 4516 } 4517 NumElems >>= 1; 4518 } 4519 return V; 4520 } 4521 4522 /// getLegalSplat - Generate a legal splat with supported x86 shuffles 4523 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4524 EVT VT = V.getValueType(); 4525 DebugLoc dl = V.getDebugLoc(); 4526 4527 if (VT.is128BitVector()) { 4528 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4529 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4530 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4531 &SplatMask[0]); 4532 } else if (VT.is256BitVector()) { 4533 // To use VPERMILPS to splat scalars, the second half of indicies must 4534 // refer to the higher part, which is a duplication of the lower one, 4535 // because VPERMILPS can only handle in-lane permutations. 4536 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4537 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4538 4539 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4540 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4541 &SplatMask[0]); 4542 } else 4543 llvm_unreachable("Vector size not supported"); 4544 4545 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4546 } 4547 4548 /// PromoteSplat - Splat is promoted to target supported vector shuffles. 4549 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4550 EVT SrcVT = SV->getValueType(0); 4551 SDValue V1 = SV->getOperand(0); 4552 DebugLoc dl = SV->getDebugLoc(); 4553 4554 int EltNo = SV->getSplatIndex(); 4555 int NumElems = SrcVT.getVectorNumElements(); 4556 bool Is256BitVec = SrcVT.is256BitVector(); 4557 4558 assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && 4559 "Unknown how to promote splat for type"); 4560 4561 // Extract the 128-bit part containing the splat element and update 4562 // the splat element index when it refers to the higher register. 4563 if (Is256BitVec) { 4564 V1 = Extract128BitVector(V1, EltNo, DAG, dl); 4565 if (EltNo >= NumElems/2) 4566 EltNo -= NumElems/2; 4567 } 4568 4569 // All i16 and i8 vector types can't be used directly by a generic shuffle 4570 // instruction because the target has no such instruction. Generate shuffles 4571 // which repeat i16 and i8 several times until they fit in i32, and then can 4572 // be manipulated by target suported shuffles. 4573 EVT EltVT = SrcVT.getVectorElementType(); 4574 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4575 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4576 4577 // Recreate the 256-bit vector and place the same 128-bit vector 4578 // into the low and high part. This is necessary because we want 4579 // to use VPERM* to shuffle the vectors 4580 if (Is256BitVec) { 4581 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); 4582 } 4583 4584 return getLegalSplat(DAG, V1, EltNo); 4585 } 4586 4587 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4588 /// vector of zero or undef vector. This produces a shuffle where the low 4589 /// element of V2 is swizzled into the zero/undef vector, landing at element 4590 /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4591 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4592 bool IsZero, 4593 const X86Subtarget *Subtarget, 4594 SelectionDAG &DAG) { 4595 EVT VT = V2.getValueType(); 4596 SDValue V1 = IsZero 4597 ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4598 unsigned NumElems = VT.getVectorNumElements(); 4599 SmallVector<int, 16> MaskVec; 4600 for (unsigned i = 0; i != NumElems; ++i) 4601 // If this is the insertion idx, put the low elt of V2 here. 4602 MaskVec.push_back(i == Idx ? NumElems : i); 4603 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4604 } 4605 4606 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 4607 /// target specific opcode. Returns true if the Mask could be calculated. 4608 /// Sets IsUnary to true if only uses one source. 4609 static bool getTargetShuffleMask(SDNode *N, MVT VT, 4610 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4611 unsigned NumElems = VT.getVectorNumElements(); 4612 SDValue ImmN; 4613 4614 IsUnary = false; 4615 switch(N->getOpcode()) { 4616 case X86ISD::SHUFP: 4617 ImmN = N->getOperand(N->getNumOperands()-1); 4618 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4619 break; 4620 case X86ISD::UNPCKH: 4621 DecodeUNPCKHMask(VT, Mask); 4622 break; 4623 case X86ISD::UNPCKL: 4624 DecodeUNPCKLMask(VT, Mask); 4625 break; 4626 case X86ISD::MOVHLPS: 4627 DecodeMOVHLPSMask(NumElems, Mask); 4628 break; 4629 case X86ISD::MOVLHPS: 4630 DecodeMOVLHPSMask(NumElems, Mask); 4631 break; 4632 case X86ISD::PALIGNR: 4633 ImmN = N->getOperand(N->getNumOperands()-1); 4634 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4635 break; 4636 case X86ISD::PSHUFD: 4637 case X86ISD::VPERMILP: 4638 ImmN = N->getOperand(N->getNumOperands()-1); 4639 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4640 IsUnary = true; 4641 break; 4642 case X86ISD::PSHUFHW: 4643 ImmN = N->getOperand(N->getNumOperands()-1); 4644 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4645 IsUnary = true; 4646 break; 4647 case X86ISD::PSHUFLW: 4648 ImmN = N->getOperand(N->getNumOperands()-1); 4649 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4650 IsUnary = true; 4651 break; 4652 case X86ISD::VPERMI: 4653 ImmN = N->getOperand(N->getNumOperands()-1); 4654 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4655 IsUnary = true; 4656 break; 4657 case X86ISD::MOVSS: 4658 case X86ISD::MOVSD: { 4659 // The index 0 always comes from the first element of the second source, 4660 // this is why MOVSS and MOVSD are used in the first place. The other 4661 // elements come from the other positions of the first source vector 4662 Mask.push_back(NumElems); 4663 for (unsigned i = 1; i != NumElems; ++i) { 4664 Mask.push_back(i); 4665 } 4666 break; 4667 } 4668 case X86ISD::VPERM2X128: 4669 ImmN = N->getOperand(N->getNumOperands()-1); 4670 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4671 if (Mask.empty()) return false; 4672 break; 4673 case X86ISD::MOVDDUP: 4674 case X86ISD::MOVLHPD: 4675 case X86ISD::MOVLPD: 4676 case X86ISD::MOVLPS: 4677 case X86ISD::MOVSHDUP: 4678 case X86ISD::MOVSLDUP: 4679 // Not yet implemented 4680 return false; 4681 default: llvm_unreachable("unknown target shuffle node"); 4682 } 4683 4684 return true; 4685 } 4686 4687 /// getShuffleScalarElt - Returns the scalar element that will make up the ith 4688 /// element of the result of the vector shuffle. 4689 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 4690 unsigned Depth) { 4691 if (Depth == 6) 4692 return SDValue(); // Limit search depth. 4693 4694 SDValue V = SDValue(N, 0); 4695 EVT VT = V.getValueType(); 4696 unsigned Opcode = V.getOpcode(); 4697 4698 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4699 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4700 int Elt = SV->getMaskElt(Index); 4701 4702 if (Elt < 0) 4703 return DAG.getUNDEF(VT.getVectorElementType()); 4704 4705 unsigned NumElems = VT.getVectorNumElements(); 4706 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 4707 : SV->getOperand(1); 4708 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 4709 } 4710 4711 // Recurse into target specific vector shuffles to find scalars. 4712 if (isTargetShuffle(Opcode)) { 4713 MVT ShufVT = V.getValueType().getSimpleVT(); 4714 unsigned NumElems = ShufVT.getVectorNumElements(); 4715 SmallVector<int, 16> ShuffleMask; 4716 bool IsUnary; 4717 4718 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 4719 return SDValue(); 4720 4721 int Elt = ShuffleMask[Index]; 4722 if (Elt < 0) 4723 return DAG.getUNDEF(ShufVT.getVectorElementType()); 4724 4725 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 4726 : N->getOperand(1); 4727 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 4728 Depth+1); 4729 } 4730 4731 // Actual nodes that may contain scalar elements 4732 if (Opcode == ISD::BITCAST) { 4733 V = V.getOperand(0); 4734 EVT SrcVT = V.getValueType(); 4735 unsigned NumElems = VT.getVectorNumElements(); 4736 4737 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4738 return SDValue(); 4739 } 4740 4741 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4742 return (Index == 0) ? V.getOperand(0) 4743 : DAG.getUNDEF(VT.getVectorElementType()); 4744 4745 if (V.getOpcode() == ISD::BUILD_VECTOR) 4746 return V.getOperand(Index); 4747 4748 return SDValue(); 4749 } 4750 4751 /// getNumOfConsecutiveZeros - Return the number of elements of a vector 4752 /// shuffle operation which come from a consecutively from a zero. The 4753 /// search can start in two different directions, from left or right. 4754 static 4755 unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, 4756 bool ZerosFromLeft, SelectionDAG &DAG) { 4757 unsigned i; 4758 for (i = 0; i != NumElems; ++i) { 4759 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4760 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); 4761 if (!(Elt.getNode() && 4762 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4763 break; 4764 } 4765 4766 return i; 4767 } 4768 4769 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) 4770 /// correspond consecutively to elements from one of the vector operands, 4771 /// starting from its index OpIdx. Also tell OpNum which source vector operand. 4772 static 4773 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, 4774 unsigned MaskI, unsigned MaskE, unsigned OpIdx, 4775 unsigned NumElems, unsigned &OpNum) { 4776 bool SeenV1 = false; 4777 bool SeenV2 = false; 4778 4779 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { 4780 int Idx = SVOp->getMaskElt(i); 4781 // Ignore undef indicies 4782 if (Idx < 0) 4783 continue; 4784 4785 if (Idx < (int)NumElems) 4786 SeenV1 = true; 4787 else 4788 SeenV2 = true; 4789 4790 // Only accept consecutive elements from the same vector 4791 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4792 return false; 4793 } 4794 4795 OpNum = SeenV1 ? 0 : 1; 4796 return true; 4797 } 4798 4799 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4800 /// logical left shift of a vector. 4801 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4802 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4803 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4804 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4805 false /* check zeros from right */, DAG); 4806 unsigned OpSrc; 4807 4808 if (!NumZeros) 4809 return false; 4810 4811 // Considering the elements in the mask that are not consecutive zeros, 4812 // check if they consecutively come from only one of the source vectors. 4813 // 4814 // V1 = {X, A, B, C} 0 4815 // \ \ \ / 4816 // vector_shuffle V1, V2 <1, 2, 3, X> 4817 // 4818 if (!isShuffleMaskConsecutive(SVOp, 4819 0, // Mask Start Index 4820 NumElems-NumZeros, // Mask End Index(exclusive) 4821 NumZeros, // Where to start looking in the src vector 4822 NumElems, // Number of elements in vector 4823 OpSrc)) // Which source operand ? 4824 return false; 4825 4826 isLeft = false; 4827 ShAmt = NumZeros; 4828 ShVal = SVOp->getOperand(OpSrc); 4829 return true; 4830 } 4831 4832 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4833 /// logical left shift of a vector. 4834 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4835 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4836 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4837 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4838 true /* check zeros from left */, DAG); 4839 unsigned OpSrc; 4840 4841 if (!NumZeros) 4842 return false; 4843 4844 // Considering the elements in the mask that are not consecutive zeros, 4845 // check if they consecutively come from only one of the source vectors. 4846 // 4847 // 0 { A, B, X, X } = V2 4848 // / \ / / 4849 // vector_shuffle V1, V2 <X, X, 4, 5> 4850 // 4851 if (!isShuffleMaskConsecutive(SVOp, 4852 NumZeros, // Mask Start Index 4853 NumElems, // Mask End Index(exclusive) 4854 0, // Where to start looking in the src vector 4855 NumElems, // Number of elements in vector 4856 OpSrc)) // Which source operand ? 4857 return false; 4858 4859 isLeft = true; 4860 ShAmt = NumZeros; 4861 ShVal = SVOp->getOperand(OpSrc); 4862 return true; 4863 } 4864 4865 /// isVectorShift - Returns true if the shuffle can be implemented as a 4866 /// logical left or right shift of a vector. 4867 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4868 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4869 // Although the logic below support any bitwidth size, there are no 4870 // shift instructions which handle more than 128-bit vectors. 4871 if (!SVOp->getValueType(0).is128BitVector()) 4872 return false; 4873 4874 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4875 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4876 return true; 4877 4878 return false; 4879 } 4880 4881 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4882 /// 4883 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4884 unsigned NumNonZero, unsigned NumZero, 4885 SelectionDAG &DAG, 4886 const X86Subtarget* Subtarget, 4887 const TargetLowering &TLI) { 4888 if (NumNonZero > 8) 4889 return SDValue(); 4890 4891 DebugLoc dl = Op.getDebugLoc(); 4892 SDValue V(0, 0); 4893 bool First = true; 4894 for (unsigned i = 0; i < 16; ++i) { 4895 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4896 if (ThisIsNonZero && First) { 4897 if (NumZero) 4898 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4899 else 4900 V = DAG.getUNDEF(MVT::v8i16); 4901 First = false; 4902 } 4903 4904 if ((i & 1) != 0) { 4905 SDValue ThisElt(0, 0), LastElt(0, 0); 4906 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4907 if (LastIsNonZero) { 4908 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4909 MVT::i16, Op.getOperand(i-1)); 4910 } 4911 if (ThisIsNonZero) { 4912 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4913 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4914 ThisElt, DAG.getConstant(8, MVT::i8)); 4915 if (LastIsNonZero) 4916 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4917 } else 4918 ThisElt = LastElt; 4919 4920 if (ThisElt.getNode()) 4921 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4922 DAG.getIntPtrConstant(i/2)); 4923 } 4924 } 4925 4926 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4927 } 4928 4929 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4930 /// 4931 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4932 unsigned NumNonZero, unsigned NumZero, 4933 SelectionDAG &DAG, 4934 const X86Subtarget* Subtarget, 4935 const TargetLowering &TLI) { 4936 if (NumNonZero > 4) 4937 return SDValue(); 4938 4939 DebugLoc dl = Op.getDebugLoc(); 4940 SDValue V(0, 0); 4941 bool First = true; 4942 for (unsigned i = 0; i < 8; ++i) { 4943 bool isNonZero = (NonZeros & (1 << i)) != 0; 4944 if (isNonZero) { 4945 if (First) { 4946 if (NumZero) 4947 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4948 else 4949 V = DAG.getUNDEF(MVT::v8i16); 4950 First = false; 4951 } 4952 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4953 MVT::v8i16, V, Op.getOperand(i), 4954 DAG.getIntPtrConstant(i)); 4955 } 4956 } 4957 4958 return V; 4959 } 4960 4961 /// getVShift - Return a vector logical shift node. 4962 /// 4963 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4964 unsigned NumBits, SelectionDAG &DAG, 4965 const TargetLowering &TLI, DebugLoc dl) { 4966 assert(VT.is128BitVector() && "Unknown type for VShift"); 4967 EVT ShVT = MVT::v2i64; 4968 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 4969 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4970 return DAG.getNode(ISD::BITCAST, dl, VT, 4971 DAG.getNode(Opc, dl, ShVT, SrcOp, 4972 DAG.getConstant(NumBits, 4973 TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); 4974 } 4975 4976 SDValue 4977 X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4978 SelectionDAG &DAG) const { 4979 4980 // Check if the scalar load can be widened into a vector load. And if 4981 // the address is "base + cst" see if the cst can be "absorbed" into 4982 // the shuffle mask. 4983 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4984 SDValue Ptr = LD->getBasePtr(); 4985 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4986 return SDValue(); 4987 EVT PVT = LD->getValueType(0); 4988 if (PVT != MVT::i32 && PVT != MVT::f32) 4989 return SDValue(); 4990 4991 int FI = -1; 4992 int64_t Offset = 0; 4993 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4994 FI = FINode->getIndex(); 4995 Offset = 0; 4996 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4997 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4998 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4999 Offset = Ptr.getConstantOperandVal(1); 5000 Ptr = Ptr.getOperand(0); 5001 } else { 5002 return SDValue(); 5003 } 5004 5005 // FIXME: 256-bit vector instructions don't require a strict alignment, 5006 // improve this code to support it better. 5007 unsigned RequiredAlign = VT.getSizeInBits()/8; 5008 SDValue Chain = LD->getChain(); 5009 // Make sure the stack object alignment is at least 16 or 32. 5010 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5011 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 5012 if (MFI->isFixedObjectIndex(FI)) { 5013 // Can't change the alignment. FIXME: It's possible to compute 5014 // the exact stack offset and reference FI + adjust offset instead. 5015 // If someone *really* cares about this. That's the way to implement it. 5016 return SDValue(); 5017 } else { 5018 MFI->setObjectAlignment(FI, RequiredAlign); 5019 } 5020 } 5021 5022 // (Offset % 16 or 32) must be multiple of 4. Then address is then 5023 // Ptr + (Offset & ~15). 5024 if (Offset < 0) 5025 return SDValue(); 5026 if ((Offset % RequiredAlign) & 3) 5027 return SDValue(); 5028 int64_t StartOffset = Offset & ~(RequiredAlign-1); 5029 if (StartOffset) 5030 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 5031 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 5032 5033 int EltNo = (Offset - StartOffset) >> 2; 5034 unsigned NumElems = VT.getVectorNumElements(); 5035 5036 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 5037 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 5038 LD->getPointerInfo().getWithOffset(StartOffset), 5039 false, false, false, 0); 5040 5041 SmallVector<int, 8> Mask; 5042 for (unsigned i = 0; i != NumElems; ++i) 5043 Mask.push_back(EltNo); 5044 5045 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 5046 } 5047 5048 return SDValue(); 5049 } 5050 5051 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 5052 /// vector of type 'VT', see if the elements can be replaced by a single large 5053 /// load which has the same value as a build_vector whose operands are 'elts'. 5054 /// 5055 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 5056 /// 5057 /// FIXME: we'd also like to handle the case where the last elements are zero 5058 /// rather than undef via VZEXT_LOAD, but we do not detect that case today. 5059 /// There's even a handy isZeroNode for that purpose. 5060 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 5061 DebugLoc &DL, SelectionDAG &DAG) { 5062 EVT EltVT = VT.getVectorElementType(); 5063 unsigned NumElems = Elts.size(); 5064 5065 LoadSDNode *LDBase = NULL; 5066 unsigned LastLoadedElt = -1U; 5067 5068 // For each element in the initializer, see if we've found a load or an undef. 5069 // If we don't find an initial load element, or later load elements are 5070 // non-consecutive, bail out. 5071 for (unsigned i = 0; i < NumElems; ++i) { 5072 SDValue Elt = Elts[i]; 5073 5074 if (!Elt.getNode() || 5075 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 5076 return SDValue(); 5077 if (!LDBase) { 5078 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 5079 return SDValue(); 5080 LDBase = cast<LoadSDNode>(Elt.getNode()); 5081 LastLoadedElt = i; 5082 continue; 5083 } 5084 if (Elt.getOpcode() == ISD::UNDEF) 5085 continue; 5086 5087 LoadSDNode *LD = cast<LoadSDNode>(Elt); 5088 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 5089 return SDValue(); 5090 LastLoadedElt = i; 5091 } 5092 5093 // If we have found an entire vector of loads and undefs, then return a large 5094 // load of the entire vector width starting at the base pointer. If we found 5095 // consecutive loads for the low half, generate a vzext_load node. 5096 if (LastLoadedElt == NumElems - 1) { 5097 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 5098 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5099 LDBase->getPointerInfo(), 5100 LDBase->isVolatile(), LDBase->isNonTemporal(), 5101 LDBase->isInvariant(), 0); 5102 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5103 LDBase->getPointerInfo(), 5104 LDBase->isVolatile(), LDBase->isNonTemporal(), 5105 LDBase->isInvariant(), LDBase->getAlignment()); 5106 } 5107 if (NumElems == 4 && LastLoadedElt == 1 && 5108 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 5109 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 5110 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 5111 SDValue ResNode = 5112 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, 5113 LDBase->getPointerInfo(), 5114 LDBase->getAlignment(), 5115 false/*isVolatile*/, true/*ReadMem*/, 5116 false/*WriteMem*/); 5117 5118 // Make sure the newly-created LOAD is in the same position as LDBase in 5119 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 5120 // update uses of LDBase's output chain to use the TokenFactor. 5121 if (LDBase->hasAnyUseOfValue(1)) { 5122 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5123 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 5124 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5125 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5126 SDValue(ResNode.getNode(), 1)); 5127 } 5128 5129 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 5130 } 5131 return SDValue(); 5132 } 5133 5134 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 5135 /// to generate a splat value for the following cases: 5136 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 5137 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from 5138 /// a scalar load, or a constant. 5139 /// The VBROADCAST node is returned when a pattern is found, 5140 /// or SDValue() otherwise. 5141 SDValue 5142 X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { 5143 if (!Subtarget->hasFp256()) 5144 return SDValue(); 5145 5146 MVT VT = Op.getValueType().getSimpleVT(); 5147 DebugLoc dl = Op.getDebugLoc(); 5148 5149 assert((VT.is128BitVector() || VT.is256BitVector()) && 5150 "Unsupported vector type for broadcast."); 5151 5152 SDValue Ld; 5153 bool ConstSplatVal; 5154 5155 switch (Op.getOpcode()) { 5156 default: 5157 // Unknown pattern found. 5158 return SDValue(); 5159 5160 case ISD::BUILD_VECTOR: { 5161 // The BUILD_VECTOR node must be a splat. 5162 if (!isSplatVector(Op.getNode())) 5163 return SDValue(); 5164 5165 Ld = Op.getOperand(0); 5166 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5167 Ld.getOpcode() == ISD::ConstantFP); 5168 5169 // The suspected load node has several users. Make sure that all 5170 // of its users are from the BUILD_VECTOR node. 5171 // Constants may have multiple users. 5172 if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 5173 return SDValue(); 5174 break; 5175 } 5176 5177 case ISD::VECTOR_SHUFFLE: { 5178 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5179 5180 // Shuffles must have a splat mask where the first element is 5181 // broadcasted. 5182 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5183 return SDValue(); 5184 5185 SDValue Sc = Op.getOperand(0); 5186 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5187 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5188 5189 if (!Subtarget->hasInt256()) 5190 return SDValue(); 5191 5192 // Use the register form of the broadcast instruction available on AVX2. 5193 if (VT.is256BitVector()) 5194 Sc = Extract128BitVector(Sc, 0, DAG, dl); 5195 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5196 } 5197 5198 Ld = Sc.getOperand(0); 5199 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5200 Ld.getOpcode() == ISD::ConstantFP); 5201 5202 // The scalar_to_vector node and the suspected 5203 // load node must have exactly one user. 5204 // Constants may have multiple users. 5205 if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) 5206 return SDValue(); 5207 break; 5208 } 5209 } 5210 5211 bool Is256 = VT.is256BitVector(); 5212 5213 // Handle the broadcasting a single constant scalar from the constant pool 5214 // into a vector. On Sandybridge it is still better to load a constant vector 5215 // from the constant pool and not to broadcast it from a scalar. 5216 if (ConstSplatVal && Subtarget->hasInt256()) { 5217 EVT CVT = Ld.getValueType(); 5218 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5219 unsigned ScalarSize = CVT.getSizeInBits(); 5220 5221 if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { 5222 const Constant *C = 0; 5223 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5224 C = CI->getConstantIntValue(); 5225 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5226 C = CF->getConstantFPValue(); 5227 5228 assert(C && "Invalid constant type"); 5229 5230 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 5231 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5232 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 5233 MachinePointerInfo::getConstantPool(), 5234 false, false, false, Alignment); 5235 5236 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5237 } 5238 } 5239 5240 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5241 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5242 5243 // Handle AVX2 in-register broadcasts. 5244 if (!IsLoad && Subtarget->hasInt256() && 5245 (ScalarSize == 32 || (Is256 && ScalarSize == 64))) 5246 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5247 5248 // The scalar source must be a normal load. 5249 if (!IsLoad) 5250 return SDValue(); 5251 5252 if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) 5253 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5254 5255 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5256 // double since there is no vbroadcastsd xmm 5257 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { 5258 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5259 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5260 } 5261 5262 // Unsupported broadcast. 5263 return SDValue(); 5264 } 5265 5266 SDValue 5267 X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { 5268 EVT VT = Op.getValueType(); 5269 5270 // Skip if insert_vec_elt is not supported. 5271 if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 5272 return SDValue(); 5273 5274 DebugLoc DL = Op.getDebugLoc(); 5275 unsigned NumElems = Op.getNumOperands(); 5276 5277 SDValue VecIn1; 5278 SDValue VecIn2; 5279 SmallVector<unsigned, 4> InsertIndices; 5280 SmallVector<int, 8> Mask(NumElems, -1); 5281 5282 for (unsigned i = 0; i != NumElems; ++i) { 5283 unsigned Opc = Op.getOperand(i).getOpcode(); 5284 5285 if (Opc == ISD::UNDEF) 5286 continue; 5287 5288 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 5289 // Quit if more than 1 elements need inserting. 5290 if (InsertIndices.size() > 1) 5291 return SDValue(); 5292 5293 InsertIndices.push_back(i); 5294 continue; 5295 } 5296 5297 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 5298 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 5299 5300 // Quit if extracted from vector of different type. 5301 if (ExtractedFromVec.getValueType() != VT) 5302 return SDValue(); 5303 5304 // Quit if non-constant index. 5305 if (!isa<ConstantSDNode>(ExtIdx)) 5306 return SDValue(); 5307 5308 if (VecIn1.getNode() == 0) 5309 VecIn1 = ExtractedFromVec; 5310 else if (VecIn1 != ExtractedFromVec) { 5311 if (VecIn2.getNode() == 0) 5312 VecIn2 = ExtractedFromVec; 5313 else if (VecIn2 != ExtractedFromVec) 5314 // Quit if more than 2 vectors to shuffle 5315 return SDValue(); 5316 } 5317 5318 unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 5319 5320 if (ExtractedFromVec == VecIn1) 5321 Mask[i] = Idx; 5322 else if (ExtractedFromVec == VecIn2) 5323 Mask[i] = Idx + NumElems; 5324 } 5325 5326 if (VecIn1.getNode() == 0) 5327 return SDValue(); 5328 5329 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 5330 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); 5331 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 5332 unsigned Idx = InsertIndices[i]; 5333 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 5334 DAG.getIntPtrConstant(Idx)); 5335 } 5336 5337 return NV; 5338 } 5339 5340 SDValue 5341 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5342 DebugLoc dl = Op.getDebugLoc(); 5343 5344 MVT VT = Op.getValueType().getSimpleVT(); 5345 MVT ExtVT = VT.getVectorElementType(); 5346 unsigned NumElems = Op.getNumOperands(); 5347 5348 // Vectors containing all zeros can be matched by pxor and xorps later 5349 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5350 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5351 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5352 if (VT == MVT::v4i32 || VT == MVT::v8i32) 5353 return Op; 5354 5355 return getZeroVector(VT, Subtarget, DAG, dl); 5356 } 5357 5358 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5359 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5360 // vpcmpeqd on 256-bit vectors. 5361 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { 5362 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) 5363 return Op; 5364 5365 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); 5366 } 5367 5368 SDValue Broadcast = LowerVectorBroadcast(Op, DAG); 5369 if (Broadcast.getNode()) 5370 return Broadcast; 5371 5372 unsigned EVTBits = ExtVT.getSizeInBits(); 5373 5374 unsigned NumZero = 0; 5375 unsigned NumNonZero = 0; 5376 unsigned NonZeros = 0; 5377 bool IsAllConstants = true; 5378 SmallSet<SDValue, 8> Values; 5379 for (unsigned i = 0; i < NumElems; ++i) { 5380 SDValue Elt = Op.getOperand(i); 5381 if (Elt.getOpcode() == ISD::UNDEF) 5382 continue; 5383 Values.insert(Elt); 5384 if (Elt.getOpcode() != ISD::Constant && 5385 Elt.getOpcode() != ISD::ConstantFP) 5386 IsAllConstants = false; 5387 if (X86::isZeroNode(Elt)) 5388 NumZero++; 5389 else { 5390 NonZeros |= (1 << i); 5391 NumNonZero++; 5392 } 5393 } 5394 5395 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5396 if (NumNonZero == 0) 5397 return DAG.getUNDEF(VT); 5398 5399 // Special case for single non-zero, non-undef, element. 5400 if (NumNonZero == 1) { 5401 unsigned Idx = CountTrailingZeros_32(NonZeros); 5402 SDValue Item = Op.getOperand(Idx); 5403 5404 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5405 // the value are obviously zero, truncate the value to i32 and do the 5406 // insertion that way. Only do this if the value is non-constant or if the 5407 // value is a constant being inserted into element 0. It is cheaper to do 5408 // a constant pool load than it is to do a movd + shuffle. 5409 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5410 (!IsAllConstants || Idx == 0)) { 5411 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5412 // Handle SSE only. 5413 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5414 EVT VecVT = MVT::v4i32; 5415 unsigned VecElts = 4; 5416 5417 // Truncate the value (which may itself be a constant) to i32, and 5418 // convert it to a vector with movd (S2V+shuffle to zero extend). 5419 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5420 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5421 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5422 5423 // Now we have our 32-bit value zero extended in the low element of 5424 // a vector. If Idx != 0, swizzle it into place. 5425 if (Idx != 0) { 5426 SmallVector<int, 4> Mask; 5427 Mask.push_back(Idx); 5428 for (unsigned i = 1; i != VecElts; ++i) 5429 Mask.push_back(i); 5430 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), 5431 &Mask[0]); 5432 } 5433 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5434 } 5435 } 5436 5437 // If we have a constant or non-constant insertion into the low element of 5438 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5439 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5440 // depending on what the source datatype is. 5441 if (Idx == 0) { 5442 if (NumZero == 0) 5443 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5444 5445 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5446 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5447 if (VT.is256BitVector()) { 5448 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5449 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5450 Item, DAG.getIntPtrConstant(0)); 5451 } 5452 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5453 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5454 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5455 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5456 } 5457 5458 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5459 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5460 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5461 if (VT.is256BitVector()) { 5462 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5463 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 5464 } else { 5465 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5466 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5467 } 5468 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5469 } 5470 } 5471 5472 // Is it a vector logical left shift? 5473 if (NumElems == 2 && Idx == 1 && 5474 X86::isZeroNode(Op.getOperand(0)) && 5475 !X86::isZeroNode(Op.getOperand(1))) { 5476 unsigned NumBits = VT.getSizeInBits(); 5477 return getVShift(true, VT, 5478 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5479 VT, Op.getOperand(1)), 5480 NumBits/2, DAG, *this, dl); 5481 } 5482 5483 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5484 return SDValue(); 5485 5486 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5487 // is a non-constant being inserted into an element other than the low one, 5488 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5489 // movd/movss) to move this into the low element, then shuffle it into 5490 // place. 5491 if (EVTBits == 32) { 5492 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5493 5494 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5495 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 5496 SmallVector<int, 8> MaskVec; 5497 for (unsigned i = 0; i != NumElems; ++i) 5498 MaskVec.push_back(i == Idx ? 0 : 1); 5499 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5500 } 5501 } 5502 5503 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5504 if (Values.size() == 1) { 5505 if (EVTBits == 32) { 5506 // Instead of a shuffle like this: 5507 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5508 // Check if it's possible to issue this instead. 5509 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5510 unsigned Idx = CountTrailingZeros_32(NonZeros); 5511 SDValue Item = Op.getOperand(Idx); 5512 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5513 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5514 } 5515 return SDValue(); 5516 } 5517 5518 // A vector full of immediates; various special cases are already 5519 // handled, so this is best done with a single constant-pool load. 5520 if (IsAllConstants) 5521 return SDValue(); 5522 5523 // For AVX-length vectors, build the individual 128-bit pieces and use 5524 // shuffles to put them in place. 5525 if (VT.is256BitVector()) { 5526 SmallVector<SDValue, 32> V; 5527 for (unsigned i = 0; i != NumElems; ++i) 5528 V.push_back(Op.getOperand(i)); 5529 5530 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5531 5532 // Build both the lower and upper subvector. 5533 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 5534 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 5535 NumElems/2); 5536 5537 // Recreate the wider vector with the lower and upper part. 5538 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 5539 } 5540 5541 // Let legalizer expand 2-wide build_vectors. 5542 if (EVTBits == 64) { 5543 if (NumNonZero == 1) { 5544 // One half is zero or undef. 5545 unsigned Idx = CountTrailingZeros_32(NonZeros); 5546 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 5547 Op.getOperand(Idx)); 5548 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 5549 } 5550 return SDValue(); 5551 } 5552 5553 // If element VT is < 32 bits, convert it to inserts into a zero vector. 5554 if (EVTBits == 8 && NumElems == 16) { 5555 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 5556 Subtarget, *this); 5557 if (V.getNode()) return V; 5558 } 5559 5560 if (EVTBits == 16 && NumElems == 8) { 5561 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 5562 Subtarget, *this); 5563 if (V.getNode()) return V; 5564 } 5565 5566 // If element VT is == 32 bits, turn it into a number of shuffles. 5567 SmallVector<SDValue, 8> V(NumElems); 5568 if (NumElems == 4 && NumZero > 0) { 5569 for (unsigned i = 0; i < 4; ++i) { 5570 bool isZero = !(NonZeros & (1 << i)); 5571 if (isZero) 5572 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 5573 else 5574 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5575 } 5576 5577 for (unsigned i = 0; i < 2; ++i) { 5578 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5579 default: break; 5580 case 0: 5581 V[i] = V[i*2]; // Must be a zero vector. 5582 break; 5583 case 1: 5584 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5585 break; 5586 case 2: 5587 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5588 break; 5589 case 3: 5590 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5591 break; 5592 } 5593 } 5594 5595 bool Reverse1 = (NonZeros & 0x3) == 2; 5596 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5597 int MaskVec[] = { 5598 Reverse1 ? 1 : 0, 5599 Reverse1 ? 0 : 1, 5600 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 5601 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 5602 }; 5603 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5604 } 5605 5606 if (Values.size() > 1 && VT.is128BitVector()) { 5607 // Check for a build vector of consecutive loads. 5608 for (unsigned i = 0; i < NumElems; ++i) 5609 V[i] = Op.getOperand(i); 5610 5611 // Check for elements which are consecutive loads. 5612 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 5613 if (LD.getNode()) 5614 return LD; 5615 5616 // Check for a build vector from mostly shuffle plus few inserting. 5617 SDValue Sh = buildFromShuffleMostly(Op, DAG); 5618 if (Sh.getNode()) 5619 return Sh; 5620 5621 // For SSE 4.1, use insertps to put the high elements into the low element. 5622 if (getSubtarget()->hasSSE41()) { 5623 SDValue Result; 5624 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5625 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5626 else 5627 Result = DAG.getUNDEF(VT); 5628 5629 for (unsigned i = 1; i < NumElems; ++i) { 5630 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5631 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5632 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5633 } 5634 return Result; 5635 } 5636 5637 // Otherwise, expand into a number of unpckl*, start by extending each of 5638 // our (non-undef) elements to the full vector width with the element in the 5639 // bottom slot of the vector (which generates no code for SSE). 5640 for (unsigned i = 0; i < NumElems; ++i) { 5641 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5642 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5643 else 5644 V[i] = DAG.getUNDEF(VT); 5645 } 5646 5647 // Next, we iteratively mix elements, e.g. for v4f32: 5648 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5649 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5650 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5651 unsigned EltStride = NumElems >> 1; 5652 while (EltStride != 0) { 5653 for (unsigned i = 0; i < EltStride; ++i) { 5654 // If V[i+EltStride] is undef and this is the first round of mixing, 5655 // then it is safe to just drop this shuffle: V[i] is already in the 5656 // right place, the one element (since it's the first round) being 5657 // inserted as undef can be dropped. This isn't safe for successive 5658 // rounds because they will permute elements within both vectors. 5659 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5660 EltStride == NumElems/2) 5661 continue; 5662 5663 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5664 } 5665 EltStride >>= 1; 5666 } 5667 return V[0]; 5668 } 5669 return SDValue(); 5670 } 5671 5672 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5673 // to create 256-bit vectors from two other 128-bit ones. 5674 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5675 DebugLoc dl = Op.getDebugLoc(); 5676 MVT ResVT = Op.getValueType().getSimpleVT(); 5677 5678 assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); 5679 5680 SDValue V1 = Op.getOperand(0); 5681 SDValue V2 = Op.getOperand(1); 5682 unsigned NumElems = ResVT.getVectorNumElements(); 5683 5684 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 5685 } 5686 5687 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5688 assert(Op.getNumOperands() == 2); 5689 5690 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5691 // from two other 128-bit ones. 5692 return LowerAVXCONCAT_VECTORS(Op, DAG); 5693 } 5694 5695 // Try to lower a shuffle node into a simple blend instruction. 5696 static SDValue 5697 LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, 5698 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 5699 SDValue V1 = SVOp->getOperand(0); 5700 SDValue V2 = SVOp->getOperand(1); 5701 DebugLoc dl = SVOp->getDebugLoc(); 5702 MVT VT = SVOp->getValueType(0).getSimpleVT(); 5703 MVT EltVT = VT.getVectorElementType(); 5704 unsigned NumElems = VT.getVectorNumElements(); 5705 5706 if (!Subtarget->hasSSE41() || EltVT == MVT::i8) 5707 return SDValue(); 5708 if (!Subtarget->hasInt256() && VT == MVT::v16i16) 5709 return SDValue(); 5710 5711 // Check the mask for BLEND and build the value. 5712 unsigned MaskValue = 0; 5713 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 5714 unsigned NumLanes = (NumElems-1)/8 + 1; 5715 unsigned NumElemsInLane = NumElems / NumLanes; 5716 5717 // Blend for v16i16 should be symetric for the both lanes. 5718 for (unsigned i = 0; i < NumElemsInLane; ++i) { 5719 5720 int SndLaneEltIdx = (NumLanes == 2) ? 5721 SVOp->getMaskElt(i + NumElemsInLane) : -1; 5722 int EltIdx = SVOp->getMaskElt(i); 5723 5724 if ((EltIdx < 0 || EltIdx == (int)i) && 5725 (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) 5726 continue; 5727 5728 if (((unsigned)EltIdx == (i + NumElems)) && 5729 (SndLaneEltIdx < 0 || 5730 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) 5731 MaskValue |= (1<<i); 5732 else 5733 return SDValue(); 5734 } 5735 5736 // Convert i32 vectors to floating point if it is not AVX2. 5737 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. 5738 MVT BlendVT = VT; 5739 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { 5740 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), 5741 NumElems); 5742 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); 5743 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); 5744 } 5745 5746 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, 5747 DAG.getConstant(MaskValue, MVT::i32)); 5748 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 5749 } 5750 5751 // v8i16 shuffles - Prefer shuffles in the following order: 5752 // 1. [all] pshuflw, pshufhw, optional move 5753 // 2. [ssse3] 1 x pshufb 5754 // 3. [ssse3] 2 x pshufb + 1 x por 5755 // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5756 static SDValue 5757 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, 5758 SelectionDAG &DAG) { 5759 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5760 SDValue V1 = SVOp->getOperand(0); 5761 SDValue V2 = SVOp->getOperand(1); 5762 DebugLoc dl = SVOp->getDebugLoc(); 5763 SmallVector<int, 8> MaskVals; 5764 5765 // Determine if more than 1 of the words in each of the low and high quadwords 5766 // of the result come from the same quadword of one of the two inputs. Undef 5767 // mask values count as coming from any quadword, for better codegen. 5768 unsigned LoQuad[] = { 0, 0, 0, 0 }; 5769 unsigned HiQuad[] = { 0, 0, 0, 0 }; 5770 std::bitset<4> InputQuads; 5771 for (unsigned i = 0; i < 8; ++i) { 5772 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 5773 int EltIdx = SVOp->getMaskElt(i); 5774 MaskVals.push_back(EltIdx); 5775 if (EltIdx < 0) { 5776 ++Quad[0]; 5777 ++Quad[1]; 5778 ++Quad[2]; 5779 ++Quad[3]; 5780 continue; 5781 } 5782 ++Quad[EltIdx / 4]; 5783 InputQuads.set(EltIdx / 4); 5784 } 5785 5786 int BestLoQuad = -1; 5787 unsigned MaxQuad = 1; 5788 for (unsigned i = 0; i < 4; ++i) { 5789 if (LoQuad[i] > MaxQuad) { 5790 BestLoQuad = i; 5791 MaxQuad = LoQuad[i]; 5792 } 5793 } 5794 5795 int BestHiQuad = -1; 5796 MaxQuad = 1; 5797 for (unsigned i = 0; i < 4; ++i) { 5798 if (HiQuad[i] > MaxQuad) { 5799 BestHiQuad = i; 5800 MaxQuad = HiQuad[i]; 5801 } 5802 } 5803 5804 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5805 // of the two input vectors, shuffle them into one input vector so only a 5806 // single pshufb instruction is necessary. If There are more than 2 input 5807 // quads, disable the next transformation since it does not help SSSE3. 5808 bool V1Used = InputQuads[0] || InputQuads[1]; 5809 bool V2Used = InputQuads[2] || InputQuads[3]; 5810 if (Subtarget->hasSSSE3()) { 5811 if (InputQuads.count() == 2 && V1Used && V2Used) { 5812 BestLoQuad = InputQuads[0] ? 0 : 1; 5813 BestHiQuad = InputQuads[2] ? 2 : 3; 5814 } 5815 if (InputQuads.count() > 2) { 5816 BestLoQuad = -1; 5817 BestHiQuad = -1; 5818 } 5819 } 5820 5821 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5822 // the shuffle mask. If a quad is scored as -1, that means that it contains 5823 // words from all 4 input quadwords. 5824 SDValue NewV; 5825 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5826 int MaskV[] = { 5827 BestLoQuad < 0 ? 0 : BestLoQuad, 5828 BestHiQuad < 0 ? 1 : BestHiQuad 5829 }; 5830 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5831 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5832 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5833 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5834 5835 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5836 // source words for the shuffle, to aid later transformations. 5837 bool AllWordsInNewV = true; 5838 bool InOrder[2] = { true, true }; 5839 for (unsigned i = 0; i != 8; ++i) { 5840 int idx = MaskVals[i]; 5841 if (idx != (int)i) 5842 InOrder[i/4] = false; 5843 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5844 continue; 5845 AllWordsInNewV = false; 5846 break; 5847 } 5848 5849 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5850 if (AllWordsInNewV) { 5851 for (int i = 0; i != 8; ++i) { 5852 int idx = MaskVals[i]; 5853 if (idx < 0) 5854 continue; 5855 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5856 if ((idx != i) && idx < 4) 5857 pshufhw = false; 5858 if ((idx != i) && idx > 3) 5859 pshuflw = false; 5860 } 5861 V1 = NewV; 5862 V2Used = false; 5863 BestLoQuad = 0; 5864 BestHiQuad = 1; 5865 } 5866 5867 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5868 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5869 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5870 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5871 unsigned TargetMask = 0; 5872 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5873 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5874 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5875 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 5876 getShufflePSHUFLWImmediate(SVOp); 5877 V1 = NewV.getOperand(0); 5878 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5879 } 5880 } 5881 5882 // Promote splats to a larger type which usually leads to more efficient code. 5883 // FIXME: Is this true if pshufb is available? 5884 if (SVOp->isSplat()) 5885 return PromoteSplat(SVOp, DAG); 5886 5887 // If we have SSSE3, and all words of the result are from 1 input vector, 5888 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5889 // is present, fall back to case 4. 5890 if (Subtarget->hasSSSE3()) { 5891 SmallVector<SDValue,16> pshufbMask; 5892 5893 // If we have elements from both input vectors, set the high bit of the 5894 // shuffle mask element to zero out elements that come from V2 in the V1 5895 // mask, and elements that come from V1 in the V2 mask, so that the two 5896 // results can be OR'd together. 5897 bool TwoInputs = V1Used && V2Used; 5898 for (unsigned i = 0; i != 8; ++i) { 5899 int EltIdx = MaskVals[i] * 2; 5900 int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; 5901 int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; 5902 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 5903 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 5904 } 5905 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5906 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5907 DAG.getNode(ISD::BUILD_VECTOR, dl, 5908 MVT::v16i8, &pshufbMask[0], 16)); 5909 if (!TwoInputs) 5910 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5911 5912 // Calculate the shuffle mask for the second input, shuffle it, and 5913 // OR it with the first shuffled input. 5914 pshufbMask.clear(); 5915 for (unsigned i = 0; i != 8; ++i) { 5916 int EltIdx = MaskVals[i] * 2; 5917 int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; 5918 int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; 5919 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 5920 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 5921 } 5922 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5923 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5924 DAG.getNode(ISD::BUILD_VECTOR, dl, 5925 MVT::v16i8, &pshufbMask[0], 16)); 5926 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5927 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5928 } 5929 5930 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5931 // and update MaskVals with new element order. 5932 std::bitset<8> InOrder; 5933 if (BestLoQuad >= 0) { 5934 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 5935 for (int i = 0; i != 4; ++i) { 5936 int idx = MaskVals[i]; 5937 if (idx < 0) { 5938 InOrder.set(i); 5939 } else if ((idx / 4) == BestLoQuad) { 5940 MaskV[i] = idx & 3; 5941 InOrder.set(i); 5942 } 5943 } 5944 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5945 &MaskV[0]); 5946 5947 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5948 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5949 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5950 NewV.getOperand(0), 5951 getShufflePSHUFLWImmediate(SVOp), DAG); 5952 } 5953 } 5954 5955 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5956 // and update MaskVals with the new element order. 5957 if (BestHiQuad >= 0) { 5958 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 5959 for (unsigned i = 4; i != 8; ++i) { 5960 int idx = MaskVals[i]; 5961 if (idx < 0) { 5962 InOrder.set(i); 5963 } else if ((idx / 4) == BestHiQuad) { 5964 MaskV[i] = (idx & 3) + 4; 5965 InOrder.set(i); 5966 } 5967 } 5968 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5969 &MaskV[0]); 5970 5971 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5972 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5973 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5974 NewV.getOperand(0), 5975 getShufflePSHUFHWImmediate(SVOp), DAG); 5976 } 5977 } 5978 5979 // In case BestHi & BestLo were both -1, which means each quadword has a word 5980 // from each of the four input quadwords, calculate the InOrder bitvector now 5981 // before falling through to the insert/extract cleanup. 5982 if (BestLoQuad == -1 && BestHiQuad == -1) { 5983 NewV = V1; 5984 for (int i = 0; i != 8; ++i) 5985 if (MaskVals[i] < 0 || MaskVals[i] == i) 5986 InOrder.set(i); 5987 } 5988 5989 // The other elements are put in the right place using pextrw and pinsrw. 5990 for (unsigned i = 0; i != 8; ++i) { 5991 if (InOrder[i]) 5992 continue; 5993 int EltIdx = MaskVals[i]; 5994 if (EltIdx < 0) 5995 continue; 5996 SDValue ExtOp = (EltIdx < 8) ? 5997 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5998 DAG.getIntPtrConstant(EltIdx)) : 5999 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 6000 DAG.getIntPtrConstant(EltIdx - 8)); 6001 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 6002 DAG.getIntPtrConstant(i)); 6003 } 6004 return NewV; 6005 } 6006 6007 // v16i8 shuffles - Prefer shuffles in the following order: 6008 // 1. [ssse3] 1 x pshufb 6009 // 2. [ssse3] 2 x pshufb + 1 x por 6010 // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 6011 static 6012 SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 6013 SelectionDAG &DAG, 6014 const X86TargetLowering &TLI) { 6015 SDValue V1 = SVOp->getOperand(0); 6016 SDValue V2 = SVOp->getOperand(1); 6017 DebugLoc dl = SVOp->getDebugLoc(); 6018 ArrayRef<int> MaskVals = SVOp->getMask(); 6019 6020 // Promote splats to a larger type which usually leads to more efficient code. 6021 // FIXME: Is this true if pshufb is available? 6022 if (SVOp->isSplat()) 6023 return PromoteSplat(SVOp, DAG); 6024 6025 // If we have SSSE3, case 1 is generated when all result bytes come from 6026 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 6027 // present, fall back to case 3. 6028 6029 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 6030 if (TLI.getSubtarget()->hasSSSE3()) { 6031 SmallVector<SDValue,16> pshufbMask; 6032 6033 // If all result elements are from one input vector, then only translate 6034 // undef mask values to 0x80 (zero out result) in the pshufb mask. 6035 // 6036 // Otherwise, we have elements from both input vectors, and must zero out 6037 // elements that come from V2 in the first mask, and V1 in the second mask 6038 // so that we can OR them together. 6039 for (unsigned i = 0; i != 16; ++i) { 6040 int EltIdx = MaskVals[i]; 6041 if (EltIdx < 0 || EltIdx >= 16) 6042 EltIdx = 0x80; 6043 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6044 } 6045 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 6046 DAG.getNode(ISD::BUILD_VECTOR, dl, 6047 MVT::v16i8, &pshufbMask[0], 16)); 6048 6049 // As PSHUFB will zero elements with negative indices, it's safe to ignore 6050 // the 2nd operand if it's undefined or zero. 6051 if (V2.getOpcode() == ISD::UNDEF || 6052 ISD::isBuildVectorAllZeros(V2.getNode())) 6053 return V1; 6054 6055 // Calculate the shuffle mask for the second input, shuffle it, and 6056 // OR it with the first shuffled input. 6057 pshufbMask.clear(); 6058 for (unsigned i = 0; i != 16; ++i) { 6059 int EltIdx = MaskVals[i]; 6060 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; 6061 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6062 } 6063 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 6064 DAG.getNode(ISD::BUILD_VECTOR, dl, 6065 MVT::v16i8, &pshufbMask[0], 16)); 6066 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 6067 } 6068 6069 // No SSSE3 - Calculate in place words and then fix all out of place words 6070 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 6071 // the 16 different words that comprise the two doublequadword input vectors. 6072 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6073 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 6074 SDValue NewV = V1; 6075 for (int i = 0; i != 8; ++i) { 6076 int Elt0 = MaskVals[i*2]; 6077 int Elt1 = MaskVals[i*2+1]; 6078 6079 // This word of the result is all undef, skip it. 6080 if (Elt0 < 0 && Elt1 < 0) 6081 continue; 6082 6083 // This word of the result is already in the correct place, skip it. 6084 if ((Elt0 == i*2) && (Elt1 == i*2+1)) 6085 continue; 6086 6087 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 6088 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 6089 SDValue InsElt; 6090 6091 // If Elt0 and Elt1 are defined, are consecutive, and can be load 6092 // using a single extract together, load it and store it. 6093 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 6094 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6095 DAG.getIntPtrConstant(Elt1 / 2)); 6096 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6097 DAG.getIntPtrConstant(i)); 6098 continue; 6099 } 6100 6101 // If Elt1 is defined, extract it from the appropriate source. If the 6102 // source byte is not also odd, shift the extracted word left 8 bits 6103 // otherwise clear the bottom 8 bits if we need to do an or. 6104 if (Elt1 >= 0) { 6105 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6106 DAG.getIntPtrConstant(Elt1 / 2)); 6107 if ((Elt1 & 1) == 0) 6108 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 6109 DAG.getConstant(8, 6110 TLI.getShiftAmountTy(InsElt.getValueType()))); 6111 else if (Elt0 >= 0) 6112 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 6113 DAG.getConstant(0xFF00, MVT::i16)); 6114 } 6115 // If Elt0 is defined, extract it from the appropriate source. If the 6116 // source byte is not also even, shift the extracted word right 8 bits. If 6117 // Elt1 was also defined, OR the extracted values together before 6118 // inserting them in the result. 6119 if (Elt0 >= 0) { 6120 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 6121 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 6122 if ((Elt0 & 1) != 0) 6123 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 6124 DAG.getConstant(8, 6125 TLI.getShiftAmountTy(InsElt0.getValueType()))); 6126 else if (Elt1 >= 0) 6127 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 6128 DAG.getConstant(0x00FF, MVT::i16)); 6129 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 6130 : InsElt0; 6131 } 6132 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6133 DAG.getIntPtrConstant(i)); 6134 } 6135 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 6136 } 6137 6138 // v32i8 shuffles - Translate to VPSHUFB if possible. 6139 static 6140 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, 6141 const X86Subtarget *Subtarget, 6142 SelectionDAG &DAG) { 6143 MVT VT = SVOp->getValueType(0).getSimpleVT(); 6144 SDValue V1 = SVOp->getOperand(0); 6145 SDValue V2 = SVOp->getOperand(1); 6146 DebugLoc dl = SVOp->getDebugLoc(); 6147 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); 6148 6149 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6150 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); 6151 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); 6152 6153 // VPSHUFB may be generated if 6154 // (1) one of input vector is undefined or zeroinitializer. 6155 // The mask value 0x80 puts 0 in the corresponding slot of the vector. 6156 // And (2) the mask indexes don't cross the 128-bit lane. 6157 if (VT != MVT::v32i8 || !Subtarget->hasInt256() || 6158 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) 6159 return SDValue(); 6160 6161 if (V1IsAllZero && !V2IsAllZero) { 6162 CommuteVectorShuffleMask(MaskVals, 32); 6163 V1 = V2; 6164 } 6165 SmallVector<SDValue, 32> pshufbMask; 6166 for (unsigned i = 0; i != 32; i++) { 6167 int EltIdx = MaskVals[i]; 6168 if (EltIdx < 0 || EltIdx >= 32) 6169 EltIdx = 0x80; 6170 else { 6171 if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) 6172 // Cross lane is not allowed. 6173 return SDValue(); 6174 EltIdx &= 0xf; 6175 } 6176 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6177 } 6178 return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, 6179 DAG.getNode(ISD::BUILD_VECTOR, dl, 6180 MVT::v32i8, &pshufbMask[0], 32)); 6181 } 6182 6183 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 6184 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 6185 /// done when every pair / quad of shuffle mask elements point to elements in 6186 /// the right sequence. e.g. 6187 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 6188 static 6189 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 6190 SelectionDAG &DAG) { 6191 MVT VT = SVOp->getValueType(0).getSimpleVT(); 6192 DebugLoc dl = SVOp->getDebugLoc(); 6193 unsigned NumElems = VT.getVectorNumElements(); 6194 MVT NewVT; 6195 unsigned Scale; 6196 switch (VT.SimpleTy) { 6197 default: llvm_unreachable("Unexpected!"); 6198 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; 6199 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; 6200 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; 6201 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; 6202 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; 6203 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; 6204 } 6205 6206 SmallVector<int, 8> MaskVec; 6207 for (unsigned i = 0; i != NumElems; i += Scale) { 6208 int StartIdx = -1; 6209 for (unsigned j = 0; j != Scale; ++j) { 6210 int EltIdx = SVOp->getMaskElt(i+j); 6211 if (EltIdx < 0) 6212 continue; 6213 if (StartIdx < 0) 6214 StartIdx = (EltIdx / Scale); 6215 if (EltIdx != (int)(StartIdx*Scale + j)) 6216 return SDValue(); 6217 } 6218 MaskVec.push_back(StartIdx); 6219 } 6220 6221 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); 6222 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); 6223 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 6224 } 6225 6226 /// getVZextMovL - Return a zero-extending vector move low node. 6227 /// 6228 static SDValue getVZextMovL(MVT VT, EVT OpVT, 6229 SDValue SrcOp, SelectionDAG &DAG, 6230 const X86Subtarget *Subtarget, DebugLoc dl) { 6231 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 6232 LoadSDNode *LD = NULL; 6233 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 6234 LD = dyn_cast<LoadSDNode>(SrcOp); 6235 if (!LD) { 6236 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 6237 // instead. 6238 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 6239 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 6240 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 6241 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 6242 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 6243 // PR2108 6244 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 6245 return DAG.getNode(ISD::BITCAST, dl, VT, 6246 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6247 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6248 OpVT, 6249 SrcOp.getOperand(0) 6250 .getOperand(0)))); 6251 } 6252 } 6253 } 6254 6255 return DAG.getNode(ISD::BITCAST, dl, VT, 6256 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6257 DAG.getNode(ISD::BITCAST, dl, 6258 OpVT, SrcOp))); 6259 } 6260 6261 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 6262 /// which could not be matched by any known target speficic shuffle 6263 static SDValue 6264 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6265 6266 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); 6267 if (NewOp.getNode()) 6268 return NewOp; 6269 6270 MVT VT = SVOp->getValueType(0).getSimpleVT(); 6271 6272 unsigned NumElems = VT.getVectorNumElements(); 6273 unsigned NumLaneElems = NumElems / 2; 6274 6275 DebugLoc dl = SVOp->getDebugLoc(); 6276 MVT EltVT = VT.getVectorElementType(); 6277 MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); 6278 SDValue Output[2]; 6279 6280 SmallVector<int, 16> Mask; 6281 for (unsigned l = 0; l < 2; ++l) { 6282 // Build a shuffle mask for the output, discovering on the fly which 6283 // input vectors to use as shuffle operands (recorded in InputUsed). 6284 // If building a suitable shuffle vector proves too hard, then bail 6285 // out with UseBuildVector set. 6286 bool UseBuildVector = false; 6287 int InputUsed[2] = { -1, -1 }; // Not yet discovered. 6288 unsigned LaneStart = l * NumLaneElems; 6289 for (unsigned i = 0; i != NumLaneElems; ++i) { 6290 // The mask element. This indexes into the input. 6291 int Idx = SVOp->getMaskElt(i+LaneStart); 6292 if (Idx < 0) { 6293 // the mask element does not index into any input vector. 6294 Mask.push_back(-1); 6295 continue; 6296 } 6297 6298 // The input vector this mask element indexes into. 6299 int Input = Idx / NumLaneElems; 6300 6301 // Turn the index into an offset from the start of the input vector. 6302 Idx -= Input * NumLaneElems; 6303 6304 // Find or create a shuffle vector operand to hold this input. 6305 unsigned OpNo; 6306 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 6307 if (InputUsed[OpNo] == Input) 6308 // This input vector is already an operand. 6309 break; 6310 if (InputUsed[OpNo] < 0) { 6311 // Create a new operand for this input vector. 6312 InputUsed[OpNo] = Input; 6313 break; 6314 } 6315 } 6316 6317 if (OpNo >= array_lengthof(InputUsed)) { 6318 // More than two input vectors used! Give up on trying to create a 6319 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 6320 UseBuildVector = true; 6321 break; 6322 } 6323 6324 // Add the mask index for the new shuffle vector. 6325 Mask.push_back(Idx + OpNo * NumLaneElems); 6326 } 6327 6328 if (UseBuildVector) { 6329 SmallVector<SDValue, 16> SVOps; 6330 for (unsigned i = 0; i != NumLaneElems; ++i) { 6331 // The mask element. This indexes into the input. 6332 int Idx = SVOp->getMaskElt(i+LaneStart); 6333 if (Idx < 0) { 6334 SVOps.push_back(DAG.getUNDEF(EltVT)); 6335 continue; 6336 } 6337 6338 // The input vector this mask element indexes into. 6339 int Input = Idx / NumElems; 6340 6341 // Turn the index into an offset from the start of the input vector. 6342 Idx -= Input * NumElems; 6343 6344 // Extract the vector element by hand. 6345 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6346 SVOp->getOperand(Input), 6347 DAG.getIntPtrConstant(Idx))); 6348 } 6349 6350 // Construct the output using a BUILD_VECTOR. 6351 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], 6352 SVOps.size()); 6353 } else if (InputUsed[0] < 0) { 6354 // No input vectors were used! The result is undefined. 6355 Output[l] = DAG.getUNDEF(NVT); 6356 } else { 6357 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), 6358 (InputUsed[0] % 2) * NumLaneElems, 6359 DAG, dl); 6360 // If only one input was used, use an undefined vector for the other. 6361 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : 6362 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), 6363 (InputUsed[1] % 2) * NumLaneElems, DAG, dl); 6364 // At least one input vector was used. Create a new shuffle vector. 6365 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); 6366 } 6367 6368 Mask.clear(); 6369 } 6370 6371 // Concatenate the result back 6372 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); 6373 } 6374 6375 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 6376 /// 4 elements, and match them with several different shuffle types. 6377 static SDValue 6378 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6379 SDValue V1 = SVOp->getOperand(0); 6380 SDValue V2 = SVOp->getOperand(1); 6381 DebugLoc dl = SVOp->getDebugLoc(); 6382 MVT VT = SVOp->getValueType(0).getSimpleVT(); 6383 6384 assert(VT.is128BitVector() && "Unsupported vector size"); 6385 6386 std::pair<int, int> Locs[4]; 6387 int Mask1[] = { -1, -1, -1, -1 }; 6388 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 6389 6390 unsigned NumHi = 0; 6391 unsigned NumLo = 0; 6392 for (unsigned i = 0; i != 4; ++i) { 6393 int Idx = PermMask[i]; 6394 if (Idx < 0) { 6395 Locs[i] = std::make_pair(-1, -1); 6396 } else { 6397 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 6398 if (Idx < 4) { 6399 Locs[i] = std::make_pair(0, NumLo); 6400 Mask1[NumLo] = Idx; 6401 NumLo++; 6402 } else { 6403 Locs[i] = std::make_pair(1, NumHi); 6404 if (2+NumHi < 4) 6405 Mask1[2+NumHi] = Idx; 6406 NumHi++; 6407 } 6408 } 6409 } 6410 6411 if (NumLo <= 2 && NumHi <= 2) { 6412 // If no more than two elements come from either vector. This can be 6413 // implemented with two shuffles. First shuffle gather the elements. 6414 // The second shuffle, which takes the first shuffle as both of its 6415 // vector operands, put the elements into the right order. 6416 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6417 6418 int Mask2[] = { -1, -1, -1, -1 }; 6419 6420 for (unsigned i = 0; i != 4; ++i) 6421 if (Locs[i].first != -1) { 6422 unsigned Idx = (i < 2) ? 0 : 4; 6423 Idx += Locs[i].first * 2 + Locs[i].second; 6424 Mask2[i] = Idx; 6425 } 6426 6427 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6428 } 6429 6430 if (NumLo == 3 || NumHi == 3) { 6431 // Otherwise, we must have three elements from one vector, call it X, and 6432 // one element from the other, call it Y. First, use a shufps to build an 6433 // intermediate vector with the one element from Y and the element from X 6434 // that will be in the same half in the final destination (the indexes don't 6435 // matter). Then, use a shufps to build the final vector, taking the half 6436 // containing the element from Y from the intermediate, and the other half 6437 // from X. 6438 if (NumHi == 3) { 6439 // Normalize it so the 3 elements come from V1. 6440 CommuteVectorShuffleMask(PermMask, 4); 6441 std::swap(V1, V2); 6442 } 6443 6444 // Find the element from V2. 6445 unsigned HiIndex; 6446 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6447 int Val = PermMask[HiIndex]; 6448 if (Val < 0) 6449 continue; 6450 if (Val >= 4) 6451 break; 6452 } 6453 6454 Mask1[0] = PermMask[HiIndex]; 6455 Mask1[1] = -1; 6456 Mask1[2] = PermMask[HiIndex^1]; 6457 Mask1[3] = -1; 6458 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6459 6460 if (HiIndex >= 2) { 6461 Mask1[0] = PermMask[0]; 6462 Mask1[1] = PermMask[1]; 6463 Mask1[2] = HiIndex & 1 ? 6 : 4; 6464 Mask1[3] = HiIndex & 1 ? 4 : 6; 6465 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6466 } 6467 6468 Mask1[0] = HiIndex & 1 ? 2 : 0; 6469 Mask1[1] = HiIndex & 1 ? 0 : 2; 6470 Mask1[2] = PermMask[2]; 6471 Mask1[3] = PermMask[3]; 6472 if (Mask1[2] >= 0) 6473 Mask1[2] += 4; 6474 if (Mask1[3] >= 0) 6475 Mask1[3] += 4; 6476 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6477 } 6478 6479 // Break it into (shuffle shuffle_hi, shuffle_lo). 6480 int LoMask[] = { -1, -1, -1, -1 }; 6481 int HiMask[] = { -1, -1, -1, -1 }; 6482 6483 int *MaskPtr = LoMask; 6484 unsigned MaskIdx = 0; 6485 unsigned LoIdx = 0; 6486 unsigned HiIdx = 2; 6487 for (unsigned i = 0; i != 4; ++i) { 6488 if (i == 2) { 6489 MaskPtr = HiMask; 6490 MaskIdx = 1; 6491 LoIdx = 0; 6492 HiIdx = 2; 6493 } 6494 int Idx = PermMask[i]; 6495 if (Idx < 0) { 6496 Locs[i] = std::make_pair(-1, -1); 6497 } else if (Idx < 4) { 6498 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6499 MaskPtr[LoIdx] = Idx; 6500 LoIdx++; 6501 } else { 6502 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6503 MaskPtr[HiIdx] = Idx; 6504 HiIdx++; 6505 } 6506 } 6507 6508 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6509 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6510 int MaskOps[] = { -1, -1, -1, -1 }; 6511 for (unsigned i = 0; i != 4; ++i) 6512 if (Locs[i].first != -1) 6513 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 6514 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6515 } 6516 6517 static bool MayFoldVectorLoad(SDValue V) { 6518 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6519 V = V.getOperand(0); 6520 6521 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6522 V = V.getOperand(0); 6523 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 6524 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 6525 // BUILD_VECTOR (load), undef 6526 V = V.getOperand(0); 6527 6528 return MayFoldLoad(V); 6529 } 6530 6531 static 6532 SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 6533 EVT VT = Op.getValueType(); 6534 6535 // Canonizalize to v2f64. 6536 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 6537 return DAG.getNode(ISD::BITCAST, dl, VT, 6538 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 6539 V1, DAG)); 6540 } 6541 6542 static 6543 SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 6544 bool HasSSE2) { 6545 SDValue V1 = Op.getOperand(0); 6546 SDValue V2 = Op.getOperand(1); 6547 EVT VT = Op.getValueType(); 6548 6549 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 6550 6551 if (HasSSE2 && VT == MVT::v2f64) 6552 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 6553 6554 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 6555 return DAG.getNode(ISD::BITCAST, dl, VT, 6556 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 6557 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 6558 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 6559 } 6560 6561 static 6562 SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 6563 SDValue V1 = Op.getOperand(0); 6564 SDValue V2 = Op.getOperand(1); 6565 EVT VT = Op.getValueType(); 6566 6567 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 6568 "unsupported shuffle type"); 6569 6570 if (V2.getOpcode() == ISD::UNDEF) 6571 V2 = V1; 6572 6573 // v4i32 or v4f32 6574 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 6575 } 6576 6577 static 6578 SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 6579 SDValue V1 = Op.getOperand(0); 6580 SDValue V2 = Op.getOperand(1); 6581 EVT VT = Op.getValueType(); 6582 unsigned NumElems = VT.getVectorNumElements(); 6583 6584 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 6585 // operand of these instructions is only memory, so check if there's a 6586 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 6587 // same masks. 6588 bool CanFoldLoad = false; 6589 6590 // Trivial case, when V2 comes from a load. 6591 if (MayFoldVectorLoad(V2)) 6592 CanFoldLoad = true; 6593 6594 // When V1 is a load, it can be folded later into a store in isel, example: 6595 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 6596 // turns into: 6597 // (MOVLPSmr addr:$src1, VR128:$src2) 6598 // So, recognize this potential and also use MOVLPS or MOVLPD 6599 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 6600 CanFoldLoad = true; 6601 6602 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6603 if (CanFoldLoad) { 6604 if (HasSSE2 && NumElems == 2) 6605 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 6606 6607 if (NumElems == 4) 6608 // If we don't care about the second element, proceed to use movss. 6609 if (SVOp->getMaskElt(1) != -1) 6610 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 6611 } 6612 6613 // movl and movlp will both match v2i64, but v2i64 is never matched by 6614 // movl earlier because we make it strict to avoid messing with the movlp load 6615 // folding logic (see the code above getMOVLP call). Match it here then, 6616 // this is horrible, but will stay like this until we move all shuffle 6617 // matching to x86 specific nodes. Note that for the 1st condition all 6618 // types are matched with movsd. 6619 if (HasSSE2) { 6620 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 6621 // as to remove this logic from here, as much as possible 6622 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 6623 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6624 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6625 } 6626 6627 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 6628 6629 // Invert the operand order and use SHUFPS to match it. 6630 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 6631 getShuffleSHUFImmediate(SVOp), DAG); 6632 } 6633 6634 // Reduce a vector shuffle to zext. 6635 SDValue 6636 X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { 6637 // PMOVZX is only available from SSE41. 6638 if (!Subtarget->hasSSE41()) 6639 return SDValue(); 6640 6641 EVT VT = Op.getValueType(); 6642 6643 // Only AVX2 support 256-bit vector integer extending. 6644 if (!Subtarget->hasInt256() && VT.is256BitVector()) 6645 return SDValue(); 6646 6647 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6648 DebugLoc DL = Op.getDebugLoc(); 6649 SDValue V1 = Op.getOperand(0); 6650 SDValue V2 = Op.getOperand(1); 6651 unsigned NumElems = VT.getVectorNumElements(); 6652 6653 // Extending is an unary operation and the element type of the source vector 6654 // won't be equal to or larger than i64. 6655 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || 6656 VT.getVectorElementType() == MVT::i64) 6657 return SDValue(); 6658 6659 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. 6660 unsigned Shift = 1; // Start from 2, i.e. 1 << 1. 6661 while ((1U << Shift) < NumElems) { 6662 if (SVOp->getMaskElt(1U << Shift) == 1) 6663 break; 6664 Shift += 1; 6665 // The maximal ratio is 8, i.e. from i8 to i64. 6666 if (Shift > 3) 6667 return SDValue(); 6668 } 6669 6670 // Check the shuffle mask. 6671 unsigned Mask = (1U << Shift) - 1; 6672 for (unsigned i = 0; i != NumElems; ++i) { 6673 int EltIdx = SVOp->getMaskElt(i); 6674 if ((i & Mask) != 0 && EltIdx != -1) 6675 return SDValue(); 6676 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) 6677 return SDValue(); 6678 } 6679 6680 LLVMContext *Context = DAG.getContext(); 6681 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; 6682 EVT NeVT = EVT::getIntegerVT(*Context, NBits); 6683 EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift); 6684 6685 if (!isTypeLegal(NVT)) 6686 return SDValue(); 6687 6688 // Simplify the operand as it's prepared to be fed into shuffle. 6689 unsigned SignificantBits = NVT.getSizeInBits() >> Shift; 6690 if (V1.getOpcode() == ISD::BITCAST && 6691 V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 6692 V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 6693 V1.getOperand(0) 6694 .getOperand(0).getValueType().getSizeInBits() == SignificantBits) { 6695 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 6696 SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); 6697 ConstantSDNode *CIdx = 6698 dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); 6699 // If it's foldable, i.e. normal load with single use, we will let code 6700 // selection to fold it. Otherwise, we will short the conversion sequence. 6701 if (CIdx && CIdx->getZExtValue() == 0 && 6702 (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { 6703 if (V.getValueSizeInBits() > V1.getValueSizeInBits()) { 6704 // The "ext_vec_elt" node is wider than the result node. 6705 // In this case we should extract subvector from V. 6706 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). 6707 unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits(); 6708 EVT FullVT = V.getValueType(); 6709 EVT SubVecVT = EVT::getVectorVT(*Context, 6710 FullVT.getVectorElementType(), 6711 FullVT.getVectorNumElements()/Ratio); 6712 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 6713 DAG.getIntPtrConstant(0)); 6714 } 6715 V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); 6716 } 6717 } 6718 6719 return DAG.getNode(ISD::BITCAST, DL, VT, 6720 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); 6721 } 6722 6723 SDValue 6724 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { 6725 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6726 MVT VT = Op.getValueType().getSimpleVT(); 6727 DebugLoc dl = Op.getDebugLoc(); 6728 SDValue V1 = Op.getOperand(0); 6729 SDValue V2 = Op.getOperand(1); 6730 6731 if (isZeroShuffle(SVOp)) 6732 return getZeroVector(VT, Subtarget, DAG, dl); 6733 6734 // Handle splat operations 6735 if (SVOp->isSplat()) { 6736 // Use vbroadcast whenever the splat comes from a foldable load 6737 SDValue Broadcast = LowerVectorBroadcast(Op, DAG); 6738 if (Broadcast.getNode()) 6739 return Broadcast; 6740 } 6741 6742 // Check integer expanding shuffles. 6743 SDValue NewOp = LowerVectorIntExtend(Op, DAG); 6744 if (NewOp.getNode()) 6745 return NewOp; 6746 6747 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6748 // do it! 6749 if (VT == MVT::v8i16 || VT == MVT::v16i8 || 6750 VT == MVT::v16i16 || VT == MVT::v32i8) { 6751 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 6752 if (NewOp.getNode()) 6753 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6754 } else if ((VT == MVT::v4i32 || 6755 (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 6756 // FIXME: Figure out a cleaner way to do this. 6757 // Try to make use of movq to zero out the top part. 6758 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6759 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 6760 if (NewOp.getNode()) { 6761 MVT NewVT = NewOp.getValueType().getSimpleVT(); 6762 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 6763 NewVT, true, false)) 6764 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), 6765 DAG, Subtarget, dl); 6766 } 6767 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6768 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 6769 if (NewOp.getNode()) { 6770 MVT NewVT = NewOp.getValueType().getSimpleVT(); 6771 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 6772 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), 6773 DAG, Subtarget, dl); 6774 } 6775 } 6776 } 6777 return SDValue(); 6778 } 6779 6780 SDValue 6781 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6782 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6783 SDValue V1 = Op.getOperand(0); 6784 SDValue V2 = Op.getOperand(1); 6785 MVT VT = Op.getValueType().getSimpleVT(); 6786 DebugLoc dl = Op.getDebugLoc(); 6787 unsigned NumElems = VT.getVectorNumElements(); 6788 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 6789 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6790 bool V1IsSplat = false; 6791 bool V2IsSplat = false; 6792 bool HasSSE2 = Subtarget->hasSSE2(); 6793 bool HasFp256 = Subtarget->hasFp256(); 6794 bool HasInt256 = Subtarget->hasInt256(); 6795 MachineFunction &MF = DAG.getMachineFunction(); 6796 bool OptForSize = MF.getFunction()->getAttributes(). 6797 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); 6798 6799 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 6800 6801 if (V1IsUndef && V2IsUndef) 6802 return DAG.getUNDEF(VT); 6803 6804 assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); 6805 6806 // Vector shuffle lowering takes 3 steps: 6807 // 6808 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6809 // narrowing and commutation of operands should be handled. 6810 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6811 // shuffle nodes. 6812 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6813 // so the shuffle can be broken into other shuffles and the legalizer can 6814 // try the lowering again. 6815 // 6816 // The general idea is that no vector_shuffle operation should be left to 6817 // be matched during isel, all of them must be converted to a target specific 6818 // node here. 6819 6820 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6821 // narrowing and commutation of operands should be handled. The actual code 6822 // doesn't include all of those, work in progress... 6823 SDValue NewOp = NormalizeVectorShuffle(Op, DAG); 6824 if (NewOp.getNode()) 6825 return NewOp; 6826 6827 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 6828 6829 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6830 // unpckh_undef). Only use pshufd if speed is more important than size. 6831 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 6832 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6833 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 6834 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6835 6836 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 6837 V2IsUndef && MayFoldVectorLoad(V1)) 6838 return getMOVDDup(Op, dl, V1, DAG); 6839 6840 if (isMOVHLPS_v_undef_Mask(M, VT)) 6841 return getMOVHighToLow(Op, dl, DAG); 6842 6843 // Use to match splats 6844 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && 6845 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6846 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6847 6848 if (isPSHUFDMask(M, VT)) { 6849 // The actual implementation will match the mask in the if above and then 6850 // during isel it can match several different instructions, not only pshufd 6851 // as its name says, sad but true, emulate the behavior for now... 6852 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6853 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6854 6855 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 6856 6857 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6858 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6859 6860 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) 6861 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, 6862 DAG); 6863 6864 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 6865 TargetMask, DAG); 6866 } 6867 6868 // Check if this can be converted into a logical shift. 6869 bool isLeft = false; 6870 unsigned ShAmt = 0; 6871 SDValue ShVal; 6872 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6873 if (isShift && ShVal.hasOneUse()) { 6874 // If the shifted value has multiple uses, it may be cheaper to use 6875 // v_set0 + movlhps or movhlps, etc. 6876 MVT EltVT = VT.getVectorElementType(); 6877 ShAmt *= EltVT.getSizeInBits(); 6878 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6879 } 6880 6881 if (isMOVLMask(M, VT)) { 6882 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6883 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6884 if (!isMOVLPMask(M, VT)) { 6885 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6886 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6887 6888 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6889 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6890 } 6891 } 6892 6893 // FIXME: fold these into legal mask. 6894 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) 6895 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6896 6897 if (isMOVHLPSMask(M, VT)) 6898 return getMOVHighToLow(Op, dl, DAG); 6899 6900 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 6901 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6902 6903 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 6904 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6905 6906 if (isMOVLPMask(M, VT)) 6907 return getMOVLP(Op, dl, DAG, HasSSE2); 6908 6909 if (ShouldXformToMOVHLPS(M, VT) || 6910 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 6911 return CommuteVectorShuffle(SVOp, DAG); 6912 6913 if (isShift) { 6914 // No better options. Use a vshldq / vsrldq. 6915 MVT EltVT = VT.getVectorElementType(); 6916 ShAmt *= EltVT.getSizeInBits(); 6917 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6918 } 6919 6920 bool Commuted = false; 6921 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6922 // 1,1,1,1 -> v8i16 though. 6923 V1IsSplat = isSplatVector(V1.getNode()); 6924 V2IsSplat = isSplatVector(V2.getNode()); 6925 6926 // Canonicalize the splat or undef, if present, to be on the RHS. 6927 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 6928 CommuteVectorShuffleMask(M, NumElems); 6929 std::swap(V1, V2); 6930 std::swap(V1IsSplat, V2IsSplat); 6931 Commuted = true; 6932 } 6933 6934 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 6935 // Shuffling low element of v1 into undef, just return v1. 6936 if (V2IsUndef) 6937 return V1; 6938 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6939 // the instruction selector will not match, so get a canonical MOVL with 6940 // swapped operands to undo the commute. 6941 return getMOVL(DAG, dl, VT, V2, V1); 6942 } 6943 6944 if (isUNPCKLMask(M, VT, HasInt256)) 6945 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6946 6947 if (isUNPCKHMask(M, VT, HasInt256)) 6948 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6949 6950 if (V2IsSplat) { 6951 // Normalize mask so all entries that point to V2 points to its first 6952 // element then try to match unpck{h|l} again. If match, return a 6953 // new vector_shuffle with the corrected mask.p 6954 SmallVector<int, 8> NewMask(M.begin(), M.end()); 6955 NormalizeMask(NewMask, NumElems); 6956 if (isUNPCKLMask(NewMask, VT, HasInt256, true)) 6957 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6958 if (isUNPCKHMask(NewMask, VT, HasInt256, true)) 6959 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6960 } 6961 6962 if (Commuted) { 6963 // Commute is back and try unpck* again. 6964 // FIXME: this seems wrong. 6965 CommuteVectorShuffleMask(M, NumElems); 6966 std::swap(V1, V2); 6967 std::swap(V1IsSplat, V2IsSplat); 6968 Commuted = false; 6969 6970 if (isUNPCKLMask(M, VT, HasInt256)) 6971 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6972 6973 if (isUNPCKHMask(M, VT, HasInt256)) 6974 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6975 } 6976 6977 // Normalize the node to match x86 shuffle ops if needed 6978 if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true))) 6979 return CommuteVectorShuffle(SVOp, DAG); 6980 6981 // The checks below are all present in isShuffleMaskLegal, but they are 6982 // inlined here right now to enable us to directly emit target specific 6983 // nodes, and remove one by one until they don't return Op anymore. 6984 6985 if (isPALIGNRMask(M, VT, Subtarget)) 6986 return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, 6987 getShufflePALIGNRImmediate(SVOp), 6988 DAG); 6989 6990 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6991 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6992 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6993 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6994 } 6995 6996 if (isPSHUFHWMask(M, VT, HasInt256)) 6997 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6998 getShufflePSHUFHWImmediate(SVOp), 6999 DAG); 7000 7001 if (isPSHUFLWMask(M, VT, HasInt256)) 7002 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 7003 getShufflePSHUFLWImmediate(SVOp), 7004 DAG); 7005 7006 if (isSHUFPMask(M, VT, HasFp256)) 7007 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 7008 getShuffleSHUFImmediate(SVOp), DAG); 7009 7010 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 7011 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7012 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 7013 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7014 7015 //===--------------------------------------------------------------------===// 7016 // Generate target specific nodes for 128 or 256-bit shuffles only 7017 // supported in the AVX instruction set. 7018 // 7019 7020 // Handle VMOVDDUPY permutations 7021 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) 7022 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 7023 7024 // Handle VPERMILPS/D* permutations 7025 if (isVPERMILPMask(M, VT, HasFp256)) { 7026 if (HasInt256 && VT == MVT::v8i32) 7027 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 7028 getShuffleSHUFImmediate(SVOp), DAG); 7029 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 7030 getShuffleSHUFImmediate(SVOp), DAG); 7031 } 7032 7033 // Handle VPERM2F128/VPERM2I128 permutations 7034 if (isVPERM2X128Mask(M, VT, HasFp256)) 7035 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 7036 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 7037 7038 SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); 7039 if (BlendOp.getNode()) 7040 return BlendOp; 7041 7042 if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { 7043 SmallVector<SDValue, 8> permclMask; 7044 for (unsigned i = 0; i != 8; ++i) { 7045 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); 7046 } 7047 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, 7048 &permclMask[0], 8); 7049 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 7050 return DAG.getNode(X86ISD::VPERMV, dl, VT, 7051 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); 7052 } 7053 7054 if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64)) 7055 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, 7056 getShuffleCLImmediate(SVOp), DAG); 7057 7058 //===--------------------------------------------------------------------===// 7059 // Since no target specific shuffle was selected for this generic one, 7060 // lower it into other known shuffles. FIXME: this isn't true yet, but 7061 // this is the plan. 7062 // 7063 7064 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 7065 if (VT == MVT::v8i16) { 7066 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); 7067 if (NewOp.getNode()) 7068 return NewOp; 7069 } 7070 7071 if (VT == MVT::v16i8) { 7072 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 7073 if (NewOp.getNode()) 7074 return NewOp; 7075 } 7076 7077 if (VT == MVT::v32i8) { 7078 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); 7079 if (NewOp.getNode()) 7080 return NewOp; 7081 } 7082 7083 // Handle all 128-bit wide vectors with 4 elements, and match them with 7084 // several different shuffle types. 7085 if (NumElems == 4 && VT.is128BitVector()) 7086 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 7087 7088 // Handle general 256-bit shuffles 7089 if (VT.is256BitVector()) 7090 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 7091 7092 return SDValue(); 7093 } 7094 7095 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 7096 MVT VT = Op.getValueType().getSimpleVT(); 7097 DebugLoc dl = Op.getDebugLoc(); 7098 7099 if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector()) 7100 return SDValue(); 7101 7102 if (VT.getSizeInBits() == 8) { 7103 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 7104 Op.getOperand(0), Op.getOperand(1)); 7105 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7106 DAG.getValueType(VT)); 7107 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7108 } 7109 7110 if (VT.getSizeInBits() == 16) { 7111 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7112 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 7113 if (Idx == 0) 7114 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7115 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7116 DAG.getNode(ISD::BITCAST, dl, 7117 MVT::v4i32, 7118 Op.getOperand(0)), 7119 Op.getOperand(1))); 7120 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 7121 Op.getOperand(0), Op.getOperand(1)); 7122 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7123 DAG.getValueType(VT)); 7124 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7125 } 7126 7127 if (VT == MVT::f32) { 7128 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 7129 // the result back to FR32 register. It's only worth matching if the 7130 // result has a single use which is a store or a bitcast to i32. And in 7131 // the case of a store, it's not worth it if the index is a constant 0, 7132 // because a MOVSSmr can be used instead, which is smaller and faster. 7133 if (!Op.hasOneUse()) 7134 return SDValue(); 7135 SDNode *User = *Op.getNode()->use_begin(); 7136 if ((User->getOpcode() != ISD::STORE || 7137 (isa<ConstantSDNode>(Op.getOperand(1)) && 7138 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 7139 (User->getOpcode() != ISD::BITCAST || 7140 User->getValueType(0) != MVT::i32)) 7141 return SDValue(); 7142 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7143 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 7144 Op.getOperand(0)), 7145 Op.getOperand(1)); 7146 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 7147 } 7148 7149 if (VT == MVT::i32 || VT == MVT::i64) { 7150 // ExtractPS/pextrq works with constant index. 7151 if (isa<ConstantSDNode>(Op.getOperand(1))) 7152 return Op; 7153 } 7154 return SDValue(); 7155 } 7156 7157 SDValue 7158 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 7159 SelectionDAG &DAG) const { 7160 if (!isa<ConstantSDNode>(Op.getOperand(1))) 7161 return SDValue(); 7162 7163 SDValue Vec = Op.getOperand(0); 7164 MVT VecVT = Vec.getValueType().getSimpleVT(); 7165 7166 // If this is a 256-bit vector result, first extract the 128-bit vector and 7167 // then extract the element from the 128-bit vector. 7168 if (VecVT.is256BitVector()) { 7169 DebugLoc dl = Op.getNode()->getDebugLoc(); 7170 unsigned NumElems = VecVT.getVectorNumElements(); 7171 SDValue Idx = Op.getOperand(1); 7172 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7173 7174 // Get the 128-bit vector. 7175 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 7176 7177 if (IdxVal >= NumElems/2) 7178 IdxVal -= NumElems/2; 7179 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 7180 DAG.getConstant(IdxVal, MVT::i32)); 7181 } 7182 7183 assert(VecVT.is128BitVector() && "Unexpected vector length"); 7184 7185 if (Subtarget->hasSSE41()) { 7186 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 7187 if (Res.getNode()) 7188 return Res; 7189 } 7190 7191 MVT VT = Op.getValueType().getSimpleVT(); 7192 DebugLoc dl = Op.getDebugLoc(); 7193 // TODO: handle v16i8. 7194 if (VT.getSizeInBits() == 16) { 7195 SDValue Vec = Op.getOperand(0); 7196 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7197 if (Idx == 0) 7198 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7199 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7200 DAG.getNode(ISD::BITCAST, dl, 7201 MVT::v4i32, Vec), 7202 Op.getOperand(1))); 7203 // Transform it so it match pextrw which produces a 32-bit result. 7204 MVT EltVT = MVT::i32; 7205 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 7206 Op.getOperand(0), Op.getOperand(1)); 7207 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 7208 DAG.getValueType(VT)); 7209 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7210 } 7211 7212 if (VT.getSizeInBits() == 32) { 7213 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7214 if (Idx == 0) 7215 return Op; 7216 7217 // SHUFPS the element to the lowest double word, then movss. 7218 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 7219 MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); 7220 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7221 DAG.getUNDEF(VVT), Mask); 7222 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7223 DAG.getIntPtrConstant(0)); 7224 } 7225 7226 if (VT.getSizeInBits() == 64) { 7227 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 7228 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 7229 // to match extract_elt for f64. 7230 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7231 if (Idx == 0) 7232 return Op; 7233 7234 // UNPCKHPD the element to the lowest double word, then movsd. 7235 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 7236 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 7237 int Mask[2] = { 1, -1 }; 7238 MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); 7239 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7240 DAG.getUNDEF(VVT), Mask); 7241 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7242 DAG.getIntPtrConstant(0)); 7243 } 7244 7245 return SDValue(); 7246 } 7247 7248 static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 7249 MVT VT = Op.getValueType().getSimpleVT(); 7250 MVT EltVT = VT.getVectorElementType(); 7251 DebugLoc dl = Op.getDebugLoc(); 7252 7253 SDValue N0 = Op.getOperand(0); 7254 SDValue N1 = Op.getOperand(1); 7255 SDValue N2 = Op.getOperand(2); 7256 7257 if (!VT.is128BitVector()) 7258 return SDValue(); 7259 7260 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 7261 isa<ConstantSDNode>(N2)) { 7262 unsigned Opc; 7263 if (VT == MVT::v8i16) 7264 Opc = X86ISD::PINSRW; 7265 else if (VT == MVT::v16i8) 7266 Opc = X86ISD::PINSRB; 7267 else 7268 Opc = X86ISD::PINSRB; 7269 7270 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 7271 // argument. 7272 if (N1.getValueType() != MVT::i32) 7273 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7274 if (N2.getValueType() != MVT::i32) 7275 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7276 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 7277 } 7278 7279 if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 7280 // Bits [7:6] of the constant are the source select. This will always be 7281 // zero here. The DAG Combiner may combine an extract_elt index into these 7282 // bits. For example (insert (extract, 3), 2) could be matched by putting 7283 // the '3' into bits [7:6] of X86ISD::INSERTPS. 7284 // Bits [5:4] of the constant are the destination select. This is the 7285 // value of the incoming immediate. 7286 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 7287 // combine either bitwise AND or insert of float 0.0 to set these bits. 7288 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 7289 // Create this as a scalar to vector.. 7290 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 7291 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 7292 } 7293 7294 if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { 7295 // PINSR* works with constant index. 7296 return Op; 7297 } 7298 return SDValue(); 7299 } 7300 7301 SDValue 7302 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 7303 MVT VT = Op.getValueType().getSimpleVT(); 7304 MVT EltVT = VT.getVectorElementType(); 7305 7306 DebugLoc dl = Op.getDebugLoc(); 7307 SDValue N0 = Op.getOperand(0); 7308 SDValue N1 = Op.getOperand(1); 7309 SDValue N2 = Op.getOperand(2); 7310 7311 // If this is a 256-bit vector result, first extract the 128-bit vector, 7312 // insert the element into the extracted half and then place it back. 7313 if (VT.is256BitVector()) { 7314 if (!isa<ConstantSDNode>(N2)) 7315 return SDValue(); 7316 7317 // Get the desired 128-bit vector half. 7318 unsigned NumElems = VT.getVectorNumElements(); 7319 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 7320 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 7321 7322 // Insert the element into the desired half. 7323 bool Upper = IdxVal >= NumElems/2; 7324 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 7325 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); 7326 7327 // Insert the changed part back to the 256-bit vector 7328 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 7329 } 7330 7331 if (Subtarget->hasSSE41()) 7332 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 7333 7334 if (EltVT == MVT::i8) 7335 return SDValue(); 7336 7337 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 7338 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 7339 // as its second argument. 7340 if (N1.getValueType() != MVT::i32) 7341 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7342 if (N2.getValueType() != MVT::i32) 7343 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7344 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 7345 } 7346 return SDValue(); 7347 } 7348 7349 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 7350 LLVMContext *Context = DAG.getContext(); 7351 DebugLoc dl = Op.getDebugLoc(); 7352 MVT OpVT = Op.getValueType().getSimpleVT(); 7353 7354 // If this is a 256-bit vector result, first insert into a 128-bit 7355 // vector and then insert into the 256-bit vector. 7356 if (!OpVT.is128BitVector()) { 7357 // Insert into a 128-bit vector. 7358 EVT VT128 = EVT::getVectorVT(*Context, 7359 OpVT.getVectorElementType(), 7360 OpVT.getVectorNumElements() / 2); 7361 7362 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7363 7364 // Insert the 128-bit vector. 7365 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 7366 } 7367 7368 if (OpVT == MVT::v1i64 && 7369 Op.getOperand(0).getValueType() == MVT::i64) 7370 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7371 7372 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7373 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 7374 return DAG.getNode(ISD::BITCAST, dl, OpVT, 7375 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7376 } 7377 7378 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7379 // a simple subregister reference or explicit instructions to grab 7380 // upper bits of a vector. 7381 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7382 SelectionDAG &DAG) { 7383 if (Subtarget->hasFp256()) { 7384 DebugLoc dl = Op.getNode()->getDebugLoc(); 7385 SDValue Vec = Op.getNode()->getOperand(0); 7386 SDValue Idx = Op.getNode()->getOperand(1); 7387 7388 if (Op.getNode()->getValueType(0).is128BitVector() && 7389 Vec.getNode()->getValueType(0).is256BitVector() && 7390 isa<ConstantSDNode>(Idx)) { 7391 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7392 return Extract128BitVector(Vec, IdxVal, DAG, dl); 7393 } 7394 } 7395 return SDValue(); 7396 } 7397 7398 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7399 // simple superregister reference or explicit instructions to insert 7400 // the upper bits of a vector. 7401 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7402 SelectionDAG &DAG) { 7403 if (Subtarget->hasFp256()) { 7404 DebugLoc dl = Op.getNode()->getDebugLoc(); 7405 SDValue Vec = Op.getNode()->getOperand(0); 7406 SDValue SubVec = Op.getNode()->getOperand(1); 7407 SDValue Idx = Op.getNode()->getOperand(2); 7408 7409 if (Op.getNode()->getValueType(0).is256BitVector() && 7410 SubVec.getNode()->getValueType(0).is128BitVector() && 7411 isa<ConstantSDNode>(Idx)) { 7412 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7413 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 7414 } 7415 } 7416 return SDValue(); 7417 } 7418 7419 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7420 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7421 // one of the above mentioned nodes. It has to be wrapped because otherwise 7422 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7423 // be used to form addressing mode. These wrapped nodes will be selected 7424 // into MOV32ri. 7425 SDValue 7426 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7427 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7428 7429 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7430 // global base reg. 7431 unsigned char OpFlag = 0; 7432 unsigned WrapperKind = X86ISD::Wrapper; 7433 CodeModel::Model M = getTargetMachine().getCodeModel(); 7434 7435 if (Subtarget->isPICStyleRIPRel() && 7436 (M == CodeModel::Small || M == CodeModel::Kernel)) 7437 WrapperKind = X86ISD::WrapperRIP; 7438 else if (Subtarget->isPICStyleGOT()) 7439 OpFlag = X86II::MO_GOTOFF; 7440 else if (Subtarget->isPICStyleStubPIC()) 7441 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7442 7443 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7444 CP->getAlignment(), 7445 CP->getOffset(), OpFlag); 7446 DebugLoc DL = CP->getDebugLoc(); 7447 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7448 // With PIC, the address is actually $g + Offset. 7449 if (OpFlag) { 7450 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7451 DAG.getNode(X86ISD::GlobalBaseReg, 7452 DebugLoc(), getPointerTy()), 7453 Result); 7454 } 7455 7456 return Result; 7457 } 7458 7459 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7460 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7461 7462 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7463 // global base reg. 7464 unsigned char OpFlag = 0; 7465 unsigned WrapperKind = X86ISD::Wrapper; 7466 CodeModel::Model M = getTargetMachine().getCodeModel(); 7467 7468 if (Subtarget->isPICStyleRIPRel() && 7469 (M == CodeModel::Small || M == CodeModel::Kernel)) 7470 WrapperKind = X86ISD::WrapperRIP; 7471 else if (Subtarget->isPICStyleGOT()) 7472 OpFlag = X86II::MO_GOTOFF; 7473 else if (Subtarget->isPICStyleStubPIC()) 7474 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7475 7476 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7477 OpFlag); 7478 DebugLoc DL = JT->getDebugLoc(); 7479 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7480 7481 // With PIC, the address is actually $g + Offset. 7482 if (OpFlag) 7483 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7484 DAG.getNode(X86ISD::GlobalBaseReg, 7485 DebugLoc(), getPointerTy()), 7486 Result); 7487 7488 return Result; 7489 } 7490 7491 SDValue 7492 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 7493 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 7494 7495 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7496 // global base reg. 7497 unsigned char OpFlag = 0; 7498 unsigned WrapperKind = X86ISD::Wrapper; 7499 CodeModel::Model M = getTargetMachine().getCodeModel(); 7500 7501 if (Subtarget->isPICStyleRIPRel() && 7502 (M == CodeModel::Small || M == CodeModel::Kernel)) { 7503 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 7504 OpFlag = X86II::MO_GOTPCREL; 7505 WrapperKind = X86ISD::WrapperRIP; 7506 } else if (Subtarget->isPICStyleGOT()) { 7507 OpFlag = X86II::MO_GOT; 7508 } else if (Subtarget->isPICStyleStubPIC()) { 7509 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 7510 } else if (Subtarget->isPICStyleStubNoDynamic()) { 7511 OpFlag = X86II::MO_DARWIN_NONLAZY; 7512 } 7513 7514 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 7515 7516 DebugLoc DL = Op.getDebugLoc(); 7517 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7518 7519 // With PIC, the address is actually $g + Offset. 7520 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 7521 !Subtarget->is64Bit()) { 7522 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7523 DAG.getNode(X86ISD::GlobalBaseReg, 7524 DebugLoc(), getPointerTy()), 7525 Result); 7526 } 7527 7528 // For symbols that require a load from a stub to get the address, emit the 7529 // load. 7530 if (isGlobalStubReference(OpFlag)) 7531 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 7532 MachinePointerInfo::getGOT(), false, false, false, 0); 7533 7534 return Result; 7535 } 7536 7537 SDValue 7538 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 7539 // Create the TargetBlockAddressAddress node. 7540 unsigned char OpFlags = 7541 Subtarget->ClassifyBlockAddressReference(); 7542 CodeModel::Model M = getTargetMachine().getCodeModel(); 7543 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 7544 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 7545 DebugLoc dl = Op.getDebugLoc(); 7546 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, 7547 OpFlags); 7548 7549 if (Subtarget->isPICStyleRIPRel() && 7550 (M == CodeModel::Small || M == CodeModel::Kernel)) 7551 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7552 else 7553 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7554 7555 // With PIC, the address is actually $g + Offset. 7556 if (isGlobalRelativeToPICBase(OpFlags)) { 7557 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7558 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7559 Result); 7560 } 7561 7562 return Result; 7563 } 7564 7565 SDValue 7566 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 7567 int64_t Offset, SelectionDAG &DAG) const { 7568 // Create the TargetGlobalAddress node, folding in the constant 7569 // offset if it is legal. 7570 unsigned char OpFlags = 7571 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7572 CodeModel::Model M = getTargetMachine().getCodeModel();