1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// X86 target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 /// About Cost Model numbers used below it's necessary to say the following: 17 /// the numbers correspond to some "generic" X86 CPU instead of usage of 18 /// concrete CPU model. Usually the numbers correspond to CPU where the feature 19 /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in 20 /// the lookups below the cost is based on Nehalem as that was the first CPU 21 /// to support that feature level and thus has most likely the worst case cost. 22 /// Some examples of other technologies/CPUs: 23 /// SSE 3 - Pentium4 / Athlon64 24 /// SSE 4.1 - Penryn 25 /// SSE 4.2 - Nehalem 26 /// AVX - Sandy Bridge 27 /// AVX2 - Haswell 28 /// AVX-512 - Xeon Phi / Skylake 29 /// And some examples of instruction target dependent costs (latency) 30 /// divss sqrtss rsqrtss 31 /// AMD K7 11-16 19 3 32 /// Piledriver 9-24 13-15 5 33 /// Jaguar 14 16 2 34 /// Pentium II,III 18 30 2 35 /// Nehalem 7-14 7-18 3 36 /// Haswell 10-13 11 5 37 /// TODO: Develop and implement the target dependent cost model and 38 /// specialize cost numbers for different Cost Model Targets such as throughput, 39 /// code size, latency and uop count. 40 //===----------------------------------------------------------------------===// 41 42 #include "X86TargetTransformInfo.h" 43 #include "llvm/Analysis/TargetTransformInfo.h" 44 #include "llvm/CodeGen/BasicTTIImpl.h" 45 #include "llvm/CodeGen/CostTable.h" 46 #include "llvm/CodeGen/TargetLowering.h" 47 #include "llvm/IR/IntrinsicInst.h" 48 #include "llvm/Support/Debug.h" 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "x86tti" 53 54 //===----------------------------------------------------------------------===// 55 // 56 // X86 cost model. 57 // 58 //===----------------------------------------------------------------------===// 59 60 TargetTransformInfo::PopcntSupportKind 61 X86TTIImpl::getPopcntSupport(unsigned TyWidth) { 62 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 63 // TODO: Currently the __builtin_popcount() implementation using SSE3 64 // instructions is inefficient. Once the problem is fixed, we should 65 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 66 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; 67 } 68 69 llvm::Optional<unsigned> X86TTIImpl::getCacheSize( 70 TargetTransformInfo::CacheLevel Level) const { 71 switch (Level) { 72 case TargetTransformInfo::CacheLevel::L1D: 73 // - Penryn 74 // - Nehalem 75 // - Westmere 76 // - Sandy Bridge 77 // - Ivy Bridge 78 // - Haswell 79 // - Broadwell 80 // - Skylake 81 // - Kabylake 82 return 32 * 1024; // 32 KByte 83 case TargetTransformInfo::CacheLevel::L2D: 84 // - Penryn 85 // - Nehalem 86 // - Westmere 87 // - Sandy Bridge 88 // - Ivy Bridge 89 // - Haswell 90 // - Broadwell 91 // - Skylake 92 // - Kabylake 93 return 256 * 1024; // 256 KByte 94 } 95 96 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 97 } 98 99 llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( 100 TargetTransformInfo::CacheLevel Level) const { 101 // - Penryn 102 // - Nehalem 103 // - Westmere 104 // - Sandy Bridge 105 // - Ivy Bridge 106 // - Haswell 107 // - Broadwell 108 // - Skylake 109 // - Kabylake 110 switch (Level) { 111 case TargetTransformInfo::CacheLevel::L1D: 112 LLVM_FALLTHROUGH; 113 case TargetTransformInfo::CacheLevel::L2D: 114 return 8; 115 } 116 117 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); 118 } 119 120 unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { 121 if (Vector && !ST->hasSSE1()) 122 return 0; 123 124 if (ST->is64Bit()) { 125 if (Vector && ST->hasAVX512()) 126 return 32; 127 return 16; 128 } 129 return 8; 130 } 131 132 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { 133 unsigned PreferVectorWidth = ST->getPreferVectorWidth(); 134 if (Vector) { 135 if (ST->hasAVX512() && PreferVectorWidth >= 512) 136 return 512; 137 if (ST->hasAVX() && PreferVectorWidth >= 256) 138 return 256; 139 if (ST->hasSSE1() && PreferVectorWidth >= 128) 140 return 128; 141 return 0; 142 } 143 144 if (ST->is64Bit()) 145 return 64; 146 147 return 32; 148 } 149 150 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { 151 return getRegisterBitWidth(true); 152 } 153 154 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { 155 // If the loop will not be vectorized, don't interleave the loop. 156 // Let regular unroll to unroll the loop, which saves the overflow 157 // check and memory check cost. 158 if (VF == 1) 159 return 1; 160 161 if (ST->isAtom()) 162 return 1; 163 164 // Sandybridge and Haswell have multiple execution ports and pipelined 165 // vector units. 166 if (ST->hasAVX()) 167 return 4; 168 169 return 2; 170 } 171 172 int X86TTIImpl::getArithmeticInstrCost( 173 unsigned Opcode, Type *Ty, 174 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, 175 TTI::OperandValueProperties Opd1PropInfo, 176 TTI::OperandValueProperties Opd2PropInfo, 177 ArrayRef<const Value *> Args) { 178 // Legalize the type. 179 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 180 181 int ISD = TLI->InstructionOpcodeToISD(Opcode); 182 assert(ISD && "Invalid opcode"); 183 184 static const CostTblEntry GLMCostTable[] = { 185 { ISD::FDIV, MVT::f32, 18 }, // divss 186 { ISD::FDIV, MVT::v4f32, 35 }, // divps 187 { ISD::FDIV, MVT::f64, 33 }, // divsd 188 { ISD::FDIV, MVT::v2f64, 65 }, // divpd 189 }; 190 191 if (ST->isGLM()) 192 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, 193 LT.second)) 194 return LT.first * Entry->Cost; 195 196 static const CostTblEntry SLMCostTable[] = { 197 { ISD::MUL, MVT::v4i32, 11 }, // pmulld 198 { ISD::MUL, MVT::v8i16, 2 }, // pmullw 199 { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. 200 { ISD::FMUL, MVT::f64, 2 }, // mulsd 201 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd 202 { ISD::FMUL, MVT::v4f32, 2 }, // mulps 203 { ISD::FDIV, MVT::f32, 17 }, // divss 204 { ISD::FDIV, MVT::v4f32, 39 }, // divps 205 { ISD::FDIV, MVT::f64, 32 }, // divsd 206 { ISD::FDIV, MVT::v2f64, 69 }, // divpd 207 { ISD::FADD, MVT::v2f64, 2 }, // addpd 208 { ISD::FSUB, MVT::v2f64, 2 }, // subpd 209 // v2i64/v4i64 mul is custom lowered as a series of long: 210 // multiplies(3), shifts(3) and adds(2) 211 // slm muldq version throughput is 2 and addq throughput 4 212 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + 213 // 3X4 (addq throughput) = 17 214 { ISD::MUL, MVT::v2i64, 17 }, 215 // slm addq\subq throughput is 4 216 { ISD::ADD, MVT::v2i64, 4 }, 217 { ISD::SUB, MVT::v2i64, 4 }, 218 }; 219 220 if (ST->isSLM()) { 221 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { 222 // Check if the operands can be shrinked into a smaller datatype. 223 bool Op1Signed = false; 224 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); 225 bool Op2Signed = false; 226 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); 227 228 bool signedMode = Op1Signed | Op2Signed; 229 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); 230 231 if (OpMinSize <= 7) 232 return LT.first * 3; // pmullw/sext 233 if (!signedMode && OpMinSize <= 8) 234 return LT.first * 3; // pmullw/zext 235 if (OpMinSize <= 15) 236 return LT.first * 5; // pmullw/pmulhw/pshuf 237 if (!signedMode && OpMinSize <= 16) 238 return LT.first * 5; // pmullw/pmulhw/pshuf 239 } 240 241 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, 242 LT.second)) { 243 return LT.first * Entry->Cost; 244 } 245 } 246 247 if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || 248 ISD == ISD::UREM) && 249 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 250 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && 251 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 252 if (ISD == ISD::SDIV || ISD == ISD::SREM) { 253 // On X86, vector signed division by constants power-of-two are 254 // normally expanded to the sequence SRA + SRL + ADD + SRA. 255 // The OperandValue properties may not be the same as that of the previous 256 // operation; conservatively assume OP_None. 257 int Cost = 258 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, 259 TargetTransformInfo::OP_None, 260 TargetTransformInfo::OP_None); 261 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, 262 TargetTransformInfo::OP_None, 263 TargetTransformInfo::OP_None); 264 Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, 265 TargetTransformInfo::OP_None, 266 TargetTransformInfo::OP_None); 267 268 if (ISD == ISD::SREM) { 269 // For SREM: (X % C) is the equivalent of (X - (X/C)*C) 270 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info); 271 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info); 272 } 273 274 return Cost; 275 } 276 277 // Vector unsigned division/remainder will be simplified to shifts/masks. 278 if (ISD == ISD::UDIV) 279 return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, 280 TargetTransformInfo::OP_None, 281 TargetTransformInfo::OP_None); 282 283 if (ISD == ISD::UREM) 284 return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info, 285 TargetTransformInfo::OP_None, 286 TargetTransformInfo::OP_None); 287 } 288 289 static const CostTblEntry AVX512BWUniformConstCostTable[] = { 290 { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. 291 { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. 292 { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. 293 294 { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence 295 { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence 296 { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence 297 { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence 298 }; 299 300 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 301 ST->hasBWI()) { 302 if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, 303 LT.second)) 304 return LT.first * Entry->Cost; 305 } 306 307 static const CostTblEntry AVX512UniformConstCostTable[] = { 308 { ISD::SRA, MVT::v2i64, 1 }, 309 { ISD::SRA, MVT::v4i64, 1 }, 310 { ISD::SRA, MVT::v8i64, 1 }, 311 312 { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence 313 { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence 314 { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence 315 { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence 316 }; 317 318 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 319 ST->hasAVX512()) { 320 if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, 321 LT.second)) 322 return LT.first * Entry->Cost; 323 } 324 325 static const CostTblEntry AVX2UniformConstCostTable[] = { 326 { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. 327 { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. 328 { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. 329 330 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. 331 332 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence 333 { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence 334 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence 335 { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence 336 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence 337 { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence 338 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence 339 { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence 340 }; 341 342 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 343 ST->hasAVX2()) { 344 if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, 345 LT.second)) 346 return LT.first * Entry->Cost; 347 } 348 349 static const CostTblEntry SSE2UniformConstCostTable[] = { 350 { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. 351 { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. 352 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 353 354 { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. 355 { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. 356 { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. 357 358 { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. 359 { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. 360 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence 361 { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence 362 { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. 363 { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. 364 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence 365 { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence 366 { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. 367 { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. 368 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence 369 { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence 370 { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. 371 { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. 372 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence 373 { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence 374 }; 375 376 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 377 ST->hasSSE2()) { 378 // pmuldq sequence. 379 if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) 380 return LT.first * 32; 381 if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) 382 return LT.first * 38; 383 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) 384 return LT.first * 15; 385 if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) 386 return LT.first * 20; 387 388 // XOP has faster vXi8 shifts. 389 if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) || 390 !ST->hasXOP()) 391 if (const auto *Entry = 392 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) 393 return LT.first * Entry->Cost; 394 } 395 396 static const CostTblEntry AVX2UniformCostTable[] = { 397 // Uniform splats are cheaper for the following instructions. 398 { ISD::SHL, MVT::v16i16, 1 }, // psllw. 399 { ISD::SRL, MVT::v16i16, 1 }, // psrlw. 400 { ISD::SRA, MVT::v16i16, 1 }, // psraw. 401 }; 402 403 if (ST->hasAVX2() && 404 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 405 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 406 if (const auto *Entry = 407 CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) 408 return LT.first * Entry->Cost; 409 } 410 411 static const CostTblEntry SSE2UniformCostTable[] = { 412 // Uniform splats are cheaper for the following instructions. 413 { ISD::SHL, MVT::v8i16, 1 }, // psllw. 414 { ISD::SHL, MVT::v4i32, 1 }, // pslld 415 { ISD::SHL, MVT::v2i64, 1 }, // psllq. 416 417 { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 418 { ISD::SRL, MVT::v4i32, 1 }, // psrld. 419 { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 420 421 { ISD::SRA, MVT::v8i16, 1 }, // psraw. 422 { ISD::SRA, MVT::v4i32, 1 }, // psrad. 423 }; 424 425 if (ST->hasSSE2() && 426 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 427 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 428 if (const auto *Entry = 429 CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) 430 return LT.first * Entry->Cost; 431 } 432 433 static const CostTblEntry AVX512DQCostTable[] = { 434 { ISD::MUL, MVT::v2i64, 1 }, 435 { ISD::MUL, MVT::v4i64, 1 }, 436 { ISD::MUL, MVT::v8i64, 1 } 437 }; 438 439 // Look for AVX512DQ lowering tricks for custom cases. 440 if (ST->hasDQI()) 441 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) 442 return LT.first * Entry->Cost; 443 444 static const CostTblEntry AVX512BWCostTable[] = { 445 { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw 446 { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw 447 { ISD::SRA, MVT::v8i16, 1 }, // vpsravw 448 449 { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw 450 { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw 451 { ISD::SRA, MVT::v16i16, 1 }, // vpsravw 452 453 { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw 454 { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw 455 { ISD::SRA, MVT::v32i16, 1 }, // vpsravw 456 457 { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. 458 { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. 459 { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. 460 461 { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. 462 { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. 463 { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. 464 }; 465 466 // Look for AVX512BW lowering tricks for custom cases. 467 if (ST->hasBWI()) 468 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) 469 return LT.first * Entry->Cost; 470 471 static const CostTblEntry AVX512CostTable[] = { 472 { ISD::SHL, MVT::v16i32, 1 }, 473 { ISD::SRL, MVT::v16i32, 1 }, 474 { ISD::SRA, MVT::v16i32, 1 }, 475 476 { ISD::SHL, MVT::v8i64, 1 }, 477 { ISD::SRL, MVT::v8i64, 1 }, 478 479 { ISD::SRA, MVT::v2i64, 1 }, 480 { ISD::SRA, MVT::v4i64, 1 }, 481 { ISD::SRA, MVT::v8i64, 1 }, 482 483 { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. 484 { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. 485 { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) 486 { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) 487 { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) 488 { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add 489 490 { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 491 { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 492 { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ 493 494 { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 495 { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 496 { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ 497 }; 498 499 if (ST->hasAVX512()) 500 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) 501 return LT.first * Entry->Cost; 502 503 static const CostTblEntry AVX2ShiftCostTable[] = { 504 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to 505 // customize them to detect the cases where shift amount is a scalar one. 506 { ISD::SHL, MVT::v4i32, 1 }, 507 { ISD::SRL, MVT::v4i32, 1 }, 508 { ISD::SRA, MVT::v4i32, 1 }, 509 { ISD::SHL, MVT::v8i32, 1 }, 510 { ISD::SRL, MVT::v8i32, 1 }, 511 { ISD::SRA, MVT::v8i32, 1 }, 512 { ISD::SHL, MVT::v2i64, 1 }, 513 { ISD::SRL, MVT::v2i64, 1 }, 514 { ISD::SHL, MVT::v4i64, 1 }, 515 { ISD::SRL, MVT::v4i64, 1 }, 516 }; 517 518 // Look for AVX2 lowering tricks. 519 if (ST->hasAVX2()) { 520 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 521 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 522 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 523 // On AVX2, a packed v16i16 shift left by a constant build_vector 524 // is lowered into a vector multiply (vpmullw). 525 return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info, 526 TargetTransformInfo::OP_None, 527 TargetTransformInfo::OP_None); 528 529 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) 530 return LT.first * Entry->Cost; 531 } 532 533 static const CostTblEntry XOPShiftCostTable[] = { 534 // 128bit shifts take 1cy, but right shifts require negation beforehand. 535 { ISD::SHL, MVT::v16i8, 1 }, 536 { ISD::SRL, MVT::v16i8, 2 }, 537 { ISD::SRA, MVT::v16i8, 2 }, 538 { ISD::SHL, MVT::v8i16, 1 }, 539 { ISD::SRL, MVT::v8i16, 2 }, 540 { ISD::SRA, MVT::v8i16, 2 }, 541 { ISD::SHL, MVT::v4i32, 1 }, 542 { ISD::SRL, MVT::v4i32, 2 }, 543 { ISD::SRA, MVT::v4i32, 2 }, 544 { ISD::SHL, MVT::v2i64, 1 }, 545 { ISD::SRL, MVT::v2i64, 2 }, 546 { ISD::SRA, MVT::v2i64, 2 }, 547 // 256bit shifts require splitting if AVX2 didn't catch them above. 548 { ISD::SHL, MVT::v32i8, 2+2 }, 549 { ISD::SRL, MVT::v32i8, 4+2 }, 550 { ISD::SRA, MVT::v32i8, 4+2 }, 551 { ISD::SHL, MVT::v16i16, 2+2 }, 552 { ISD::SRL, MVT::v16i16, 4+2 }, 553 { ISD::SRA, MVT::v16i16, 4+2 }, 554 { ISD::SHL, MVT::v8i32, 2+2 }, 555 { ISD::SRL, MVT::v8i32, 4+2 }, 556 { ISD::SRA, MVT::v8i32, 4+2 }, 557 { ISD::SHL, MVT::v4i64, 2+2 }, 558 { ISD::SRL, MVT::v4i64, 4+2 }, 559 { ISD::SRA, MVT::v4i64, 4+2 }, 560 }; 561 562 // Look for XOP lowering tricks. 563 if (ST->hasXOP()) 564 if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second)) 565 return LT.first * Entry->Cost; 566 567 static const CostTblEntry SSE2UniformShiftCostTable[] = { 568 // Uniform splats are cheaper for the following instructions. 569 { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. 570 { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. 571 { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. 572 573 { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. 574 { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. 575 { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. 576 577 { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. 578 { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. 579 { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. 580 { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. 581 }; 582 583 if (ST->hasSSE2() && 584 ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || 585 (Op2Info == TargetTransformInfo::OK_UniformValue))) { 586 587 // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. 588 if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) 589 return LT.first * 4; // 2*psrad + shuffle. 590 591 if (const auto *Entry = 592 CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) 593 return LT.first * Entry->Cost; 594 } 595 596 if (ISD == ISD::SHL && 597 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { 598 MVT VT = LT.second; 599 // Vector shift left by non uniform constant can be lowered 600 // into vector multiply. 601 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || 602 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) 603 ISD = ISD::MUL; 604 } 605 606 static const CostTblEntry AVX2CostTable[] = { 607 { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. 608 { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. 609 610 { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. 611 { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. 612 613 { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. 614 { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. 615 { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. 616 { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. 617 618 { ISD::SUB, MVT::v32i8, 1 }, // psubb 619 { ISD::ADD, MVT::v32i8, 1 }, // paddb 620 { ISD::SUB, MVT::v16i16, 1 }, // psubw 621 { ISD::ADD, MVT::v16i16, 1 }, // paddw 622 { ISD::SUB, MVT::v8i32, 1 }, // psubd 623 { ISD::ADD, MVT::v8i32, 1 }, // paddd 624 { ISD::SUB, MVT::v4i64, 1 }, // psubq 625 { ISD::ADD, MVT::v4i64, 1 }, // paddq 626 627 { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. 628 { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. 629 { ISD::MUL, MVT::v16i16, 1 }, // pmullw 630 { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) 631 { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add 632 633 { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 634 { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 635 { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 636 { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 637 { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ 638 { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ 639 640 { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ 641 { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ 642 { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ 643 { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ 644 { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ 645 { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ 646 }; 647 648 // Look for AVX2 lowering tricks for custom cases. 649 if (ST->hasAVX2()) 650 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) 651 return LT.first * Entry->Cost; 652 653 static const CostTblEntry AVX1CostTable[] = { 654 // We don't have to scalarize unsupported ops. We can issue two half-sized 655 // operations and we only need to extract the upper YMM half. 656 // Two ops + 1 extract + 1 insert = 4. 657 { ISD::MUL, MVT::v16i16, 4 }, 658 { ISD::MUL, MVT::v8i32, 4 }, 659 { ISD::SUB, MVT::v32i8, 4 }, 660 { ISD::ADD, MVT::v32i8, 4 }, 661 { ISD::SUB, MVT::v16i16, 4 }, 662 { ISD::ADD, MVT::v16i16, 4 }, 663 { ISD::SUB, MVT::v8i32, 4 }, 664 { ISD::ADD, MVT::v8i32, 4 }, 665 { ISD::SUB, MVT::v4i64, 4 }, 666 { ISD::ADD, MVT::v4i64, 4 }, 667 668 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 669 // are lowered as a series of long multiplies(3), shifts(3) and adds(2) 670 // Because we believe v4i64 to be a legal type, we must also include the 671 // extract+insert in the cost table. Therefore, the cost here is 18 672 // instead of 8. 673 { ISD::MUL, MVT::v4i64, 18 }, 674 675 { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. 676 677 { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ 678 { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ 679 { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ 680 { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ 681 { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ 682 { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ 683 }; 684 685 if (ST->hasAVX()) 686 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) 687 return LT.first * Entry->Cost; 688 689 static const CostTblEntry SSE42CostTable[] = { 690 { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 691 { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ 692 { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 693 { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 694 695 { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 696 { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ 697 { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 698 { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 699 700 { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ 701 { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ 702 { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ 703 { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ 704 705 { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ 706 { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ 707 { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ 708 { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ 709 }; 710 711 if (ST->hasSSE42()) 712 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) 713 return LT.first * Entry->Cost; 714 715 static const CostTblEntry SSE41CostTable[] = { 716 { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. 717 { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. 718 { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. 719 { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 720 { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld 721 { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split 722 723 { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. 724 { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. 725 { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. 726 { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 727 { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. 728 { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. 729 730 { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. 731 { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. 732 { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. 733 { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. 734 { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. 735 { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. 736 737 { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) 738 }; 739 740 if (ST->hasSSE41()) 741 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) 742 return LT.first * Entry->Cost; 743 744 static const CostTblEntry SSE2CostTable[] = { 745 // We don't correctly identify costs of casts because they are marked as 746 // custom. 747 { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. 748 { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. 749 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. 750 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. 751 { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. 752 753 { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. 754 { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. 755 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. 756 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. 757 { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. 758 759 { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. 760 { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. 761 { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. 762 { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. 763 { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. 764 765 { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. 766 { ISD::MUL, MVT::v8i16, 1 }, // pmullw 767 { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle 768 { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add 769 770 { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ 771 { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ 772 { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ 773 { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ 774 }; 775 776 if (ST->hasSSE2()) 777 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) 778 return LT.first * Entry->Cost; 779 780 static const CostTblEntry SSE1CostTable[] = { 781 { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ 782 { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ 783 }; 784 785 if (ST->hasSSE1()) 786 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) 787 return LT.first * Entry->Cost; 788 789 // It is not a good idea to vectorize division. We have to scalarize it and 790 // in the process we will often end up having to spilling regular 791 // registers. The overhead of division is going to dominate most kernels 792 // anyways so try hard to prevent vectorization of division - it is 793 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 794 // to hide "20 cycles" for each lane. 795 if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || 796 ISD == ISD::UDIV || ISD == ISD::UREM)) { 797 int ScalarCost = getArithmeticInstrCost( 798 Opcode, Ty->getScalarType(), Op1Info, Op2Info, 799 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); 800 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; 801 } 802 803 // Fallback to the default implementation. 804 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); 805 } 806 807 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 808 Type *SubTp) { 809 // 64-bit packed float vectors (v2f32) are widened to type v4f32. 810 // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. 811 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 812 813 // For Broadcasts we are splatting the first element from the first input 814 // register, so only need to reference that input and all the output 815 // registers are the same. 816 if (Kind == TTI::SK_Broadcast) 817 LT.first = 1; 818 819 // We are going to permute multiple sources and the result will be in multiple 820 // destinations. Providing an accurate cost only for splits where the element 821 // type remains the same. 822 if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { 823 MVT LegalVT = LT.second; 824 if (LegalVT.isVector() && 825 LegalVT.getVectorElementType().getSizeInBits() == 826 Tp->getVectorElementType()->getPrimitiveSizeInBits() && 827 LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { 828 829 unsigned VecTySize = DL.getTypeStoreSize(Tp); 830 unsigned LegalVTSize = LegalVT.getStoreSize(); 831 // Number of source vectors after legalization: 832 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; 833 // Number of destination vectors after legalization: 834 unsigned NumOfDests = LT.first; 835 836 Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), 837 LegalVT.getVectorNumElements()); 838 839 unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; 840 return NumOfShuffles * 841 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); 842 } 843 844 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 845 } 846 847 // For 2-input shuffles, we must account for splitting the 2 inputs into many. 848 if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { 849 // We assume that source and destination have the same vector type. 850 int NumOfDests = LT.first; 851 int NumOfShufflesPerDest = LT.first * 2 - 1; 852 LT.first = NumOfDests * NumOfShufflesPerDest; 853 } 854 855 static const CostTblEntry AVX512VBMIShuffleTbl[] = { 856 { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb 857 { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb 858 859 { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb 860 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb 861 862 { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b 863 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b 864 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b 865 }; 866 867 if (ST->hasVBMI()) 868 if (const auto *Entry = 869 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) 870 return LT.first * Entry->Cost; 871 872 static const CostTblEntry AVX512BWShuffleTbl[] = { 873 { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw 874 { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb 875 876 { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw 877 { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw 878 { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2 879 880 { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw 881 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw 882 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw 883 { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16 884 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc 885 886 { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w 887 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w 888 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w 889 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc 890 { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1 891 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc 892 }; 893 894 if (ST->hasBWI()) 895 if (const auto *Entry = 896 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) 897 return LT.first * Entry->Cost; 898 899 static const CostTblEntry AVX512ShuffleTbl[] = { 900 { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd 901 { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps 902 { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq 903 { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd 904 905 { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd 906 { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps 907 { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq 908 { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd 909 910 { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd 911 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd 912 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd 913 { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps 914 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps 915 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps 916 { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq 917 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq 918 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq 919 { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd 920 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd 921 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd 922 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb 923 924 { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd 925 { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps 926 { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q 927 { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d 928 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd 929 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps 930 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q 931 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d 932 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd 933 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps 934 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q 935 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d 936 }; 937 938 if (ST->hasAVX512()) 939 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) 940 return LT.first * Entry->Cost; 941 942 static const CostTblEntry AVX2ShuffleTbl[] = { 943 { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd 944 { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps 945 { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq 946 { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd 947 { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw 948 { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb 949 950 { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd 951 { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps 952 { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq 953 { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd 954 { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb 955 { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb 956 957 { TTI::SK_Select, MVT::v16i16, 1 }, // vpblendvb 958 { TTI::SK_Select, MVT::v32i8, 1 }, // vpblendvb 959 960 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd 961 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps 962 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq 963 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd 964 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb 965 // + vpblendvb 966 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb 967 // + vpblendvb 968 969 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd 970 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps 971 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd 972 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd 973 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb 974 // + vpblendvb 975 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb 976 // + vpblendvb 977 }; 978 979 if (ST->hasAVX2()) 980 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) 981 return LT.first * Entry->Cost; 982 983 static const CostTblEntry XOPShuffleTbl[] = { 984 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd 985 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps 986 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd 987 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps 988 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm 989 // + vinsertf128 990 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm 991 // + vinsertf128 992 993 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm 994 // + vinsertf128 995 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm 996 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm 997 // + vinsertf128 998 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm 999 }; 1000 1001 if (ST->hasXOP()) 1002 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) 1003 return LT.first * Entry->Cost; 1004 1005 static const CostTblEntry AVX1ShuffleTbl[] = { 1006 { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd 1007 { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps 1008 { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd 1009 { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps 1010 { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128 1011 { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128 1012 1013 { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd 1014 { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps 1015 { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd 1016 { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps 1017 { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb 1018 // + vinsertf128 1019 { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb 1020 // + vinsertf128 1021 1022 { TTI::SK_Select, MVT::v4i64, 1 }, // vblendpd 1023 { TTI::SK_Select, MVT::v4f64, 1 }, // vblendpd 1024 { TTI::SK_Select, MVT::v8i32, 1 }, // vblendps 1025 { TTI::SK_Select, MVT::v8f32, 1 }, // vblendps 1026 { TTI::SK_Select, MVT::v16i16, 3 }, // vpand + vpandn + vpor 1027 { TTI::SK_Select, MVT::v32i8, 3 }, // vpand + vpandn + vpor 1028 1029 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vshufpd 1030 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vshufpd 1031 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps 1032 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps 1033 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb 1034 // + 2*por + vinsertf128 1035 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb 1036 // + 2*por + vinsertf128 1037 1038 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd 1039 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd 1040 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps 1041 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps 1042 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb 1043 // + 4*por + vinsertf128 1044 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb 1045 // + 4*por + vinsertf128 1046 }; 1047 1048 if (ST->hasAVX()) 1049 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) 1050 return LT.first * Entry->Cost; 1051 1052 static const CostTblEntry SSE41ShuffleTbl[] = { 1053 { TTI::SK_Select, MVT::v2i64, 1 }, // pblendw 1054 { TTI::SK_Select, MVT::v2f64, 1 }, // movsd 1055 { TTI::SK_Select, MVT::v4i32, 1 }, // pblendw 1056 { TTI::SK_Select, MVT::v4f32, 1 }, // blendps 1057 { TTI::SK_Select, MVT::v8i16, 1 }, // pblendw 1058 { TTI::SK_Select, MVT::v16i8, 1 } // pblendvb 1059 }; 1060 1061 if (ST->hasSSE41()) 1062 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) 1063 return LT.first * Entry->Cost; 1064 1065 static const CostTblEntry SSSE3ShuffleTbl[] = { 1066 { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb 1067 { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb 1068 1069 { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb 1070 { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb 1071 1072 { TTI::SK_Select, MVT::v8i16, 3 }, // 2*pshufb + por 1073 { TTI::SK_Select, MVT::v16i8, 3 }, // 2*pshufb + por 1074 1075 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb 1076 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb 1077 1078 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por 1079 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por 1080 }; 1081 1082 if (ST->hasSSSE3()) 1083 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) 1084 return LT.first * Entry->Cost; 1085 1086 static const CostTblEntry SSE2ShuffleTbl[] = { 1087 { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd 1088 { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd 1089 { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd 1090 { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd 1091 { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd 1092 1093 { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd 1094 { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd 1095 { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd 1096 { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd 1097 { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw 1098 // + 2*pshufd + 2*unpck + packus 1099 1100 { TTI::SK_Select, MVT::v2i64, 1 }, // movsd 1101 { TTI::SK_Select, MVT::v2f64, 1 }, // movsd 1102 { TTI::SK_Select, MVT::v4i32, 2 }, // 2*shufps 1103 { TTI::SK_Select, MVT::v8i16, 3 }, // pand + pandn + por 1104 { TTI::SK_Select, MVT::v16i8, 3 }, // pand + pandn + por 1105 1106 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd 1107 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd 1108 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd 1109 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw 1110 // + pshufd/unpck 1111 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw 1112 // + 2*pshufd + 2*unpck + 2*packus 1113 1114 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd 1115 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd 1116 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} 1117 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute 1118 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute 1119 }; 1120 1121 if (ST->hasSSE2()) 1122 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) 1123 return LT.first * Entry->Cost; 1124 1125 static const CostTblEntry SSE1ShuffleTbl[] = { 1126 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps 1127 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps 1128 { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps 1129 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps 1130 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps 1131 }; 1132 1133 if (ST->hasSSE1()) 1134 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) 1135 return LT.first * Entry->Cost; 1136 1137 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); 1138 } 1139 1140 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 1141 const Instruction *I) { 1142 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1143 assert(ISD && "Invalid opcode"); 1144 1145 // FIXME: Need a better design of the cost table to handle non-simple types of 1146 // potential massive combinations (elem_num x src_type x dst_type). 1147 1148 static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { 1149 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 1150 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1151 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 1152 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 1153 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 1154 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 1155 1156 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, 1157 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 1158 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, 1159 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, 1160 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, 1161 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, 1162 1163 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, 1164 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, 1165 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, 1166 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 1167 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, 1168 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, 1169 1170 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, 1171 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, 1172 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, 1173 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 1174 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, 1175 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, 1176 }; 1177 1178 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and 1179 // 256-bit wide vectors. 1180 1181 static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { 1182 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, 1183 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, 1184 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, 1185 1186 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, 1187 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, 1188 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, 1189 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, 1190 1191 // v16i1 -> v16i32 - load + broadcast 1192 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 1193 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 1194 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 1195 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 1196 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 1197 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 1198 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 1199 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, 1200 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 1201 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, 1202 1203 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 1204 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 1205 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, 1206 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, 1207 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 1208 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, 1209 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 1210 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 1211 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, 1212 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, 1213 1214 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, 1215 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, 1216 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, 1217 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 1218 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, 1219 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, 1220 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, 1221 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, 1222 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 1223 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, 1224 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, 1225 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, 1226 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, 1227 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, 1228 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1229 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 1230 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 1231 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, 1232 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, 1233 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, 1234 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, 1235 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, 1236 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, 1237 1238 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 1239 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 1240 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, 1241 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 }, 1242 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 }, 1243 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, 1244 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 }, 1245 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, 1246 }; 1247 1248 static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { 1249 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 1250 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 1251 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 1252 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 1253 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 1254 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 1255 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1256 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 1257 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 1258 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 1259 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1260 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1261 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 1262 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 1263 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 1264 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 1265 1266 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, 1267 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, 1268 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 1269 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, 1270 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 1271 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, 1272 1273 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, 1274 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, 1275 1276 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, 1277 }; 1278 1279 static const TypeConversionCostTblEntry AVXConversionTbl[] = { 1280 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 1281 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 1282 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 1283 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 1284 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, 1285 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 1286 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, 1287 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, 1288 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1289 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1290 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, 1291 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1292 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1293 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1294 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 1295 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 1296 1297 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, 1298 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 1299 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 1300 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, 1301 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, 1302 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, 1303 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, 1304 1305 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 1306 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 1307 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 1308 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 1309 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, 1310 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, 1311 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, 1312 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, 1313 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 1314 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 1315 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 1316 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 1317 1318 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 1319 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 1320 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 1321 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, 1322 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 1323 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, 1324 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 1325 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 1326 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 1327 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 }, 1328 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, 1329 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 1330 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, 1331 // The generic code to compute the scalar overhead is currently broken. 1332 // Workaround this limitation by estimating the scalarization overhead 1333 // here. We have roughly 10 instructions per scalar element. 1334 // Multiply that by the vector width. 1335 // FIXME: remove that when PR19268 is fixed. 1336 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 10 }, 1337 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 20 }, 1338 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, 1339 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, 1340 1341 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 1342 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, 1343 // This node is expanded into scalarized operations but BasicTTI is overly 1344 // optimistic estimating its cost. It computes 3 per element (one 1345 // vector-extract, one scalar conversion and one vector-insert). The 1346 // problem is that the inserts form a read-modify-write chain so latency 1347 // should be factored in too. Inflating the cost per element by 1. 1348 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, 1349 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, 1350 1351 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, 1352 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, 1353 }; 1354 1355 static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { 1356 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, 1357 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, 1358 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, 1359 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, 1360 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1361 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, 1362 1363 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, 1364 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, 1365 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, 1366 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, 1367 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1368 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1369 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, 1370 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, 1371 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1372 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, 1373 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, 1374 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, 1375 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1376 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1377 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1378 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, 1379 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, 1380 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, 1381 1382 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, 1383 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, 1384 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, 1385 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 1386 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 1387 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, 1388 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, 1389 1390 }; 1391 1392 static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { 1393 // These are somewhat magic numbers justified by looking at the output of 1394 // Intel's IACA, running some kernels and making sure when we take 1395 // legalization into account the throughput will be overestimated. 1396 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 1397 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 1398 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 1399 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 1400 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, 1401 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 1402 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 1403 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 1404 1405 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 1406 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 1407 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 1408 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 1409 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 1410 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, 1411 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 1412 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 1413 1414 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, 1415 1416 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, 1417 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, 1418 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 1419 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, 1420 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 1421 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, 1422 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 1423 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, 1424 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, 1425 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, 1426 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, 1427 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 1428 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, 1429 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, 1430 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 1431 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, 1432 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 1433 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 }, 1434 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, 1435 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 1436 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, 1437 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, 1438 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, 1439 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, 1440 1441 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, 1442 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, 1443 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, 1444 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, 1445 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, 1446 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 1447 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, 1448 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 1449 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, 1450 }; 1451 1452 std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); 1453 std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); 1454 1455 if (ST->hasSSE2() && !ST->hasAVX()) { 1456 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 1457 LTDest.second, LTSrc.second)) 1458 return LTSrc.first * Entry->Cost; 1459 } 1460 1461 EVT SrcTy = TLI->getValueType(DL, Src); 1462 EVT DstTy = TLI->getValueType(DL, Dst); 1463 1464 // The function getSimpleVT only handles simple value types. 1465 if (!SrcTy.isSimple() || !DstTy.isSimple()) 1466 return BaseT::getCastInstrCost(Opcode, Dst, Src); 1467 1468 if (ST->hasDQI()) 1469 if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, 1470 DstTy.getSimpleVT(), 1471 SrcTy.getSimpleVT())) 1472 return Entry->Cost; 1473 1474 if (ST->hasAVX512()) 1475 if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, 1476 DstTy.getSimpleVT(), 1477 SrcTy.getSimpleVT())) 1478 return Entry->Cost; 1479 1480 if (ST->hasAVX2()) { 1481 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 1482 DstTy.getSimpleVT(), 1483 SrcTy.getSimpleVT())) 1484 return Entry->Cost; 1485 } 1486 1487 if (ST->hasAVX()) { 1488 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, 1489 DstTy.getSimpleVT(), 1490 SrcTy.getSimpleVT())) 1491 return Entry->Cost; 1492 } 1493 1494 if (ST->hasSSE41()) { 1495 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, 1496 DstTy.getSimpleVT(), 1497 SrcTy.getSimpleVT())) 1498 return Entry->Cost; 1499 } 1500 1501 if (ST->hasSSE2()) { 1502 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, 1503 DstTy.getSimpleVT(), 1504 SrcTy.getSimpleVT())) 1505 return Entry->Cost; 1506 } 1507 1508 return BaseT::getCastInstrCost(Opcode, Dst, Src, I); 1509 } 1510 1511 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1512 const Instruction *I) { 1513 // Legalize the type. 1514 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 1515 1516 MVT MTy = LT.second; 1517 1518 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1519 assert(ISD && "Invalid opcode"); 1520 1521 static const CostTblEntry SSE2CostTbl[] = { 1522 { ISD::SETCC, MVT::v2i64, 8 }, 1523 { ISD::SETCC, MVT::v4i32, 1 }, 1524 { ISD::SETCC, MVT::v8i16, 1 }, 1525 { ISD::SETCC, MVT::v16i8, 1 }, 1526 }; 1527 1528 static const CostTblEntry SSE42CostTbl[] = { 1529 { ISD::SETCC, MVT::v2f64, 1 }, 1530 { ISD::SETCC, MVT::v4f32, 1 }, 1531 { ISD::SETCC, MVT::v2i64, 1 }, 1532 }; 1533 1534 static const CostTblEntry AVX1CostTbl[] = { 1535 { ISD::SETCC, MVT::v4f64, 1 }, 1536 { ISD::SETCC, MVT::v8f32, 1 }, 1537 // AVX1 does not support 8-wide integer compare. 1538 { ISD::SETCC, MVT::v4i64, 4 }, 1539 { ISD::SETCC, MVT::v8i32, 4 }, 1540 { ISD::SETCC, MVT::v16i16, 4 }, 1541 { ISD::SETCC, MVT::v32i8, 4 }, 1542 }; 1543 1544 static const CostTblEntry AVX2CostTbl[] = { 1545 { ISD::SETCC, MVT::v4i64, 1 }, 1546 { ISD::SETCC, MVT::v8i32, 1 }, 1547 { ISD::SETCC, MVT::v16i16, 1 }, 1548 { ISD::SETCC, MVT::v32i8, 1 }, 1549 }; 1550 1551 static const CostTblEntry AVX512CostTbl[] = { 1552 { ISD::SETCC, MVT::v8i64, 1 }, 1553 { ISD::SETCC, MVT::v16i32, 1 }, 1554 { ISD::SETCC, MVT::v8f64, 1 }, 1555 { ISD::SETCC, MVT::v16f32, 1 }, 1556 }; 1557 1558 static const CostTblEntry AVX512BWCostTbl[] = { 1559 { ISD::SETCC, MVT::v32i16, 1 }, 1560 { ISD::SETCC, MVT::v64i8, 1 }, 1561 }; 1562 1563 if (ST->hasBWI()) 1564 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 1565 return LT.first * Entry->Cost; 1566 1567 if (ST->hasAVX512()) 1568 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 1569 return LT.first * Entry->Cost; 1570 1571 if (ST->hasAVX2()) 1572 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 1573 return LT.first * Entry->Cost; 1574 1575 if (ST->hasAVX()) 1576 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 1577 return LT.first * Entry->Cost; 1578 1579 if (ST->hasSSE42()) 1580 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 1581 return LT.first * Entry->Cost; 1582 1583 if (ST->hasSSE2()) 1584 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 1585 return LT.first * Entry->Cost; 1586 1587 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); 1588 } 1589 1590 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } 1591 1592 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 1593 ArrayRef<Type *> Tys, FastMathFlags FMF, 1594 unsigned ScalarizationCostPassed) { 1595 // Costs should match the codegen from: 1596 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll 1597 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll 1598 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll 1599 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll 1600 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll 1601 static const CostTblEntry AVX512CDCostTbl[] = { 1602 { ISD::CTLZ, MVT::v8i64, 1 }, 1603 { ISD::CTLZ, MVT::v16i32, 1 }, 1604 { ISD::CTLZ, MVT::v32i16, 8 }, 1605 { ISD::CTLZ, MVT::v64i8, 20 }, 1606 { ISD::CTLZ, MVT::v4i64, 1 }, 1607 { ISD::CTLZ, MVT::v8i32, 1 }, 1608 { ISD::CTLZ, MVT::v16i16, 4 }, 1609 { ISD::CTLZ, MVT::v32i8, 10 }, 1610 { ISD::CTLZ, MVT::v2i64, 1 }, 1611 { ISD::CTLZ, MVT::v4i32, 1 }, 1612 { ISD::CTLZ, MVT::v8i16, 4 }, 1613 { ISD::CTLZ, MVT::v16i8, 4 }, 1614 }; 1615 static const CostTblEntry AVX512BWCostTbl[] = { 1616 { ISD::BITREVERSE, MVT::v8i64, 5 }, 1617 { ISD::BITREVERSE, MVT::v16i32, 5 }, 1618 { ISD::BITREVERSE, MVT::v32i16, 5 }, 1619 { ISD::BITREVERSE, MVT::v64i8, 5 }, 1620 { ISD::CTLZ, MVT::v8i64, 23 }, 1621 { ISD::CTLZ, MVT::v16i32, 22 }, 1622 { ISD::CTLZ, MVT::v32i16, 18 }, 1623 { ISD::CTLZ, MVT::v64i8, 17 }, 1624 { ISD::CTPOP, MVT::v8i64, 7 }, 1625 { ISD::CTPOP, MVT::v16i32, 11 }, 1626 { ISD::CTPOP, MVT::v32i16, 9 }, 1627 { ISD::CTPOP, MVT::v64i8, 6 }, 1628 { ISD::CTTZ, MVT::v8i64, 10 }, 1629 { ISD::CTTZ, MVT::v16i32, 14 }, 1630 { ISD::CTTZ, MVT::v32i16, 12 }, 1631 { ISD::CTTZ, MVT::v64i8, 9 }, 1632 }; 1633 static const CostTblEntry AVX512CostTbl[] = { 1634 { ISD::BITREVERSE, MVT::v8i64, 36 }, 1635 { ISD::BITREVERSE, MVT::v16i32, 24 }, 1636 { ISD::CTLZ, MVT::v8i64, 29 }, 1637 { ISD::CTLZ, MVT::v16i32, 35 }, 1638 { ISD::CTPOP, MVT::v8i64, 16 }, 1639 { ISD::CTPOP, MVT::v16i32, 24 }, 1640 { ISD::CTTZ, MVT::v8i64, 20 }, 1641 { ISD::CTTZ, MVT::v16i32, 28 }, 1642 }; 1643 static const CostTblEntry XOPCostTbl[] = { 1644 { ISD::BITREVERSE, MVT::v4i64, 4 }, 1645 { ISD::BITREVERSE, MVT::v8i32, 4 }, 1646 { ISD::BITREVERSE, MVT::v16i16, 4 }, 1647 { ISD::BITREVERSE, MVT::v32i8, 4 }, 1648 { ISD::BITREVERSE, MVT::v2i64, 1 }, 1649 { ISD::BITREVERSE, MVT::v4i32, 1 }, 1650 { ISD::BITREVERSE, MVT::v8i16, 1 }, 1651 { ISD::BITREVERSE, MVT::v16i8, 1 }, 1652 { ISD::BITREVERSE, MVT::i64, 3 }, 1653 { ISD::BITREVERSE, MVT::i32, 3 }, 1654 { ISD::BITREVERSE, MVT::i16, 3 }, 1655 { ISD::BITREVERSE, MVT::i8, 3 } 1656 }; 1657 static const CostTblEntry AVX2CostTbl[] = { 1658 { ISD::BITREVERSE, MVT::v4i64, 5 }, 1659 { ISD::BITREVERSE, MVT::v8i32, 5 }, 1660 { ISD::BITREVERSE, MVT::v16i16, 5 }, 1661 { ISD::BITREVERSE, MVT::v32i8, 5 }, 1662 { ISD::BSWAP, MVT::v4i64, 1 }, 1663 { ISD::BSWAP, MVT::v8i32, 1 }, 1664 { ISD::BSWAP, MVT::v16i16, 1 }, 1665 { ISD::CTLZ, MVT::v4i64, 23 }, 1666 { ISD::CTLZ, MVT::v8i32, 18 }, 1667 { ISD::CTLZ, MVT::v16i16, 14 }, 1668 { ISD::CTLZ, MVT::v32i8, 9 }, 1669 { ISD::CTPOP, MVT::v4i64, 7 }, 1670 { ISD::CTPOP, MVT::v8i32, 11 }, 1671 { ISD::CTPOP, MVT::v16i16, 9 }, 1672 { ISD::CTPOP, MVT::v32i8, 6 }, 1673 { ISD::CTTZ, MVT::v4i64, 10 }, 1674 { ISD::CTTZ, MVT::v8i32, 14 }, 1675 { ISD::CTTZ, MVT::v16i16, 12 }, 1676 { ISD::CTTZ, MVT::v32i8, 9 }, 1677 { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ 1678 { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ 1679 { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ 1680 { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ 1681 { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ 1682 { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ 1683 }; 1684 static const CostTblEntry AVX1CostTbl[] = { 1685 { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert 1686 { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert 1687 { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert 1688 { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert 1689 { ISD::BSWAP, MVT::v4i64, 4 }, 1690 { ISD::BSWAP, MVT::v8i32, 4 }, 1691 { ISD::BSWAP, MVT::v16i16, 4 }, 1692 { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert 1693 { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert 1694 { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert 1695 { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert 1696 { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert 1697 { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert 1698 { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert 1699 { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert 1700 { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert 1701 { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert 1702 { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert 1703 { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert 1704 { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ 1705 { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ 1706 { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ 1707 { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ 1708 { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ 1709 { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ 1710 }; 1711 static const CostTblEntry GLMCostTbl[] = { 1712 { ISD::FSQRT, MVT::f32, 19 }, // sqrtss 1713 { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps 1714 { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd 1715 { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd 1716 }; 1717 static const CostTblEntry SLMCostTbl[] = { 1718 { ISD::FSQRT, MVT::f32, 20 }, // sqrtss 1719 { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps 1720 { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd 1721 { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd 1722 }; 1723 static const CostTblEntry SSE42CostTbl[] = { 1724 { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ 1725 { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ 1726 }; 1727 static const CostTblEntry SSSE3CostTbl[] = { 1728 { ISD::BITREVERSE, MVT::v2i64, 5 }, 1729 { ISD::BITREVERSE, MVT::v4i32, 5 }, 1730 { ISD::BITREVERSE, MVT::v8i16, 5 }, 1731 { ISD::BITREVERSE, MVT::v16i8, 5 }, 1732 { ISD::BSWAP, MVT::v2i64, 1 }, 1733 { ISD::BSWAP, MVT::v4i32, 1 }, 1734 { ISD::BSWAP, MVT::v8i16, 1 }, 1735 { ISD::CTLZ, MVT::v2i64, 23 }, 1736 { ISD::CTLZ, MVT::v4i32, 18 }, 1737 { ISD::CTLZ, MVT::v8i16, 14 }, 1738 { ISD::CTLZ, MVT::v16i8, 9 }, 1739 { ISD::CTPOP, MVT::v2i64, 7 }, 1740 { ISD::CTPOP, MVT::v4i32, 11 }, 1741 { ISD::CTPOP, MVT::v8i16, 9 }, 1742 { ISD::CTPOP, MVT::v16i8, 6 }, 1743 { ISD::CTTZ, MVT::v2i64, 10 }, 1744 { ISD::CTTZ, MVT::v4i32, 14 }, 1745 { ISD::CTTZ, MVT::v8i16, 12 }, 1746 { ISD::CTTZ, MVT::v16i8, 9 } 1747 }; 1748 static const CostTblEntry SSE2CostTbl[] = { 1749 { ISD::BITREVERSE, MVT::v2i64, 29 }, 1750 { ISD::BITREVERSE, MVT::v4i32, 27 }, 1751 { ISD::BITREVERSE, MVT::v8i16, 27 }, 1752 { ISD::BITREVERSE, MVT::v16i8, 20 }, 1753 { ISD::BSWAP, MVT::v2i64, 7 }, 1754 { ISD::BSWAP, MVT::v4i32, 7 }, 1755 { ISD::BSWAP, MVT::v8i16, 7 }, 1756 { ISD::CTLZ, MVT::v2i64, 25 }, 1757 { ISD::CTLZ, MVT::v4i32, 26 }, 1758 { ISD::CTLZ, MVT::v8i16, 20 }, 1759 { ISD::CTLZ, MVT::v16i8, 17 }, 1760 { ISD::CTPOP, MVT::v2i64, 12 }, 1761 { ISD::CTPOP, MVT::v4i32, 15 }, 1762 { ISD::CTPOP, MVT::v8i16, 13 }, 1763 { ISD::CTPOP, MVT::v16i8, 10 }, 1764 { ISD::CTTZ, MVT::v2i64, 14 }, 1765 { ISD::CTTZ, MVT::v4i32, 18 }, 1766 { ISD::CTTZ, MVT::v8i16, 16 }, 1767 { ISD::CTTZ, MVT::v16i8, 13 }, 1768 { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ 1769 { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ 1770 }; 1771 static const CostTblEntry SSE1CostTbl[] = { 1772 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ 1773 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ 1774 }; 1775 static const CostTblEntry X64CostTbl[] = { // 64-bit targets 1776 { ISD::BITREVERSE, MVT::i64, 14 } 1777 }; 1778 static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets 1779 { ISD::BITREVERSE, MVT::i32, 14 }, 1780 { ISD::BITREVERSE, MVT::i16, 14 }, 1781 { ISD::BITREVERSE, MVT::i8, 11 } 1782 }; 1783 1784 unsigned ISD = ISD::DELETED_NODE; 1785 switch (IID) { 1786 default: 1787 break; 1788 case Intrinsic::bitreverse: 1789 ISD = ISD::BITREVERSE; 1790 break; 1791 case Intrinsic::bswap: 1792 ISD = ISD::BSWAP; 1793 break; 1794 case Intrinsic::ctlz: 1795 ISD = ISD::CTLZ; 1796 break; 1797 case Intrinsic::ctpop: 1798 ISD = ISD::CTPOP; 1799 break; 1800 case Intrinsic::cttz: 1801 ISD = ISD::CTTZ; 1802 break; 1803 case Intrinsic::sqrt: 1804 ISD = ISD::FSQRT; 1805 break; 1806 } 1807 1808 // Legalize the type. 1809 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 1810 MVT MTy = LT.second; 1811 1812 // Attempt to lookup cost. 1813 if (ST->isGLM()) 1814 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) 1815 return LT.first * Entry->Cost; 1816 1817 if (ST->isSLM()) 1818 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) 1819 return LT.first * Entry->Cost; 1820 1821 if (ST->hasCDI()) 1822 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) 1823 return LT.first * Entry->Cost; 1824 1825 if (ST->hasBWI()) 1826 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) 1827 return LT.first * Entry->Cost; 1828 1829 if (ST->hasAVX512()) 1830 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) 1831 return LT.first * Entry->Cost; 1832 1833 if (ST->hasXOP()) 1834 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) 1835 return LT.first * Entry->Cost; 1836 1837 if (ST->hasAVX2()) 1838 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) 1839 return LT.first * Entry->Cost; 1840 1841 if (ST->hasAVX()) 1842 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) 1843 return LT.first * Entry->Cost; 1844 1845 if (ST->hasSSE42()) 1846 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) 1847 return LT.first * Entry->Cost; 1848 1849 if (ST->hasSSSE3()) 1850 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) 1851 return LT.first * Entry->Cost; 1852 1853 if (ST->hasSSE2()) 1854 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) 1855 return LT.first * Entry->Cost; 1856 1857 if (ST->hasSSE1()) 1858 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) 1859 return LT.first * Entry->Cost; 1860 1861 if (ST->is64Bit()) 1862 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) 1863 return LT.first * Entry->Cost; 1864 1865 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) 1866 return LT.first * Entry->Cost; 1867 1868 return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); 1869 } 1870 1871 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 1872 ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) { 1873 return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); 1874 } 1875 1876 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 1877 assert(Val->isVectorTy() && "This must be a vector type"); 1878 1879 Type *ScalarType = Val->getScalarType(); 1880 1881 if (Index != -1U) { 1882 // Legalize the type. 1883 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); 1884 1885 // This type is legalized to a scalar type. 1886 if (!LT.second.isVector()) 1887 return 0; 1888 1889 // The type may be split. Normalize the index to the new type. 1890 unsigned Width = LT.second.getVectorNumElements(); 1891 Index = Index % Width; 1892 1893 // Floating point scalars are already located in index #0. 1894 if (ScalarType->isFloatingPointTy() && Index == 0) 1895 return 0; 1896 } 1897 1898 // Add to the base cost if we know that the extracted element of a vector is 1899 // destined to be moved to and used in the integer register file. 1900 int RegisterFileMoveCost = 0; 1901 if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) 1902 RegisterFileMoveCost = 1; 1903 1904 return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; 1905 } 1906 1907 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 1908 unsigned AddressSpace, const Instruction *I) { 1909 // Handle non-power-of-two vectors such as <3 x float> 1910 if (VectorType *VTy = dyn_cast<VectorType>(Src)) { 1911 unsigned NumElem = VTy->getVectorNumElements(); 1912 1913 // Handle a few common cases: 1914 // <3 x float> 1915 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) 1916 // Cost = 64 bit store + extract + 32 bit store. 1917 return 3; 1918 1919 // <3 x double> 1920 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) 1921 // Cost = 128 bit store + unpack + 64 bit store. 1922 return 3; 1923 1924 // Assume that all other non-power-of-two numbers are scalarized. 1925 if (!isPowerOf2_32(NumElem)) { 1926 int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, 1927 AddressSpace); 1928 int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, 1929 Opcode == Instruction::Store); 1930 return NumElem * Cost + SplitCost; 1931 } 1932 } 1933 1934 // Legalize the type. 1935 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 1936 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 1937 "Invalid Opcode"); 1938 1939 // Each load/store unit costs 1. 1940 int Cost = LT.first * 1; 1941 1942 // This isn't exactly right. We're using slow unaligned 32-byte accesses as a 1943 // proxy for a double-pumped AVX memory interface such as on Sandybridge. 1944 if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow()) 1945 Cost *= 2; 1946 1947 return Cost; 1948 } 1949 1950 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, 1951 unsigned Alignment, 1952 unsigned AddressSpace) { 1953 VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); 1954 if (!SrcVTy) 1955 // To calculate scalar take the regular cost, without mask 1956 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); 1957 1958 unsigned NumElem = SrcVTy->getVectorNumElements(); 1959 VectorType *MaskTy = 1960 VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); 1961 if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || 1962 (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || 1963 !isPowerOf2_32(NumElem)) { 1964 // Scalarization 1965 int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); 1966 int ScalarCompareCost = getCmpSelInstrCost( 1967 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr); 1968 int BranchCost = getCFInstrCost(Instruction::Br); 1969 int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); 1970 1971 int ValueSplitCost = getScalarizationOverhead( 1972 SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); 1973 int MemopCost = 1974 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 1975 Alignment, AddressSpace); 1976 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; 1977 } 1978 1979 // Legalize the type. 1980 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); 1981 auto VT = TLI->getValueType(DL, SrcVTy); 1982 int Cost = 0; 1983 if (VT.isSimple() && LT.second != VT.getSimpleVT() && 1984 LT.second.getVectorNumElements() == NumElem) 1985 // Promotion requires expand/truncate for data and a shuffle for mask. 1986 Cost += getShuffleCost(TTI::SK_Select, SrcVTy, 0, nullptr) + 1987 getShuffleCost(TTI::SK_Select, MaskTy, 0, nullptr); 1988 1989 else if (LT.second.getVectorNumElements() > NumElem) { 1990 VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), 1991 LT.second.getVectorNumElements()); 1992 // Expanding requires fill mask with zeroes 1993 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); 1994 } 1995 if (!ST->hasAVX512()) 1996 return Cost + LT.first*4; // Each maskmov costs 4 1997 1998 // AVX-512 masked load/store is cheapper 1999 return Cost+LT.first; 2000 } 2001 2002 int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, 2003 const SCEV *Ptr) { 2004 // Address computations in vectorized code with non-consecutive addresses will 2005 // likely result in more instructions compared to scalar code where the 2006 // computation can more often be merged into the index mode. The resulting 2007 // extra micro-ops can significantly decrease throughput. 2008 unsigned NumVectorInstToHideOverhead = 10; 2009 2010 // Cost modeling of Strided Access Computation is hidden by the indexing 2011 // modes of X86 regardless of the stride value. We dont believe that there 2012 // is a difference between constant strided access in gerenal and constant 2013 // strided value which is less than or equal to 64. 2014 // Even in the case of (loop invariant) stride whose value is not known at 2015 // compile time, the address computation will not incur more than one extra 2016 // ADD instruction. 2017 if (Ty->isVectorTy() && SE) { 2018 if (!BaseT::isStridedAccess(Ptr)) 2019 return NumVectorInstToHideOverhead; 2020 if (!BaseT::getConstantStrideStep(SE, Ptr)) 2021 return 1; 2022 } 2023 2024 return BaseT::getAddressComputationCost(Ty, SE, Ptr); 2025 } 2026 2027 int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, 2028 bool IsPairwise) { 2029 2030 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2031 2032 MVT MTy = LT.second; 2033 2034 int ISD = TLI->InstructionOpcodeToISD(Opcode); 2035 assert(ISD && "Invalid opcode"); 2036 2037 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 2038 // and make it as the cost. 2039 2040 static const CostTblEntry SSE42CostTblPairWise[] = { 2041 { ISD::FADD, MVT::v2f64, 2 }, 2042 { ISD::FADD, MVT::v4f32, 4 }, 2043 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 2044 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 2045 { ISD::ADD, MVT::v8i16, 5 }, 2046 }; 2047 2048 static const CostTblEntry AVX1CostTblPairWise[] = { 2049 { ISD::FADD, MVT::v4f32, 4 }, 2050 { ISD::FADD, MVT::v4f64, 5 }, 2051 { ISD::FADD, MVT::v8f32, 7 }, 2052 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 2053 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 2054 { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". 2055 { ISD::ADD, MVT::v8i16, 5 }, 2056 { ISD::ADD, MVT::v8i32, 5 }, 2057 }; 2058 2059 static const CostTblEntry SSE42CostTblNoPairWise[] = { 2060 { ISD::FADD, MVT::v2f64, 2 }, 2061 { ISD::FADD, MVT::v4f32, 4 }, 2062 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 2063 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 2064 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 2065 }; 2066 2067 static const CostTblEntry AVX1CostTblNoPairWise[] = { 2068 { ISD::FADD, MVT::v4f32, 3 }, 2069 { ISD::FADD, MVT::v4f64, 3 }, 2070 { ISD::FADD, MVT::v8f32, 4 }, 2071 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 2072 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". 2073 { ISD::ADD, MVT::v4i64, 3 }, 2074 { ISD::ADD, MVT::v8i16, 4 }, 2075 { ISD::ADD, MVT::v8i32, 5 }, 2076 }; 2077 2078 if (IsPairwise) { 2079 if (ST->hasAVX()) 2080 if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) 2081 return LT.first * Entry->Cost; 2082 2083 if (ST->hasSSE42()) 2084 if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) 2085 return LT.first * Entry->Cost; 2086 } else { 2087 if (ST->hasAVX()) 2088 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 2089 return LT.first * Entry->Cost; 2090 2091 if (ST->hasSSE42()) 2092 if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) 2093 return LT.first * Entry->Cost; 2094 } 2095 2096 return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); 2097 } 2098 2099 int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, 2100 bool IsPairwise, bool IsUnsigned) { 2101 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 2102 2103 MVT MTy = LT.second; 2104 2105 int ISD; 2106 if (ValTy->isIntOrIntVectorTy()) { 2107 ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; 2108 } else { 2109 assert(ValTy->isFPOrFPVectorTy() && 2110 "Expected float point or integer vector type."); 2111 ISD = ISD::FMINNUM; 2112 } 2113 2114 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 2115 // and make it as the cost. 2116 2117 static const CostTblEntry SSE42CostTblPairWise[] = { 2118 {ISD::FMINNUM, MVT::v2f64, 3}, 2119 {ISD::FMINNUM, MVT::v4f32, 2}, 2120 {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" 2121 {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" 2122 {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" 2123 {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" 2124 {ISD::SMIN, MVT::v8i16, 2}, 2125 {ISD::UMIN, MVT::v8i16, 2}, 2126 }; 2127 2128 static const CostTblEntry AVX1CostTblPairWise[] = { 2129 {ISD::FMINNUM, MVT::v4f32, 1}, 2130 {ISD::FMINNUM, MVT::v4f64, 1}, 2131 {ISD::FMINNUM, MVT::v8f32, 2}, 2132 {ISD::SMIN, MVT::v2i64, 3}, 2133 {ISD::UMIN, MVT::v2i64, 3}, 2134 {ISD::SMIN, MVT::v4i32, 1}, 2135 {ISD::UMIN, MVT::v4i32, 1}, 2136 {ISD::SMIN, MVT::v8i16, 1}, 2137 {ISD::UMIN, MVT::v8i16, 1}, 2138 {ISD::SMIN, MVT::v8i32, 3}, 2139 {ISD::UMIN, MVT::v8i32, 3}, 2140 }; 2141 2142 static const CostTblEntry AVX2CostTblPairWise[] = { 2143 {ISD::SMIN, MVT::v4i64, 2}, 2144 {ISD::UMIN, MVT::v4i64, 2}, 2145 {ISD::SMIN, MVT::v8i32, 1}, 2146 {ISD::UMIN, MVT::v8i32, 1}, 2147 {ISD::SMIN, MVT::v16i16, 1}, 2148 {ISD::UMIN, MVT::v16i16, 1}, 2149 {ISD::SMIN, MVT::v32i8, 2}, 2150 {ISD::UMIN, MVT::v32i8, 2}, 2151 }; 2152 2153 static const CostTblEntry AVX512CostTblPairWise[] = { 2154 {ISD::FMINNUM, MVT::v8f64, 1}, 2155 {ISD::FMINNUM, MVT::v16f32, 2}, 2156 {ISD::SMIN, MVT::v8i64, 2}, 2157 {ISD::UMIN, MVT::v8i64, 2}, 2158 {ISD::SMIN, MVT::v16i32, 1}, 2159 {ISD::UMIN, MVT::v16i32, 1}, 2160 }; 2161 2162 static const CostTblEntry SSE42CostTblNoPairWise[] = { 2163 {ISD::FMINNUM, MVT::v2f64, 3}, 2164 {ISD::FMINNUM, MVT::v4f32, 3}, 2165 {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" 2166 {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" 2167 {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" 2168 {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" 2169 {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" 2170 {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" 2171 }; 2172 2173 static const CostTblEntry AVX1CostTblNoPairWise[] = { 2174 {ISD::FMINNUM, MVT::v4f32, 1}, 2175 {ISD::FMINNUM, MVT::v4f64, 1}, 2176 {ISD::FMINNUM, MVT::v8f32, 1}, 2177 {ISD::SMIN, MVT::v2i64, 3}, 2178 {ISD::UMIN, MVT::v2i64, 3}, 2179 {ISD::SMIN, MVT::v4i32, 1}, 2180 {ISD::UMIN, MVT::v4i32, 1}, 2181 {ISD::SMIN, MVT::v8i16, 1}, 2182 {ISD::UMIN, MVT::v8i16, 1}, 2183 {ISD::SMIN, MVT::v8i32, 2}, 2184 {ISD::UMIN, MVT::v8i32, 2}, 2185 }; 2186 2187 static const CostTblEntry AVX2CostTblNoPairWise[] = { 2188 {ISD::SMIN, MVT::v4i64, 1}, 2189 {ISD::UMIN, MVT::v4i64, 1}, 2190 {ISD::SMIN, MVT::v8i32, 1}, 2191 {ISD::UMIN, MVT::v8i32, 1}, 2192 {ISD::SMIN, MVT::v16i16, 1}, 2193 {ISD::UMIN, MVT::v16i16, 1}, 2194 {ISD::SMIN, MVT::v32i8, 1}, 2195 {ISD::UMIN, MVT::v32i8, 1}, 2196 }; 2197 2198 static const CostTblEntry AVX512CostTblNoPairWise[] = { 2199 {ISD::FMINNUM, MVT::v8f64, 1}, 2200 {ISD::FMINNUM, MVT::v16f32, 2}, 2201 {ISD::SMIN, MVT::v8i64, 1}, 2202 {ISD::UMIN, MVT::v8i64, 1}, 2203 {ISD::SMIN, MVT::v16i32, 1}, 2204 {ISD::UMIN, MVT::v16i32, 1}, 2205 }; 2206 2207 if (IsPairwise) { 2208 if (ST->hasAVX512()) 2209 if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) 2210 return LT.first * Entry->Cost; 2211 2212 if (ST->hasAVX2()) 2213 if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) 2214 return LT.first * Entry->Cost; 2215 2216 if (ST->hasAVX()) 2217 if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) 2218 return LT.first * Entry->Cost; 2219 2220 if (ST->hasSSE42()) 2221 if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) 2222 return LT.first * Entry->Cost; 2223 } else { 2224 if (ST->hasAVX512()) 2225 if (const auto *Entry = 2226 CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) 2227 return LT.first * Entry->Cost; 2228 2229 if (ST->hasAVX2()) 2230 if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) 2231 return LT.first * Entry->Cost; 2232 2233 if (ST->hasAVX()) 2234 if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) 2235 return LT.first * Entry->Cost; 2236 2237 if (ST->hasSSE42()) 2238 if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) 2239 return LT.first * Entry->Cost; 2240 } 2241 2242 return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); 2243 } 2244 2245 /// Calculate the cost of materializing a 64-bit value. This helper 2246 /// method might only calculate a fraction of a larger immediate. Therefore it 2247 /// is valid to return a cost of ZERO. 2248 int X86TTIImpl::getIntImmCost(int64_t Val) { 2249 if (Val == 0) 2250 return TTI::TCC_Free; 2251 2252 if (isInt<32>(Val)) 2253 return TTI::TCC_Basic; 2254 2255 return 2 * TTI::TCC_Basic; 2256 } 2257 2258 int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { 2259 assert(Ty->isIntegerTy()); 2260 2261 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2262 if (BitSize == 0) 2263 return ~0U; 2264 2265 // Never hoist constants larger than 128bit, because this might lead to 2266 // incorrect code generation or assertions in codegen. 2267 // Fixme: Create a cost model for types larger than i128 once the codegen 2268 // issues have been fixed. 2269 if (BitSize > 128) 2270 return TTI::TCC_Free; 2271 2272 if (Imm == 0) 2273 return TTI::TCC_Free; 2274 2275 // Sign-extend all constants to a multiple of 64-bit. 2276 APInt ImmVal = Imm; 2277 if (BitSize % 64 != 0) 2278 ImmVal = Imm.sext(alignTo(BitSize, 64)); 2279 2280 // Split the constant into 64-bit chunks and calculate the cost for each 2281 // chunk. 2282 int Cost = 0; 2283 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 2284 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 2285 int64_t Val = Tmp.getSExtValue(); 2286 Cost += getIntImmCost(Val); 2287 } 2288 // We need at least one instruction to materialize the constant. 2289 return std::max(1, Cost); 2290 } 2291 2292 int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 2293 Type *Ty) { 2294 assert(Ty->isIntegerTy()); 2295 2296 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2297 // There is no cost model for constants with a bit size of 0. Return TCC_Free 2298 // here, so that constant hoisting will ignore this constant. 2299 if (BitSize == 0) 2300 return TTI::TCC_Free; 2301 2302 unsigned ImmIdx = ~0U; 2303 switch (Opcode) { 2304 default: 2305 return TTI::TCC_Free; 2306 case Instruction::GetElementPtr: 2307 // Always hoist the base address of a GetElementPtr. This prevents the 2308 // creation of new constants for every base constant that gets constant 2309 // folded with the offset. 2310 if (Idx == 0) 2311 return 2 * TTI::TCC_Basic; 2312 return TTI::TCC_Free; 2313 case Instruction::Store: 2314 ImmIdx = 0; 2315 break; 2316 case Instruction::ICmp: 2317 // This is an imperfect hack to prevent constant hoisting of 2318 // compares that might be trying to check if a 64-bit value fits in 2319 // 32-bits. The backend can optimize these cases using a right shift by 32. 2320 // Ideally we would check the compare predicate here. There also other 2321 // similar immediates the backend can use shifts for. 2322 if (Idx == 1 && Imm.getBitWidth() == 64) { 2323 uint64_t ImmVal = Imm.getZExtValue(); 2324 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) 2325 return TTI::TCC_Free; 2326 } 2327 ImmIdx = 1; 2328 break; 2329 case Instruction::And: 2330 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes 2331 // by using a 32-bit operation with implicit zero extension. Detect such 2332 // immediates here as the normal path expects bit 31 to be sign extended. 2333 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) 2334 return TTI::TCC_Free; 2335 ImmIdx = 1; 2336 break; 2337 case Instruction::Add: 2338 case Instruction::Sub: 2339 // For add/sub, we can use the opposite instruction for INT32_MIN. 2340 if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) 2341 return TTI::TCC_Free; 2342 ImmIdx = 1; 2343 break; 2344 case Instruction::Mul: 2345 case Instruction::UDiv: 2346 case Instruction::SDiv: 2347 case Instruction::URem: 2348 case Instruction::SRem: 2349 case Instruction::Or: 2350 case Instruction::Xor: 2351 ImmIdx = 1; 2352 break; 2353 // Always return TCC_Free for the shift value of a shift instruction. 2354 case Instruction::Shl: 2355 case Instruction::LShr: 2356 case Instruction::AShr: 2357 if (Idx == 1) 2358 return TTI::TCC_Free; 2359 break; 2360 case Instruction::Trunc: 2361 case Instruction::ZExt: 2362 case Instruction::SExt: 2363 case Instruction::IntToPtr: 2364 case Instruction::PtrToInt: 2365 case Instruction::BitCast: 2366 case Instruction::PHI: 2367 case Instruction::Call: 2368 case Instruction::Select: 2369 case Instruction::Ret: 2370 case Instruction::Load: 2371 break; 2372 } 2373 2374 if (Idx == ImmIdx) { 2375 int NumConstants = divideCeil(BitSize, 64); 2376 int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); 2377 return (Cost <= NumConstants * TTI::TCC_Basic) 2378 ? static_cast<int>(TTI::TCC_Free) 2379 : Cost; 2380 } 2381 2382 return X86TTIImpl::getIntImmCost(Imm, Ty); 2383 } 2384 2385 int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 2386 Type *Ty) { 2387 assert(Ty->isIntegerTy()); 2388 2389 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 2390 // There is no cost model for constants with a bit size of 0. Return TCC_Free 2391 // here, so that constant hoisting will ignore this constant. 2392 if (BitSize == 0) 2393 return TTI::TCC_Free; 2394 2395 switch (IID) { 2396 default: 2397 return TTI::TCC_Free; 2398 case Intrinsic::sadd_with_overflow: 2399 case Intrinsic::uadd_with_overflow: 2400 case Intrinsic::ssub_with_overflow: 2401 case Intrinsic::usub_with_overflow: 2402 case Intrinsic::smul_with_overflow: 2403 case Intrinsic::umul_with_overflow: 2404 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 2405 return TTI::TCC_Free; 2406 break; 2407 case Intrinsic::experimental_stackmap: 2408 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 2409 return TTI::TCC_Free; 2410 break; 2411 case Intrinsic::experimental_patchpoint_void: 2412 case Intrinsic::experimental_patchpoint_i64: 2413 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 2414 return TTI::TCC_Free; 2415 break; 2416 } 2417 return X86TTIImpl::getIntImmCost(Imm, Ty); 2418 } 2419 2420 unsigned X86TTIImpl::getUserCost(const User *U, 2421 ArrayRef<const Value *> Operands) { 2422 if (isa<StoreInst>(U)) { 2423 Value *Ptr = U->getOperand(1); 2424 // Store instruction with index and scale costs 2 Uops. 2425 // Check the preceding GEP to identify non-const indices. 2426 if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 2427 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) 2428 return TTI::TCC_Basic * 2; 2429 } 2430 return TTI::TCC_Basic; 2431 } 2432 return BaseT::getUserCost(U, Operands); 2433 } 2434 2435 // Return an average cost of Gather / Scatter instruction, maybe improved later 2436 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, 2437 unsigned Alignment, unsigned AddressSpace) { 2438 2439 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); 2440 unsigned VF = SrcVTy->getVectorNumElements(); 2441 2442 // Try to reduce index size from 64 bit (default for GEP) 2443 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the 2444 // operation will use 16 x 64 indices which do not fit in a zmm and needs 2445 // to split. Also check that the base pointer is the same for all lanes, 2446 // and that there's at most one variable index. 2447 auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { 2448 unsigned IndexSize = DL.getPointerSizeInBits(); 2449 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); 2450 if (IndexSize < 64 || !GEP) 2451 return IndexSize; 2452 2453 unsigned NumOfVarIndices = 0; 2454 Value *Ptrs = GEP->getPointerOperand(); 2455 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) 2456 return IndexSize; 2457 for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { 2458 if (isa<Constant>(GEP->getOperand(i))) 2459 continue; 2460 Type *IndxTy = GEP->getOperand(i)->getType(); 2461 if (IndxTy->isVectorTy()) 2462 IndxTy = IndxTy->getVectorElementType(); 2463 if ((IndxTy->getPrimitiveSizeInBits() == 64 && 2464 !isa<SExtInst>(GEP->getOperand(i))) || 2465 ++NumOfVarIndices > 1) 2466 return IndexSize; // 64 2467 } 2468 return (unsigned)32; 2469 }; 2470 2471 2472 // Trying to reduce IndexSize to 32 bits for vector 16. 2473 // By default the IndexSize is equal to pointer size. 2474 unsigned IndexSize = (ST->hasAVX512() && VF >= 16) 2475 ? getIndexSizeInBits(Ptr, DL) 2476 : DL.getPointerSizeInBits(); 2477 2478 Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), 2479 IndexSize), VF); 2480 std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); 2481 std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); 2482 int SplitFactor = std::max(IdxsLT.first, SrcLT.first); 2483 if (SplitFactor > 1) { 2484 // Handle splitting of vector of pointers 2485 Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); 2486 return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, 2487 AddressSpace); 2488 } 2489 2490 // The gather / scatter cost is given by Intel architects. It is a rough 2491 // number since we are looking at one instruction in a time. 2492 const int GSOverhead = (Opcode == Instruction::Load) 2493 ? ST->getGatherOverhead() 2494 : ST->getScatterOverhead(); 2495 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 2496 Alignment, AddressSpace); 2497 } 2498 2499 /// Return the cost of full scalarization of gather / scatter operation. 2500 /// 2501 /// Opcode - Load or Store instruction. 2502 /// SrcVTy - The type of the data vector that should be gathered or scattered. 2503 /// VariableMask - The mask is non-constant at compile time. 2504 /// Alignment - Alignment for one element. 2505 /// AddressSpace - pointer[s] address space. 2506 /// 2507 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, 2508 bool VariableMask, unsigned Alignment, 2509 unsigned AddressSpace) { 2510 unsigned VF = SrcVTy->getVectorNumElements(); 2511 2512 int MaskUnpackCost = 0; 2513 if (VariableMask) { 2514 VectorType *MaskTy = 2515 VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); 2516 MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); 2517 int ScalarCompareCost = 2518 getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), 2519 nullptr); 2520 int BranchCost = getCFInstrCost(Instruction::Br); 2521 MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); 2522 } 2523 2524 // The cost of the scalar loads/stores. 2525 int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), 2526 Alignment, AddressSpace); 2527 2528 int InsertExtractCost = 0; 2529 if (Opcode == Instruction::Load) 2530 for (unsigned i = 0; i < VF; ++i) 2531 // Add the cost of inserting each scalar load into the vector 2532 InsertExtractCost += 2533 getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); 2534 else 2535 for (unsigned i = 0; i < VF; ++i) 2536 // Add the cost of extracting each element out of the data vector 2537 InsertExtractCost += 2538 getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); 2539 2540 return MemoryOpCost + MaskUnpackCost + InsertExtractCost; 2541 } 2542 2543 /// Calculate the cost of Gather / Scatter operation 2544 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, 2545 Value *Ptr, bool VariableMask, 2546 unsigned Alignment) { 2547 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); 2548 unsigned VF = SrcVTy->getVectorNumElements(); 2549 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 2550 if (!PtrTy && Ptr->getType()->isVectorTy()) 2551 PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); 2552 assert(PtrTy && "Unexpected type for Ptr argument"); 2553 unsigned AddressSpace = PtrTy->getAddressSpace(); 2554 2555 bool Scalarize = false; 2556 if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || 2557 (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) 2558 Scalarize = true; 2559 // Gather / Scatter for vector 2 is not profitable on KNL / SKX 2560 // Vector-4 of gather/scatter instruction does not exist on KNL. 2561 // We can extend it to 8 elements, but zeroing upper bits of 2562 // the mask vector will add more instructions. Right now we give the scalar 2563 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction 2564 // is better in the VariableMask case. 2565 if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) 2566 Scalarize = true; 2567 2568 if (Scalarize) 2569 return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, 2570 AddressSpace); 2571 2572 return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); 2573 } 2574 2575 bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, 2576 TargetTransformInfo::LSRCost &C2) { 2577 // X86 specific here are "instruction number 1st priority". 2578 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, 2579 C1.NumIVMuls, C1.NumBaseAdds, 2580 C1.ScaleCost, C1.ImmCost, C1.SetupCost) < 2581 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, 2582 C2.NumIVMuls, C2.NumBaseAdds, 2583 C2.ScaleCost, C2.ImmCost, C2.SetupCost); 2584 } 2585 2586 bool X86TTIImpl::canMacroFuseCmp() { 2587 return ST->hasMacroFusion(); 2588 } 2589 2590 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { 2591 // The backend can't handle a single element vector. 2592 if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1) 2593 return false; 2594 Type *ScalarTy = DataTy->getScalarType(); 2595 int DataWidth = isa<PointerType>(ScalarTy) ? 2596 DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); 2597 2598 return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) || 2599 ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI()); 2600 } 2601 2602 bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { 2603 return isLegalMaskedLoad(DataType); 2604 } 2605 2606 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { 2607 // This function is called now in two cases: from the Loop Vectorizer 2608 // and from the Scalarizer. 2609 // When the Loop Vectorizer asks about legality of the feature, 2610 // the vectorization factor is not calculated yet. The Loop Vectorizer 2611 // sends a scalar type and the decision is based on the width of the 2612 // scalar element. 2613 // Later on, the cost model will estimate usage this intrinsic based on 2614 // the vector type. 2615 // The Scalarizer asks again about legality. It sends a vector type. 2616 // In this case we can reject non-power-of-2 vectors. 2617 // We also reject single element vectors as the type legalizer can't 2618 // scalarize it. 2619 if (isa<VectorType>(DataTy)) { 2620 unsigned NumElts = DataTy->getVectorNumElements(); 2621 if (NumElts == 1 || !isPowerOf2_32(NumElts)) 2622 return false; 2623 } 2624 Type *ScalarTy = DataTy->getScalarType(); 2625 int DataWidth = isa<PointerType>(ScalarTy) ? 2626 DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); 2627 2628 // Some CPUs have better gather performance than others. 2629 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only 2630 // enable gather with a -march. 2631 return (DataWidth == 32 || DataWidth == 64) && 2632 (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); 2633 } 2634 2635 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { 2636 // AVX2 doesn't support scatter 2637 if (!ST->hasAVX512()) 2638 return false; 2639 return isLegalMaskedGather(DataType); 2640 } 2641 2642 bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { 2643 EVT VT = TLI->getValueType(DL, DataType); 2644 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); 2645 } 2646 2647 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 2648 return false; 2649 } 2650 2651 bool X86TTIImpl::areInlineCompatible(const Function *Caller, 2652 const Function *Callee) const { 2653 const TargetMachine &TM = getTLI()->getTargetMachine(); 2654 2655 // Work this as a subsetting of subtarget features. 2656 const FeatureBitset &CallerBits = 2657 TM.getSubtargetImpl(*Caller)->getFeatureBits(); 2658 const FeatureBitset &CalleeBits = 2659 TM.getSubtargetImpl(*Callee)->getFeatureBits(); 2660 2661 // FIXME: This is likely too limiting as it will include subtarget features 2662 // that we might not care about for inlining, but it is conservatively 2663 // correct. 2664 return (CallerBits & CalleeBits) == CalleeBits; 2665 } 2666 2667 const X86TTIImpl::TTI::MemCmpExpansionOptions * 2668 X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { 2669 // Only enable vector loads for equality comparison. 2670 // Right now the vector version is not as fast, see #33329. 2671 static const auto ThreeWayOptions = [this]() { 2672 TTI::MemCmpExpansionOptions Options; 2673 if (ST->is64Bit()) { 2674 Options.LoadSizes.push_back(8); 2675 } 2676 Options.LoadSizes.push_back(4); 2677 Options.LoadSizes.push_back(2); 2678 Options.LoadSizes.push_back(1); 2679 return Options; 2680 }(); 2681 static const auto EqZeroOptions = [this]() { 2682 TTI::MemCmpExpansionOptions Options; 2683 // TODO: enable AVX512 when the DAG is ready. 2684 // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); 2685 if (ST->hasAVX2()) Options.LoadSizes.push_back(32); 2686 if (ST->hasSSE2()) Options.LoadSizes.push_back(16); 2687 if (ST->is64Bit()) { 2688 Options.LoadSizes.push_back(8); 2689 } 2690 Options.LoadSizes.push_back(4); 2691 Options.LoadSizes.push_back(2); 2692 Options.LoadSizes.push_back(1); 2693 return Options; 2694 }(); 2695 return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; 2696 } 2697 2698 bool X86TTIImpl::enableInterleavedAccessVectorization() { 2699 // TODO: We expect this to be beneficial regardless of arch, 2700 // but there are currently some unexplained performance artifacts on Atom. 2701 // As a temporary solution, disable on Atom. 2702 return !(ST->isAtom()); 2703 } 2704 2705 // Get estimation for interleaved load/store operations for AVX2. 2706 // \p Factor is the interleaved-access factor (stride) - number of 2707 // (interleaved) elements in the group. 2708 // \p Indices contains the indices for a strided load: when the 2709 // interleaved load has gaps they indicate which elements are used. 2710 // If Indices is empty (or if the number of indices is equal to the size 2711 // of the interleaved-access as given in \p Factor) the access has no gaps. 2712 // 2713 // As opposed to AVX-512, AVX2 does not have generic shuffles that allow 2714 // computing the cost using a generic formula as a function of generic 2715 // shuffles. We therefore use a lookup table instead, filled according to 2716 // the instruction sequences that codegen currently generates. 2717 int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, 2718 unsigned Factor, 2719 ArrayRef<unsigned> Indices, 2720 unsigned Alignment, 2721 unsigned AddressSpace) { 2722 2723 // We currently Support only fully-interleaved groups, with no gaps. 2724 // TODO: Support also strided loads (interleaved-groups with gaps). 2725 if (Indices.size() && Indices.size() != Factor) 2726 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2727 Alignment, AddressSpace); 2728 2729 // VecTy for interleave memop is <VF*Factor x Elt>. 2730 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 2731 // VecTy = <12 x i32>. 2732 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 2733 2734 // This function can be called with VecTy=<6xi128>, Factor=3, in which case 2735 // the VF=2, while v2i128 is an unsupported MVT vector type 2736 // (see MachineValueType.h::getVectorVT()). 2737 if (!LegalVT.isVector()) 2738 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2739 Alignment, AddressSpace); 2740 2741 unsigned VF = VecTy->getVectorNumElements() / Factor; 2742 Type *ScalarTy = VecTy->getVectorElementType(); 2743 2744 // Calculate the number of memory operations (NumOfMemOps), required 2745 // for load/store the VecTy. 2746 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 2747 unsigned LegalVTSize = LegalVT.getStoreSize(); 2748 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 2749 2750 // Get the cost of one memory operation. 2751 Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), 2752 LegalVT.getVectorNumElements()); 2753 unsigned MemOpCost = 2754 getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); 2755 2756 VectorType *VT = VectorType::get(ScalarTy, VF); 2757 EVT ETy = TLI->getValueType(DL, VT); 2758 if (!ETy.isSimple()) 2759 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2760 Alignment, AddressSpace); 2761 2762 // TODO: Complete for other data-types and strides. 2763 // Each combination of Stride, ElementTy and VF results in a different 2764 // sequence; The cost tables are therefore accessed with: 2765 // Factor (stride) and VectorType=VFxElemType. 2766 // The Cost accounts only for the shuffle sequence; 2767 // The cost of the loads/stores is accounted for separately. 2768 // 2769 static const CostTblEntry AVX2InterleavedLoadTbl[] = { 2770 { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64 2771 { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64 2772 2773 { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 2774 { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 2775 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 2776 { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 2777 { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 2778 { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32 2779 2780 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 2781 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 2782 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 2783 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 2784 { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8 2785 2786 { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32 2787 }; 2788 2789 static const CostTblEntry AVX2InterleavedStoreTbl[] = { 2790 { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store) 2791 { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store) 2792 2793 { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) 2794 { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) 2795 { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) 2796 { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store) 2797 { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store) 2798 2799 { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) 2800 { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) 2801 { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store) 2802 { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store) 2803 { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store) 2804 }; 2805 2806 if (Opcode == Instruction::Load) { 2807 if (const auto *Entry = 2808 CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) 2809 return NumOfMemOps * MemOpCost + Entry->Cost; 2810 } else { 2811 assert(Opcode == Instruction::Store && 2812 "Expected Store Instruction at this point"); 2813 if (const auto *Entry = 2814 CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) 2815 return NumOfMemOps * MemOpCost + Entry->Cost; 2816 } 2817 2818 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2819 Alignment, AddressSpace); 2820 } 2821 2822 // Get estimation for interleaved load/store operations and strided load. 2823 // \p Indices contains indices for strided load. 2824 // \p Factor - the factor of interleaving. 2825 // AVX-512 provides 3-src shuffles that significantly reduces the cost. 2826 int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, 2827 unsigned Factor, 2828 ArrayRef<unsigned> Indices, 2829 unsigned Alignment, 2830 unsigned AddressSpace) { 2831 2832 // VecTy for interleave memop is <VF*Factor x Elt>. 2833 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have 2834 // VecTy = <12 x i32>. 2835 2836 // Calculate the number of memory operations (NumOfMemOps), required 2837 // for load/store the VecTy. 2838 MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 2839 unsigned VecTySize = DL.getTypeStoreSize(VecTy); 2840 unsigned LegalVTSize = LegalVT.getStoreSize(); 2841 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; 2842 2843 // Get the cost of one memory operation. 2844 Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), 2845 LegalVT.getVectorNumElements()); 2846 unsigned MemOpCost = 2847 getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); 2848 2849 unsigned VF = VecTy->getVectorNumElements() / Factor; 2850 MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); 2851 2852 if (Opcode == Instruction::Load) { 2853 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) 2854 // contain the cost of the optimized shuffle sequence that the 2855 // X86InterleavedAccess pass will generate. 2856 // The cost of loads and stores are computed separately from the table. 2857 2858 // X86InterleavedAccess support only the following interleaved-access group. 2859 static const CostTblEntry AVX512InterleavedLoadTbl[] = { 2860 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 2861 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 2862 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 2863 }; 2864 2865 if (const auto *Entry = 2866 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) 2867 return NumOfMemOps * MemOpCost + Entry->Cost; 2868 //If an entry does not exist, fallback to the default implementation. 2869 2870 // Kind of shuffle depends on number of loaded values. 2871 // If we load the entire data in one register, we can use a 1-src shuffle. 2872 // Otherwise, we'll merge 2 sources in each operation. 2873 TTI::ShuffleKind ShuffleKind = 2874 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; 2875 2876 unsigned ShuffleCost = 2877 getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); 2878 2879 unsigned NumOfLoadsInInterleaveGrp = 2880 Indices.size() ? Indices.size() : Factor; 2881 Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), 2882 VecTy->getVectorNumElements() / Factor); 2883 unsigned NumOfResults = 2884 getTLI()->getTypeLegalizationCost(DL, ResultTy).first * 2885 NumOfLoadsInInterleaveGrp; 2886 2887 // About a half of the loads may be folded in shuffles when we have only 2888 // one result. If we have more than one result, we do not fold loads at all. 2889 unsigned NumOfUnfoldedLoads = 2890 NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; 2891 2892 // Get a number of shuffle operations per result. 2893 unsigned NumOfShufflesPerResult = 2894 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); 2895 2896 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 2897 // When we have more than one destination, we need additional instructions 2898 // to keep sources. 2899 unsigned NumOfMoves = 0; 2900 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) 2901 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; 2902 2903 int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + 2904 NumOfUnfoldedLoads * MemOpCost + NumOfMoves; 2905 2906 return Cost; 2907 } 2908 2909 // Store. 2910 assert(Opcode == Instruction::Store && 2911 "Expected Store Instruction at this point"); 2912 // X86InterleavedAccess support only the following interleaved-access group. 2913 static const CostTblEntry AVX512InterleavedStoreTbl[] = { 2914 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) 2915 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) 2916 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) 2917 2918 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) 2919 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) 2920 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) 2921 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) 2922 }; 2923 2924 if (const auto *Entry = 2925 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) 2926 return NumOfMemOps * MemOpCost + Entry->Cost; 2927 //If an entry does not exist, fallback to the default implementation. 2928 2929 // There is no strided stores meanwhile. And store can't be folded in 2930 // shuffle. 2931 unsigned NumOfSources = Factor; // The number of values to be merged. 2932 unsigned ShuffleCost = 2933 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); 2934 unsigned NumOfShufflesPerStore = NumOfSources - 1; 2935 2936 // The SK_MergeTwoSrc shuffle clobbers one of src operands. 2937 // We need additional instructions to keep sources. 2938 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; 2939 int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + 2940 NumOfMoves; 2941 return Cost; 2942 } 2943 2944 int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 2945 unsigned Factor, 2946 ArrayRef<unsigned> Indices, 2947 unsigned Alignment, 2948 unsigned AddressSpace) { 2949 auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { 2950 Type *EltTy = VecTy->getVectorElementType(); 2951 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || 2952 EltTy->isIntegerTy(32) || EltTy->isPointerTy()) 2953 return true; 2954 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) 2955 return HasBW; 2956 return false; 2957 }; 2958 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) 2959 return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, 2960 Alignment, AddressSpace); 2961 if (ST->hasAVX2()) 2962 return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, 2963 Alignment, AddressSpace); 2964 2965 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 2966 Alignment, AddressSpace); 2967 } 2968