1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 #include "../Target.h" 10 11 #include "../Latency.h" 12 #include "../Uops.h" 13 #include "MCTargetDesc/X86BaseInfo.h" 14 #include "MCTargetDesc/X86MCTargetDesc.h" 15 #include "X86.h" 16 #include "X86RegisterInfo.h" 17 #include "X86Subtarget.h" 18 #include "llvm/MC/MCInstBuilder.h" 19 20 namespace exegesis { 21 22 namespace { 23 24 // Common code for X86 Uops and Latency runners. 25 template <typename Impl> class X86BenchmarkRunner : public Impl { 26 using Impl::Impl; 27 28 llvm::Expected<SnippetPrototype> 29 generatePrototype(unsigned Opcode) const override { 30 // Test whether we can generate a snippet for this instruction. 31 const auto &InstrInfo = this->State.getInstrInfo(); 32 const auto OpcodeName = InstrInfo.getName(Opcode); 33 if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") || 34 OpcodeName.startswith("ADJCALLSTACK")) { 35 return llvm::make_error<BenchmarkFailure>( 36 "Unsupported opcode: Push/Pop/AdjCallStack"); 37 } 38 39 // Handle X87. 40 const auto &InstrDesc = InstrInfo.get(Opcode); 41 const unsigned FPInstClass = InstrDesc.TSFlags & llvm::X86II::FPTypeMask; 42 const Instruction Instr(InstrDesc, this->RATC); 43 switch (FPInstClass) { 44 case llvm::X86II::NotFP: 45 break; 46 case llvm::X86II::ZeroArgFP: 47 return llvm::make_error<BenchmarkFailure>("Unsupported x87 ZeroArgFP"); 48 case llvm::X86II::OneArgFP: 49 return llvm::make_error<BenchmarkFailure>("Unsupported x87 OneArgFP"); 50 case llvm::X86II::OneArgFPRW: 51 case llvm::X86II::TwoArgFP: { 52 // These are instructions like 53 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) 54 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) 55 // They are intrinsically serial and do not modify the state of the stack. 56 // We generate the same code for latency and uops. 57 return this->generateSelfAliasingPrototype(Instr); 58 } 59 case llvm::X86II::CompareFP: 60 return Impl::handleCompareFP(Instr); 61 case llvm::X86II::CondMovFP: 62 return Impl::handleCondMovFP(Instr); 63 case llvm::X86II::SpecialFP: 64 return llvm::make_error<BenchmarkFailure>("Unsupported x87 SpecialFP"); 65 default: 66 llvm_unreachable("Unknown FP Type!"); 67 } 68 69 // Fallback to generic implementation. 70 return Impl::Base::generatePrototype(Opcode); 71 } 72 }; 73 74 class X86LatencyImpl : public LatencyBenchmarkRunner { 75 protected: 76 using Base = LatencyBenchmarkRunner; 77 using Base::Base; 78 llvm::Expected<SnippetPrototype> 79 handleCompareFP(const Instruction &Instr) const { 80 return llvm::make_error<BenchmarkFailure>("Unsupported x87 CompareFP"); 81 } 82 llvm::Expected<SnippetPrototype> 83 handleCondMovFP(const Instruction &Instr) const { 84 return llvm::make_error<BenchmarkFailure>("Unsupported x87 CondMovFP"); 85 } 86 }; 87 88 class X86UopsImpl : public UopsBenchmarkRunner { 89 protected: 90 using Base = UopsBenchmarkRunner; 91 using Base::Base; 92 // We can compute uops for any FP instruction that does not grow or shrink the 93 // stack (either do not touch the stack or push as much as they pop). 94 llvm::Expected<SnippetPrototype> 95 handleCompareFP(const Instruction &Instr) const { 96 return generateUnconstrainedPrototype( 97 Instr, "instruction does not grow/shrink the FP stack"); 98 } 99 llvm::Expected<SnippetPrototype> 100 handleCondMovFP(const Instruction &Instr) const { 101 return generateUnconstrainedPrototype( 102 Instr, "instruction does not grow/shrink the FP stack"); 103 } 104 }; 105 106 class ExegesisX86Target : public ExegesisTarget { 107 void addTargetSpecificPasses(llvm::PassManagerBase &PM) const override { 108 // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F. 109 PM.add(llvm::createX86FloatingPointStackifierPass()); 110 } 111 112 std::vector<llvm::MCInst> setRegToConstant(const llvm::MCSubtargetInfo &STI, 113 unsigned Reg) const override { 114 // GPR. 115 if (llvm::X86::GR8RegClass.contains(Reg)) 116 return {llvm::MCInstBuilder(llvm::X86::MOV8ri).addReg(Reg).addImm(1)}; 117 if (llvm::X86::GR16RegClass.contains(Reg)) 118 return {llvm::MCInstBuilder(llvm::X86::MOV16ri).addReg(Reg).addImm(1)}; 119 if (llvm::X86::GR32RegClass.contains(Reg)) 120 return {llvm::MCInstBuilder(llvm::X86::MOV32ri).addReg(Reg).addImm(1)}; 121 if (llvm::X86::GR64RegClass.contains(Reg)) 122 return {llvm::MCInstBuilder(llvm::X86::MOV64ri32).addReg(Reg).addImm(1)}; 123 // MMX. 124 if (llvm::X86::VR64RegClass.contains(Reg)) 125 return setVectorRegToConstant(Reg, 8, llvm::X86::MMX_MOVQ64rm); 126 // {X,Y,Z}MM. 127 if (llvm::X86::VR128XRegClass.contains(Reg)) { 128 if (STI.getFeatureBits()[llvm::X86::FeatureAVX512]) 129 return setVectorRegToConstant(Reg, 16, llvm::X86::VMOVDQU32Z128rm); 130 if (STI.getFeatureBits()[llvm::X86::FeatureAVX]) 131 return setVectorRegToConstant(Reg, 16, llvm::X86::VMOVDQUrm); 132 return setVectorRegToConstant(Reg, 16, llvm::X86::MOVDQUrm); 133 } 134 if (llvm::X86::VR256XRegClass.contains(Reg)) { 135 if (STI.getFeatureBits()[llvm::X86::FeatureAVX512]) 136 return setVectorRegToConstant(Reg, 32, llvm::X86::VMOVDQU32Z256rm); 137 return setVectorRegToConstant(Reg, 32, llvm::X86::VMOVDQUYrm); 138 } 139 if (llvm::X86::VR512RegClass.contains(Reg)) 140 return setVectorRegToConstant(Reg, 64, llvm::X86::VMOVDQU32Zrm); 141 // X87. 142 if (llvm::X86::RFP32RegClass.contains(Reg) || 143 llvm::X86::RFP64RegClass.contains(Reg) || 144 llvm::X86::RFP80RegClass.contains(Reg)) 145 return setVectorRegToConstant(Reg, 8, llvm::X86::LD_Fp64m); 146 if (Reg == llvm::X86::EFLAGS) { 147 // Set all flags to 0 but the bits that are "reserved and set to 1". 148 constexpr const uint32_t kImmValue = 0x00007002u; 149 std::vector<llvm::MCInst> Result; 150 Result.push_back(allocateStackSpace(8)); 151 Result.push_back(fillStackSpace(llvm::X86::MOV64mi32, 0, kImmValue)); 152 Result.push_back(llvm::MCInstBuilder(llvm::X86::POPF64)); // Also pops. 153 return Result; 154 } 155 return {}; 156 } 157 158 std::unique_ptr<BenchmarkRunner> 159 createLatencyBenchmarkRunner(const LLVMState &State) const override { 160 return llvm::make_unique<X86BenchmarkRunner<X86LatencyImpl>>(State); 161 } 162 163 std::unique_ptr<BenchmarkRunner> 164 createUopsBenchmarkRunner(const LLVMState &State) const override { 165 return llvm::make_unique<X86BenchmarkRunner<X86UopsImpl>>(State); 166 } 167 168 bool matchesArch(llvm::Triple::ArchType Arch) const override { 169 return Arch == llvm::Triple::x86_64 || Arch == llvm::Triple::x86; 170 } 171 172 private: 173 // setRegToConstant() specialized for a vector register of size 174 // `RegSizeBytes`. `RMOpcode` is the opcode used to do a memory -> vector 175 // register load. 176 static std::vector<llvm::MCInst> 177 setVectorRegToConstant(const unsigned Reg, const unsigned RegSizeBytes, 178 const unsigned RMOpcode) { 179 // There is no instruction to directly set XMM, go through memory. 180 // Since vector values can be interpreted as integers of various sizes (8 181 // to 64 bits) as well as floats and double, so we chose an immediate 182 // value that has set bits for all byte values and is a normal float/ 183 // double. 0x40404040 is ~32.5 when interpreted as a double and ~3.0f when 184 // interpreted as a float. 185 constexpr const uint32_t kImmValue = 0x40404040u; 186 std::vector<llvm::MCInst> Result; 187 Result.push_back(allocateStackSpace(RegSizeBytes)); 188 constexpr const unsigned kMov32NumBytes = 4; 189 for (unsigned Disp = 0; Disp < RegSizeBytes; Disp += kMov32NumBytes) { 190 Result.push_back(fillStackSpace(llvm::X86::MOV32mi, Disp, kImmValue)); 191 } 192 Result.push_back(loadToReg(Reg, RMOpcode)); 193 Result.push_back(releaseStackSpace(RegSizeBytes)); 194 return Result; 195 } 196 197 // Allocates scratch memory on the stack. 198 static llvm::MCInst allocateStackSpace(unsigned Bytes) { 199 return llvm::MCInstBuilder(llvm::X86::SUB64ri8) 200 .addReg(llvm::X86::RSP) 201 .addReg(llvm::X86::RSP) 202 .addImm(Bytes); 203 } 204 205 // Fills scratch memory at offset `OffsetBytes` with value `Imm`. 206 static llvm::MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes, 207 uint64_t Imm) { 208 return llvm::MCInstBuilder(MovOpcode) 209 // Address = ESP 210 .addReg(llvm::X86::RSP) // BaseReg 211 .addImm(1) // ScaleAmt 212 .addReg(0) // IndexReg 213 .addImm(OffsetBytes) // Disp 214 .addReg(0) // Segment 215 // Immediate. 216 .addImm(Imm); 217 } 218 219 // Loads scratch memory into register `Reg` using opcode `RMOpcode`. 220 static llvm::MCInst loadToReg(unsigned Reg, unsigned RMOpcode) { 221 return llvm::MCInstBuilder(RMOpcode) 222 .addReg(Reg) 223 // Address = ESP 224 .addReg(llvm::X86::RSP) // BaseReg 225 .addImm(1) // ScaleAmt 226 .addReg(0) // IndexReg 227 .addImm(0) // Disp 228 .addReg(0); // Segment 229 } 230 231 // Releases scratch memory. 232 static llvm::MCInst releaseStackSpace(unsigned Bytes) { 233 return llvm::MCInstBuilder(llvm::X86::ADD64ri8) 234 .addReg(llvm::X86::RSP) 235 .addReg(llvm::X86::RSP) 236 .addImm(Bytes); 237 } 238 }; 239 240 } // namespace 241 242 static ExegesisTarget *getTheExegesisX86Target() { 243 static ExegesisX86Target Target; 244 return &Target; 245 } 246 247 void InitializeX86ExegesisTarget() { 248 ExegesisTarget::registerTarget(getTheExegesisX86Target()); 249 } 250 251 } // namespace exegesis 252