Home | History | Annotate | Download | only in AMDGPU
      1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 ///
     12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
     13 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
     14 /// an MCObjectStreamer it outputs binary code.
     15 //
     16 //===----------------------------------------------------------------------===//
     17 //
     18 
     19 #include "AMDGPUAsmPrinter.h"
     20 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
     21 #include "InstPrinter/AMDGPUInstPrinter.h"
     22 #include "Utils/AMDGPUBaseInfo.h"
     23 #include "AMDGPU.h"
     24 #include "AMDKernelCodeT.h"
     25 #include "AMDGPUSubtarget.h"
     26 #include "R600Defines.h"
     27 #include "R600MachineFunctionInfo.h"
     28 #include "R600RegisterInfo.h"
     29 #include "SIDefines.h"
     30 #include "SIMachineFunctionInfo.h"
     31 #include "SIRegisterInfo.h"
     32 #include "llvm/CodeGen/MachineFrameInfo.h"
     33 #include "llvm/MC/MCContext.h"
     34 #include "llvm/MC/MCSectionELF.h"
     35 #include "llvm/MC/MCStreamer.h"
     36 #include "llvm/Support/ELF.h"
     37 #include "llvm/Support/MathExtras.h"
     38 #include "llvm/Support/TargetRegistry.h"
     39 #include "llvm/Target/TargetLoweringObjectFile.h"
     40 
     41 using namespace llvm;
     42 
     43 // TODO: This should get the default rounding mode from the kernel. We just set
     44 // the default here, but this could change if the OpenCL rounding mode pragmas
     45 // are used.
     46 //
     47 // The denormal mode here should match what is reported by the OpenCL runtime
     48 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
     49 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
     50 //
     51 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
     52 // precision, and leaves single precision to flush all and does not report
     53 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
     54 // CL_FP_DENORM for both.
     55 //
     56 // FIXME: It seems some instructions do not support single precision denormals
     57 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
     58 // and sin_f32, cos_f32 on most parts).
     59 
     60 // We want to use these instructions, and using fp32 denormals also causes
     61 // instructions to run at the double precision rate for the device so it's
     62 // probably best to just report no single precision denormals.
     63 static uint32_t getFPMode(const MachineFunction &F) {
     64   const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
     65   // TODO: Is there any real use for the flush in only / flush out only modes?
     66 
     67   uint32_t FP32Denormals =
     68     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
     69 
     70   uint32_t FP64Denormals =
     71     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
     72 
     73   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
     74          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
     75          FP_DENORM_MODE_SP(FP32Denormals) |
     76          FP_DENORM_MODE_DP(FP64Denormals);
     77 }
     78 
     79 static AsmPrinter *
     80 createAMDGPUAsmPrinterPass(TargetMachine &tm,
     81                            std::unique_ptr<MCStreamer> &&Streamer) {
     82   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
     83 }
     84 
     85 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
     86   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
     87   TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
     88 }
     89 
     90 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
     91                                    std::unique_ptr<MCStreamer> Streamer)
     92     : AsmPrinter(TM, std::move(Streamer)) {}
     93 
     94 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
     95   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
     96   SIProgramInfo KernelInfo;
     97   if (STM.isAmdHsaOS()) {
     98     getSIProgramInfo(KernelInfo, *MF);
     99     EmitAmdKernelCodeT(*MF, KernelInfo);
    100   }
    101 }
    102 
    103 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
    104   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    105   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
    106   if (MFI->isKernel() && STM.isAmdHsaOS()) {
    107     AMDGPUTargetStreamer *TS =
    108         static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
    109     TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
    110                              ELF::STT_AMDGPU_HSA_KERNEL);
    111   }
    112 
    113   AsmPrinter::EmitFunctionEntryLabel();
    114 }
    115 
    116 static bool isModuleLinkage(const GlobalValue *GV) {
    117   switch (GV->getLinkage()) {
    118   case GlobalValue::InternalLinkage:
    119   case GlobalValue::CommonLinkage:
    120    return true;
    121   case GlobalValue::ExternalLinkage:
    122    return false;
    123   default: llvm_unreachable("unknown linkage type");
    124   }
    125 }
    126 
    127 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
    128 
    129   if (TM.getTargetTriple().getOS() != Triple::AMDHSA) {
    130     AsmPrinter::EmitGlobalVariable(GV);
    131     return;
    132   }
    133 
    134   if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) {
    135     AsmPrinter::EmitGlobalVariable(GV);
    136     return;
    137   }
    138 
    139   // Group segment variables aren't emitted in HSA.
    140   if (AMDGPU::isGroupSegment(GV))
    141     return;
    142 
    143   AMDGPUTargetStreamer *TS =
    144       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
    145   if (isModuleLinkage(GV)) {
    146     TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
    147   } else {
    148     TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
    149   }
    150 
    151   const DataLayout &DL = getDataLayout();
    152   OutStreamer->PushSection();
    153   OutStreamer->SwitchSection(
    154       getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
    155   MCSymbol *GVSym = getSymbol(GV);
    156   const Constant *C = GV->getInitializer();
    157   OutStreamer->EmitLabel(GVSym);
    158   EmitGlobalConstant(DL, C);
    159   OutStreamer->PopSection();
    160 }
    161 
    162 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
    163 
    164   // The starting address of all shader programs must be 256 bytes aligned.
    165   MF.setAlignment(8);
    166 
    167   SetupMachineFunction(MF);
    168 
    169   MCContext &Context = getObjFileLowering().getContext();
    170   MCSectionELF *ConfigSection =
    171       Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
    172   OutStreamer->SwitchSection(ConfigSection);
    173 
    174   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
    175   SIProgramInfo KernelInfo;
    176   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
    177     getSIProgramInfo(KernelInfo, MF);
    178     if (!STM.isAmdHsaOS()) {
    179       EmitProgramInfoSI(MF, KernelInfo);
    180     }
    181     // Emit directives
    182     AMDGPUTargetStreamer *TS =
    183         static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
    184     TS->EmitDirectiveHSACodeObjectVersion(1, 0);
    185     AMDGPU::IsaVersion ISA = STM.getIsaVersion();
    186     TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
    187                                       "AMD", "AMDGPU");
    188   } else {
    189     EmitProgramInfoR600(MF);
    190   }
    191 
    192   DisasmLines.clear();
    193   HexLines.clear();
    194   DisasmLineMaxLen = 0;
    195 
    196   EmitFunctionBody();
    197 
    198   if (isVerbose()) {
    199     MCSectionELF *CommentSection =
    200         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
    201     OutStreamer->SwitchSection(CommentSection);
    202 
    203     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
    204       OutStreamer->emitRawComment(" Kernel info:", false);
    205       OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
    206                                   false);
    207       OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
    208                                   false);
    209       OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
    210                                   false);
    211       OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
    212                                   false);
    213       OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
    214                                   false);
    215       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
    216                                   false);
    217 
    218       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
    219                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
    220                                   false);
    221       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
    222                                   Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
    223                                   false);
    224       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
    225                                   Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
    226                                   false);
    227       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
    228                                   Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
    229                                   false);
    230       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
    231                                   Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
    232                                   false);
    233 
    234     } else {
    235       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    236       OutStreamer->emitRawComment(
    237         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
    238     }
    239   }
    240 
    241   if (STM.dumpCode()) {
    242 
    243     OutStreamer->SwitchSection(
    244         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
    245 
    246     for (size_t i = 0; i < DisasmLines.size(); ++i) {
    247       std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
    248       Comment += " ; " + HexLines[i] + "\n";
    249 
    250       OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
    251       OutStreamer->EmitBytes(StringRef(Comment));
    252     }
    253   }
    254 
    255   return false;
    256 }
    257 
    258 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
    259   unsigned MaxGPR = 0;
    260   bool killPixel = false;
    261   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
    262   const R600RegisterInfo *RI =
    263       static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
    264   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    265 
    266   for (const MachineBasicBlock &MBB : MF) {
    267     for (const MachineInstr &MI : MBB) {
    268       if (MI.getOpcode() == AMDGPU::KILLGT)
    269         killPixel = true;
    270       unsigned numOperands = MI.getNumOperands();
    271       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
    272         const MachineOperand &MO = MI.getOperand(op_idx);
    273         if (!MO.isReg())
    274           continue;
    275         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
    276 
    277         // Register with value > 127 aren't GPR
    278         if (HWReg > 127)
    279           continue;
    280         MaxGPR = std::max(MaxGPR, HWReg);
    281       }
    282     }
    283   }
    284 
    285   unsigned RsrcReg;
    286   if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
    287     // Evergreen / Northern Islands
    288     switch (MFI->getShaderType()) {
    289     default: // Fall through
    290     case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
    291     case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
    292     case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
    293     case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
    294     }
    295   } else {
    296     // R600 / R700
    297     switch (MFI->getShaderType()) {
    298     default: // Fall through
    299     case ShaderType::GEOMETRY: // Fall through
    300     case ShaderType::COMPUTE:  // Fall through
    301     case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
    302     case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
    303     }
    304   }
    305 
    306   OutStreamer->EmitIntValue(RsrcReg, 4);
    307   OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
    308                            S_STACK_SIZE(MFI->StackSize), 4);
    309   OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
    310   OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
    311 
    312   if (MFI->getShaderType() == ShaderType::COMPUTE) {
    313     OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
    314     OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
    315   }
    316 }
    317 
    318 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
    319                                         const MachineFunction &MF) const {
    320   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
    321   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    322   uint64_t CodeSize = 0;
    323   unsigned MaxSGPR = 0;
    324   unsigned MaxVGPR = 0;
    325   bool VCCUsed = false;
    326   bool FlatUsed = false;
    327   const SIRegisterInfo *RI =
    328       static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
    329 
    330   for (const MachineBasicBlock &MBB : MF) {
    331     for (const MachineInstr &MI : MBB) {
    332       // TODO: CodeSize should account for multiple functions.
    333 
    334       // TODO: Should we count size of debug info?
    335       if (MI.isDebugValue())
    336         continue;
    337 
    338       // FIXME: This is reporting 0 for many instructions.
    339       CodeSize += MI.getDesc().Size;
    340 
    341       unsigned numOperands = MI.getNumOperands();
    342       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
    343         const MachineOperand &MO = MI.getOperand(op_idx);
    344         unsigned width = 0;
    345         bool isSGPR = false;
    346 
    347         if (!MO.isReg())
    348           continue;
    349 
    350         unsigned reg = MO.getReg();
    351         switch (reg) {
    352         case AMDGPU::EXEC:
    353         case AMDGPU::SCC:
    354         case AMDGPU::M0:
    355           continue;
    356 
    357         case AMDGPU::VCC:
    358         case AMDGPU::VCC_LO:
    359         case AMDGPU::VCC_HI:
    360           VCCUsed = true;
    361           continue;
    362 
    363         case AMDGPU::FLAT_SCR:
    364         case AMDGPU::FLAT_SCR_LO:
    365         case AMDGPU::FLAT_SCR_HI:
    366           FlatUsed = true;
    367           continue;
    368 
    369         default:
    370           break;
    371         }
    372 
    373         if (AMDGPU::SReg_32RegClass.contains(reg)) {
    374           isSGPR = true;
    375           width = 1;
    376         } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
    377           isSGPR = false;
    378           width = 1;
    379         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
    380           isSGPR = true;
    381           width = 2;
    382         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
    383           isSGPR = false;
    384           width = 2;
    385         } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
    386           isSGPR = false;
    387           width = 3;
    388         } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
    389           isSGPR = true;
    390           width = 4;
    391         } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
    392           isSGPR = false;
    393           width = 4;
    394         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
    395           isSGPR = true;
    396           width = 8;
    397         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
    398           isSGPR = false;
    399           width = 8;
    400         } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
    401           isSGPR = true;
    402           width = 16;
    403         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
    404           isSGPR = false;
    405           width = 16;
    406         } else {
    407           llvm_unreachable("Unknown register class");
    408         }
    409         unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
    410         unsigned maxUsed = hwReg + width - 1;
    411         if (isSGPR) {
    412           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
    413         } else {
    414           MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
    415         }
    416       }
    417     }
    418   }
    419 
    420   if (VCCUsed || FlatUsed)
    421     MaxSGPR += 2;
    422 
    423   if (FlatUsed) {
    424     MaxSGPR += 2;
    425     // 2 additional for VI+.
    426     if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
    427       MaxSGPR += 2;
    428   }
    429 
    430   // We found the maximum register index. They start at 0, so add one to get the
    431   // number of registers.
    432   ProgInfo.NumVGPR = MaxVGPR + 1;
    433   ProgInfo.NumSGPR = MaxSGPR + 1;
    434 
    435   if (STM.hasSGPRInitBug()) {
    436     if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
    437       LLVMContext &Ctx = MF.getFunction()->getContext();
    438       Ctx.emitError("too many SGPRs used with the SGPR init bug");
    439     }
    440 
    441     ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
    442   }
    443 
    444   if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
    445     LLVMContext &Ctx = MF.getFunction()->getContext();
    446     Ctx.emitError("too many user SGPRs used");
    447   }
    448 
    449   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
    450   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
    451   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
    452   // register.
    453   ProgInfo.FloatMode = getFPMode(MF);
    454 
    455   // XXX: Not quite sure what this does, but sc seems to unset this.
    456   ProgInfo.IEEEMode = 0;
    457 
    458   // Do not clamp NAN to 0.
    459   ProgInfo.DX10Clamp = 0;
    460 
    461   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
    462   ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
    463 
    464   ProgInfo.FlatUsed = FlatUsed;
    465   ProgInfo.VCCUsed = VCCUsed;
    466   ProgInfo.CodeLen = CodeSize;
    467 
    468   unsigned LDSAlignShift;
    469   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
    470     // LDS is allocated in 64 dword blocks.
    471     LDSAlignShift = 8;
    472   } else {
    473     // LDS is allocated in 128 dword blocks.
    474     LDSAlignShift = 9;
    475   }
    476 
    477   unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
    478                           MFI->getMaximumWorkGroupSize(MF);
    479 
    480   ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
    481   ProgInfo.LDSBlocks =
    482      RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
    483 
    484   // Scratch is allocated in 256 dword blocks.
    485   unsigned ScratchAlignShift = 10;
    486   // We need to program the hardware with the amount of scratch memory that
    487   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
    488   // scratch memory used per thread.
    489   ProgInfo.ScratchBlocks =
    490     RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
    491                        1 << ScratchAlignShift) >> ScratchAlignShift;
    492 
    493   ProgInfo.ComputePGMRSrc1 =
    494       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
    495       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
    496       S_00B848_PRIORITY(ProgInfo.Priority) |
    497       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
    498       S_00B848_PRIV(ProgInfo.Priv) |
    499       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
    500       S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
    501       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
    502 
    503   // 0 = X, 1 = XY, 2 = XYZ
    504   unsigned TIDIGCompCnt = 0;
    505   if (MFI->hasWorkItemIDZ())
    506     TIDIGCompCnt = 2;
    507   else if (MFI->hasWorkItemIDY())
    508     TIDIGCompCnt = 1;
    509 
    510   ProgInfo.ComputePGMRSrc2 =
    511       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
    512       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
    513       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
    514       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
    515       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
    516       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
    517       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
    518       S_00B84C_EXCP_EN_MSB(0) |
    519       S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
    520       S_00B84C_EXCP_EN(0);
    521 }
    522 
    523 static unsigned getRsrcReg(unsigned ShaderType) {
    524   switch (ShaderType) {
    525   default: // Fall through
    526   case ShaderType::COMPUTE:  return R_00B848_COMPUTE_PGM_RSRC1;
    527   case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
    528   case ShaderType::PIXEL:    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
    529   case ShaderType::VERTEX:   return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
    530   }
    531 }
    532 
    533 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
    534                                          const SIProgramInfo &KernelInfo) {
    535   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
    536   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    537   unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
    538 
    539   if (MFI->getShaderType() == ShaderType::COMPUTE) {
    540     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
    541 
    542     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
    543 
    544     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
    545     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
    546 
    547     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
    548     OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
    549 
    550     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
    551     // 0" comment but I don't see a corresponding field in the register spec.
    552   } else {
    553     OutStreamer->EmitIntValue(RsrcReg, 4);
    554     OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
    555                               S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
    556     if (STM.isVGPRSpillingEnabled(MFI)) {
    557       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
    558       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
    559     }
    560   }
    561 
    562   if (MFI->getShaderType() == ShaderType::PIXEL) {
    563     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
    564     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
    565     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
    566     OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
    567   }
    568 }
    569 
    570 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
    571                                          const SIProgramInfo &KernelInfo) const {
    572   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    573   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
    574   amd_kernel_code_t header;
    575 
    576   AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
    577 
    578   header.compute_pgm_resource_registers =
    579       KernelInfo.ComputePGMRSrc1 |
    580       (KernelInfo.ComputePGMRSrc2 << 32);
    581   header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
    582 
    583   if (MFI->hasPrivateSegmentBuffer()) {
    584     header.code_properties |=
    585       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
    586   }
    587 
    588   if (MFI->hasDispatchPtr())
    589     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
    590 
    591   if (MFI->hasQueuePtr())
    592     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
    593 
    594   if (MFI->hasKernargSegmentPtr())
    595     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
    596 
    597   if (MFI->hasDispatchID())
    598     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
    599 
    600   if (MFI->hasFlatScratchInit())
    601     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
    602 
    603   // TODO: Private segment size
    604 
    605   if (MFI->hasGridWorkgroupCountX()) {
    606     header.code_properties |=
    607       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
    608   }
    609 
    610   if (MFI->hasGridWorkgroupCountY()) {
    611     header.code_properties |=
    612       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
    613   }
    614 
    615   if (MFI->hasGridWorkgroupCountZ()) {
    616     header.code_properties |=
    617       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
    618   }
    619 
    620   if (MFI->hasDispatchPtr())
    621     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
    622 
    623   header.kernarg_segment_byte_size = MFI->ABIArgOffset;
    624   header.wavefront_sgpr_count = KernelInfo.NumSGPR;
    625   header.workitem_vgpr_count = KernelInfo.NumVGPR;
    626   header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
    627   header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
    628 
    629   AMDGPUTargetStreamer *TS =
    630       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
    631   TS->EmitAMDKernelCodeT(header);
    632 }
    633 
    634 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
    635                                        unsigned AsmVariant,
    636                                        const char *ExtraCode, raw_ostream &O) {
    637   if (ExtraCode && ExtraCode[0]) {
    638     if (ExtraCode[1] != 0)
    639       return true; // Unknown modifier.
    640 
    641     switch (ExtraCode[0]) {
    642     default:
    643       // See if this is a generic print operand
    644       return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
    645     case 'r':
    646       break;
    647     }
    648   }
    649 
    650   AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
    651                    *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
    652   return false;
    653 }
    654