Home | History | Annotate | Download | only in AMDGPU
      1 //===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 /// \file AMDKernelCodeT.h
     10 //===----------------------------------------------------------------------===//
     11 
     12 #ifndef AMDKERNELCODET_H
     13 #define AMDKERNELCODET_H
     14 
     15 #include "llvm/MC/SubtargetFeature.h"
     16 
     17 #include <cstddef>
     18 #include <cstdint>
     19 
     20 #include "llvm/Support/Debug.h"
     21 //---------------------------------------------------------------------------//
     22 // AMD Kernel Code, and its dependencies                                     //
     23 //---------------------------------------------------------------------------//
     24 
     25 typedef uint8_t hsa_powertwo8_t;
     26 typedef uint32_t hsa_ext_code_kind_t;
     27 typedef uint8_t hsa_ext_brig_profile8_t;
     28 typedef uint8_t hsa_ext_brig_machine_model8_t;
     29 typedef uint64_t hsa_ext_control_directive_present64_t;
     30 typedef uint16_t hsa_ext_exception_kind16_t;
     31 typedef uint32_t hsa_ext_code_kind32_t;
     32 
     33 typedef struct hsa_dim3_s {
     34   uint32_t x;
     35   uint32_t y;
     36   uint32_t z;
     37 } hsa_dim3_t;
     38 
     39 /// The version of the amd_*_code_t struct. Minor versions must be
     40 /// backward compatible.
     41 typedef uint32_t amd_code_version32_t;
     42 enum amd_code_version_t {
     43   AMD_CODE_VERSION_MAJOR = 0,
     44   AMD_CODE_VERSION_MINOR = 1
     45 };
     46 
     47 /// The values used to define the number of bytes to use for the
     48 /// swizzle element size.
     49 enum amd_element_byte_size_t {
     50   AMD_ELEMENT_2_BYTES = 0,
     51   AMD_ELEMENT_4_BYTES = 1,
     52   AMD_ELEMENT_8_BYTES = 2,
     53   AMD_ELEMENT_16_BYTES = 3
     54 };
     55 
     56 /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
     57 /// COMPUTE_PGM_RSRC2 registers.
     58 typedef uint64_t amd_compute_pgm_resource_register64_t;
     59 
     60 /// Every amd_*_code_t has the following properties, which are composed of
     61 /// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
     62 /// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
     63 /// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
     64 ///
     65 /// (Note that bit fields cannot be used as their layout is
     66 /// implementation defined in the C standard and so cannot be used to
     67 /// specify an ABI)
     68 typedef uint32_t amd_code_property32_t;
     69 enum amd_code_property_mask_t {
     70 
     71   /// Enable the setup of the SGPR user data registers
     72   /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
     73   /// for initial register state.
     74   ///
     75   /// The total number of SGPRuser data registers requested must not
     76   /// exceed 16. Any requests beyond 16 will be ignored.
     77   ///
     78   /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
     79   /// SGPR user data registers enabled up to 16).
     80 
     81   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
     82   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
     83   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
     84 
     85   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
     86   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
     87   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
     88 
     89   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
     90   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
     91   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
     92 
     93   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
     94   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
     95   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
     96 
     97   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
     98   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
     99   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
    100 
    101   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
    102   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
    103   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
    104 
    105   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
    106   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
    107   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
    108 
    109   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
    110   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
    111   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
    112 
    113   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
    114   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
    115   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
    116 
    117   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
    118   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
    119   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
    120 
    121   /// Control wave ID base counter for GDS ordered-append. Used to set
    122   /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
    123   /// ORDERED_APPEND_MODE also needs to be settable)
    124   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10,
    125   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
    126   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
    127 
    128   /// The interleave (swizzle) element size in bytes required by the
    129   /// code for private memory. This must be 2, 4, 8 or 16. This value
    130   /// is provided to the finalizer when it is invoked and is recorded
    131   /// here. The hardware will interleave the memory requests of each
    132   /// lane of a wavefront by this element size to ensure each
    133   /// work-item gets a distinct memory memory location. Therefore, the
    134   /// finalizer ensures that all load and store operations done to
    135   /// private memory do not exceed this size. For example, if the
    136   /// element size is 4 (32-bits or dword) and a 64-bit value must be
    137   /// loaded, the finalizer will generate two 32-bit loads. This
    138   /// ensures that the interleaving will get the work-item
    139   /// specific dword for both halves of the 64-bit value. If it just
    140   /// did a 64-bit load then it would get one dword which belonged to
    141   /// its own work-item, but the second dword would belong to the
    142   /// adjacent lane work-item since the interleaving is in dwords.
    143   ///
    144   /// The value used must match the value that the runtime configures
    145   /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
    146   /// is generally DWORD.
    147   ///
    148   /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
    149   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11,
    150   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
    151   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
    152 
    153   /// Are global memory addresses 64 bits. Must match
    154   /// amd_kernel_code_t.hsail_machine_model ==
    155   /// HSA_MACHINE_LARGE. Must also match
    156   /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
    157   /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
    158   AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13,
    159   AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
    160   AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
    161 
    162   /// Indicate if the generated ISA is using a dynamically sized call
    163   /// stack. This can happen if calls are implemented using a call
    164   /// stack and recursion, alloca or calls to indirect functions are
    165   /// present. In these cases the Finalizer cannot compute the total
    166   /// private segment size at compile time. In this case the
    167   /// workitem_private_segment_byte_size only specifies the statically
    168   /// know private segment size, and additional space must be added
    169   /// for the call stack.
    170   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14,
    171   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
    172   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
    173 
    174   /// Indicate if code generated has support for debugging.
    175   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15,
    176   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
    177   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
    178 
    179   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 15,
    180   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
    181   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT
    182 };
    183 
    184 /// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
    185 /// control directives. These control how the finalizer generates code. This
    186 /// struct is used both as an argument to hsaFinalizeKernel to specify values for
    187 /// the control directives, and is used in HsaKernelCode to record the values of
    188 /// the control directives that the finalize used when generating the code which
    189 /// either came from the finalizer argument or explicit HSAIL control
    190 /// directives. See the definition of the control directives in HSA Programmer's
    191 /// Reference Manual which also defines how the values specified as finalizer
    192 /// arguments have to agree with the control directives in the HSAIL code.
    193 typedef struct hsa_ext_control_directives_s {
    194   /// This is a bit set indicating which control directives have been
    195   /// specified. If the value is 0 then there are no control directives specified
    196   /// and the rest of the fields can be ignored. The bits are accessed using the
    197   /// hsa_ext_control_directives_present_mask_t. Any control directive that is not
    198   /// enabled in this bit set must have the value of all 0s.
    199   hsa_ext_control_directive_present64_t enabled_control_directives;
    200 
    201   /// If enableBreakExceptions is not enabled then must be 0, otherwise must be
    202   /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK
    203   /// policy enabled. If this set is not empty then the generated code may have
    204   /// lower performance than if the set is empty. If the kernel being finalized
    205   /// has any enablebreakexceptions control directives, then the values specified
    206   /// by this argument are unioned with the values in these control
    207   /// directives. If any of the functions the kernel calls have an
    208   /// enablebreakexceptions control directive, then they must be equal or a
    209   /// subset of, this union.
    210   hsa_ext_exception_kind16_t enable_break_exceptions;
    211 
    212   /// If enableDetectExceptions is not enabled then must be 0, otherwise must be
    213   /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT
    214   /// policy enabled. If this set is not empty then the generated code may have
    215   /// lower performance than if the set is empty. However, an implementation
    216   /// should endeavour to make the performance impact small. If the kernel being
    217   /// finalized has any enabledetectexceptions control directives, then the
    218   /// values specified by this argument are unioned with the values in these
    219   /// control directives. If any of the functions the kernel calls have an
    220   /// enabledetectexceptions control directive, then they must be equal or a
    221   /// subset of, this union.
    222   hsa_ext_exception_kind16_t enable_detect_exceptions;
    223 
    224   /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of
    225   /// dynamic group segment can be allocated for a dispatch, otherwise the value
    226   /// specifies the maximum number of bytes of dynamic group segment that can be
    227   /// allocated for a dispatch. If the kernel being finalized has any
    228   /// maxdynamicsize control directives, then the values must be the same, and
    229   /// must be the same as this argument if it is enabled. This value can be used
    230   /// by the finalizer to determine the maximum number of bytes of group memory
    231   /// used by each work-group by adding this value to the group memory required
    232   /// for all group segment variables used by the kernel and all functions it
    233   /// calls, and group memory used to implement other HSAIL features such as
    234   /// fbarriers and the detect exception operations. This can allow the finalizer
    235   /// to determine the expected number of work-groups that can be executed by a
    236   /// compute unit and allow more resources to be allocated to the work-items if
    237   /// it is known that fewer work-groups can be executed due to group memory
    238   /// limitations.
    239   uint32_t max_dynamic_group_size;
    240 
    241   /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater
    242   /// than 0. See HSA Programmer's Reference Manual description of
    243   /// maxflatgridsize control directive.
    244   uint32_t max_flat_grid_size;
    245 
    246   /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be
    247   /// greater than 0. See HSA Programmer's Reference Manual description of
    248   /// maxflatworkgroupsize control directive.
    249   uint32_t max_flat_workgroup_size;
    250 
    251   /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the
    252   /// finalizer is free to generate ISA that may result in any number of
    253   /// work-groups executing on a single compute unit. Otherwise, the finalizer
    254   /// should attempt to generate ISA that will allow the specified number of
    255   /// work-groups to execute on a single compute unit. This is only a hint and
    256   /// can be ignored by the finalizer. If the kernel being finalized, or any of
    257   /// the functions it calls, has a requested control directive, then the values
    258   /// must be the same. This can be used to determine the number of resources
    259   /// that should be allocated to a single work-group and work-item. For example,
    260   /// a low value may allow more resources to be allocated, resulting in higher
    261   /// per work-item performance, as it is known there will never be more than the
    262   /// specified number of work-groups actually executing on the compute
    263   /// unit. Conversely, a high value may allocate fewer resources, resulting in
    264   /// lower per work-item performance, which is offset by the fact it allows more
    265   /// work-groups to actually execute on the compute unit.
    266   uint32_t requested_workgroups_per_cu;
    267 
    268   /// If not enabled then all elements for Dim3 must be 0, otherwise every
    269   /// element must be greater than 0. See HSA Programmer's Reference Manual
    270   /// description of requiredgridsize control directive.
    271   hsa_dim3_t required_grid_size;
    272 
    273   /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be
    274   /// 0, and the produced code can be dispatched with any legal work-group range
    275   /// consistent with the dispatch dimensions. Otherwise, the code produced must
    276   /// always be dispatched with the specified work-group range. No element of the
    277   /// specified range must be 0. It must be consistent with required_dimensions
    278   /// and max_flat_workgroup_size. If the kernel being finalized, or any of the
    279   /// functions it calls, has a requiredworkgroupsize control directive, then the
    280   /// values must be the same. Specifying a value can allow the finalizer to
    281   /// optimize work-group id operations, and if the number of work-items in the
    282   /// work-group is less than the WAVESIZE then barrier operations can be
    283   /// optimized to just a memory fence.
    284   hsa_dim3_t required_workgroup_size;
    285 
    286   /// If requiredDim is not enabled then must be 0 and the produced kernel code
    287   /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is
    288   /// 1..3 and the code produced must only be dispatched with a dimension that
    289   /// matches. Other values are illegal. If the kernel being finalized, or any of
    290   /// the functions it calls, has a requireddimsize control directive, then the
    291   /// values must be the same. This can be used to optimize the code generated to
    292   /// compute the absolute and flat work-group and work-item id, and the dim
    293   /// HSAIL operations.
    294   uint8_t required_dim;
    295 
    296   /// Reserved. Must be 0.
    297   uint8_t reserved[75];
    298 } hsa_ext_control_directives_t;
    299 
    300 /// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
    301 /// Code Object to set up the hardware to execute the kernel dispatch.
    302 ///
    303 /// Initial Kernel Register State.
    304 ///
    305 /// Initial kernel register state will be set up by CP/SPI prior to the start
    306 /// of execution of every wavefront. This is limited by the constraints of the
    307 /// current hardware.
    308 ///
    309 /// The order of the SGPR registers is defined, but the Finalizer can specify
    310 /// which ones are actually setup in the amd_kernel_code_t object using the
    311 /// enable_sgpr_* bit fields. The register numbers used for enabled registers
    312 /// are dense starting at SGPR0: the first enabled register is SGPR0, the next
    313 /// enabled register is SGPR1 etc.; disabled registers do not have an SGPR
    314 /// number.
    315 ///
    316 /// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
    317 /// apply to all waves of the grid. It is possible to specify more than 16 User
    318 /// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
    319 /// are actually initialized. These are then immediately followed by the System
    320 /// SGPRs that are set up by ADC/SPI and can have different values for each wave
    321 /// of the grid dispatch.
    322 ///
    323 /// SGPR register initial state is defined as follows:
    324 ///
    325 /// Private Segment Buffer (enable_sgpr_private_segment_buffer):
    326 ///   Number of User SGPR registers: 4. V# that can be used, together with
    327 ///   Scratch Wave Offset as an offset, to access the Private/Spill/Arg
    328 ///   segments using a segment address. It must be set as follows:
    329 ///     - Base address: of the scratch memory area used by the dispatch. It
    330 ///       does not include the scratch wave offset. It will be the per process
    331 ///       SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
    332 ///       example there may be a per pipe offset, or per AQL Queue offset).
    333 ///     - Stride + data_format: Element Size * Index Stride (???)
    334 ///     - Cache swizzle: ???
    335 ///     - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
    336 ///       scratch)
    337 ///     - Num records: Flat Scratch Work Item Size / Element Size (???)
    338 ///     - Dst_sel_*: ???
    339 ///     - Num_format: ???
    340 ///     - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
    341 ///       agree with amd_kernel_code_t.privateElementSize)
    342 ///     - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
    343 ///       be number of wavefront lanes for scratch, must agree with
    344 ///       amd_kernel_code_t.wavefrontSize)
    345 ///     - Add tid enable: 1
    346 ///     - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
    347 ///     - Hash_enable: ???
    348 ///     - Heap: ???
    349 ///     - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
    350 ///     - Type: 0 (a buffer) (???)
    351 ///
    352 /// Dispatch Ptr (enable_sgpr_dispatch_ptr):
    353 ///   Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
    354 ///   for kernel actually executing.
    355 ///
    356 /// Queue Ptr (enable_sgpr_queue_ptr):
    357 ///   Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
    358 ///   AQL queue on which the dispatch packet was queued.
    359 ///
    360 /// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
    361 ///   Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
    362 ///   is directly copied from the kernargPtr in the dispatch packet. Having CP
    363 ///   load it once avoids loading it at the beginning of every wavefront.
    364 ///
    365 /// Dispatch Id (enable_sgpr_dispatch_id):
    366 ///   Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
    367 ///   packet being executed.
    368 ///
    369 /// Flat Scratch Init (enable_sgpr_flat_scratch_init):
    370 ///   Number of User SGPR registers: 2. This is 2 SGPRs.
    371 ///
    372 ///   For CI/VI:
    373 ///     The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
    374 ///     to base of memory for scratch for this dispatch. This is the same offset
    375 ///     used in computing the Scratch Segment Buffer base address. The value of
    376 ///     Scratch Wave Offset must be added by the kernel code and moved to
    377 ///     SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
    378 ///
    379 ///     The second SGPR is 32 bit byte size of a single work-item's scratch
    380 ///     memory usage. This is directly loaded from the dispatch packet Private
    381 ///     Segment Byte Size and rounded up to a multiple of DWORD.
    382 ///
    383 ///     \todo [Does CP need to round this to >4 byte alignment?]
    384 ///
    385 ///     The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
    386 ///     flat memory instructions. Having CP load it once avoids loading it at
    387 ///     the beginning of every wavefront.
    388 ///
    389 ///   For PI:
    390 ///     This is the 64 bit base address of the scratch backing memory for
    391 ///     allocated by CP for this dispatch.
    392 ///
    393 /// Private Segment Size (enable_sgpr_private_segment_size):
    394 ///   Number of User SGPR registers: 1. The 32 bit byte size of a single
    395 ///   work-item's scratch memory allocation. This is the value from the dispatch
    396 ///   packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
    397 ///
    398 ///   \todo [Does CP need to round this to >4 byte alignment?]
    399 ///
    400 ///   Having CP load it once avoids loading it at the beginning of every
    401 ///   wavefront.
    402 ///
    403 ///   \todo [This will not be used for CI/VI since it is the same value as
    404 ///   the second SGPR of Flat Scratch Init. However, it is need for PI which
    405 ///   changes meaning of Flat Scratchg Init..]
    406 ///
    407 /// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
    408 ///   Number of User SGPR registers: 1. 32 bit count of the number of
    409 ///   work-groups in the X dimension for the grid being executed. Computed from
    410 ///   the fields in the HsaDispatchPacket as
    411 ///   ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
    412 ///
    413 /// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
    414 ///   Number of User SGPR registers: 1. 32 bit count of the number of
    415 ///   work-groups in the Y dimension for the grid being executed. Computed from
    416 ///   the fields in the HsaDispatchPacket as
    417 ///   ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
    418 ///
    419 ///   Only initialized if <16 previous SGPRs initialized.
    420 ///
    421 /// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
    422 ///   Number of User SGPR registers: 1. 32 bit count of the number of
    423 ///   work-groups in the Z dimension for the grid being executed. Computed
    424 ///   from the fields in the HsaDispatchPacket as
    425 ///   ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
    426 ///
    427 ///   Only initialized if <16 previous SGPRs initialized.
    428 ///
    429 /// Work-Group Id X (enable_sgpr_workgroup_id_x):
    430 ///   Number of System SGPR registers: 1. 32 bit work group id in X dimension
    431 ///   of grid for wavefront. Always present.
    432 ///
    433 /// Work-Group Id Y (enable_sgpr_workgroup_id_y):
    434 ///   Number of System SGPR registers: 1. 32 bit work group id in Y dimension
    435 ///   of grid for wavefront.
    436 ///
    437 /// Work-Group Id Z (enable_sgpr_workgroup_id_z):
    438 ///   Number of System SGPR registers: 1. 32 bit work group id in Z dimension
    439 ///   of grid for wavefront. If present then Work-group Id Y will also be
    440 ///   present
    441 ///
    442 /// Work-Group Info (enable_sgpr_workgroup_info):
    443 ///   Number of System SGPR registers: 1. {first_wave, 14'b0000,
    444 ///   ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
    445 ///
    446 /// Private Segment Wave Byte Offset
    447 /// (enable_sgpr_private_segment_wave_byte_offset):
    448 ///   Number of System SGPR registers: 1. 32 bit byte offset from base of
    449 ///   dispatch scratch base. Must be used as an offset with Private/Spill/Arg
    450 ///   segment address when using Scratch Segment Buffer. It must be added to
    451 ///   Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
    452 ///
    453 ///
    454 /// The order of the VGPR registers is defined, but the Finalizer can specify
    455 /// which ones are actually setup in the amd_kernel_code_t object using the
    456 /// enableVgpr*  bit fields. The register numbers used for enabled registers
    457 /// are dense starting at VGPR0: the first enabled register is VGPR0, the next
    458 /// enabled register is VGPR1 etc.; disabled registers do not have an VGPR
    459 /// number.
    460 ///
    461 /// VGPR register initial state is defined as follows:
    462 ///
    463 /// Work-Item Id X (always initialized):
    464 ///   Number of registers: 1. 32 bit work item id in X dimension of work-group
    465 ///   for wavefront lane.
    466 ///
    467 /// Work-Item Id X (enable_vgpr_workitem_id > 0):
    468 ///   Number of registers: 1. 32 bit work item id in Y dimension of work-group
    469 ///   for wavefront lane.
    470 ///
    471 /// Work-Item Id X (enable_vgpr_workitem_id > 0):
    472 ///   Number of registers: 1. 32 bit work item id in Z dimension of work-group
    473 ///   for wavefront lane.
    474 ///
    475 ///
    476 /// The setting of registers is being done by existing GPU hardware as follows:
    477 ///   1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
    478 ///      registers.
    479 ///   2) Work-group Id registers X, Y, Z are set by SPI which supports any
    480 ///      combination including none.
    481 ///   3) Scratch Wave Offset is also set by SPI which is why its value cannot
    482 ///      be added into the value Flat Scratch Offset which would avoid the
    483 ///      Finalizer generated prolog having to do the add.
    484 ///   4) The VGPRs are set by SPI which only supports specifying either (X),
    485 ///      (X, Y) or (X, Y, Z).
    486 ///
    487 /// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
    488 /// they can be moved as a 64 bit value to the hardware required SGPRn-3 and
    489 /// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
    490 ///
    491 /// The global segment can be accessed either using flat operations or buffer
    492 /// operations. If buffer operations are used then the Global Buffer used to
    493 /// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
    494 /// segment address is not passed into the kernel code by CP since its base
    495 /// address is always 0. Instead the Finalizer generates prolog code to
    496 /// initialize 4 SGPRs with a V# that has the following properties, and then
    497 /// uses that in the buffer instructions:
    498 ///   - base address of 0
    499 ///   - no swizzle
    500 ///   - ATC=1
    501 ///   - MTYPE set to support memory coherence specified in
    502 ///     amd_kernel_code_t.globalMemoryCoherence
    503 ///
    504 /// When the Global Buffer is used to access the Kernarg segment, must add the
    505 /// dispatch packet kernArgPtr to a kernarg segment address before using this V#.
    506 /// Alternatively scalar loads can be used if the kernarg offset is uniform, as
    507 /// the kernarg segment is constant for the duration of the kernel execution.
    508 ///
    509 
    510 typedef struct amd_kernel_code_s {
    511   uint32_t amd_kernel_code_version_major;
    512   uint32_t amd_kernel_code_version_minor;
    513   uint16_t amd_machine_kind;
    514   uint16_t amd_machine_version_major;
    515   uint16_t amd_machine_version_minor;
    516   uint16_t amd_machine_version_stepping;
    517 
    518   /// Byte offset (possibly negative) from start of amd_kernel_code_t
    519   /// object to kernel's entry point instruction. The actual code for
    520   /// the kernel is required to be 256 byte aligned to match hardware
    521   /// requirements (SQ cache line is 16). The code must be position
    522   /// independent code (PIC) for AMD devices to give runtime the
    523   /// option of copying code to discrete GPU memory or APU L2
    524   /// cache. The Finalizer should endeavour to allocate all kernel
    525   /// machine code in contiguous memory pages so that a device
    526   /// pre-fetcher will tend to only pre-fetch Kernel Code objects,
    527   /// improving cache performance.
    528   int64_t kernel_code_entry_byte_offset;
    529 
    530   /// Range of bytes to consider prefetching expressed as an offset
    531   /// and size. The offset is from the start (possibly negative) of
    532   /// amd_kernel_code_t object. Set both to 0 if no prefetch
    533   /// information is available.
    534   int64_t kernel_code_prefetch_byte_offset;
    535   uint64_t kernel_code_prefetch_byte_size;
    536 
    537   /// Number of bytes of scratch backing memory required for full
    538   /// occupancy of target chip. This takes into account the number of
    539   /// bytes of scratch per work-item, the wavefront size, the maximum
    540   /// number of wavefronts per CU, and the number of CUs. This is an
    541   /// upper limit on scratch. If the grid being dispatched is small it
    542   /// may only need less than this. If the kernel uses no scratch, or
    543   /// the Finalizer has not computed this value, it must be 0.
    544   uint64_t max_scratch_backing_memory_byte_size;
    545 
    546   /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
    547   /// COMPUTE_PGM_RSRC2 registers.
    548   uint64_t compute_pgm_resource_registers;
    549 
    550   /// Code properties. See amd_code_property_mask_t for a full list of
    551   /// properties.
    552   uint32_t code_properties;
    553 
    554   /// The amount of memory required for the combined private, spill
    555   /// and arg segments for a work-item in bytes. If
    556   /// is_dynamic_callstack is 1 then additional space must be added to
    557   /// this value for the call stack.
    558   uint32_t workitem_private_segment_byte_size;
    559 
    560   /// The amount of group segment memory required by a work-group in
    561   /// bytes. This does not include any dynamically allocated group
    562   /// segment memory that may be added when the kernel is
    563   /// dispatched.
    564   uint32_t workgroup_group_segment_byte_size;
    565 
    566   /// Number of byte of GDS required by kernel dispatch. Must be 0 if
    567   /// not using GDS.
    568   uint32_t gds_segment_byte_size;
    569 
    570   /// The size in bytes of the kernarg segment that holds the values
    571   /// of the arguments to the kernel. This could be used by CP to
    572   /// prefetch the kernarg segment pointed to by the dispatch packet.
    573   uint64_t kernarg_segment_byte_size;
    574 
    575   /// Number of fbarrier's used in the kernel and all functions it
    576   /// calls. If the implementation uses group memory to allocate the
    577   /// fbarriers then that amount must already be included in the
    578   /// workgroup_group_segment_byte_size total.
    579   uint32_t workgroup_fbarrier_count;
    580 
    581   /// Number of scalar registers used by a wavefront. This includes
    582   /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
    583   /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
    584   /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
    585   uint16_t wavefront_sgpr_count;
    586 
    587   /// Number of vector registers used by each work-item. Used to set
    588   /// COMPUTE_PGM_RSRC1.VGPRS.
    589   uint16_t workitem_vgpr_count;
    590 
    591   /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
    592   /// first fixed VGPR number reserved.
    593   uint16_t reserved_vgpr_first;
    594 
    595   /// The number of consecutive VGPRs reserved by the client. If
    596   /// is_debug_supported then this count includes VGPRs reserved
    597   /// for debugger use.
    598   uint16_t reserved_vgpr_count;
    599 
    600   /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
    601   /// first fixed SGPR number reserved.
    602   uint16_t reserved_sgpr_first;
    603 
    604   /// The number of consecutive SGPRs reserved by the client. If
    605   /// is_debug_supported then this count includes SGPRs reserved
    606   /// for debugger use.
    607   uint16_t reserved_sgpr_count;
    608 
    609   /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
    610   /// fixed SGPR number used to hold the wave scratch offset for the
    611   /// entire kernel execution, or uint16_t(-1) if the register is not
    612   /// used or not known.
    613   uint16_t debug_wavefront_private_segment_offset_sgpr;
    614 
    615   /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
    616   /// fixed SGPR number of the first of 4 SGPRs used to hold the
    617   /// scratch V# used for the entire kernel execution, or uint16_t(-1)
    618   /// if the registers are not used or not known.
    619   uint16_t debug_private_segment_buffer_sgpr;
    620 
    621   /// The maximum byte alignment of variables used by the kernel in
    622   /// the specified memory segment. Expressed as a power of two. Must
    623   /// be at least HSA_POWERTWO_16.
    624   uint8_t kernarg_segment_alignment;
    625   uint8_t group_segment_alignment;
    626   uint8_t private_segment_alignment;
    627 
    628   /// Wavefront size expressed as a power of two. Must be a power of 2
    629   /// in range 1..64 inclusive. Used to support runtime query that
    630   /// obtains wavefront size, which may be used by application to
    631   /// allocated dynamic group memory and set the dispatch work-group
    632   /// size.
    633   uint8_t wavefront_size;
    634 
    635   int32_t call_convention;
    636   uint8_t reserved3[12];
    637   uint64_t runtime_loader_kernel_symbol;
    638   uint64_t control_directives[16];
    639 } amd_kernel_code_t;
    640 
    641 #endif // AMDKERNELCODET_H
    642