Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2015 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  */
     23 
     24 #include "util/ralloc.h"
     25 #include "brw_context.h"
     26 #include "brw_cs.h"
     27 #include "brw_eu.h"
     28 #include "brw_wm.h"
     29 #include "brw_shader.h"
     30 #include "intel_mipmap_tree.h"
     31 #include "intel_batchbuffer.h"
     32 #include "brw_state.h"
     33 #include "program/prog_statevars.h"
     34 #include "compiler/glsl/ir_uniform.h"
     35 #include "main/shaderapi.h"
     36 
     37 static void
     38 brw_upload_cs_state(struct brw_context *brw)
     39 {
     40    if (!brw->cs.base.prog_data)
     41       return;
     42 
     43    uint32_t offset;
     44    uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
     45                                                 8 * 4, 64, &offset);
     46    struct brw_stage_state *stage_state = &brw->cs.base;
     47    struct brw_stage_prog_data *prog_data = stage_state->prog_data;
     48    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
     49    const struct gen_device_info *devinfo = &brw->screen->devinfo;
     50 
     51    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
     52       brw_emit_buffer_surface_state(
     53          brw, &stage_state->surf_offset[
     54                  prog_data->binding_table.shader_time_start],
     55          brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW,
     56          brw->shader_time.bo->size, 1, true);
     57    }
     58 
     59    uint32_t *bind = (uint32_t*) brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
     60                                             prog_data->binding_table.size_bytes,
     61                                             32, &stage_state->bind_bo_offset);
     62 
     63    uint32_t dwords = brw->gen < 8 ? 8 : 9;
     64    BEGIN_BATCH(dwords);
     65    OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));
     66 
     67    if (prog_data->total_scratch) {
     68       if (brw->gen >= 8) {
     69          /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
     70           * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
     71           */
     72          OUT_RELOC64(stage_state->scratch_bo,
     73                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
     74                      ffs(stage_state->per_thread_scratch) - 11);
     75       } else if (brw->is_haswell) {
     76          /* Haswell's Per Thread Scratch Space is in the range [0, 10]
     77           * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
     78           */
     79          OUT_RELOC(stage_state->scratch_bo,
     80                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
     81                    ffs(stage_state->per_thread_scratch) - 12);
     82       } else {
     83          /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
     84           * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
     85           */
     86          OUT_RELOC(stage_state->scratch_bo,
     87                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
     88                    stage_state->per_thread_scratch / 1024 - 1);
     89       }
     90    } else {
     91       OUT_BATCH(0);
     92       if (brw->gen >= 8)
     93          OUT_BATCH(0);
     94    }
     95 
     96    const uint32_t vfe_num_urb_entries = brw->gen >= 8 ? 2 : 0;
     97    const uint32_t vfe_gpgpu_mode =
     98       brw->gen == 7 ? SET_FIELD(1, GEN7_MEDIA_VFE_STATE_GPGPU_MODE) : 0;
     99    const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
    100    OUT_BATCH(SET_FIELD(devinfo->max_cs_threads * subslices - 1,
    101                        MEDIA_VFE_STATE_MAX_THREADS) |
    102              SET_FIELD(vfe_num_urb_entries, MEDIA_VFE_STATE_URB_ENTRIES) |
    103              SET_FIELD(1, MEDIA_VFE_STATE_RESET_GTW_TIMER) |
    104              SET_FIELD(1, MEDIA_VFE_STATE_BYPASS_GTW) |
    105              vfe_gpgpu_mode);
    106 
    107    OUT_BATCH(0);
    108    const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0;
    109 
    110    /* We are uploading duplicated copies of push constant uniforms for each
    111     * thread. Although the local id data needs to vary per thread, it won't
    112     * change for other uniform data. Unfortunately this duplication is
    113     * required for gen7. As of Haswell, this duplication can be avoided, but
    114     * this older mechanism with duplicated data continues to work.
    115     *
    116     * FINISHME: As of Haswell, we could make use of the
    117     * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" field
    118     * to only store one copy of uniform data.
    119     *
    120     * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
    121     * which is described in the GPGPU_WALKER command and in the Broadwell PRM
    122     * Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
    123     * Operations => GPGPU Mode => Indirect Payload Storage.
    124     *
    125     * Note: The constant data is built in brw_upload_cs_push_constants below.
    126     */
    127    const uint32_t vfe_curbe_allocation =
    128       ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
    129             cs_prog_data->push.cross_thread.regs, 2);
    130    OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) |
    131              SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC));
    132    OUT_BATCH(0);
    133    OUT_BATCH(0);
    134    OUT_BATCH(0);
    135    ADVANCE_BATCH();
    136 
    137    if (cs_prog_data->push.total.size > 0) {
    138       BEGIN_BATCH(4);
    139       OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));
    140       OUT_BATCH(0);
    141       OUT_BATCH(ALIGN(cs_prog_data->push.total.size, 64));
    142       OUT_BATCH(stage_state->push_const_offset);
    143       ADVANCE_BATCH();
    144    }
    145 
    146    /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
    147    memcpy(bind, stage_state->surf_offset,
    148           prog_data->binding_table.size_bytes);
    149 
    150    memset(desc, 0, 8 * 4);
    151 
    152    int dw = 0;
    153    desc[dw++] = brw->cs.base.prog_offset;
    154    if (brw->gen >= 8)
    155       desc[dw++] = 0; /* Kernel Start Pointer High */
    156    desc[dw++] = 0;
    157    desc[dw++] = stage_state->sampler_offset |
    158       ((stage_state->sampler_count + 3) / 4);
    159    desc[dw++] = stage_state->bind_bo_offset;
    160    desc[dw++] = SET_FIELD(cs_prog_data->push.per_thread.regs,
    161                           MEDIA_CURBE_READ_LENGTH);
    162    const uint32_t media_threads =
    163       brw->gen >= 8 ?
    164       SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
    165       SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT);
    166    assert(cs_prog_data->threads <= devinfo->max_cs_threads);
    167 
    168    const uint32_t slm_size =
    169       encode_slm_size(devinfo->gen, prog_data->total_shared);
    170 
    171    desc[dw++] =
    172       SET_FIELD(cs_prog_data->uses_barrier, MEDIA_BARRIER_ENABLE) |
    173       SET_FIELD(slm_size, MEDIA_SHARED_LOCAL_MEMORY_SIZE) |
    174       media_threads;
    175 
    176    desc[dw++] =
    177       SET_FIELD(cs_prog_data->push.cross_thread.regs, CROSS_THREAD_READ_LENGTH);
    178 
    179    BEGIN_BATCH(4);
    180    OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
    181    OUT_BATCH(0);
    182    OUT_BATCH(8 * 4);
    183    OUT_BATCH(offset);
    184    ADVANCE_BATCH();
    185 }
    186 
    187 const struct brw_tracked_state brw_cs_state = {
    188    .dirty = {
    189       .mesa = _NEW_PROGRAM_CONSTANTS,
    190       .brw = BRW_NEW_BATCH |
    191              BRW_NEW_BLORP |
    192              BRW_NEW_CS_PROG_DATA |
    193              BRW_NEW_PUSH_CONSTANT_ALLOCATION |
    194              BRW_NEW_SAMPLER_STATE_TABLE |
    195              BRW_NEW_SURFACES,
    196    },
    197    .emit = brw_upload_cs_state
    198 };
    199 
    200 
    201 /**
    202  * Creates a region containing the push constants for the CS on gen7+.
    203  *
    204  * Push constants are constant values (such as GLSL uniforms) that are
    205  * pre-loaded into a shader stage's register space at thread spawn time.
    206  *
    207  * For other stages, see brw_curbe.c:brw_upload_constant_buffer for the
    208  * equivalent gen4/5 code and gen6_vs_state.c:gen6_upload_push_constants for
    209  * gen6+.
    210  */
    211 static void
    212 brw_upload_cs_push_constants(struct brw_context *brw,
    213                              const struct gl_program *prog,
    214                              const struct brw_cs_prog_data *cs_prog_data,
    215                              struct brw_stage_state *stage_state,
    216                              enum aub_state_struct_type type)
    217 {
    218    struct gl_context *ctx = &brw->ctx;
    219    const struct brw_stage_prog_data *prog_data =
    220       (struct brw_stage_prog_data*) cs_prog_data;
    221 
    222    /* Updates the ParamaterValues[i] pointers for all parameters of the
    223     * basic type of PROGRAM_STATE_VAR.
    224     */
    225    /* XXX: Should this happen somewhere before to get our state flag set? */
    226    _mesa_load_state_parameters(ctx, prog->Parameters);
    227 
    228    if (cs_prog_data->push.total.size == 0) {
    229       stage_state->push_const_size = 0;
    230       return;
    231    }
    232 
    233 
    234    gl_constant_value *param = (gl_constant_value*)
    235       brw_state_batch(brw, type, ALIGN(cs_prog_data->push.total.size, 64),
    236                       64, &stage_state->push_const_offset);
    237    assert(param);
    238 
    239    STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
    240 
    241    if (cs_prog_data->push.cross_thread.size > 0) {
    242       gl_constant_value *param_copy = param;
    243       assert(cs_prog_data->thread_local_id_index < 0 ||
    244              cs_prog_data->thread_local_id_index >=
    245                 cs_prog_data->push.cross_thread.dwords);
    246       for (unsigned i = 0;
    247            i < cs_prog_data->push.cross_thread.dwords;
    248            i++) {
    249          param_copy[i] = *prog_data->param[i];
    250       }
    251    }
    252 
    253    gl_constant_value thread_id;
    254    if (cs_prog_data->push.per_thread.size > 0) {
    255       for (unsigned t = 0; t < cs_prog_data->threads; t++) {
    256          unsigned dst =
    257             8 * (cs_prog_data->push.per_thread.regs * t +
    258                  cs_prog_data->push.cross_thread.regs);
    259          unsigned src = cs_prog_data->push.cross_thread.dwords;
    260          for ( ; src < prog_data->nr_params; src++, dst++) {
    261             if (src != cs_prog_data->thread_local_id_index)
    262                param[dst] = *prog_data->param[src];
    263             else {
    264                thread_id.u = t * cs_prog_data->simd_size;
    265                param[dst] = thread_id;
    266             }
    267          }
    268       }
    269    }
    270 
    271    stage_state->push_const_size =
    272       cs_prog_data->push.cross_thread.regs +
    273       cs_prog_data->push.per_thread.regs;
    274 }
    275 
    276 
    277 static void
    278 gen7_upload_cs_push_constants(struct brw_context *brw)
    279 {
    280    struct brw_stage_state *stage_state = &brw->cs.base;
    281 
    282    /* BRW_NEW_COMPUTE_PROGRAM */
    283    const struct brw_program *cp = (struct brw_program *) brw->compute_program;
    284 
    285    if (cp) {
    286       /* BRW_NEW_CS_PROG_DATA */
    287       struct brw_cs_prog_data *cs_prog_data =
    288          brw_cs_prog_data(brw->cs.base.prog_data);
    289 
    290       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
    291       brw_upload_cs_push_constants(brw, &cp->program, cs_prog_data,
    292                                    stage_state, AUB_TRACE_WM_CONSTANTS);
    293    }
    294 }
    295 
    296 const struct brw_tracked_state gen7_cs_push_constants = {
    297    .dirty = {
    298       .mesa = _NEW_PROGRAM_CONSTANTS,
    299       .brw = BRW_NEW_BATCH |
    300              BRW_NEW_BLORP |
    301              BRW_NEW_COMPUTE_PROGRAM |
    302              BRW_NEW_CS_PROG_DATA |
    303              BRW_NEW_PUSH_CONSTANT_ALLOCATION,
    304    },
    305    .emit = gen7_upload_cs_push_constants,
    306 };
    307 
    308 /**
    309  * Creates a new CS constant buffer reflecting the current CS program's
    310  * constants, if needed by the CS program.
    311  */
    312 static void
    313 brw_upload_cs_pull_constants(struct brw_context *brw)
    314 {
    315    struct brw_stage_state *stage_state = &brw->cs.base;
    316 
    317    /* BRW_NEW_COMPUTE_PROGRAM */
    318    struct brw_program *cp = (struct brw_program *) brw->compute_program;
    319 
    320    /* BRW_NEW_CS_PROG_DATA */
    321    const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
    322 
    323    _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
    324    /* _NEW_PROGRAM_CONSTANTS */
    325    brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
    326                              stage_state, prog_data);
    327 }
    328 
    329 const struct brw_tracked_state brw_cs_pull_constants = {
    330    .dirty = {
    331       .mesa = _NEW_PROGRAM_CONSTANTS,
    332       .brw = BRW_NEW_BATCH |
    333              BRW_NEW_BLORP |
    334              BRW_NEW_COMPUTE_PROGRAM |
    335              BRW_NEW_CS_PROG_DATA,
    336    },
    337    .emit = brw_upload_cs_pull_constants,
    338 };
    339