Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2014 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  */
     23 
     24 #include <sys/errno.h>
     25 
     26 #include "main/condrender.h"
     27 #include "main/mtypes.h"
     28 #include "main/state.h"
     29 #include "brw_context.h"
     30 #include "brw_draw.h"
     31 #include "brw_state.h"
     32 #include "intel_batchbuffer.h"
     33 #include "intel_buffer_objects.h"
     34 #include "brw_defines.h"
     35 
     36 
     37 static void
     38 prepare_indirect_gpgpu_walker(struct brw_context *brw)
     39 {
     40    GLintptr indirect_offset = brw->compute.num_work_groups_offset;
     41    drm_intel_bo *bo = brw->compute.num_work_groups_bo;
     42 
     43    brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo,
     44                          I915_GEM_DOMAIN_VERTEX, 0,
     45                          indirect_offset + 0);
     46    brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo,
     47                          I915_GEM_DOMAIN_VERTEX, 0,
     48                          indirect_offset + 4);
     49    brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo,
     50                          I915_GEM_DOMAIN_VERTEX, 0,
     51                          indirect_offset + 8);
     52 
     53    if (brw->gen > 7)
     54       return;
     55 
     56    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
     57    BEGIN_BATCH(7);
     58    OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2));
     59    OUT_BATCH(MI_PREDICATE_SRC0 + 4);
     60    OUT_BATCH(0u);
     61    OUT_BATCH(MI_PREDICATE_SRC1 + 0);
     62    OUT_BATCH(0u);
     63    OUT_BATCH(MI_PREDICATE_SRC1 + 4);
     64    OUT_BATCH(0u);
     65    ADVANCE_BATCH();
     66 
     67    /* Load compute_dispatch_indirect_x_size into SRC0 */
     68    brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo,
     69                          I915_GEM_DOMAIN_INSTRUCTION, 0,
     70                          indirect_offset + 0);
     71 
     72    /* predicate = (compute_dispatch_indirect_x_size == 0); */
     73    BEGIN_BATCH(1);
     74    OUT_BATCH(GEN7_MI_PREDICATE |
     75              MI_PREDICATE_LOADOP_LOAD |
     76              MI_PREDICATE_COMBINEOP_SET |
     77              MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
     78    ADVANCE_BATCH();
     79 
     80    /* Load compute_dispatch_indirect_y_size into SRC0 */
     81    brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo,
     82                          I915_GEM_DOMAIN_INSTRUCTION, 0,
     83                          indirect_offset + 4);
     84 
     85    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
     86    BEGIN_BATCH(1);
     87    OUT_BATCH(GEN7_MI_PREDICATE |
     88              MI_PREDICATE_LOADOP_LOAD |
     89              MI_PREDICATE_COMBINEOP_OR |
     90              MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
     91    ADVANCE_BATCH();
     92 
     93    /* Load compute_dispatch_indirect_z_size into SRC0 */
     94    brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo,
     95                          I915_GEM_DOMAIN_INSTRUCTION, 0,
     96                          indirect_offset + 8);
     97 
     98    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
     99    BEGIN_BATCH(1);
    100    OUT_BATCH(GEN7_MI_PREDICATE |
    101              MI_PREDICATE_LOADOP_LOAD |
    102              MI_PREDICATE_COMBINEOP_OR |
    103              MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
    104    ADVANCE_BATCH();
    105 
    106    /* predicate = !predicate; */
    107    BEGIN_BATCH(1);
    108    OUT_BATCH(GEN7_MI_PREDICATE |
    109              MI_PREDICATE_LOADOP_LOADINV |
    110              MI_PREDICATE_COMBINEOP_OR |
    111              MI_PREDICATE_COMPAREOP_FALSE);
    112    ADVANCE_BATCH();
    113 }
    114 
    115 static void
    116 brw_emit_gpgpu_walker(struct brw_context *brw)
    117 {
    118    const struct brw_cs_prog_data *prog_data =
    119       brw_cs_prog_data(brw->cs.base.prog_data);
    120 
    121    const GLuint *num_groups = brw->compute.num_work_groups;
    122    uint32_t indirect_flag;
    123 
    124    if (brw->compute.num_work_groups_bo == NULL) {
    125       indirect_flag = 0;
    126    } else {
    127       indirect_flag =
    128          GEN7_GPGPU_INDIRECT_PARAMETER_ENABLE |
    129          (brw->gen == 7 ? GEN7_GPGPU_PREDICATE_ENABLE : 0);
    130       prepare_indirect_gpgpu_walker(brw);
    131    }
    132 
    133    const unsigned simd_size = prog_data->simd_size;
    134    unsigned group_size = prog_data->local_size[0] *
    135       prog_data->local_size[1] * prog_data->local_size[2];
    136    unsigned thread_width_max =
    137       (group_size + simd_size - 1) / simd_size;
    138 
    139    uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
    140    const unsigned right_non_aligned = group_size & (simd_size - 1);
    141    if (right_non_aligned != 0)
    142       right_mask >>= (simd_size - right_non_aligned);
    143 
    144    uint32_t dwords = brw->gen < 8 ? 11 : 15;
    145    BEGIN_BATCH(dwords);
    146    OUT_BATCH(GPGPU_WALKER << 16 | (dwords - 2) | indirect_flag);
    147    OUT_BATCH(0);
    148    if (brw->gen >= 8) {
    149       OUT_BATCH(0);                     /* Indirect Data Length */
    150       OUT_BATCH(0);                     /* Indirect Data Start Address */
    151    }
    152    assert(thread_width_max <= brw->screen->devinfo.max_cs_threads);
    153    OUT_BATCH(SET_FIELD(simd_size / 16, GPGPU_WALKER_SIMD_SIZE) |
    154              SET_FIELD(thread_width_max - 1, GPGPU_WALKER_THREAD_WIDTH_MAX));
    155    OUT_BATCH(0);                        /* Thread Group ID Starting X */
    156    if (brw->gen >= 8)
    157       OUT_BATCH(0);                     /* MBZ */
    158    OUT_BATCH(num_groups[0]);            /* Thread Group ID X Dimension */
    159    OUT_BATCH(0);                        /* Thread Group ID Starting Y */
    160    if (brw->gen >= 8)
    161       OUT_BATCH(0);                     /* MBZ */
    162    OUT_BATCH(num_groups[1]);            /* Thread Group ID Y Dimension */
    163    OUT_BATCH(0);                        /* Thread Group ID Starting/Resume Z */
    164    OUT_BATCH(num_groups[2]);            /* Thread Group ID Z Dimension */
    165    OUT_BATCH(right_mask);               /* Right Execution Mask */
    166    OUT_BATCH(0xffffffff);               /* Bottom Execution Mask */
    167    ADVANCE_BATCH();
    168 
    169    BEGIN_BATCH(2);
    170    OUT_BATCH(MEDIA_STATE_FLUSH << 16 | (2 - 2));
    171    OUT_BATCH(0);
    172    ADVANCE_BATCH();
    173 }
    174 
    175 
    176 static void
    177 brw_dispatch_compute_common(struct gl_context *ctx)
    178 {
    179    struct brw_context *brw = brw_context(ctx);
    180    int estimated_buffer_space_needed;
    181    bool fail_next = false;
    182 
    183    if (!_mesa_check_conditional_render(ctx))
    184       return;
    185 
    186    if (ctx->NewState)
    187       _mesa_update_state(ctx);
    188 
    189    brw_validate_textures(brw);
    190 
    191    const int sampler_state_size = 16; /* 16 bytes */
    192    estimated_buffer_space_needed = 512; /* batchbuffer commands */
    193    estimated_buffer_space_needed += (BRW_MAX_TEX_UNIT *
    194                                      (sampler_state_size +
    195                                       sizeof(struct gen5_sampler_default_color)));
    196    estimated_buffer_space_needed += 1024; /* push constants */
    197    estimated_buffer_space_needed += 512; /* misc. pad */
    198 
    199    /* Flush the batch if it's approaching full, so that we don't wrap while
    200     * we've got validated state that needs to be in the same batch as the
    201     * primitives.
    202     */
    203    intel_batchbuffer_require_space(brw, estimated_buffer_space_needed,
    204                                    RENDER_RING);
    205    intel_batchbuffer_save_state(brw);
    206 
    207  retry:
    208    brw->no_batch_wrap = true;
    209    brw_upload_compute_state(brw);
    210 
    211    brw_emit_gpgpu_walker(brw);
    212 
    213    brw->no_batch_wrap = false;
    214 
    215    if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
    216       if (!fail_next) {
    217          intel_batchbuffer_reset_to_saved(brw);
    218          intel_batchbuffer_flush(brw);
    219          fail_next = true;
    220          goto retry;
    221       } else {
    222          if (intel_batchbuffer_flush(brw) == -ENOSPC) {
    223             static bool warned = false;
    224 
    225             if (!warned) {
    226                fprintf(stderr, "i965: Single compute shader dispatch "
    227                        "exceeded available aperture space\n");
    228                warned = true;
    229             }
    230          }
    231       }
    232    }
    233 
    234    /* Now that we know we haven't run out of aperture space, we can safely
    235     * reset the dirty bits.
    236     */
    237    brw_compute_state_finished(brw);
    238 
    239    if (brw->always_flush_batch)
    240       intel_batchbuffer_flush(brw);
    241 
    242    brw_program_cache_check_size(brw);
    243 
    244    /* Note: since compute shaders can't write to framebuffers, there's no need
    245     * to call brw_postdraw_set_buffers_need_resolve().
    246     */
    247 }
    248 
    249 static void
    250 brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) {
    251    struct brw_context *brw = brw_context(ctx);
    252 
    253    brw->compute.num_work_groups_bo = NULL;
    254    brw->compute.num_work_groups = num_groups;
    255    ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
    256 
    257    brw_dispatch_compute_common(ctx);
    258 }
    259 
    260 static void
    261 brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect)
    262 {
    263    struct brw_context *brw = brw_context(ctx);
    264    static const GLuint indirect_group_counts[3] = { 0, 0, 0 };
    265    struct gl_buffer_object *indirect_buffer = ctx->DispatchIndirectBuffer;
    266    drm_intel_bo *bo =
    267       intel_bufferobj_buffer(brw,
    268                              intel_buffer_object(indirect_buffer),
    269                              indirect, 3 * sizeof(GLuint));
    270 
    271    brw->compute.num_work_groups_bo = bo;
    272    brw->compute.num_work_groups_offset = indirect;
    273    brw->compute.num_work_groups = indirect_group_counts;
    274    ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
    275 
    276    brw_dispatch_compute_common(ctx);
    277 }
    278 
    279 void
    280 brw_init_compute_functions(struct dd_function_table *functions)
    281 {
    282    functions->DispatchCompute = brw_dispatch_compute;
    283    functions->DispatchComputeIndirect = brw_dispatch_compute_indirect;
    284 }
    285