Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright 2006 VMware, Inc.
      3  * All Rights Reserved.
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the
      7  * "Software"), to deal in the Software without restriction, including
      8  * without limitation the rights to use, copy, modify, merge, publish,
      9  * distribute, sublicense, and/or sell copies of the Software, and to
     10  * permit persons to whom the Software is furnished to do so, subject to
     11  * the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the
     14  * next paragraph) shall be included in all copies or substantial portions
     15  * of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     20  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     21  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     24  */
     25 
     26 #include "intel_batchbuffer.h"
     27 #include "intel_buffer_objects.h"
     28 #include "brw_bufmgr.h"
     29 #include "intel_buffers.h"
     30 #include "intel_fbo.h"
     31 #include "brw_context.h"
     32 #include "brw_defines.h"
     33 #include "brw_state.h"
     34 #include "common/gen_decoder.h"
     35 
     36 #include "util/hash_table.h"
     37 
     38 #include <xf86drm.h>
     39 #include <i915_drm.h>
     40 
     41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
     42 
     43 /**
     44  * Target sizes of the batch and state buffers.  We create the initial
     45  * buffers at these sizes, and flush when they're nearly full.  If we
     46  * underestimate how close we are to the end, and suddenly need more space
     47  * in the middle of a draw, we can grow the buffers, and finish the draw.
     48  * At that point, we'll be over our target size, so the next operation
     49  * should flush.  Each time we flush the batch, we recreate both buffers
     50  * at the original target size, so it doesn't grow without bound.
     51  */
     52 #define BATCH_SZ (20 * 1024)
     53 #define STATE_SZ (16 * 1024)
     54 
     55 static void
     56 intel_batchbuffer_reset(struct brw_context *brw);
     57 
     58 static bool
     59 uint_key_compare(const void *a, const void *b)
     60 {
     61    return a == b;
     62 }
     63 
     64 static uint32_t
     65 uint_key_hash(const void *key)
     66 {
     67    return (uintptr_t) key;
     68 }
     69 
     70 static void
     71 init_reloc_list(struct brw_reloc_list *rlist, int count)
     72 {
     73    rlist->reloc_count = 0;
     74    rlist->reloc_array_size = count;
     75    rlist->relocs = malloc(rlist->reloc_array_size *
     76                           sizeof(struct drm_i915_gem_relocation_entry));
     77 }
     78 
     79 void
     80 intel_batchbuffer_init(struct brw_context *brw)
     81 {
     82    struct intel_screen *screen = brw->screen;
     83    struct intel_batchbuffer *batch = &brw->batch;
     84    const struct gen_device_info *devinfo = &screen->devinfo;
     85 
     86    batch->use_shadow_copy = !devinfo->has_llc;
     87 
     88    if (batch->use_shadow_copy) {
     89       batch->batch.map = malloc(BATCH_SZ);
     90       batch->map_next = batch->batch.map;
     91       batch->state.map = malloc(STATE_SZ);
     92    }
     93 
     94    init_reloc_list(&batch->batch_relocs, 250);
     95    init_reloc_list(&batch->state_relocs, 250);
     96 
     97    batch->exec_count = 0;
     98    batch->exec_array_size = 100;
     99    batch->exec_bos =
    100       malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
    101    batch->validation_list =
    102       malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
    103 
    104    if (INTEL_DEBUG & DEBUG_BATCH) {
    105       batch->state_batch_sizes =
    106          _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
    107    }
    108 
    109    batch->use_batch_first =
    110       screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
    111 
    112    /* PIPE_CONTROL needs a w/a but only on gen6 */
    113    batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
    114    if (devinfo->gen == 6)
    115       batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
    116 
    117    intel_batchbuffer_reset(brw);
    118 }
    119 
    120 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
    121 
    122 static unsigned
    123 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
    124 {
    125    unsigned index = READ_ONCE(bo->index);
    126 
    127    if (index < batch->exec_count && batch->exec_bos[index] == bo)
    128       return index;
    129 
    130    /* May have been shared between multiple active batches */
    131    for (index = 0; index < batch->exec_count; index++) {
    132       if (batch->exec_bos[index] == bo)
    133          return index;
    134    }
    135 
    136    brw_bo_reference(bo);
    137 
    138    if (batch->exec_count == batch->exec_array_size) {
    139       batch->exec_array_size *= 2;
    140       batch->exec_bos =
    141          realloc(batch->exec_bos,
    142                  batch->exec_array_size * sizeof(batch->exec_bos[0]));
    143       batch->validation_list =
    144          realloc(batch->validation_list,
    145                  batch->exec_array_size * sizeof(batch->validation_list[0]));
    146    }
    147 
    148    batch->validation_list[batch->exec_count] =
    149       (struct drm_i915_gem_exec_object2) {
    150          .handle = bo->gem_handle,
    151          .alignment = bo->align,
    152          .offset = bo->gtt_offset,
    153          .flags = bo->kflags,
    154       };
    155 
    156    bo->index = batch->exec_count;
    157    batch->exec_bos[batch->exec_count] = bo;
    158    batch->aperture_space += bo->size;
    159 
    160    return batch->exec_count++;
    161 }
    162 
    163 static void
    164 recreate_growing_buffer(struct brw_context *brw,
    165                         struct brw_growing_bo *grow,
    166                         const char *name, unsigned size)
    167 {
    168    struct intel_screen *screen = brw->screen;
    169    struct intel_batchbuffer *batch = &brw->batch;
    170    struct brw_bufmgr *bufmgr = screen->bufmgr;
    171 
    172    grow->bo = brw_bo_alloc(bufmgr, name, size, 4096);
    173    grow->bo->kflags = can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
    174    grow->partial_bo = NULL;
    175    grow->partial_bo_map = NULL;
    176    grow->partial_bytes = 0;
    177 
    178    if (!batch->use_shadow_copy)
    179       grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE);
    180 }
    181 
    182 static void
    183 intel_batchbuffer_reset(struct brw_context *brw)
    184 {
    185    struct intel_batchbuffer *batch = &brw->batch;
    186 
    187    if (batch->last_bo != NULL) {
    188       brw_bo_unreference(batch->last_bo);
    189       batch->last_bo = NULL;
    190    }
    191    batch->last_bo = batch->batch.bo;
    192 
    193    recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ);
    194    batch->map_next = batch->batch.map;
    195 
    196    recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ);
    197 
    198    /* Avoid making 0 a valid state offset - otherwise the decoder will try
    199     * and decode data when we use offset 0 as a null pointer.
    200     */
    201    batch->state_used = 1;
    202 
    203    add_exec_bo(batch, batch->batch.bo);
    204    assert(batch->batch.bo->index == 0);
    205 
    206    batch->needs_sol_reset = false;
    207    batch->state_base_address_emitted = false;
    208 
    209    /* We don't know what ring the new batch will be sent to until we see the
    210     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
    211     */
    212    batch->ring = UNKNOWN_RING;
    213 
    214    if (batch->state_batch_sizes)
    215       _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
    216 }
    217 
    218 static void
    219 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
    220 {
    221    intel_batchbuffer_reset(brw);
    222    brw_cache_sets_clear(brw);
    223 }
    224 
    225 void
    226 intel_batchbuffer_save_state(struct brw_context *brw)
    227 {
    228    brw->batch.saved.map_next = brw->batch.map_next;
    229    brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
    230    brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
    231    brw->batch.saved.exec_count = brw->batch.exec_count;
    232 }
    233 
    234 void
    235 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
    236 {
    237    for (int i = brw->batch.saved.exec_count;
    238         i < brw->batch.exec_count; i++) {
    239       brw_bo_unreference(brw->batch.exec_bos[i]);
    240    }
    241    brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
    242    brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
    243    brw->batch.exec_count = brw->batch.saved.exec_count;
    244 
    245    brw->batch.map_next = brw->batch.saved.map_next;
    246    if (USED_BATCH(brw->batch) == 0)
    247       brw->batch.ring = UNKNOWN_RING;
    248 }
    249 
    250 void
    251 intel_batchbuffer_free(struct intel_batchbuffer *batch)
    252 {
    253    if (batch->use_shadow_copy) {
    254       free(batch->batch.map);
    255       free(batch->state.map);
    256    }
    257 
    258    for (int i = 0; i < batch->exec_count; i++) {
    259       brw_bo_unreference(batch->exec_bos[i]);
    260    }
    261    free(batch->batch_relocs.relocs);
    262    free(batch->state_relocs.relocs);
    263    free(batch->exec_bos);
    264    free(batch->validation_list);
    265 
    266    brw_bo_unreference(batch->last_bo);
    267    brw_bo_unreference(batch->batch.bo);
    268    brw_bo_unreference(batch->state.bo);
    269    if (batch->state_batch_sizes)
    270       _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
    271 }
    272 
    273 /**
    274  * Finish copying the old batch/state buffer's contents to the new one
    275  * after we tried to "grow" the buffer in an earlier operation.
    276  */
    277 static void
    278 finish_growing_bos(struct brw_growing_bo *grow)
    279 {
    280    struct brw_bo *old_bo = grow->partial_bo;
    281    if (!old_bo)
    282       return;
    283 
    284    memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
    285 
    286    grow->partial_bo = NULL;
    287    grow->partial_bo_map = NULL;
    288    grow->partial_bytes = 0;
    289 
    290    brw_bo_unreference(old_bo);
    291 }
    292 
    293 static void
    294 replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
    295                          uint32_t old_handle, uint32_t new_handle)
    296 {
    297    for (int i = 0; i < rlist->reloc_count; i++) {
    298       if (rlist->relocs[i].target_handle == old_handle)
    299          rlist->relocs[i].target_handle = new_handle;
    300    }
    301 }
    302 
    303 /**
    304  * Grow either the batch or state buffer to a new larger size.
    305  *
    306  * We can't actually grow buffers, so we allocate a new one, copy over
    307  * the existing contents, and update our lists to refer to the new one.
    308  *
    309  * Note that this is only temporary - each new batch recreates the buffers
    310  * at their original target size (BATCH_SZ or STATE_SZ).
    311  */
    312 static void
    313 grow_buffer(struct brw_context *brw,
    314             struct brw_growing_bo *grow,
    315             unsigned existing_bytes,
    316             unsigned new_size)
    317 {
    318    struct intel_batchbuffer *batch = &brw->batch;
    319    struct brw_bufmgr *bufmgr = brw->bufmgr;
    320    struct brw_bo *bo = grow->bo;
    321 
    322    perf_debug("Growing %s - ran out of space\n", bo->name);
    323 
    324    if (grow->partial_bo) {
    325       /* We've already grown once, and now we need to do it again.
    326        * Finish our last grow operation so we can start a new one.
    327        * This should basically never happen.
    328        */
    329       perf_debug("Had to grow multiple times");
    330       finish_growing_bos(grow);
    331    }
    332 
    333    struct brw_bo *new_bo = brw_bo_alloc(bufmgr, bo->name, new_size, bo->align);
    334 
    335    /* Copy existing data to the new larger buffer */
    336    grow->partial_bo_map = grow->map;
    337 
    338    if (batch->use_shadow_copy) {
    339       /* We can't safely use realloc, as it may move the existing buffer,
    340        * breaking existing pointers the caller may still be using.  Just
    341        * malloc a new copy and memcpy it like the normal BO path.
    342        *
    343        * Use bo->size rather than new_size because the bufmgr may have
    344        * rounded up the size, and we want the shadow size to match.
    345        */
    346       grow->map = malloc(new_bo->size);
    347    } else {
    348       grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
    349    }
    350 
    351    /* Try to put the new BO at the same GTT offset as the old BO (which
    352     * we're throwing away, so it doesn't need to be there).
    353     *
    354     * This guarantees that our relocations continue to work: values we've
    355     * already written into the buffer, values we're going to write into the
    356     * buffer, and the validation/relocation lists all will match.
    357     *
    358     * Also preserve kflags for EXEC_OBJECT_CAPTURE.
    359     */
    360    new_bo->gtt_offset = bo->gtt_offset;
    361    new_bo->index = bo->index;
    362    new_bo->kflags = bo->kflags;
    363 
    364    /* Batch/state buffers are per-context, and if we've run out of space,
    365     * we must have actually used them before, so...they will be in the list.
    366     */
    367    assert(bo->index < batch->exec_count);
    368    assert(batch->exec_bos[bo->index] == bo);
    369 
    370    /* Update the validation list to use the new BO. */
    371    batch->validation_list[bo->index].handle = new_bo->gem_handle;
    372 
    373    if (!batch->use_batch_first) {
    374       /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
    375        * update the relocation list entries to point at the new BO as well.
    376        * (With newer kernels, the "handle" is an offset into the validation
    377        * list, which remains unchanged, so we can skip this.)
    378        */
    379       replace_bo_in_reloc_list(&batch->batch_relocs,
    380                                bo->gem_handle, new_bo->gem_handle);
    381       replace_bo_in_reloc_list(&batch->state_relocs,
    382                                bo->gem_handle, new_bo->gem_handle);
    383    }
    384 
    385    /* Exchange the two BOs...without breaking pointers to the old BO.
    386     *
    387     * Consider this scenario:
    388     *
    389     * 1. Somebody calls brw_state_batch() to get a region of memory, and
    390     *    and then creates a brw_address pointing to brw->batch.state.bo.
    391     * 2. They then call brw_state_batch() a second time, which happens to
    392     *    grow and replace the state buffer.  They then try to emit a
    393     *    relocation to their first section of memory.
    394     *
    395     * If we replace the brw->batch.state.bo pointer at step 2, we would
    396     * break the address created in step 1.  They'd have a pointer to the
    397     * old destroyed BO.  Emitting a relocation would add this dead BO to
    398     * the validation list...causing /both/ statebuffers to be in the list,
    399     * and all kinds of disasters.
    400     *
    401     * This is not a contrived case - BLORP vertex data upload hits this.
    402     *
    403     * There are worse scenarios too.  Fences for GL sync objects reference
    404     * brw->batch.batch.bo.  If we replaced the batch pointer when growing,
    405     * we'd need to chase down every fence and update it to point to the
    406     * new BO.  Otherwise, it would refer to a "batch" that never actually
    407     * gets submitted, and would fail to trigger.
    408     *
    409     * To work around both of these issues, we transmutate the buffers in
    410     * place, making the existing struct brw_bo represent the new buffer,
    411     * and "new_bo" represent the old BO.  This is highly unusual, but it
    412     * seems like a necessary evil.
    413     *
    414     * We also defer the memcpy of the existing batch's contents.  Callers
    415     * may make multiple brw_state_batch calls, and retain pointers to the
    416     * old BO's map.  We'll perform the memcpy in finish_growing_bo() when
    417     * we finally submit the batch, at which point we've finished uploading
    418     * state, and nobody should have any old references anymore.
    419     *
    420     * To do that, we keep a reference to the old BO in grow->partial_bo,
    421     * and store the number of bytes to copy in grow->partial_bytes.  We
    422     * can monkey with the refcounts directly without atomics because these
    423     * are per-context BOs and they can only be touched by this thread.
    424     */
    425    assert(new_bo->refcount == 1);
    426    new_bo->refcount = bo->refcount;
    427    bo->refcount = 1;
    428 
    429    struct brw_bo tmp;
    430    memcpy(&tmp, bo, sizeof(struct brw_bo));
    431    memcpy(bo, new_bo, sizeof(struct brw_bo));
    432    memcpy(new_bo, &tmp, sizeof(struct brw_bo));
    433 
    434    grow->partial_bo = new_bo; /* the one reference of the OLD bo */
    435    grow->partial_bytes = existing_bytes;
    436 }
    437 
    438 void
    439 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
    440                                 enum brw_gpu_ring ring)
    441 {
    442    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    443    struct intel_batchbuffer *batch = &brw->batch;
    444 
    445    /* If we're switching rings, implicitly flush the batch. */
    446    if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
    447        devinfo->gen >= 6) {
    448       intel_batchbuffer_flush(brw);
    449    }
    450 
    451    const unsigned batch_used = USED_BATCH(*batch) * 4;
    452    if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) {
    453       intel_batchbuffer_flush(brw);
    454    } else if (batch_used + sz >= batch->batch.bo->size) {
    455       const unsigned new_size =
    456          MIN2(batch->batch.bo->size + batch->batch.bo->size / 2,
    457               MAX_BATCH_SIZE);
    458       grow_buffer(brw, &batch->batch, batch_used, new_size);
    459       batch->map_next = (void *) batch->batch.map + batch_used;
    460       assert(batch_used + sz < batch->batch.bo->size);
    461    }
    462 
    463    /* The intel_batchbuffer_flush() calls above might have changed
    464     * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
    465     */
    466    brw->batch.ring = ring;
    467 }
    468 
    469 #ifdef DEBUG
    470 #define CSI "\e["
    471 #define BLUE_HEADER  CSI "0;44m"
    472 #define NORMAL       CSI "0m"
    473 
    474 
    475 static void
    476 decode_struct(struct brw_context *brw, struct gen_spec *spec,
    477               const char *struct_name, uint32_t *data,
    478               uint32_t gtt_offset, uint32_t offset, bool color)
    479 {
    480    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
    481    if (!group)
    482       return;
    483 
    484    fprintf(stderr, "%s\n", struct_name);
    485    gen_print_group(stderr, group, gtt_offset + offset,
    486                    &data[offset / 4], 0, color);
    487 }
    488 
    489 static void
    490 decode_structs(struct brw_context *brw, struct gen_spec *spec,
    491                const char *struct_name,
    492                uint32_t *data, uint32_t gtt_offset, uint32_t offset,
    493                int struct_size, bool color)
    494 {
    495    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
    496    if (!group)
    497       return;
    498 
    499    int entries = brw_state_batch_size(brw, offset) / struct_size;
    500    for (int i = 0; i < entries; i++) {
    501       fprintf(stderr, "%s %d\n", struct_name, i);
    502       gen_print_group(stderr, group, gtt_offset + offset,
    503                       &data[(offset + i * struct_size) / 4], 0, color);
    504    }
    505 }
    506 
    507 static void
    508 do_batch_dump(struct brw_context *brw)
    509 {
    510    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    511    struct intel_batchbuffer *batch = &brw->batch;
    512    struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
    513 
    514    if (batch->ring != RENDER_RING)
    515       return;
    516 
    517    uint32_t *batch_data = brw_bo_map(brw, batch->batch.bo, MAP_READ);
    518    uint32_t *state = brw_bo_map(brw, batch->state.bo, MAP_READ);
    519    if (batch_data == NULL || state == NULL) {
    520       fprintf(stderr, "WARNING: failed to map batchbuffer/statebuffer\n");
    521       return;
    522    }
    523 
    524    uint32_t *end = batch_data + USED_BATCH(*batch);
    525    uint32_t batch_gtt_offset = batch->batch.bo->gtt_offset;
    526    uint32_t state_gtt_offset = batch->state.bo->gtt_offset;
    527    int length;
    528 
    529    bool color = INTEL_DEBUG & DEBUG_COLOR;
    530    const char *header_color = color ? BLUE_HEADER : "";
    531    const char *reset_color  = color ? NORMAL : "";
    532 
    533    for (uint32_t *p = batch_data; p < end; p += length) {
    534       struct gen_group *inst = gen_spec_find_instruction(spec, p);
    535       length = gen_group_get_length(inst, p);
    536       assert(inst == NULL || length > 0);
    537       length = MAX2(1, length);
    538       if (inst == NULL) {
    539          fprintf(stderr, "unknown instruction %08x\n", p[0]);
    540          continue;
    541       }
    542 
    543       uint64_t offset = batch_gtt_offset + 4 * (p - batch_data);
    544 
    545       fprintf(stderr, "%s0x%08"PRIx64":  0x%08x:  %-80s%s\n", header_color,
    546               offset, p[0], gen_group_get_name(inst), reset_color);
    547 
    548       gen_print_group(stderr, inst, offset, p, 0, color);
    549 
    550       switch (gen_group_get_opcode(inst) >> 16) {
    551       case _3DSTATE_PIPELINED_POINTERS:
    552          /* Note: these Gen4-5 pointers are full relocations rather than
    553           * offsets from the start of the statebuffer.  So we need to subtract
    554           * gtt_offset (the start of the statebuffer) to obtain an offset we
    555           * can add to the map and get at the data.
    556           */
    557          decode_struct(brw, spec, "VS_STATE", state, state_gtt_offset,
    558                        (p[1] & ~0x1fu) - state_gtt_offset, color);
    559          if (p[2] & 1) {
    560             decode_struct(brw, spec, "GS_STATE", state, state_gtt_offset,
    561                           (p[2] & ~0x1fu) - state_gtt_offset, color);
    562          }
    563          if (p[3] & 1) {
    564             decode_struct(brw, spec, "CLIP_STATE", state, state_gtt_offset,
    565                           (p[3] & ~0x1fu) - state_gtt_offset, color);
    566          }
    567          decode_struct(brw, spec, "SF_STATE", state, state_gtt_offset,
    568                        (p[4] & ~0x1fu) - state_gtt_offset, color);
    569          decode_struct(brw, spec, "WM_STATE", state, state_gtt_offset,
    570                        (p[5] & ~0x1fu) - state_gtt_offset, color);
    571          decode_struct(brw, spec, "COLOR_CALC_STATE", state, state_gtt_offset,
    572                        (p[6] & ~0x3fu) - state_gtt_offset, color);
    573          break;
    574       case _3DSTATE_BINDING_TABLE_POINTERS_VS:
    575       case _3DSTATE_BINDING_TABLE_POINTERS_HS:
    576       case _3DSTATE_BINDING_TABLE_POINTERS_DS:
    577       case _3DSTATE_BINDING_TABLE_POINTERS_GS:
    578       case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
    579          struct gen_group *group =
    580             gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
    581          if (!group)
    582             break;
    583 
    584          uint32_t bt_offset = p[1] & ~0x1fu;
    585          int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
    586          uint32_t *bt_pointers = &state[bt_offset / 4];
    587          for (int i = 0; i < bt_entries; i++) {
    588             fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
    589             gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
    590                             &state[bt_pointers[i] / 4], 0, color);
    591          }
    592          break;
    593       }
    594       case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
    595       case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
    596       case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
    597       case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
    598       case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
    599          decode_structs(brw, spec, "SAMPLER_STATE", state,
    600                         state_gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
    601          break;
    602       case _3DSTATE_VIEWPORT_STATE_POINTERS:
    603          decode_structs(brw, spec, "CLIP_VIEWPORT", state,
    604                         state_gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
    605          decode_structs(brw, spec, "SF_VIEWPORT", state,
    606                         state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
    607          decode_structs(brw, spec, "CC_VIEWPORT", state,
    608                         state_gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
    609          break;
    610       case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
    611          decode_structs(brw, spec, "CC_VIEWPORT", state,
    612                         state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
    613          break;
    614       case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
    615          decode_structs(brw, spec, "SF_CLIP_VIEWPORT", state,
    616                         state_gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
    617          break;
    618       case _3DSTATE_SCISSOR_STATE_POINTERS:
    619          decode_structs(brw, spec, "SCISSOR_RECT", state,
    620                         state_gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
    621          break;
    622       case _3DSTATE_BLEND_STATE_POINTERS:
    623          /* TODO: handle Gen8+ extra dword at the beginning */
    624          decode_structs(brw, spec, "BLEND_STATE", state,
    625                         state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
    626          break;
    627       case _3DSTATE_CC_STATE_POINTERS:
    628          if (devinfo->gen >= 7) {
    629             decode_struct(brw, spec, "COLOR_CALC_STATE", state,
    630                           state_gtt_offset, p[1] & ~0x3fu, color);
    631          } else if (devinfo->gen == 6) {
    632             decode_structs(brw, spec, "BLEND_STATE", state,
    633                            state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
    634             decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
    635                           state_gtt_offset, p[2] & ~0x3fu, color);
    636             decode_struct(brw, spec, "COLOR_CALC_STATE", state,
    637                           state_gtt_offset, p[3] & ~0x3fu, color);
    638          }
    639          break;
    640       case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
    641          decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
    642                        state_gtt_offset, p[1] & ~0x3fu, color);
    643          break;
    644       case MEDIA_INTERFACE_DESCRIPTOR_LOAD: {
    645          struct gen_group *group =
    646             gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
    647          if (!group)
    648             break;
    649 
    650          uint32_t idd_offset = p[3] & ~0x1fu;
    651          decode_struct(brw, spec, "INTERFACE_DESCRIPTOR_DATA", state,
    652                        state_gtt_offset, idd_offset, color);
    653 
    654          uint32_t ss_offset = state[idd_offset / 4 + 3] & ~0x1fu;
    655          decode_structs(brw, spec, "SAMPLER_STATE", state,
    656                         state_gtt_offset, ss_offset, 4 * 4, color);
    657 
    658          uint32_t bt_offset = state[idd_offset / 4 + 4] & ~0x1fu;
    659          int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
    660          uint32_t *bt_pointers = &state[bt_offset / 4];
    661          for (int i = 0; i < bt_entries; i++) {
    662             fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
    663             gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
    664                             &state[bt_pointers[i] / 4], 0, color);
    665          }
    666          break;
    667       }
    668       }
    669    }
    670 
    671    brw_bo_unmap(batch->batch.bo);
    672    brw_bo_unmap(batch->state.bo);
    673 }
    674 #else
    675 static void do_batch_dump(struct brw_context *brw) { }
    676 #endif
    677 
    678 /**
    679  * Called when starting a new batch buffer.
    680  */
    681 static void
    682 brw_new_batch(struct brw_context *brw)
    683 {
    684    /* Unreference any BOs held by the previous batch, and reset counts. */
    685    for (int i = 0; i < brw->batch.exec_count; i++) {
    686       brw_bo_unreference(brw->batch.exec_bos[i]);
    687       brw->batch.exec_bos[i] = NULL;
    688    }
    689    brw->batch.batch_relocs.reloc_count = 0;
    690    brw->batch.state_relocs.reloc_count = 0;
    691    brw->batch.exec_count = 0;
    692    brw->batch.aperture_space = 0;
    693 
    694    brw_bo_unreference(brw->batch.state.bo);
    695 
    696    /* Create a new batchbuffer and reset the associated state: */
    697    intel_batchbuffer_reset_and_clear_render_cache(brw);
    698 
    699    /* If the kernel supports hardware contexts, then most hardware state is
    700     * preserved between batches; we only need to re-emit state that is required
    701     * to be in every batch.  Otherwise we need to re-emit all the state that
    702     * would otherwise be stored in the context (which for all intents and
    703     * purposes means everything).
    704     */
    705    if (brw->hw_ctx == 0) {
    706       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
    707       brw_upload_invariant_state(brw);
    708    }
    709 
    710    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
    711 
    712    brw->ib.index_size = -1;
    713 
    714    /* We need to periodically reap the shader time results, because rollover
    715     * happens every few seconds.  We also want to see results every once in a
    716     * while, because many programs won't cleanly destroy our context, so the
    717     * end-of-run printout may not happen.
    718     */
    719    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
    720       brw_collect_and_report_shader_time(brw);
    721 }
    722 
    723 /**
    724  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
    725  * sending it off.
    726  *
    727  * This function can emit state (say, to preserve registers that aren't saved
    728  * between batches).
    729  */
    730 static void
    731 brw_finish_batch(struct brw_context *brw)
    732 {
    733    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    734 
    735    brw->batch.no_wrap = true;
    736 
    737    /* Capture the closing pipeline statistics register values necessary to
    738     * support query objects (in the non-hardware context world).
    739     */
    740    brw_emit_query_end(brw);
    741 
    742    if (brw->batch.ring == RENDER_RING) {
    743       /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
    744        * assume that the L3 cache is configured according to the hardware
    745        * defaults.
    746        */
    747       if (devinfo->gen >= 7)
    748          gen7_restore_default_l3_config(brw);
    749 
    750       if (devinfo->is_haswell) {
    751          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
    752           * 3DSTATE_CC_STATE_POINTERS > "Note":
    753           *
    754           * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
    755           *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
    756           *
    757           * From the example in the docs, it seems to expect a regular pipe control
    758           * flush here as well. We may have done it already, but meh.
    759           *
    760           * See also WaAvoidRCZCounterRollover.
    761           */
    762          brw_emit_mi_flush(brw);
    763          BEGIN_BATCH(2);
    764          OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
    765          OUT_BATCH(brw->cc.state_offset | 1);
    766          ADVANCE_BATCH();
    767          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
    768                                           PIPE_CONTROL_CS_STALL);
    769       }
    770 
    771       /* Do not restore push constant packets during context restore. */
    772       if (devinfo->gen == 10)
    773          gen10_emit_isp_disable(brw);
    774    }
    775 
    776    /* Emit MI_BATCH_BUFFER_END to finish our batch.  Note that execbuf2
    777     * requires our batch size to be QWord aligned, so we pad it out if
    778     * necessary by emitting an extra MI_NOOP after the end.
    779     */
    780    intel_batchbuffer_require_space(brw, 8, brw->batch.ring);
    781    *brw->batch.map_next++ = MI_BATCH_BUFFER_END;
    782    if (USED_BATCH(brw->batch) & 1) {
    783       *brw->batch.map_next++ = MI_NOOP;
    784    }
    785 
    786    brw->batch.no_wrap = false;
    787 }
    788 
    789 static void
    790 throttle(struct brw_context *brw)
    791 {
    792    /* Wait for the swapbuffers before the one we just emitted, so we
    793     * don't get too many swaps outstanding for apps that are GPU-heavy
    794     * but not CPU-heavy.
    795     *
    796     * We're using intelDRI2Flush (called from the loader before
    797     * swapbuffer) and glFlush (for front buffer rendering) as the
    798     * indicator that a frame is done and then throttle when we get
    799     * here as we prepare to render the next frame.  At this point for
    800     * round trips for swap/copy and getting new buffers are done and
    801     * we'll spend less time waiting on the GPU.
    802     *
    803     * Unfortunately, we don't have a handle to the batch containing
    804     * the swap, and getting our hands on that doesn't seem worth it,
    805     * so we just use the first batch we emitted after the last swap.
    806     */
    807    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
    808       if (brw->throttle_batch[1]) {
    809          if (!brw->disable_throttling) {
    810             /* Pass NULL rather than brw so we avoid perf_debug warnings;
    811              * stalling is common and expected here...
    812              */
    813             brw_bo_wait_rendering(brw->throttle_batch[1]);
    814          }
    815          brw_bo_unreference(brw->throttle_batch[1]);
    816       }
    817       brw->throttle_batch[1] = brw->throttle_batch[0];
    818       brw->throttle_batch[0] = NULL;
    819       brw->need_swap_throttle = false;
    820       /* Throttling here is more precise than the throttle ioctl, so skip it */
    821       brw->need_flush_throttle = false;
    822    }
    823 
    824    if (brw->need_flush_throttle) {
    825       __DRIscreen *dri_screen = brw->screen->driScrnPriv;
    826       drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
    827       brw->need_flush_throttle = false;
    828    }
    829 }
    830 
    831 static int
    832 execbuffer(int fd,
    833            struct intel_batchbuffer *batch,
    834            uint32_t ctx_id,
    835            int used,
    836            int in_fence,
    837            int *out_fence,
    838            int flags)
    839 {
    840    struct drm_i915_gem_execbuffer2 execbuf = {
    841       .buffers_ptr = (uintptr_t) batch->validation_list,
    842       .buffer_count = batch->exec_count,
    843       .batch_start_offset = 0,
    844       .batch_len = used,
    845       .flags = flags,
    846       .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
    847    };
    848 
    849    unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
    850 
    851    if (in_fence != -1) {
    852       execbuf.rsvd2 = in_fence;
    853       execbuf.flags |= I915_EXEC_FENCE_IN;
    854    }
    855 
    856    if (out_fence != NULL) {
    857       cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
    858       *out_fence = -1;
    859       execbuf.flags |= I915_EXEC_FENCE_OUT;
    860    }
    861 
    862    int ret = drmIoctl(fd, cmd, &execbuf);
    863    if (ret != 0)
    864       ret = -errno;
    865 
    866    for (int i = 0; i < batch->exec_count; i++) {
    867       struct brw_bo *bo = batch->exec_bos[i];
    868 
    869       bo->idle = false;
    870       bo->index = -1;
    871 
    872       /* Update brw_bo::gtt_offset */
    873       if (batch->validation_list[i].offset != bo->gtt_offset) {
    874          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
    875              bo->gem_handle, bo->gtt_offset,
    876              batch->validation_list[i].offset);
    877          bo->gtt_offset = batch->validation_list[i].offset;
    878       }
    879    }
    880 
    881    if (ret == 0 && out_fence != NULL)
    882       *out_fence = execbuf.rsvd2 >> 32;
    883 
    884    return ret;
    885 }
    886 
    887 static int
    888 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
    889 {
    890    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    891    __DRIscreen *dri_screen = brw->screen->driScrnPriv;
    892    struct intel_batchbuffer *batch = &brw->batch;
    893    int ret = 0;
    894 
    895    if (batch->use_shadow_copy) {
    896       void *bo_map = brw_bo_map(brw, batch->batch.bo, MAP_WRITE);
    897       memcpy(bo_map, batch->batch.map, 4 * USED_BATCH(*batch));
    898 
    899       bo_map = brw_bo_map(brw, batch->state.bo, MAP_WRITE);
    900       memcpy(bo_map, batch->state.map, batch->state_used);
    901    }
    902 
    903    brw_bo_unmap(batch->batch.bo);
    904    brw_bo_unmap(batch->state.bo);
    905 
    906    if (!brw->screen->no_hw) {
    907       /* The requirement for using I915_EXEC_NO_RELOC are:
    908        *
    909        *   The addresses written in the objects must match the corresponding
    910        *   reloc.gtt_offset which in turn must match the corresponding
    911        *   execobject.offset.
    912        *
    913        *   Any render targets written to in the batch must be flagged with
    914        *   EXEC_OBJECT_WRITE.
    915        *
    916        *   To avoid stalling, execobject.offset should match the current
    917        *   address of that object within the active context.
    918        */
    919       int flags = I915_EXEC_NO_RELOC;
    920 
    921       if (devinfo->gen >= 6 && batch->ring == BLT_RING) {
    922          flags |= I915_EXEC_BLT;
    923       } else {
    924          flags |= I915_EXEC_RENDER;
    925       }
    926       if (batch->needs_sol_reset)
    927          flags |= I915_EXEC_GEN7_SOL_RESET;
    928 
    929       uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
    930 
    931       /* Set statebuffer relocations */
    932       const unsigned state_index = batch->state.bo->index;
    933       if (state_index < batch->exec_count &&
    934           batch->exec_bos[state_index] == batch->state.bo) {
    935          struct drm_i915_gem_exec_object2 *entry =
    936             &batch->validation_list[state_index];
    937          assert(entry->handle == batch->state.bo->gem_handle);
    938          entry->relocation_count = batch->state_relocs.reloc_count;
    939          entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
    940       }
    941 
    942       /* Set batchbuffer relocations */
    943       struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
    944       assert(entry->handle == batch->batch.bo->gem_handle);
    945       entry->relocation_count = batch->batch_relocs.reloc_count;
    946       entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
    947 
    948       if (batch->use_batch_first) {
    949          flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
    950       } else {
    951          /* Move the batch to the end of the validation list */
    952          struct drm_i915_gem_exec_object2 tmp;
    953          const unsigned index = batch->exec_count - 1;
    954 
    955          tmp = *entry;
    956          *entry = batch->validation_list[index];
    957          batch->validation_list[index] = tmp;
    958       }
    959 
    960       ret = execbuffer(dri_screen->fd, batch, hw_ctx,
    961                        4 * USED_BATCH(*batch),
    962                        in_fence_fd, out_fence_fd, flags);
    963 
    964       throttle(brw);
    965    }
    966 
    967    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
    968       do_batch_dump(brw);
    969 
    970    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
    971       brw_check_for_reset(brw);
    972 
    973    if (ret != 0) {
    974       fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
    975               strerror(-ret));
    976       exit(1);
    977    }
    978 
    979    return ret;
    980 }
    981 
    982 /**
    983  * The in_fence_fd is ignored if -1.  Otherwise this function takes ownership
    984  * of the fd.
    985  *
    986  * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
    987  * of the returned fd.
    988  */
    989 int
    990 _intel_batchbuffer_flush_fence(struct brw_context *brw,
    991                                int in_fence_fd, int *out_fence_fd,
    992                                const char *file, int line)
    993 {
    994    int ret;
    995 
    996    if (USED_BATCH(brw->batch) == 0)
    997       return 0;
    998 
    999    /* Check that we didn't just wrap our batchbuffer at a bad time. */
   1000    assert(!brw->batch.no_wrap);
   1001 
   1002    brw_finish_batch(brw);
   1003    intel_upload_finish(brw);
   1004 
   1005    finish_growing_bos(&brw->batch.batch);
   1006    finish_growing_bos(&brw->batch.state);
   1007 
   1008    if (brw->throttle_batch[0] == NULL) {
   1009       brw->throttle_batch[0] = brw->batch.batch.bo;
   1010       brw_bo_reference(brw->throttle_batch[0]);
   1011    }
   1012 
   1013    if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
   1014       int bytes_for_commands = 4 * USED_BATCH(brw->batch);
   1015       int bytes_for_state = brw->batch.state_used;
   1016       fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
   1017               " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
   1018               " %4d batch relocs, %4d state relocs\n", file, line,
   1019               bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
   1020               bytes_for_state, 100.0f * bytes_for_state / STATE_SZ,
   1021               brw->batch.exec_count,
   1022               (float) brw->batch.aperture_space / (1024 * 1024),
   1023               brw->batch.batch_relocs.reloc_count,
   1024               brw->batch.state_relocs.reloc_count);
   1025    }
   1026 
   1027    ret = submit_batch(brw, in_fence_fd, out_fence_fd);
   1028 
   1029    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
   1030       fprintf(stderr, "waiting for idle\n");
   1031       brw_bo_wait_rendering(brw->batch.batch.bo);
   1032    }
   1033 
   1034    /* Start a new batch buffer. */
   1035    brw_new_batch(brw);
   1036 
   1037    return ret;
   1038 }
   1039 
   1040 bool
   1041 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
   1042 {
   1043    return brw->batch.aperture_space + extra_space <=
   1044           brw->screen->aperture_threshold;
   1045 }
   1046 
   1047 bool
   1048 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
   1049 {
   1050    unsigned index = READ_ONCE(bo->index);
   1051    if (index < batch->exec_count && batch->exec_bos[index] == bo)
   1052       return true;
   1053 
   1054    for (int i = 0; i < batch->exec_count; i++) {
   1055       if (batch->exec_bos[i] == bo)
   1056          return true;
   1057    }
   1058    return false;
   1059 }
   1060 
   1061 /*  This is the only way buffers get added to the validate list.
   1062  */
   1063 static uint64_t
   1064 emit_reloc(struct intel_batchbuffer *batch,
   1065            struct brw_reloc_list *rlist, uint32_t offset,
   1066            struct brw_bo *target, int32_t target_offset,
   1067            unsigned int reloc_flags)
   1068 {
   1069    assert(target != NULL);
   1070 
   1071    if (rlist->reloc_count == rlist->reloc_array_size) {
   1072       rlist->reloc_array_size *= 2;
   1073       rlist->relocs = realloc(rlist->relocs,
   1074                               rlist->reloc_array_size *
   1075                               sizeof(struct drm_i915_gem_relocation_entry));
   1076    }
   1077 
   1078    unsigned int index = add_exec_bo(batch, target);
   1079    struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
   1080 
   1081    if (reloc_flags)
   1082       entry->flags |= reloc_flags & batch->valid_reloc_flags;
   1083 
   1084    rlist->relocs[rlist->reloc_count++] =
   1085       (struct drm_i915_gem_relocation_entry) {
   1086          .offset = offset,
   1087          .delta = target_offset,
   1088          .target_handle = batch->use_batch_first ? index : target->gem_handle,
   1089          .presumed_offset = entry->offset,
   1090       };
   1091 
   1092    /* Using the old buffer offset, write in what the right data would be, in
   1093     * case the buffer doesn't move and we can short-circuit the relocation
   1094     * processing in the kernel
   1095     */
   1096    return entry->offset + target_offset;
   1097 }
   1098 
   1099 uint64_t
   1100 brw_batch_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
   1101                 struct brw_bo *target, uint32_t target_offset,
   1102                 unsigned int reloc_flags)
   1103 {
   1104    assert(batch_offset <= batch->batch.bo->size - sizeof(uint32_t));
   1105 
   1106    return emit_reloc(batch, &batch->batch_relocs, batch_offset,
   1107                      target, target_offset, reloc_flags);
   1108 }
   1109 
   1110 uint64_t
   1111 brw_state_reloc(struct intel_batchbuffer *batch, uint32_t state_offset,
   1112                 struct brw_bo *target, uint32_t target_offset,
   1113                 unsigned int reloc_flags)
   1114 {
   1115    assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
   1116 
   1117    return emit_reloc(batch, &batch->state_relocs, state_offset,
   1118                      target, target_offset, reloc_flags);
   1119 }
   1120 
   1121 
   1122 uint32_t
   1123 brw_state_batch_size(struct brw_context *brw, uint32_t offset)
   1124 {
   1125    struct hash_entry *entry =
   1126       _mesa_hash_table_search(brw->batch.state_batch_sizes,
   1127                               (void *) (uintptr_t) offset);
   1128    return entry ? (uintptr_t) entry->data : 0;
   1129 }
   1130 
   1131 /**
   1132  * Reserve some space in the statebuffer, or flush.
   1133  *
   1134  * This is used to estimate when we're near the end of the batch,
   1135  * so we can flush early.
   1136  */
   1137 void
   1138 brw_require_statebuffer_space(struct brw_context *brw, int size)
   1139 {
   1140    if (brw->batch.state_used + size >= STATE_SZ)
   1141       intel_batchbuffer_flush(brw);
   1142 }
   1143 
   1144 /**
   1145  * Allocates a block of space in the batchbuffer for indirect state.
   1146  */
   1147 void *
   1148 brw_state_batch(struct brw_context *brw,
   1149                 int size,
   1150                 int alignment,
   1151                 uint32_t *out_offset)
   1152 {
   1153    struct intel_batchbuffer *batch = &brw->batch;
   1154 
   1155    assert(size < batch->state.bo->size);
   1156 
   1157    uint32_t offset = ALIGN(batch->state_used, alignment);
   1158 
   1159    if (offset + size >= STATE_SZ && !batch->no_wrap) {
   1160       intel_batchbuffer_flush(brw);
   1161       offset = ALIGN(batch->state_used, alignment);
   1162    } else if (offset + size >= batch->state.bo->size) {
   1163       const unsigned new_size =
   1164          MIN2(batch->state.bo->size + batch->state.bo->size / 2,
   1165               MAX_STATE_SIZE);
   1166       grow_buffer(brw, &batch->state, batch->state_used, new_size);
   1167       assert(offset + size < batch->state.bo->size);
   1168    }
   1169 
   1170    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
   1171       _mesa_hash_table_insert(batch->state_batch_sizes,
   1172                               (void *) (uintptr_t) offset,
   1173                               (void *) (uintptr_t) size);
   1174    }
   1175 
   1176    batch->state_used = offset + size;
   1177 
   1178    *out_offset = offset;
   1179    return batch->state.map + (offset >> 2);
   1180 }
   1181 
   1182 void
   1183 intel_batchbuffer_data(struct brw_context *brw,
   1184                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
   1185 {
   1186    assert((bytes & 3) == 0);
   1187    intel_batchbuffer_require_space(brw, bytes, ring);
   1188    memcpy(brw->batch.map_next, data, bytes);
   1189    brw->batch.map_next += bytes >> 2;
   1190 }
   1191 
   1192 static void
   1193 load_sized_register_mem(struct brw_context *brw,
   1194                         uint32_t reg,
   1195                         struct brw_bo *bo,
   1196                         uint32_t offset,
   1197                         int size)
   1198 {
   1199    const struct gen_device_info *devinfo = &brw->screen->devinfo;
   1200    int i;
   1201 
   1202    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
   1203    assert(devinfo->gen >= 7);
   1204 
   1205    if (devinfo->gen >= 8) {
   1206       BEGIN_BATCH(4 * size);
   1207       for (i = 0; i < size; i++) {
   1208          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
   1209          OUT_BATCH(reg + i * 4);
   1210          OUT_RELOC64(bo, 0, offset + i * 4);
   1211       }
   1212       ADVANCE_BATCH();
   1213    } else {
   1214       BEGIN_BATCH(3 * size);
   1215       for (i = 0; i < size; i++) {
   1216          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
   1217          OUT_BATCH(reg + i * 4);
   1218          OUT_RELOC(bo, 0, offset + i * 4);
   1219       }
   1220       ADVANCE_BATCH();
   1221    }
   1222 }
   1223 
   1224 void
   1225 brw_load_register_mem(struct brw_context *brw,
   1226                       uint32_t reg,
   1227                       struct brw_bo *bo,
   1228                       uint32_t offset)
   1229 {
   1230    load_sized_register_mem(brw, reg, bo, offset, 1);
   1231 }
   1232 
   1233 void
   1234 brw_load_register_mem64(struct brw_context *brw,
   1235                         uint32_t reg,
   1236                         struct brw_bo *bo,
   1237                         uint32_t offset)
   1238 {
   1239    load_sized_register_mem(brw, reg, bo, offset, 2);
   1240 }
   1241 
   1242 /*
   1243  * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
   1244  */
   1245 void
   1246 brw_store_register_mem32(struct brw_context *brw,
   1247                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
   1248 {
   1249    const struct gen_device_info *devinfo = &brw->screen->devinfo;
   1250 
   1251    assert(devinfo->gen >= 6);
   1252 
   1253    if (devinfo->gen >= 8) {
   1254       BEGIN_BATCH(4);
   1255       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
   1256       OUT_BATCH(reg);
   1257       OUT_RELOC64(bo, RELOC_WRITE, offset);
   1258       ADVANCE_BATCH();
   1259    } else {
   1260       BEGIN_BATCH(3);
   1261       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
   1262       OUT_BATCH(reg);
   1263       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
   1264       ADVANCE_BATCH();
   1265    }
   1266 }
   1267 
   1268 /*
   1269  * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
   1270  */
   1271 void
   1272 brw_store_register_mem64(struct brw_context *brw,
   1273                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
   1274 {
   1275    const struct gen_device_info *devinfo = &brw->screen->devinfo;
   1276 
   1277    assert(devinfo->gen >= 6);
   1278 
   1279    /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
   1280     * read a full 64-bit register, we need to do two of them.
   1281     */
   1282    if (devinfo->gen >= 8) {
   1283       BEGIN_BATCH(8);
   1284       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
   1285       OUT_BATCH(reg);
   1286       OUT_RELOC64(bo, RELOC_WRITE, offset);
   1287       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
   1288       OUT_BATCH(reg + sizeof(uint32_t));
   1289       OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
   1290       ADVANCE_BATCH();
   1291    } else {
   1292       BEGIN_BATCH(6);
   1293       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
   1294       OUT_BATCH(reg);
   1295       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
   1296       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
   1297       OUT_BATCH(reg + sizeof(uint32_t));
   1298       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
   1299       ADVANCE_BATCH();
   1300    }
   1301 }
   1302 
   1303 /*
   1304  * Write a 32-bit register using immediate data.
   1305  */
   1306 void
   1307 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
   1308 {
   1309    assert(brw->screen->devinfo.gen >= 6);
   1310 
   1311    BEGIN_BATCH(3);
   1312    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
   1313    OUT_BATCH(reg);
   1314    OUT_BATCH(imm);
   1315    ADVANCE_BATCH();
   1316 }
   1317 
   1318 /*
   1319  * Write a 64-bit register using immediate data.
   1320  */
   1321 void
   1322 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
   1323 {
   1324    assert(brw->screen->devinfo.gen >= 6);
   1325 
   1326    BEGIN_BATCH(5);
   1327    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
   1328    OUT_BATCH(reg);
   1329    OUT_BATCH(imm & 0xffffffff);
   1330    OUT_BATCH(reg + 4);
   1331    OUT_BATCH(imm >> 32);
   1332    ADVANCE_BATCH();
   1333 }
   1334 
   1335 /*
   1336  * Copies a 32-bit register.
   1337  */
   1338 void
   1339 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
   1340 {
   1341    assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
   1342 
   1343    BEGIN_BATCH(3);
   1344    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
   1345    OUT_BATCH(src);
   1346    OUT_BATCH(dest);
   1347    ADVANCE_BATCH();
   1348 }
   1349 
   1350 /*
   1351  * Copies a 64-bit register.
   1352  */
   1353 void
   1354 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
   1355 {
   1356    assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
   1357 
   1358    BEGIN_BATCH(6);
   1359    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
   1360    OUT_BATCH(src);
   1361    OUT_BATCH(dest);
   1362    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
   1363    OUT_BATCH(src + sizeof(uint32_t));
   1364    OUT_BATCH(dest + sizeof(uint32_t));
   1365    ADVANCE_BATCH();
   1366 }
   1367 
   1368 /*
   1369  * Write 32-bits of immediate data to a GPU memory buffer.
   1370  */
   1371 void
   1372 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
   1373                      uint32_t offset, uint32_t imm)
   1374 {
   1375    const struct gen_device_info *devinfo = &brw->screen->devinfo;
   1376 
   1377    assert(devinfo->gen >= 6);
   1378 
   1379    BEGIN_BATCH(4);
   1380    OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
   1381    if (devinfo->gen >= 8)
   1382       OUT_RELOC64(bo, RELOC_WRITE, offset);
   1383    else {
   1384       OUT_BATCH(0); /* MBZ */
   1385       OUT_RELOC(bo, RELOC_WRITE, offset);
   1386    }
   1387    OUT_BATCH(imm);
   1388    ADVANCE_BATCH();
   1389 }
   1390 
   1391 /*
   1392  * Write 64-bits of immediate data to a GPU memory buffer.
   1393  */
   1394 void
   1395 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
   1396                      uint32_t offset, uint64_t imm)
   1397 {
   1398    const struct gen_device_info *devinfo = &brw->screen->devinfo;
   1399 
   1400    assert(devinfo->gen >= 6);
   1401 
   1402    BEGIN_BATCH(5);
   1403    OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
   1404    if (devinfo->gen >= 8)
   1405       OUT_RELOC64(bo, RELOC_WRITE, offset);
   1406    else {
   1407       OUT_BATCH(0); /* MBZ */
   1408       OUT_RELOC(bo, RELOC_WRITE, offset);
   1409    }
   1410    OUT_BATCH(imm & 0xffffffffu);
   1411    OUT_BATCH(imm >> 32);
   1412    ADVANCE_BATCH();
   1413 }
   1414