Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright 2006 VMware, Inc.
      3  * All Rights Reserved.
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the
      7  * "Software"), to deal in the Software without restriction, including
      8  * without limitation the rights to use, copy, modify, merge, publish,
      9  * distribute, sublicense, and/or sell copies of the Software, and to
     10  * permit persons to whom the Software is furnished to do so, subject to
     11  * the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the
     14  * next paragraph) shall be included in all copies or substantial portions
     15  * of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     20  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     21  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     24  */
     25 
     26 #include "intel_batchbuffer.h"
     27 #include "intel_buffer_objects.h"
     28 #include "intel_bufmgr.h"
     29 #include "intel_buffers.h"
     30 #include "intel_fbo.h"
     31 #include "brw_context.h"
     32 #include "brw_defines.h"
     33 #include "brw_state.h"
     34 
     35 #include <xf86drm.h>
     36 #include <i915_drm.h>
     37 
     38 static void
     39 intel_batchbuffer_reset(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
     40                         bool has_llc);
     41 
     42 void
     43 intel_batchbuffer_init(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
     44                        bool has_llc)
     45 {
     46    intel_batchbuffer_reset(batch, bufmgr, has_llc);
     47 
     48    if (!has_llc) {
     49       batch->cpu_map = malloc(BATCH_SZ);
     50       batch->map = batch->cpu_map;
     51       batch->map_next = batch->cpu_map;
     52    }
     53 }
     54 
     55 static void
     56 intel_batchbuffer_reset(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
     57                         bool has_llc)
     58 {
     59    if (batch->last_bo != NULL) {
     60       drm_intel_bo_unreference(batch->last_bo);
     61       batch->last_bo = NULL;
     62    }
     63    batch->last_bo = batch->bo;
     64 
     65    batch->bo = drm_intel_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
     66    if (has_llc) {
     67       drm_intel_bo_map(batch->bo, true);
     68       batch->map = batch->bo->virtual;
     69    }
     70    batch->map_next = batch->map;
     71 
     72    batch->reserved_space = BATCH_RESERVED;
     73    batch->state_batch_offset = batch->bo->size;
     74    batch->needs_sol_reset = false;
     75    batch->state_base_address_emitted = false;
     76 
     77    /* We don't know what ring the new batch will be sent to until we see the
     78     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
     79     */
     80    batch->ring = UNKNOWN_RING;
     81 }
     82 
     83 static void
     84 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
     85 {
     86    intel_batchbuffer_reset(&brw->batch, brw->bufmgr, brw->has_llc);
     87    brw_render_cache_set_clear(brw);
     88 }
     89 
     90 void
     91 intel_batchbuffer_save_state(struct brw_context *brw)
     92 {
     93    brw->batch.saved.map_next = brw->batch.map_next;
     94    brw->batch.saved.reloc_count =
     95       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
     96 }
     97 
     98 void
     99 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
    100 {
    101    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
    102 
    103    brw->batch.map_next = brw->batch.saved.map_next;
    104    if (USED_BATCH(brw->batch) == 0)
    105       brw->batch.ring = UNKNOWN_RING;
    106 }
    107 
    108 void
    109 intel_batchbuffer_free(struct intel_batchbuffer *batch)
    110 {
    111    free(batch->cpu_map);
    112    drm_intel_bo_unreference(batch->last_bo);
    113    drm_intel_bo_unreference(batch->bo);
    114 }
    115 
    116 void
    117 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
    118                                 enum brw_gpu_ring ring)
    119 {
    120    /* If we're switching rings, implicitly flush the batch. */
    121    if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
    122        brw->gen >= 6) {
    123       intel_batchbuffer_flush(brw);
    124    }
    125 
    126 #ifdef DEBUG
    127    assert(sz < BATCH_SZ - BATCH_RESERVED);
    128 #endif
    129    if (intel_batchbuffer_space(&brw->batch) < sz)
    130       intel_batchbuffer_flush(brw);
    131 
    132    enum brw_gpu_ring prev_ring = brw->batch.ring;
    133    /* The intel_batchbuffer_flush() calls above might have changed
    134     * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
    135     */
    136    brw->batch.ring = ring;
    137 
    138    if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
    139       intel_batchbuffer_emit_render_ring_prelude(brw);
    140 }
    141 
    142 static void
    143 do_batch_dump(struct brw_context *brw)
    144 {
    145    struct drm_intel_decode *decode;
    146    struct intel_batchbuffer *batch = &brw->batch;
    147    int ret;
    148 
    149    decode = drm_intel_decode_context_alloc(brw->screen->deviceID);
    150    if (!decode)
    151       return;
    152 
    153    ret = drm_intel_bo_map(batch->bo, false);
    154    if (ret == 0) {
    155       drm_intel_decode_set_batch_pointer(decode,
    156 					 batch->bo->virtual,
    157 					 batch->bo->offset64,
    158                                          USED_BATCH(*batch));
    159    } else {
    160       fprintf(stderr,
    161 	      "WARNING: failed to map batchbuffer (%s), "
    162 	      "dumping uploaded data instead.\n", strerror(ret));
    163 
    164       drm_intel_decode_set_batch_pointer(decode,
    165 					 batch->map,
    166 					 batch->bo->offset64,
    167                                          USED_BATCH(*batch));
    168    }
    169 
    170    drm_intel_decode_set_output_file(decode, stderr);
    171    drm_intel_decode(decode);
    172 
    173    drm_intel_decode_context_free(decode);
    174 
    175    if (ret == 0) {
    176       drm_intel_bo_unmap(batch->bo);
    177 
    178       brw_debug_batch(brw);
    179    }
    180 }
    181 
    182 void
    183 intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
    184 {
    185    /* Un-used currently */
    186 }
    187 
    188 /**
    189  * Called when starting a new batch buffer.
    190  */
    191 static void
    192 brw_new_batch(struct brw_context *brw)
    193 {
    194    /* Create a new batchbuffer and reset the associated state: */
    195    drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
    196    intel_batchbuffer_reset_and_clear_render_cache(brw);
    197 
    198    /* If the kernel supports hardware contexts, then most hardware state is
    199     * preserved between batches; we only need to re-emit state that is required
    200     * to be in every batch.  Otherwise we need to re-emit all the state that
    201     * would otherwise be stored in the context (which for all intents and
    202     * purposes means everything).
    203     */
    204    if (brw->hw_ctx == NULL)
    205       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
    206 
    207    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
    208 
    209    brw->state_batch_count = 0;
    210 
    211    brw->ib.type = -1;
    212 
    213    /* We need to periodically reap the shader time results, because rollover
    214     * happens every few seconds.  We also want to see results every once in a
    215     * while, because many programs won't cleanly destroy our context, so the
    216     * end-of-run printout may not happen.
    217     */
    218    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
    219       brw_collect_and_report_shader_time(brw);
    220 }
    221 
    222 /**
    223  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
    224  * sending it off.
    225  *
    226  * This function can emit state (say, to preserve registers that aren't saved
    227  * between batches).  All of this state MUST fit in the reserved space at the
    228  * end of the batchbuffer.  If you add more GPU state, increase the reserved
    229  * space by updating the BATCH_RESERVED macro.
    230  */
    231 static void
    232 brw_finish_batch(struct brw_context *brw)
    233 {
    234    /* Capture the closing pipeline statistics register values necessary to
    235     * support query objects (in the non-hardware context world).
    236     */
    237    brw_emit_query_end(brw);
    238 
    239    if (brw->batch.ring == RENDER_RING) {
    240       /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
    241        * assume that the L3 cache is configured according to the hardware
    242        * defaults.
    243        */
    244       if (brw->gen >= 7)
    245          gen7_restore_default_l3_config(brw);
    246 
    247       if (brw->is_haswell) {
    248          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
    249           * 3DSTATE_CC_STATE_POINTERS > "Note":
    250           *
    251           * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
    252           *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
    253           *
    254           * From the example in the docs, it seems to expect a regular pipe control
    255           * flush here as well. We may have done it already, but meh.
    256           *
    257           * See also WaAvoidRCZCounterRollover.
    258           */
    259          brw_emit_mi_flush(brw);
    260          BEGIN_BATCH(2);
    261          OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
    262          OUT_BATCH(brw->cc.state_offset | 1);
    263          ADVANCE_BATCH();
    264          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
    265                                           PIPE_CONTROL_CS_STALL);
    266       }
    267    }
    268 
    269    /* Mark that the current program cache BO has been used by the GPU.
    270     * It will be reallocated if we need to put new programs in for the
    271     * next batch.
    272     */
    273    brw->cache.bo_used_by_gpu = true;
    274 }
    275 
    276 static void
    277 throttle(struct brw_context *brw)
    278 {
    279    /* Wait for the swapbuffers before the one we just emitted, so we
    280     * don't get too many swaps outstanding for apps that are GPU-heavy
    281     * but not CPU-heavy.
    282     *
    283     * We're using intelDRI2Flush (called from the loader before
    284     * swapbuffer) and glFlush (for front buffer rendering) as the
    285     * indicator that a frame is done and then throttle when we get
    286     * here as we prepare to render the next frame.  At this point for
    287     * round trips for swap/copy and getting new buffers are done and
    288     * we'll spend less time waiting on the GPU.
    289     *
    290     * Unfortunately, we don't have a handle to the batch containing
    291     * the swap, and getting our hands on that doesn't seem worth it,
    292     * so we just use the first batch we emitted after the last swap.
    293     */
    294    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
    295       if (brw->throttle_batch[1]) {
    296          if (!brw->disable_throttling)
    297             drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
    298          drm_intel_bo_unreference(brw->throttle_batch[1]);
    299       }
    300       brw->throttle_batch[1] = brw->throttle_batch[0];
    301       brw->throttle_batch[0] = NULL;
    302       brw->need_swap_throttle = false;
    303       /* Throttling here is more precise than the throttle ioctl, so skip it */
    304       brw->need_flush_throttle = false;
    305    }
    306 
    307    if (brw->need_flush_throttle) {
    308       __DRIscreen *dri_screen = brw->screen->driScrnPriv;
    309       drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
    310       brw->need_flush_throttle = false;
    311    }
    312 }
    313 
    314 /* Drop when RS headers get pulled to libdrm */
    315 #ifndef I915_EXEC_RESOURCE_STREAMER
    316 #define I915_EXEC_RESOURCE_STREAMER (1<<15)
    317 #endif
    318 
    319 /* TODO: Push this whole function into bufmgr.
    320  */
    321 static int
    322 do_flush_locked(struct brw_context *brw)
    323 {
    324    struct intel_batchbuffer *batch = &brw->batch;
    325    int ret = 0;
    326 
    327    if (brw->has_llc) {
    328       drm_intel_bo_unmap(batch->bo);
    329    } else {
    330       ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
    331       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
    332 	 ret = drm_intel_bo_subdata(batch->bo,
    333 				    batch->state_batch_offset,
    334 				    batch->bo->size - batch->state_batch_offset,
    335 				    (char *)batch->map + batch->state_batch_offset);
    336       }
    337    }
    338 
    339    if (!brw->screen->no_hw) {
    340       int flags;
    341 
    342       if (brw->gen >= 6 && batch->ring == BLT_RING) {
    343          flags = I915_EXEC_BLT;
    344       } else {
    345          flags = I915_EXEC_RENDER |
    346             (brw->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0);
    347       }
    348       if (batch->needs_sol_reset)
    349 	 flags |= I915_EXEC_GEN7_SOL_RESET;
    350 
    351       if (ret == 0) {
    352          if (unlikely(INTEL_DEBUG & DEBUG_AUB))
    353             brw_annotate_aub(brw);
    354 
    355 	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
    356             ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
    357                                         NULL, 0, 0, flags);
    358 	 } else {
    359 	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
    360                                                 4 * USED_BATCH(*batch), flags);
    361 	 }
    362       }
    363 
    364       throttle(brw);
    365    }
    366 
    367    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
    368       do_batch_dump(brw);
    369 
    370    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
    371       brw_check_for_reset(brw);
    372 
    373    if (ret != 0) {
    374       fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
    375       exit(1);
    376    }
    377 
    378    return ret;
    379 }
    380 
    381 int
    382 _intel_batchbuffer_flush(struct brw_context *brw,
    383 			 const char *file, int line)
    384 {
    385    int ret;
    386 
    387    if (USED_BATCH(brw->batch) == 0)
    388       return 0;
    389 
    390    if (brw->throttle_batch[0] == NULL) {
    391       brw->throttle_batch[0] = brw->batch.bo;
    392       drm_intel_bo_reference(brw->throttle_batch[0]);
    393    }
    394 
    395    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
    396       int bytes_for_commands = 4 * USED_BATCH(brw->batch);
    397       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
    398       int total_bytes = bytes_for_commands + bytes_for_state;
    399       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
    400               "%4db (state) = %4db (%0.1f%%)\n", file, line,
    401               bytes_for_commands, bytes_for_state,
    402               total_bytes,
    403               100.0f * total_bytes / BATCH_SZ);
    404    }
    405 
    406    brw->batch.reserved_space = 0;
    407 
    408    brw_finish_batch(brw);
    409 
    410    /* Mark the end of the buffer. */
    411    intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
    412    if (USED_BATCH(brw->batch) & 1) {
    413       /* Round batchbuffer usage to 2 DWORDs. */
    414       intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
    415    }
    416 
    417    intel_upload_finish(brw);
    418 
    419    /* Check that we didn't just wrap our batchbuffer at a bad time. */
    420    assert(!brw->no_batch_wrap);
    421 
    422    ret = do_flush_locked(brw);
    423 
    424    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
    425       fprintf(stderr, "waiting for idle\n");
    426       drm_intel_bo_wait_rendering(brw->batch.bo);
    427    }
    428 
    429    if (brw->use_resource_streamer)
    430       gen7_reset_hw_bt_pool_offsets(brw);
    431 
    432    /* Start a new batch buffer. */
    433    brw_new_batch(brw);
    434 
    435    return ret;
    436 }
    437 
    438 
    439 /*  This is the only way buffers get added to the validate list.
    440  */
    441 uint32_t
    442 intel_batchbuffer_reloc(struct intel_batchbuffer *batch,
    443                         drm_intel_bo *buffer, uint32_t offset,
    444                         uint32_t read_domains, uint32_t write_domain,
    445                         uint32_t delta)
    446 {
    447    int ret;
    448 
    449    ret = drm_intel_bo_emit_reloc(batch->bo, offset,
    450 				 buffer, delta,
    451 				 read_domains, write_domain);
    452    assert(ret == 0);
    453    (void)ret;
    454 
    455    /* Using the old buffer offset, write in what the right data would be, in
    456     * case the buffer doesn't move and we can short-circuit the relocation
    457     * processing in the kernel
    458     */
    459    return buffer->offset64 + delta;
    460 }
    461 
    462 uint64_t
    463 intel_batchbuffer_reloc64(struct intel_batchbuffer *batch,
    464                           drm_intel_bo *buffer, uint32_t offset,
    465                           uint32_t read_domains, uint32_t write_domain,
    466                           uint32_t delta)
    467 {
    468    int ret = drm_intel_bo_emit_reloc(batch->bo, offset,
    469                                      buffer, delta,
    470                                      read_domains, write_domain);
    471    assert(ret == 0);
    472    (void) ret;
    473 
    474    /* Using the old buffer offset, write in what the right data would be, in
    475     * case the buffer doesn't move and we can short-circuit the relocation
    476     * processing in the kernel
    477     */
    478    return buffer->offset64 + delta;
    479 }
    480 
    481 
    482 void
    483 intel_batchbuffer_data(struct brw_context *brw,
    484                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
    485 {
    486    assert((bytes & 3) == 0);
    487    intel_batchbuffer_require_space(brw, bytes, ring);
    488    memcpy(brw->batch.map_next, data, bytes);
    489    brw->batch.map_next += bytes >> 2;
    490 }
    491 
    492 static void
    493 load_sized_register_mem(struct brw_context *brw,
    494                         uint32_t reg,
    495                         drm_intel_bo *bo,
    496                         uint32_t read_domains, uint32_t write_domain,
    497                         uint32_t offset,
    498                         int size)
    499 {
    500    int i;
    501 
    502    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
    503    assert(brw->gen >= 7);
    504 
    505    if (brw->gen >= 8) {
    506       BEGIN_BATCH(4 * size);
    507       for (i = 0; i < size; i++) {
    508          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
    509          OUT_BATCH(reg + i * 4);
    510          OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
    511       }
    512       ADVANCE_BATCH();
    513    } else {
    514       BEGIN_BATCH(3 * size);
    515       for (i = 0; i < size; i++) {
    516          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
    517          OUT_BATCH(reg + i * 4);
    518          OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
    519       }
    520       ADVANCE_BATCH();
    521    }
    522 }
    523 
    524 void
    525 brw_load_register_mem(struct brw_context *brw,
    526                       uint32_t reg,
    527                       drm_intel_bo *bo,
    528                       uint32_t read_domains, uint32_t write_domain,
    529                       uint32_t offset)
    530 {
    531    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
    532 }
    533 
    534 void
    535 brw_load_register_mem64(struct brw_context *brw,
    536                         uint32_t reg,
    537                         drm_intel_bo *bo,
    538                         uint32_t read_domains, uint32_t write_domain,
    539                         uint32_t offset)
    540 {
    541    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
    542 }
    543 
    544 /*
    545  * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
    546  */
    547 void
    548 brw_store_register_mem32(struct brw_context *brw,
    549                          drm_intel_bo *bo, uint32_t reg, uint32_t offset)
    550 {
    551    assert(brw->gen >= 6);
    552 
    553    if (brw->gen >= 8) {
    554       BEGIN_BATCH(4);
    555       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
    556       OUT_BATCH(reg);
    557       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    558                   offset);
    559       ADVANCE_BATCH();
    560    } else {
    561       BEGIN_BATCH(3);
    562       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    563       OUT_BATCH(reg);
    564       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    565                 offset);
    566       ADVANCE_BATCH();
    567    }
    568 }
    569 
    570 /*
    571  * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
    572  */
    573 void
    574 brw_store_register_mem64(struct brw_context *brw,
    575                          drm_intel_bo *bo, uint32_t reg, uint32_t offset)
    576 {
    577    assert(brw->gen >= 6);
    578 
    579    /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
    580     * read a full 64-bit register, we need to do two of them.
    581     */
    582    if (brw->gen >= 8) {
    583       BEGIN_BATCH(8);
    584       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
    585       OUT_BATCH(reg);
    586       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    587                   offset);
    588       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
    589       OUT_BATCH(reg + sizeof(uint32_t));
    590       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    591                   offset + sizeof(uint32_t));
    592       ADVANCE_BATCH();
    593    } else {
    594       BEGIN_BATCH(6);
    595       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    596       OUT_BATCH(reg);
    597       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    598                 offset);
    599       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    600       OUT_BATCH(reg + sizeof(uint32_t));
    601       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    602                 offset + sizeof(uint32_t));
    603       ADVANCE_BATCH();
    604    }
    605 }
    606 
    607 /*
    608  * Write a 32-bit register using immediate data.
    609  */
    610 void
    611 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
    612 {
    613    assert(brw->gen >= 6);
    614 
    615    BEGIN_BATCH(3);
    616    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
    617    OUT_BATCH(reg);
    618    OUT_BATCH(imm);
    619    ADVANCE_BATCH();
    620 }
    621 
    622 /*
    623  * Write a 64-bit register using immediate data.
    624  */
    625 void
    626 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
    627 {
    628    assert(brw->gen >= 6);
    629 
    630    BEGIN_BATCH(5);
    631    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
    632    OUT_BATCH(reg);
    633    OUT_BATCH(imm & 0xffffffff);
    634    OUT_BATCH(reg + 4);
    635    OUT_BATCH(imm >> 32);
    636    ADVANCE_BATCH();
    637 }
    638 
    639 /*
    640  * Copies a 32-bit register.
    641  */
    642 void
    643 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
    644 {
    645    assert(brw->gen >= 8 || brw->is_haswell);
    646 
    647    BEGIN_BATCH(3);
    648    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
    649    OUT_BATCH(src);
    650    OUT_BATCH(dest);
    651    ADVANCE_BATCH();
    652 }
    653 
    654 /*
    655  * Copies a 64-bit register.
    656  */
    657 void
    658 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
    659 {
    660    assert(brw->gen >= 8 || brw->is_haswell);
    661 
    662    BEGIN_BATCH(6);
    663    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
    664    OUT_BATCH(src);
    665    OUT_BATCH(dest);
    666    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
    667    OUT_BATCH(src + sizeof(uint32_t));
    668    OUT_BATCH(dest + sizeof(uint32_t));
    669    ADVANCE_BATCH();
    670 }
    671 
    672 /*
    673  * Write 32-bits of immediate data to a GPU memory buffer.
    674  */
    675 void
    676 brw_store_data_imm32(struct brw_context *brw, drm_intel_bo *bo,
    677                      uint32_t offset, uint32_t imm)
    678 {
    679    assert(brw->gen >= 6);
    680 
    681    BEGIN_BATCH(4);
    682    OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
    683    if (brw->gen >= 8)
    684       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    685                   offset);
    686    else {
    687       OUT_BATCH(0); /* MBZ */
    688       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    689                 offset);
    690    }
    691    OUT_BATCH(imm);
    692    ADVANCE_BATCH();
    693 }
    694 
    695 /*
    696  * Write 64-bits of immediate data to a GPU memory buffer.
    697  */
    698 void
    699 brw_store_data_imm64(struct brw_context *brw, drm_intel_bo *bo,
    700                      uint32_t offset, uint64_t imm)
    701 {
    702    assert(brw->gen >= 6);
    703 
    704    BEGIN_BATCH(5);
    705    OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
    706    if (brw->gen >= 8)
    707       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    708                   offset);
    709    else {
    710       OUT_BATCH(0); /* MBZ */
    711       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    712                 offset);
    713    }
    714    OUT_BATCH(imm & 0xffffffffu);
    715    OUT_BATCH(imm >> 32);
    716    ADVANCE_BATCH();
    717 }
    718