Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2011 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 /**
     25  * @file gen7_sol_state.c
     26  *
     27  * Controls the stream output logic (SOL) stage of the gen7 hardware, which is
     28  * used to implement GL_EXT_transform_feedback.
     29  */
     30 
     31 #include "brw_context.h"
     32 #include "brw_state.h"
     33 #include "brw_defines.h"
     34 #include "intel_batchbuffer.h"
     35 #include "intel_buffer_objects.h"
     36 #include "main/transformfeedback.h"
     37 
     38 static void
     39 upload_3dstate_so_buffers(struct brw_context *brw)
     40 {
     41    struct gl_context *ctx = &brw->ctx;
     42    /* BRW_NEW_TRANSFORM_FEEDBACK */
     43    struct gl_transform_feedback_object *xfb_obj =
     44       ctx->TransformFeedback.CurrentObject;
     45    const struct gl_transform_feedback_info *linked_xfb_info =
     46       xfb_obj->program->sh.LinkedTransformFeedback;
     47    int i;
     48 
     49    /* Set up the up to 4 output buffers.  These are the ranges defined in the
     50     * gl_transform_feedback_object.
     51     */
     52    for (i = 0; i < 4; i++) {
     53       struct intel_buffer_object *bufferobj =
     54 	 intel_buffer_object(xfb_obj->Buffers[i]);
     55       drm_intel_bo *bo;
     56       uint32_t start, end;
     57       uint32_t stride;
     58 
     59       if (!xfb_obj->Buffers[i]) {
     60 	 /* The pitch of 0 in this command indicates that the buffer is
     61 	  * unbound and won't be written to.
     62 	  */
     63 	 BEGIN_BATCH(4);
     64 	 OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2));
     65 	 OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT));
     66 	 OUT_BATCH(0);
     67 	 OUT_BATCH(0);
     68 	 ADVANCE_BATCH();
     69 
     70 	 continue;
     71       }
     72 
     73       stride = linked_xfb_info->Buffers[i].Stride * 4;
     74 
     75       start = xfb_obj->Offset[i];
     76       assert(start % 4 == 0);
     77       end = ALIGN(start + xfb_obj->Size[i], 4);
     78       bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start);
     79       assert(end <= bo->size);
     80 
     81       BEGIN_BATCH(4);
     82       OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2));
     83       OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT) | stride);
     84       OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);
     85       OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, end);
     86       ADVANCE_BATCH();
     87    }
     88 }
     89 
     90 /**
     91  * Outputs the 3DSTATE_SO_DECL_LIST command.
     92  *
     93  * The data output is a series of 64-bit entries containing a SO_DECL per
     94  * stream.  We only have one stream of rendering coming out of the GS unit, so
     95  * we only emit stream 0 (low 16 bits) SO_DECLs.
     96  */
     97 void
     98 gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
     99                                  const struct brw_vue_map *vue_map)
    100 {
    101    struct gl_context *ctx = &brw->ctx;
    102    /* BRW_NEW_TRANSFORM_FEEDBACK */
    103    struct gl_transform_feedback_object *xfb_obj =
    104       ctx->TransformFeedback.CurrentObject;
    105    const struct gl_transform_feedback_info *linked_xfb_info =
    106       xfb_obj->program->sh.LinkedTransformFeedback;
    107    uint16_t so_decl[MAX_VERTEX_STREAMS][128];
    108    int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
    109    int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
    110    int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
    111    int max_decls = 0;
    112    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
    113 
    114    memset(so_decl, 0, sizeof(so_decl));
    115 
    116    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
    117     * command is feels strange -- each dword pair contains a SO_DECL per stream.
    118     */
    119    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
    120       int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
    121       uint16_t decl = 0;
    122       int varying = linked_xfb_info->Outputs[i].OutputRegister;
    123       const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
    124       unsigned component_mask = (1 << components) - 1;
    125       unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
    126       unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT;
    127       assert(stream_id < MAX_VERTEX_STREAMS);
    128 
    129       /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
    130        * gl_Layer is stored in VARYING_SLOT_PSIZ.y
    131        * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
    132        */
    133       if (varying == VARYING_SLOT_PSIZ) {
    134          assert(components == 1);
    135          component_mask <<= 3;
    136       } else if (varying == VARYING_SLOT_LAYER) {
    137          assert(components == 1);
    138          component_mask <<= 1;
    139       } else if (varying == VARYING_SLOT_VIEWPORT) {
    140          assert(components == 1);
    141          component_mask <<= 2;
    142       } else {
    143          component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
    144       }
    145 
    146       buffer_mask[stream_id] |= 1 << buffer;
    147 
    148       decl |= decl_buffer_slot;
    149       if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
    150          decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] <<
    151             SO_DECL_REGISTER_INDEX_SHIFT;
    152       } else {
    153          assert(vue_map->varying_to_slot[varying] >= 0);
    154          decl |= vue_map->varying_to_slot[varying] <<
    155             SO_DECL_REGISTER_INDEX_SHIFT;
    156       }
    157       decl |= component_mask << SO_DECL_COMPONENT_MASK_SHIFT;
    158 
    159       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
    160        * array.  Instead, it simply increments DstOffset for the following
    161        * input by the number of components that should be skipped.
    162        *
    163        * Our hardware is unusual in that it requires us to program SO_DECLs
    164        * for fake "hole" components, rather than simply taking the offset
    165        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
    166        * program as many size = 4 holes as we can, then a final hole to
    167        * accommodate the final 1, 2, or 3 remaining.
    168        */
    169       int skip_components =
    170          linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
    171 
    172       next_offset[buffer] += skip_components;
    173 
    174       while (skip_components >= 4) {
    175          so_decl[stream_id][decls[stream_id]++] =
    176             SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot;
    177          skip_components -= 4;
    178       }
    179       if (skip_components > 0)
    180          so_decl[stream_id][decls[stream_id]++] =
    181             SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) |
    182             decl_buffer_slot;
    183 
    184       assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
    185 
    186       next_offset[buffer] += components;
    187 
    188       so_decl[stream_id][decls[stream_id]++] = decl;
    189 
    190       if (decls[stream_id] > max_decls)
    191          max_decls = decls[stream_id];
    192    }
    193 
    194    BEGIN_BATCH(max_decls * 2 + 3);
    195    OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | (max_decls * 2 + 1));
    196 
    197    OUT_BATCH((buffer_mask[0] << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) |
    198              (buffer_mask[1] << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) |
    199              (buffer_mask[2] << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) |
    200              (buffer_mask[3] << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT));
    201 
    202    OUT_BATCH((decls[0] << SO_NUM_ENTRIES_0_SHIFT) |
    203              (decls[1] << SO_NUM_ENTRIES_1_SHIFT) |
    204              (decls[2] << SO_NUM_ENTRIES_2_SHIFT) |
    205              (decls[3] << SO_NUM_ENTRIES_3_SHIFT));
    206 
    207    for (int i = 0; i < max_decls; i++) {
    208       /* Stream 1 | Stream 0 */
    209       OUT_BATCH(((uint32_t) so_decl[1][i]) << 16 | so_decl[0][i]);
    210       /* Stream 3 | Stream 2 */
    211       OUT_BATCH(((uint32_t) so_decl[3][i]) << 16 | so_decl[2][i]);
    212    }
    213 
    214    ADVANCE_BATCH();
    215 }
    216 
    217 static bool
    218 query_active(struct gl_query_object *q)
    219 {
    220    return q && q->Active;
    221 }
    222 
    223 static void
    224 upload_3dstate_streamout(struct brw_context *brw, bool active,
    225 			 const struct brw_vue_map *vue_map)
    226 {
    227    struct gl_context *ctx = &brw->ctx;
    228    /* BRW_NEW_TRANSFORM_FEEDBACK */
    229    struct gl_transform_feedback_object *xfb_obj =
    230       ctx->TransformFeedback.CurrentObject;
    231    uint32_t dw1 = 0, dw2 = 0, dw3 = 0, dw4 = 0;
    232    int i;
    233 
    234    if (active) {
    235       const struct gl_transform_feedback_info *linked_xfb_info =
    236          xfb_obj->program->sh.LinkedTransformFeedback;
    237       int urb_entry_read_offset = 0;
    238       int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
    239 	 urb_entry_read_offset;
    240 
    241       dw1 |= SO_FUNCTION_ENABLE;
    242       dw1 |= SO_STATISTICS_ENABLE;
    243 
    244       /* BRW_NEW_RASTERIZER_DISCARD */
    245       if (ctx->RasterDiscard) {
    246          if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
    247             dw1 |= SO_RENDERING_DISABLE;
    248          } else {
    249             perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
    250                        "query active relies on the clipper.");
    251          }
    252       }
    253 
    254       /* _NEW_LIGHT */
    255       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
    256 	 dw1 |= SO_REORDER_TRAILING;
    257 
    258       if (brw->gen < 8) {
    259          for (i = 0; i < 4; i++) {
    260             if (xfb_obj->Buffers[i]) {
    261                dw1 |= SO_BUFFER_ENABLE(i);
    262             }
    263          }
    264       }
    265 
    266       /* We always read the whole vertex.  This could be reduced at some
    267        * point by reading less and offsetting the register index in the
    268        * SO_DECLs.
    269        */
    270       dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_0_VERTEX_READ_OFFSET);
    271       dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_0_VERTEX_READ_LENGTH);
    272 
    273       dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_1_VERTEX_READ_OFFSET);
    274       dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_1_VERTEX_READ_LENGTH);
    275 
    276       dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_2_VERTEX_READ_OFFSET);
    277       dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_2_VERTEX_READ_LENGTH);
    278 
    279       dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_3_VERTEX_READ_OFFSET);
    280       dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_3_VERTEX_READ_LENGTH);
    281 
    282       if (brw->gen >= 8) {
    283 	 /* Set buffer pitches; 0 means unbound. */
    284 	 if (xfb_obj->Buffers[0])
    285 	    dw3 |= linked_xfb_info->Buffers[0].Stride * 4;
    286 	 if (xfb_obj->Buffers[1])
    287 	    dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16;
    288 	 if (xfb_obj->Buffers[2])
    289 	    dw4 |= linked_xfb_info->Buffers[2].Stride * 4;
    290 	 if (xfb_obj->Buffers[3])
    291 	    dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16;
    292       }
    293    }
    294 
    295    const int dwords = brw->gen >= 8 ? 5 : 3;
    296 
    297    BEGIN_BATCH(dwords);
    298    OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (dwords - 2));
    299    OUT_BATCH(dw1);
    300    OUT_BATCH(dw2);
    301    if (dwords > 3) {
    302       OUT_BATCH(dw3);
    303       OUT_BATCH(dw4);
    304    }
    305    ADVANCE_BATCH();
    306 }
    307 
    308 static void
    309 upload_sol_state(struct brw_context *brw)
    310 {
    311    struct gl_context *ctx = &brw->ctx;
    312    /* BRW_NEW_TRANSFORM_FEEDBACK */
    313    bool active = _mesa_is_xfb_active_and_unpaused(ctx);
    314 
    315    if (active) {
    316       if (brw->gen >= 8)
    317          gen8_upload_3dstate_so_buffers(brw);
    318       else
    319          upload_3dstate_so_buffers(brw);
    320 
    321       /* BRW_NEW_VUE_MAP_GEOM_OUT */
    322       gen7_upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out);
    323    }
    324 
    325    /* Finally, set up the SOL stage.  This command must always follow updates to
    326     * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
    327     * MMIO register updates (current performed by the kernel at each batch
    328     * emit).
    329     */
    330    upload_3dstate_streamout(brw, active, &brw->vue_map_geom_out);
    331 }
    332 
    333 const struct brw_tracked_state gen7_sol_state = {
    334    .dirty = {
    335       .mesa  = _NEW_LIGHT,
    336       .brw   = BRW_NEW_BATCH |
    337                BRW_NEW_BLORP |
    338                BRW_NEW_RASTERIZER_DISCARD |
    339                BRW_NEW_VUE_MAP_GEOM_OUT |
    340                BRW_NEW_TRANSFORM_FEEDBACK,
    341    },
    342    .emit = upload_sol_state,
    343 };
    344 
    345 /**
    346  * Tally the number of primitives generated so far.
    347  *
    348  * The buffer contains a series of pairs:
    349  * (<start0, start1, start2, start3>, <end0, end1, end2, end3>) ;
    350  * (<start0, start1, start2, start3>, <end0, end1, end2, end3>) ;
    351  *
    352  * For each stream, we subtract the pair of values (end - start) to get the
    353  * number of primitives generated during one section.  We accumulate these
    354  * values, adding them up to get the total number of primitives generated.
    355  */
    356 static void
    357 gen7_tally_prims_generated(struct brw_context *brw,
    358                            struct brw_transform_feedback_object *obj)
    359 {
    360    /* If the current batch is still contributing to the number of primitives
    361     * generated, flush it now so the results will be present when mapped.
    362     */
    363    if (drm_intel_bo_references(brw->batch.bo, obj->prim_count_bo))
    364       intel_batchbuffer_flush(brw);
    365 
    366    if (unlikely(brw->perf_debug && drm_intel_bo_busy(obj->prim_count_bo)))
    367       perf_debug("Stalling for # of transform feedback primitives written.\n");
    368 
    369    drm_intel_bo_map(obj->prim_count_bo, false);
    370    uint64_t *prim_counts = obj->prim_count_bo->virtual;
    371 
    372    assert(obj->prim_count_buffer_index % (2 * BRW_MAX_XFB_STREAMS) == 0);
    373    int pairs = obj->prim_count_buffer_index / (2 * BRW_MAX_XFB_STREAMS);
    374 
    375    for (int i = 0; i < pairs; i++) {
    376       for (int s = 0; s < BRW_MAX_XFB_STREAMS; s++) {
    377          obj->prims_generated[s] +=
    378             prim_counts[BRW_MAX_XFB_STREAMS + s] - prim_counts[s];
    379       }
    380       prim_counts += 2 * BRW_MAX_XFB_STREAMS; /* move to the next pair */
    381    }
    382 
    383    drm_intel_bo_unmap(obj->prim_count_bo);
    384 
    385    /* We've already gathered up the old data; we can safely overwrite it now. */
    386    obj->prim_count_buffer_index = 0;
    387 }
    388 
    389 /**
    390  * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t values)
    391  * to prim_count_bo.
    392  *
    393  * If prim_count_bo is out of space, gather up the results so far into
    394  * prims_generated[] and allocate a new buffer with enough space.
    395  *
    396  * The number of primitives written is used to compute the number of vertices
    397  * written to a transform feedback stream, which is required to implement
    398  * DrawTransformFeedback().
    399  */
    400 static void
    401 gen7_save_primitives_written_counters(struct brw_context *brw,
    402                                 struct brw_transform_feedback_object *obj)
    403 {
    404    const int streams = BRW_MAX_XFB_STREAMS;
    405 
    406    /* Check if there's enough space for a new pair of four values. */
    407    if (obj->prim_count_bo != NULL &&
    408        obj->prim_count_buffer_index + 2 * streams >= 4096 / sizeof(uint64_t)) {
    409       /* Gather up the results so far and release the BO. */
    410       gen7_tally_prims_generated(brw, obj);
    411    }
    412 
    413    /* Flush any drawing so that the counters have the right values. */
    414    brw_emit_mi_flush(brw);
    415 
    416    /* Emit MI_STORE_REGISTER_MEM commands to write the values. */
    417    for (int i = 0; i < streams; i++) {
    418       int offset = (obj->prim_count_buffer_index + i) * sizeof(uint64_t);
    419       brw_store_register_mem64(brw, obj->prim_count_bo,
    420                                GEN7_SO_NUM_PRIMS_WRITTEN(i),
    421                                offset);
    422    }
    423 
    424    /* Update where to write data to. */
    425    obj->prim_count_buffer_index += streams;
    426 }
    427 
    428 /**
    429  * Compute the number of vertices written by this transform feedback operation.
    430  */
    431 static void
    432 brw_compute_xfb_vertices_written(struct brw_context *brw,
    433                                  struct brw_transform_feedback_object *obj)
    434 {
    435    if (obj->vertices_written_valid || !obj->base.EndedAnytime)
    436       return;
    437 
    438    unsigned vertices_per_prim = 0;
    439 
    440    switch (obj->primitive_mode) {
    441    case GL_POINTS:
    442       vertices_per_prim = 1;
    443       break;
    444    case GL_LINES:
    445       vertices_per_prim = 2;
    446       break;
    447    case GL_TRIANGLES:
    448       vertices_per_prim = 3;
    449       break;
    450    default:
    451       unreachable("Invalid transform feedback primitive mode.");
    452    }
    453 
    454    /* Get the number of primitives generated. */
    455    gen7_tally_prims_generated(brw, obj);
    456 
    457    for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
    458       obj->vertices_written[i] = vertices_per_prim * obj->prims_generated[i];
    459    }
    460    obj->vertices_written_valid = true;
    461 }
    462 
    463 /**
    464  * GetTransformFeedbackVertexCount() driver hook.
    465  *
    466  * Returns the number of vertices written to a particular stream by the last
    467  * Begin/EndTransformFeedback block.  Used to implement DrawTransformFeedback().
    468  */
    469 GLsizei
    470 brw_get_transform_feedback_vertex_count(struct gl_context *ctx,
    471                                         struct gl_transform_feedback_object *obj,
    472                                         GLuint stream)
    473 {
    474    struct brw_context *brw = brw_context(ctx);
    475    struct brw_transform_feedback_object *brw_obj =
    476       (struct brw_transform_feedback_object *) obj;
    477 
    478    assert(obj->EndedAnytime);
    479    assert(stream < BRW_MAX_XFB_STREAMS);
    480 
    481    brw_compute_xfb_vertices_written(brw, brw_obj);
    482    return brw_obj->vertices_written[stream];
    483 }
    484 
    485 void
    486 gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
    487                               struct gl_transform_feedback_object *obj)
    488 {
    489    struct brw_context *brw = brw_context(ctx);
    490    struct brw_transform_feedback_object *brw_obj =
    491       (struct brw_transform_feedback_object *) obj;
    492 
    493    /* Reset the SO buffer offsets to 0. */
    494    if (brw->gen >= 8) {
    495       brw_obj->zero_offsets = true;
    496    } else {
    497       intel_batchbuffer_flush(brw);
    498       brw->batch.needs_sol_reset = true;
    499    }
    500 
    501    /* We're about to lose the information needed to compute the number of
    502     * vertices written during the last Begin/EndTransformFeedback section,
    503     * so we can't delay it any further.
    504     */
    505    brw_compute_xfb_vertices_written(brw, brw_obj);
    506 
    507    /* No primitives have been generated yet. */
    508    for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
    509       brw_obj->prims_generated[i] = 0;
    510    }
    511 
    512    /* Store the starting value of the SO_NUM_PRIMS_WRITTEN counters. */
    513    gen7_save_primitives_written_counters(brw, brw_obj);
    514 
    515    brw_obj->primitive_mode = mode;
    516 }
    517 
    518 void
    519 gen7_end_transform_feedback(struct gl_context *ctx,
    520 			    struct gl_transform_feedback_object *obj)
    521 {
    522    /* After EndTransformFeedback, it's likely that the client program will try
    523     * to draw using the contents of the transform feedback buffer as vertex
    524     * input.  In order for this to work, we need to flush the data through at
    525     * least the GS stage of the pipeline, and flush out the render cache.  For
    526     * simplicity, just do a full flush.
    527     */
    528    struct brw_context *brw = brw_context(ctx);
    529    struct brw_transform_feedback_object *brw_obj =
    530       (struct brw_transform_feedback_object *) obj;
    531 
    532    /* Store the ending value of the SO_NUM_PRIMS_WRITTEN counters. */
    533    if (!obj->Paused)
    534       gen7_save_primitives_written_counters(brw, brw_obj);
    535 
    536    /* EndTransformFeedback() means that we need to update the number of
    537     * vertices written.  Since it's only necessary if DrawTransformFeedback()
    538     * is called and it means mapping a buffer object, we delay computing it
    539     * until it's absolutely necessary to try and avoid stalls.
    540     */
    541    brw_obj->vertices_written_valid = false;
    542 }
    543 
    544 void
    545 gen7_pause_transform_feedback(struct gl_context *ctx,
    546                               struct gl_transform_feedback_object *obj)
    547 {
    548    struct brw_context *brw = brw_context(ctx);
    549    struct brw_transform_feedback_object *brw_obj =
    550       (struct brw_transform_feedback_object *) obj;
    551 
    552    /* Flush any drawing so that the counters have the right values. */
    553    brw_emit_mi_flush(brw);
    554 
    555    /* Save the SOL buffer offset register values. */
    556    if (brw->gen < 8) {
    557       for (int i = 0; i < 4; i++) {
    558          BEGIN_BATCH(3);
    559          OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    560          OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
    561          OUT_RELOC(brw_obj->offset_bo,
    562                    I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    563                    i * sizeof(uint32_t));
    564          ADVANCE_BATCH();
    565       }
    566    }
    567 
    568    /* Store the temporary ending value of the SO_NUM_PRIMS_WRITTEN counters.
    569     * While this operation is paused, other transform feedback actions may
    570     * occur, which will contribute to the counters.  We need to exclude that
    571     * from our counts.
    572     */
    573    gen7_save_primitives_written_counters(brw, brw_obj);
    574 }
    575 
    576 void
    577 gen7_resume_transform_feedback(struct gl_context *ctx,
    578                                struct gl_transform_feedback_object *obj)
    579 {
    580    struct brw_context *brw = brw_context(ctx);
    581    struct brw_transform_feedback_object *brw_obj =
    582       (struct brw_transform_feedback_object *) obj;
    583 
    584    /* Reload the SOL buffer offset registers. */
    585    if (brw->gen < 8) {
    586       for (int i = 0; i < 4; i++) {
    587          BEGIN_BATCH(3);
    588          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
    589          OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
    590          OUT_RELOC(brw_obj->offset_bo,
    591                    I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    592                    i * sizeof(uint32_t));
    593          ADVANCE_BATCH();
    594       }
    595    }
    596 
    597    /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */
    598    gen7_save_primitives_written_counters(brw, brw_obj);
    599 }
    600