Home | History | Annotate | Download | only in vulkan
      1 /*
      2  * Copyright  2015 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include <assert.h>
     25 #include <stdbool.h>
     26 
     27 #include "anv_private.h"
     28 #include "vk_format_info.h"
     29 
     30 #include "common/gen_l3_config.h"
     31 #include "genxml/gen_macros.h"
     32 #include "genxml/genX_pack.h"
     33 
     34 static void
     35 emit_lrm(struct anv_batch *batch,
     36          uint32_t reg, struct anv_bo *bo, uint32_t offset)
     37 {
     38    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
     39       lrm.RegisterAddress  = reg;
     40       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
     41    }
     42 }
     43 
     44 static void
     45 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
     46 {
     47    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
     48       lri.RegisterOffset   = reg;
     49       lri.DataDWord        = imm;
     50    }
     51 }
     52 
     53 void
     54 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
     55 {
     56    struct anv_device *device = cmd_buffer->device;
     57 
     58    /* Emit a render target cache flush.
     59     *
     60     * This isn't documented anywhere in the PRM.  However, it seems to be
     61     * necessary prior to changing the surface state base adress.  Without
     62     * this, we get GPU hangs when using multi-level command buffers which
     63     * clear depth, reset state base address, and then go render stuff.
     64     */
     65    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
     66       pc.DCFlushEnable = true;
     67       pc.RenderTargetCacheFlushEnable = true;
     68       pc.CommandStreamerStallEnable = true;
     69    }
     70 
     71    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
     72       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
     73       sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
     74       sba.GeneralStateBaseAddressModifyEnable = true;
     75 
     76       sba.SurfaceStateBaseAddress =
     77          anv_cmd_buffer_surface_base_address(cmd_buffer);
     78       sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
     79       sba.SurfaceStateBaseAddressModifyEnable = true;
     80 
     81       sba.DynamicStateBaseAddress =
     82          (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
     83       sba.DynamicStateMemoryObjectControlState = GENX(MOCS);
     84       sba.DynamicStateBaseAddressModifyEnable = true;
     85 
     86       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
     87       sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
     88       sba.IndirectObjectBaseAddressModifyEnable = true;
     89 
     90       sba.InstructionBaseAddress =
     91          (struct anv_address) { &device->instruction_block_pool.bo, 0 };
     92       sba.InstructionMemoryObjectControlState = GENX(MOCS);
     93       sba.InstructionBaseAddressModifyEnable = true;
     94 
     95 #  if (GEN_GEN >= 8)
     96       /* Broadwell requires that we specify a buffer size for a bunch of
     97        * these fields.  However, since we will be growing the BO's live, we
     98        * just set them all to the maximum.
     99        */
    100       sba.GeneralStateBufferSize                = 0xfffff;
    101       sba.GeneralStateBufferSizeModifyEnable    = true;
    102       sba.DynamicStateBufferSize                = 0xfffff;
    103       sba.DynamicStateBufferSizeModifyEnable    = true;
    104       sba.IndirectObjectBufferSize              = 0xfffff;
    105       sba.IndirectObjectBufferSizeModifyEnable  = true;
    106       sba.InstructionBufferSize                 = 0xfffff;
    107       sba.InstructionBuffersizeModifyEnable     = true;
    108 #  endif
    109    }
    110 
    111    /* After re-setting the surface state base address, we have to do some
    112     * cache flusing so that the sampler engine will pick up the new
    113     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
    114     * Shared Function > 3D Sampler > State > State Caching (page 96):
    115     *
    116     *    Coherency with system memory in the state cache, like the texture
    117     *    cache is handled partially by software. It is expected that the
    118     *    command stream or shader will issue Cache Flush operation or
    119     *    Cache_Flush sampler message to ensure that the L1 cache remains
    120     *    coherent with system memory.
    121     *
    122     *    [...]
    123     *
    124     *    Whenever the value of the Dynamic_State_Base_Addr,
    125     *    Surface_State_Base_Addr are altered, the L1 state cache must be
    126     *    invalidated to ensure the new surface or sampler state is fetched
    127     *    from system memory.
    128     *
    129     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
    130     * which, according the PIPE_CONTROL instruction documentation in the
    131     * Broadwell PRM:
    132     *
    133     *    Setting this bit is independent of any other bit in this packet.
    134     *    This bit controls the invalidation of the L1 and L2 state caches
    135     *    at the top of the pipe i.e. at the parsing time.
    136     *
    137     * Unfortunately, experimentation seems to indicate that state cache
    138     * invalidation through a PIPE_CONTROL does nothing whatsoever in
    139     * regards to surface state and binding tables.  In stead, it seems that
    140     * invalidating the texture cache is what is actually needed.
    141     *
    142     * XXX:  As far as we have been able to determine through
    143     * experimentation, shows that flush the texture cache appears to be
    144     * sufficient.  The theory here is that all of the sampling/rendering
    145     * units cache the binding table in the texture cache.  However, we have
    146     * yet to be able to actually confirm this.
    147     */
    148    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    149       pc.TextureCacheInvalidationEnable = true;
    150       pc.ConstantCacheInvalidationEnable = true;
    151       pc.StateCacheInvalidationEnable = true;
    152    }
    153 }
    154 
    155 static void
    156 add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer,
    157                         struct anv_state state,
    158                         struct anv_bo *bo, uint32_t offset)
    159 {
    160    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
    161 
    162    anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
    163                       state.offset + isl_dev->ss.addr_offset, bo, offset);
    164 }
    165 
    166 static void
    167 add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer,
    168                       const struct anv_image_view *iview,
    169                       enum isl_aux_usage aux_usage,
    170                       struct anv_state state)
    171 {
    172    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
    173 
    174    anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
    175                       state.offset + isl_dev->ss.addr_offset,
    176                       iview->bo, iview->offset);
    177 
    178    if (aux_usage != ISL_AUX_USAGE_NONE) {
    179       uint32_t aux_offset = iview->offset + iview->image->aux_surface.offset;
    180 
    181       /* On gen7 and prior, the bottom 12 bits of the MCS base address are
    182        * used to store other information.  This should be ok, however, because
    183        * surface buffer addresses are always 4K page alinged.
    184        */
    185       assert((aux_offset & 0xfff) == 0);
    186       uint32_t *aux_addr_dw = state.map + isl_dev->ss.aux_addr_offset;
    187       aux_offset += *aux_addr_dw & 0xfff;
    188 
    189       anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
    190                          state.offset + isl_dev->ss.aux_addr_offset,
    191                          iview->bo, aux_offset);
    192    }
    193 }
    194 
    195 static bool
    196 color_is_zero_one(VkClearColorValue value, enum isl_format format)
    197 {
    198    if (isl_format_has_int_channel(format)) {
    199       for (unsigned i = 0; i < 4; i++) {
    200          if (value.int32[i] != 0 && value.int32[i] != 1)
    201             return false;
    202       }
    203    } else {
    204       for (unsigned i = 0; i < 4; i++) {
    205          if (value.float32[i] != 0.0f && value.float32[i] != 1.0f)
    206             return false;
    207       }
    208    }
    209 
    210    return true;
    211 }
    212 
    213 static void
    214 color_attachment_compute_aux_usage(struct anv_device *device,
    215                                    struct anv_attachment_state *att_state,
    216                                    struct anv_image_view *iview,
    217                                    VkRect2D render_area,
    218                                    union isl_color_value *fast_clear_color)
    219 {
    220    if (iview->image->aux_surface.isl.size == 0) {
    221       att_state->aux_usage = ISL_AUX_USAGE_NONE;
    222       att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
    223       att_state->fast_clear = false;
    224       return;
    225    }
    226 
    227    assert(iview->image->aux_surface.isl.usage & ISL_SURF_USAGE_CCS_BIT);
    228 
    229    att_state->clear_color_is_zero_one =
    230       color_is_zero_one(att_state->clear_value.color, iview->isl.format);
    231 
    232    if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
    233       /* Start off assuming fast clears are possible */
    234       att_state->fast_clear = true;
    235 
    236       /* Potentially, we could do partial fast-clears but doing so has crazy
    237        * alignment restrictions.  It's easier to just restrict to full size
    238        * fast clears for now.
    239        */
    240       if (render_area.offset.x != 0 ||
    241           render_area.offset.y != 0 ||
    242           render_area.extent.width != iview->extent.width ||
    243           render_area.extent.height != iview->extent.height)
    244          att_state->fast_clear = false;
    245 
    246       if (GEN_GEN <= 7) {
    247          /* On gen7, we can't do multi-LOD or multi-layer fast-clears.  We
    248           * technically can, but it comes with crazy restrictions that we
    249           * don't want to deal with now.
    250           */
    251          if (iview->isl.base_level > 0 ||
    252              iview->isl.base_array_layer > 0 ||
    253              iview->isl.array_len > 1)
    254             att_state->fast_clear = false;
    255       }
    256 
    257       /* On Broadwell and earlier, we can only handle 0/1 clear colors */
    258       if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one)
    259          att_state->fast_clear = false;
    260 
    261       if (att_state->fast_clear) {
    262          memcpy(fast_clear_color->u32, att_state->clear_value.color.uint32,
    263                 sizeof(fast_clear_color->u32));
    264       }
    265    } else {
    266       att_state->fast_clear = false;
    267    }
    268 
    269    if (isl_format_supports_lossless_compression(&device->info,
    270                                                 iview->isl.format)) {
    271       att_state->aux_usage = ISL_AUX_USAGE_CCS_E;
    272       att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E;
    273    } else if (att_state->fast_clear) {
    274       att_state->aux_usage = ISL_AUX_USAGE_CCS_D;
    275       if (GEN_GEN >= 9) {
    276          /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode:
    277           *
    278           *    "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D
    279           *    setting is only allowed if Surface Format supported for Fast
    280           *    Clear. In addition, if the surface is bound to the sampling
    281           *    engine, Surface Format must be supported for Render Target
    282           *    Compression for surfaces bound to the sampling engine."
    283           *
    284           * In other words, we can't sample from a fast-cleared image if it
    285           * doesn't also support color compression.
    286           */
    287          att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
    288       } else if (GEN_GEN == 8) {
    289          /* Broadwell can sample from fast-cleared images */
    290          att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D;
    291       } else {
    292          /* Ivy Bridge and Haswell cannot */
    293          att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
    294       }
    295    } else {
    296       att_state->aux_usage = ISL_AUX_USAGE_NONE;
    297       att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
    298    }
    299 }
    300 
    301 static bool
    302 need_input_attachment_state(const struct anv_render_pass_attachment *att)
    303 {
    304    if (!(att->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
    305       return false;
    306 
    307    /* We only allocate input attachment states for color surfaces. Compression
    308     * is not yet enabled for depth textures and stencil doesn't allow
    309     * compression so we can just use the texture surface state from the view.
    310     */
    311    return vk_format_is_color(att->format);
    312 }
    313 
    314 static enum isl_aux_usage
    315 layout_to_hiz_usage(VkImageLayout layout, uint8_t samples)
    316 {
    317    switch (layout) {
    318    case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL:
    319       return ISL_AUX_USAGE_HIZ;
    320    case VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL:
    321    case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL:
    322       if (anv_can_sample_with_hiz(GEN_GEN, samples))
    323          return ISL_AUX_USAGE_HIZ;
    324       /* Fall-through */
    325    case VK_IMAGE_LAYOUT_GENERAL:
    326       /* This buffer could be used as a source or destination in a transfer
    327        * operation. Transfer operations current don't perform HiZ-enabled reads
    328        * and writes.
    329        */
    330    default:
    331       return ISL_AUX_USAGE_NONE;
    332    }
    333 }
    334 
    335 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
    336  * the initial layout is undefined, the HiZ buffer and depth buffer will
    337  * represent the same data at the end of this operation.
    338  */
    339 static void
    340 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
    341                         const struct anv_image *image,
    342                         VkImageLayout initial_layout,
    343                         VkImageLayout final_layout)
    344 {
    345    assert(image);
    346 
    347    if (image->aux_usage != ISL_AUX_USAGE_HIZ || final_layout == initial_layout)
    348       return;
    349 
    350    const bool hiz_enabled = layout_to_hiz_usage(initial_layout, image->samples) ==
    351                             ISL_AUX_USAGE_HIZ;
    352    const bool enable_hiz = layout_to_hiz_usage(final_layout, image->samples) ==
    353                            ISL_AUX_USAGE_HIZ;
    354 
    355    enum blorp_hiz_op hiz_op;
    356    if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
    357       /* We've already initialized the aux HiZ buffer at BindImageMemory time,
    358        * so there's no need to perform a HIZ resolve or clear to avoid GPU hangs.
    359        * This initial layout indicates that the user doesn't care about the data
    360        * that's currently in the buffer, so resolves are not necessary except
    361        * for the special case noted below.
    362        */
    363       hiz_op = BLORP_HIZ_OP_NONE;
    364    } else if (hiz_enabled && !enable_hiz) {
    365       hiz_op = BLORP_HIZ_OP_DEPTH_RESOLVE;
    366    } else if (!hiz_enabled && enable_hiz) {
    367       hiz_op = BLORP_HIZ_OP_HIZ_RESOLVE;
    368    } else {
    369       assert(hiz_enabled == enable_hiz);
    370       /* If the same buffer will be used, no resolves are necessary except for
    371        * the special case noted below.
    372        */
    373       hiz_op = BLORP_HIZ_OP_NONE;
    374    }
    375 
    376    if (hiz_op != BLORP_HIZ_OP_NONE)
    377       anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op);
    378 
    379    /* Images that have sampling with HiZ enabled cause all shader sampling to
    380     * load data with the HiZ buffer. Therefore, in the case of transitioning to
    381     * the general layout - which currently routes all writes to the depth
    382     * buffer - we must ensure that the HiZ buffer remains consistent with the
    383     * depth buffer by performing an additional HIZ resolve if the operation
    384     * required by this transition was not already a HiZ resolve.
    385     */
    386    if (final_layout == VK_IMAGE_LAYOUT_GENERAL &&
    387        anv_can_sample_with_hiz(GEN_GEN, image->samples) &&
    388        hiz_op != BLORP_HIZ_OP_HIZ_RESOLVE) {
    389       anv_gen8_hiz_op_resolve(cmd_buffer, image, BLORP_HIZ_OP_HIZ_RESOLVE);
    390    }
    391 }
    392 
    393 
    394 /**
    395  * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.
    396  */
    397 static void
    398 genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer,
    399                                    struct anv_render_pass *pass,
    400                                    const VkRenderPassBeginInfo *begin)
    401 {
    402    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
    403    struct anv_cmd_state *state = &cmd_buffer->state;
    404 
    405    vk_free(&cmd_buffer->pool->alloc, state->attachments);
    406 
    407    if (pass->attachment_count == 0) {
    408       state->attachments = NULL;
    409       return;
    410    }
    411 
    412    state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
    413                                  pass->attachment_count *
    414                                       sizeof(state->attachments[0]),
    415                                  8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    416    if (state->attachments == NULL) {
    417       /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
    418       abort();
    419    }
    420 
    421    bool need_null_state = false;
    422    unsigned num_states = 0;
    423    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
    424       if (vk_format_is_color(pass->attachments[i].format)) {
    425          num_states++;
    426       } else {
    427          /* We need a null state for any depth-stencil-only subpasses.
    428           * Importantly, this includes depth/stencil clears so we create one
    429           * whenever we have depth or stencil
    430           */
    431          need_null_state = true;
    432       }
    433 
    434       if (need_input_attachment_state(&pass->attachments[i]))
    435          num_states++;
    436    }
    437    num_states += need_null_state;
    438 
    439    const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
    440    state->render_pass_states =
    441       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
    442                              num_states * ss_stride, isl_dev->ss.align);
    443 
    444    struct anv_state next_state = state->render_pass_states;
    445    next_state.alloc_size = isl_dev->ss.size;
    446 
    447    if (need_null_state) {
    448       state->null_surface_state = next_state;
    449       next_state.offset += ss_stride;
    450       next_state.map += ss_stride;
    451    }
    452 
    453    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
    454       if (vk_format_is_color(pass->attachments[i].format)) {
    455          state->attachments[i].color_rt_state = next_state;
    456          next_state.offset += ss_stride;
    457          next_state.map += ss_stride;
    458       }
    459 
    460       if (need_input_attachment_state(&pass->attachments[i])) {
    461          state->attachments[i].input_att_state = next_state;
    462          next_state.offset += ss_stride;
    463          next_state.map += ss_stride;
    464       }
    465    }
    466    assert(next_state.offset == state->render_pass_states.offset +
    467                                state->render_pass_states.alloc_size);
    468 
    469    if (begin) {
    470       ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer);
    471       assert(pass->attachment_count == framebuffer->attachment_count);
    472 
    473       if (need_null_state) {
    474          struct GENX(RENDER_SURFACE_STATE) null_ss = {
    475             .SurfaceType = SURFTYPE_NULL,
    476             .SurfaceArray = framebuffer->layers > 0,
    477             .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
    478 #if GEN_GEN >= 8
    479             .TileMode = YMAJOR,
    480 #else
    481             .TiledSurface = true,
    482 #endif
    483             .Width = framebuffer->width - 1,
    484             .Height = framebuffer->height - 1,
    485             .Depth = framebuffer->layers - 1,
    486             .RenderTargetViewExtent = framebuffer->layers - 1,
    487          };
    488          GENX(RENDER_SURFACE_STATE_pack)(NULL, state->null_surface_state.map,
    489                                          &null_ss);
    490       }
    491 
    492       for (uint32_t i = 0; i < pass->attachment_count; ++i) {
    493          struct anv_render_pass_attachment *att = &pass->attachments[i];
    494          VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
    495          VkImageAspectFlags clear_aspects = 0;
    496 
    497          if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
    498             /* color attachment */
    499             if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
    500                clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
    501             }
    502          } else {
    503             /* depthstencil attachment */
    504             if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
    505                 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
    506                clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
    507             }
    508             if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
    509                 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
    510                clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
    511             }
    512          }
    513 
    514          state->attachments[i].current_layout = att->initial_layout;
    515          state->attachments[i].pending_clear_aspects = clear_aspects;
    516          if (clear_aspects)
    517             state->attachments[i].clear_value = begin->pClearValues[i];
    518 
    519          struct anv_image_view *iview = framebuffer->attachments[i];
    520          assert(iview->vk_format == att->format);
    521 
    522          union isl_color_value clear_color = { .u32 = { 0, } };
    523          if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
    524             color_attachment_compute_aux_usage(cmd_buffer->device,
    525                                                &state->attachments[i],
    526                                                iview, begin->renderArea,
    527                                                &clear_color);
    528 
    529             struct isl_view view = iview->isl;
    530             view.usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
    531             isl_surf_fill_state(isl_dev,
    532                                 state->attachments[i].color_rt_state.map,
    533                                 .surf = &iview->image->color_surface.isl,
    534                                 .view = &view,
    535                                 .aux_surf = &iview->image->aux_surface.isl,
    536                                 .aux_usage = state->attachments[i].aux_usage,
    537                                 .clear_color = clear_color,
    538                                 .mocs = cmd_buffer->device->default_mocs);
    539 
    540             add_image_view_relocs(cmd_buffer, iview,
    541                                   state->attachments[i].aux_usage,
    542                                   state->attachments[i].color_rt_state);
    543          } else {
    544             if (iview->image->aux_usage == ISL_AUX_USAGE_HIZ) {
    545                state->attachments[i].aux_usage =
    546                   layout_to_hiz_usage(att->initial_layout, iview->image->samples);
    547             } else {
    548                state->attachments[i].aux_usage = ISL_AUX_USAGE_NONE;
    549             }
    550             state->attachments[i].input_aux_usage = ISL_AUX_USAGE_NONE;
    551          }
    552 
    553          if (need_input_attachment_state(&pass->attachments[i])) {
    554             struct isl_view view = iview->isl;
    555             view.usage |= ISL_SURF_USAGE_TEXTURE_BIT;
    556             isl_surf_fill_state(isl_dev,
    557                                 state->attachments[i].input_att_state.map,
    558                                 .surf = &iview->image->color_surface.isl,
    559                                 .view = &view,
    560                                 .aux_surf = &iview->image->aux_surface.isl,
    561                                 .aux_usage = state->attachments[i].input_aux_usage,
    562                                 .clear_color = clear_color,
    563                                 .mocs = cmd_buffer->device->default_mocs);
    564 
    565             add_image_view_relocs(cmd_buffer, iview,
    566                                   state->attachments[i].input_aux_usage,
    567                                   state->attachments[i].input_att_state);
    568          }
    569       }
    570 
    571       if (!cmd_buffer->device->info.has_llc)
    572          anv_state_clflush(state->render_pass_states);
    573    }
    574 }
    575 
    576 VkResult
    577 genX(BeginCommandBuffer)(
    578     VkCommandBuffer                             commandBuffer,
    579     const VkCommandBufferBeginInfo*             pBeginInfo)
    580 {
    581    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    582 
    583    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
    584     * command buffer's state. Otherwise, we must *reset* its state. In both
    585     * cases we reset it.
    586     *
    587     * From the Vulkan 1.0 spec:
    588     *
    589     *    If a command buffer is in the executable state and the command buffer
    590     *    was allocated from a command pool with the
    591     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
    592     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
    593     *    as if vkResetCommandBuffer had been called with
    594     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
    595     *    the command buffer in the recording state.
    596     */
    597    anv_cmd_buffer_reset(cmd_buffer);
    598 
    599    cmd_buffer->usage_flags = pBeginInfo->flags;
    600 
    601    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
    602           !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
    603 
    604    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
    605 
    606    if (cmd_buffer->usage_flags &
    607        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
    608       cmd_buffer->state.pass =
    609          anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
    610       cmd_buffer->state.subpass =
    611          &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
    612       cmd_buffer->state.framebuffer = NULL;
    613 
    614       genX(cmd_buffer_setup_attachments)(cmd_buffer, cmd_buffer->state.pass,
    615                                          NULL);
    616 
    617       cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
    618    }
    619 
    620    return VK_SUCCESS;
    621 }
    622 
    623 VkResult
    624 genX(EndCommandBuffer)(
    625     VkCommandBuffer                             commandBuffer)
    626 {
    627    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    628 
    629    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
    630 
    631    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
    632 
    633    return VK_SUCCESS;
    634 }
    635 
    636 void
    637 genX(CmdExecuteCommands)(
    638     VkCommandBuffer                             commandBuffer,
    639     uint32_t                                    commandBufferCount,
    640     const VkCommandBuffer*                      pCmdBuffers)
    641 {
    642    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
    643 
    644    assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
    645 
    646    /* The secondary command buffer doesn't know which textures etc. have been
    647     * flushed prior to their execution.  Apply those flushes now.
    648     */
    649    genX(cmd_buffer_apply_pipe_flushes)(primary);
    650 
    651    for (uint32_t i = 0; i < commandBufferCount; i++) {
    652       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
    653 
    654       assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
    655 
    656       if (secondary->usage_flags &
    657           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
    658          /* If we're continuing a render pass from the primary, we need to
    659           * copy the surface states for the current subpass into the storage
    660           * we allocated for them in BeginCommandBuffer.
    661           */
    662          struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo;
    663          struct anv_state src_state = primary->state.render_pass_states;
    664          struct anv_state dst_state = secondary->state.render_pass_states;
    665          assert(src_state.alloc_size == dst_state.alloc_size);
    666 
    667          genX(cmd_buffer_gpu_memcpy)(primary, ss_bo, dst_state.offset,
    668                                      ss_bo, src_state.offset,
    669                                      src_state.alloc_size);
    670       }
    671 
    672       anv_cmd_buffer_add_secondary(primary, secondary);
    673    }
    674 
    675    /* Each of the secondary command buffers will use its own state base
    676     * address.  We need to re-emit state base address for the primary after
    677     * all of the secondaries are done.
    678     *
    679     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
    680     * address calls?
    681     */
    682    genX(cmd_buffer_emit_state_base_address)(primary);
    683 }
    684 
    685 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
    686 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
    687 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
    688 
    689 /**
    690  * Program the hardware to use the specified L3 configuration.
    691  */
    692 void
    693 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
    694                            const struct gen_l3_config *cfg)
    695 {
    696    assert(cfg);
    697    if (cfg == cmd_buffer->state.current_l3_config)
    698       return;
    699 
    700    if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
    701       fprintf(stderr, "L3 config transition: ");
    702       gen_dump_l3_config(cfg, stderr);
    703    }
    704 
    705    const bool has_slm = cfg->n[GEN_L3P_SLM];
    706 
    707    /* According to the hardware docs, the L3 partitioning can only be changed
    708     * while the pipeline is completely drained and the caches are flushed,
    709     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
    710     */
    711    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    712       pc.DCFlushEnable = true;
    713       pc.PostSyncOperation = NoWrite;
    714       pc.CommandStreamerStallEnable = true;
    715    }
    716 
    717    /* ...followed by a second pipelined PIPE_CONTROL that initiates
    718     * invalidation of the relevant caches.  Note that because RO invalidation
    719     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
    720     * command is processed by the CS) we cannot combine it with the previous
    721     * stalling flush as the hardware documentation suggests, because that
    722     * would cause the CS to stall on previous rendering *after* RO
    723     * invalidation and wouldn't prevent the RO caches from being polluted by
    724     * concurrent rendering before the stall completes.  This intentionally
    725     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
    726     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
    727     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
    728     * already guarantee that there is no concurrent GPGPU kernel execution
    729     * (see SKL HSD 2132585).
    730     */
    731    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    732       pc.TextureCacheInvalidationEnable = true;
    733       pc.ConstantCacheInvalidationEnable = true;
    734       pc.InstructionCacheInvalidateEnable = true;
    735       pc.StateCacheInvalidationEnable = true;
    736       pc.PostSyncOperation = NoWrite;
    737    }
    738 
    739    /* Now send a third stalling flush to make sure that invalidation is
    740     * complete when the L3 configuration registers are modified.
    741     */
    742    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
    743       pc.DCFlushEnable = true;
    744       pc.PostSyncOperation = NoWrite;
    745       pc.CommandStreamerStallEnable = true;
    746    }
    747 
    748 #if GEN_GEN >= 8
    749 
    750    assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]);
    751 
    752    uint32_t l3cr;
    753    anv_pack_struct(&l3cr, GENX(L3CNTLREG),
    754                    .SLMEnable = has_slm,
    755                    .URBAllocation = cfg->n[GEN_L3P_URB],
    756                    .ROAllocation = cfg->n[GEN_L3P_RO],
    757                    .DCAllocation = cfg->n[GEN_L3P_DC],
    758                    .AllAllocation = cfg->n[GEN_L3P_ALL]);
    759 
    760    /* Set up the L3 partitioning. */
    761    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr);
    762 
    763 #else
    764 
    765    const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL];
    766    const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] ||
    767                        cfg->n[GEN_L3P_ALL];
    768    const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] ||
    769                       cfg->n[GEN_L3P_ALL];
    770    const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] ||
    771                       cfg->n[GEN_L3P_ALL];
    772 
    773    assert(!cfg->n[GEN_L3P_ALL]);
    774 
    775    /* When enabled SLM only uses a portion of the L3 on half of the banks,
    776     * the matching space on the remaining banks has to be allocated to a
    777     * client (URB for all validated configurations) set to the
    778     * lower-bandwidth 2-bank address hashing mode.
    779     */
    780    const struct gen_device_info *devinfo = &cmd_buffer->device->info;
    781    const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
    782    assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]);
    783 
    784    /* Minimum number of ways that can be allocated to the URB. */
    785    MAYBE_UNUSED const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0;
    786    assert(cfg->n[GEN_L3P_URB] >= n0_urb);
    787 
    788    uint32_t l3sqcr1, l3cr2, l3cr3;
    789    anv_pack_struct(&l3sqcr1, GENX(L3SQCREG1),
    790                    .ConvertDC_UC = !has_dc,
    791                    .ConvertIS_UC = !has_is,
    792                    .ConvertC_UC = !has_c,
    793                    .ConvertT_UC = !has_t);
    794    l3sqcr1 |=
    795       GEN_IS_HASWELL ? HSW_L3SQCREG1_SQGHPCI_DEFAULT :
    796       devinfo->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT :
    797       IVB_L3SQCREG1_SQGHPCI_DEFAULT;
    798 
    799    anv_pack_struct(&l3cr2, GENX(L3CNTLREG2),
    800                    .SLMEnable = has_slm,
    801                    .URBLowBandwidth = urb_low_bw,
    802                    .URBAllocation = cfg->n[GEN_L3P_URB],
    803 #if !GEN_IS_HASWELL
    804                    .ALLAllocation = cfg->n[GEN_L3P_ALL],
    805 #endif
    806                    .ROAllocation = cfg->n[GEN_L3P_RO],
    807                    .DCAllocation = cfg->n[GEN_L3P_DC]);
    808 
    809    anv_pack_struct(&l3cr3, GENX(L3CNTLREG3),
    810                    .ISAllocation = cfg->n[GEN_L3P_IS],
    811                    .ISLowBandwidth = 0,
    812                    .CAllocation = cfg->n[GEN_L3P_C],
    813                    .CLowBandwidth = 0,
    814                    .TAllocation = cfg->n[GEN_L3P_T],
    815                    .TLowBandwidth = 0);
    816 
    817    /* Set up the L3 partitioning. */
    818    emit_lri(&cmd_buffer->batch, GENX(L3SQCREG1_num), l3sqcr1);
    819    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG2_num), l3cr2);
    820    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3);
    821 
    822 #if GEN_IS_HASWELL
    823    if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) {
    824       /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
    825        * them disabled to avoid crashing the system hard.
    826        */
    827       uint32_t scratch1, chicken3;
    828       anv_pack_struct(&scratch1, GENX(SCRATCH1),
    829                       .L3AtomicDisable = !has_dc);
    830       anv_pack_struct(&chicken3, GENX(CHICKEN3),
    831                       .L3AtomicDisableMask = true,
    832                       .L3AtomicDisable = !has_dc);
    833       emit_lri(&cmd_buffer->batch, GENX(SCRATCH1_num), scratch1);
    834       emit_lri(&cmd_buffer->batch, GENX(CHICKEN3_num), chicken3);
    835    }
    836 #endif
    837 
    838 #endif
    839 
    840    cmd_buffer->state.current_l3_config = cfg;
    841 }
    842 
    843 void
    844 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
    845 {
    846    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
    847 
    848    /* Flushes are pipelined while invalidations are handled immediately.
    849     * Therefore, if we're flushing anything then we need to schedule a stall
    850     * before any invalidations can happen.
    851     */
    852    if (bits & ANV_PIPE_FLUSH_BITS)
    853       bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
    854 
    855    /* If we're going to do an invalidate and we have a pending CS stall that
    856     * has yet to be resolved, we do the CS stall now.
    857     */
    858    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
    859        (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
    860       bits |= ANV_PIPE_CS_STALL_BIT;
    861       bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
    862    }
    863 
    864    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
    865       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
    866          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
    867          pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
    868          pipe.RenderTargetCacheFlushEnable =
    869             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
    870 
    871          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
    872          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
    873          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
    874 
    875          /*
    876           * According to the Broadwell documentation, any PIPE_CONTROL with the
    877           * "Command Streamer Stall" bit set must also have another bit set,
    878           * with five different options:
    879           *
    880           *  - Render Target Cache Flush
    881           *  - Depth Cache Flush
    882           *  - Stall at Pixel Scoreboard
    883           *  - Post-Sync Operation
    884           *  - Depth Stall
    885           *  - DC Flush Enable
    886           *
    887           * I chose "Stall at Pixel Scoreboard" since that's what we use in
    888           * mesa and it seems to work fine. The choice is fairly arbitrary.
    889           */
    890          if ((bits & ANV_PIPE_CS_STALL_BIT) &&
    891              !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
    892                        ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
    893             pipe.StallAtPixelScoreboard = true;
    894       }
    895 
    896       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
    897    }
    898 
    899    if (bits & ANV_PIPE_INVALIDATE_BITS) {
    900       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
    901          pipe.StateCacheInvalidationEnable =
    902             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
    903          pipe.ConstantCacheInvalidationEnable =
    904             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
    905          pipe.VFCacheInvalidationEnable =
    906             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
    907          pipe.TextureCacheInvalidationEnable =
    908             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
    909          pipe.InstructionCacheInvalidateEnable =
    910             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
    911       }
    912 
    913       bits &= ~ANV_PIPE_INVALIDATE_BITS;
    914    }
    915 
    916    cmd_buffer->state.pending_pipe_bits = bits;
    917 }
    918 
    919 void genX(CmdPipelineBarrier)(
    920     VkCommandBuffer                             commandBuffer,
    921     VkPipelineStageFlags                        srcStageMask,
    922     VkPipelineStageFlags                        destStageMask,
    923     VkBool32                                    byRegion,
    924     uint32_t                                    memoryBarrierCount,
    925     const VkMemoryBarrier*                      pMemoryBarriers,
    926     uint32_t                                    bufferMemoryBarrierCount,
    927     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
    928     uint32_t                                    imageMemoryBarrierCount,
    929     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
    930 {
    931    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    932    uint32_t b;
    933 
    934    /* XXX: Right now, we're really dumb and just flush whatever categories
    935     * the app asks for.  One of these days we may make this a bit better
    936     * but right now that's all the hardware allows for in most areas.
    937     */
    938    VkAccessFlags src_flags = 0;
    939    VkAccessFlags dst_flags = 0;
    940 
    941    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
    942       src_flags |= pMemoryBarriers[i].srcAccessMask;
    943       dst_flags |= pMemoryBarriers[i].dstAccessMask;
    944    }
    945 
    946    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
    947       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
    948       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
    949    }
    950 
    951    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
    952       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
    953       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
    954       ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image);
    955       if (pImageMemoryBarriers[i].subresourceRange.aspectMask &
    956           VK_IMAGE_ASPECT_DEPTH_BIT) {
    957          transition_depth_buffer(cmd_buffer, image,
    958                                  pImageMemoryBarriers[i].oldLayout,
    959                                  pImageMemoryBarriers[i].newLayout);
    960       }
    961    }
    962 
    963    enum anv_pipe_bits pipe_bits = 0;
    964 
    965    for_each_bit(b, src_flags) {
    966       switch ((VkAccessFlagBits)(1 << b)) {
    967       case VK_ACCESS_SHADER_WRITE_BIT:
    968          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
    969          break;
    970       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
    971          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
    972          break;
    973       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
    974          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
    975          break;
    976       case VK_ACCESS_TRANSFER_WRITE_BIT:
    977          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
    978          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
    979          break;
    980       default:
    981          break; /* Nothing to do */
    982       }
    983    }
    984 
    985    for_each_bit(b, dst_flags) {
    986       switch ((VkAccessFlagBits)(1 << b)) {
    987       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
    988       case VK_ACCESS_INDEX_READ_BIT:
    989       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
    990          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
    991          break;
    992       case VK_ACCESS_UNIFORM_READ_BIT:
    993          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
    994          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
    995          break;
    996       case VK_ACCESS_SHADER_READ_BIT:
    997       case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
    998       case VK_ACCESS_TRANSFER_READ_BIT:
    999          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
   1000          break;
   1001       default:
   1002          break; /* Nothing to do */
   1003       }
   1004    }
   1005 
   1006    cmd_buffer->state.pending_pipe_bits |= pipe_bits;
   1007 }
   1008 
   1009 static void
   1010 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
   1011 {
   1012    VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
   1013 
   1014    /* In order to avoid thrash, we assume that vertex and fragment stages
   1015     * always exist.  In the rare case where one is missing *and* the other
   1016     * uses push concstants, this may be suboptimal.  However, avoiding stalls
   1017     * seems more important.
   1018     */
   1019    stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
   1020 
   1021    if (stages == cmd_buffer->state.push_constant_stages)
   1022       return;
   1023 
   1024 #if GEN_GEN >= 8
   1025    const unsigned push_constant_kb = 32;
   1026 #elif GEN_IS_HASWELL
   1027    const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
   1028 #else
   1029    const unsigned push_constant_kb = 16;
   1030 #endif
   1031 
   1032    const unsigned num_stages =
   1033       _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
   1034    unsigned size_per_stage = push_constant_kb / num_stages;
   1035 
   1036    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
   1037     * units of 2KB.  Incidentally, these are the same platforms that have
   1038     * 32KB worth of push constant space.
   1039     */
   1040    if (push_constant_kb == 32)
   1041       size_per_stage &= ~1u;
   1042 
   1043    uint32_t kb_used = 0;
   1044    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
   1045       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
   1046       anv_batch_emit(&cmd_buffer->batch,
   1047                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
   1048          alloc._3DCommandSubOpcode  = 18 + i;
   1049          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
   1050          alloc.ConstantBufferSize   = push_size;
   1051       }
   1052       kb_used += push_size;
   1053    }
   1054 
   1055    anv_batch_emit(&cmd_buffer->batch,
   1056                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
   1057       alloc.ConstantBufferOffset = kb_used;
   1058       alloc.ConstantBufferSize = push_constant_kb - kb_used;
   1059    }
   1060 
   1061    cmd_buffer->state.push_constant_stages = stages;
   1062 
   1063    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
   1064     *
   1065     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
   1066     *    the next 3DPRIMITIVE command after programming the
   1067     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
   1068     *
   1069     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
   1070     * pipeline setup, we need to dirty push constants.
   1071     */
   1072    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
   1073 }
   1074 
   1075 static VkResult
   1076 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
   1077                    gl_shader_stage stage,
   1078                    struct anv_state *bt_state)
   1079 {
   1080    struct anv_subpass *subpass = cmd_buffer->state.subpass;
   1081    struct anv_pipeline *pipeline;
   1082    uint32_t bias, state_offset;
   1083 
   1084    switch (stage) {
   1085    case  MESA_SHADER_COMPUTE:
   1086       pipeline = cmd_buffer->state.compute_pipeline;
   1087       bias = 1;
   1088       break;
   1089    default:
   1090       pipeline = cmd_buffer->state.pipeline;
   1091       bias = 0;
   1092       break;
   1093    }
   1094 
   1095    if (!anv_pipeline_has_stage(pipeline, stage)) {
   1096       *bt_state = (struct anv_state) { 0, };
   1097       return VK_SUCCESS;
   1098    }
   1099 
   1100    struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
   1101    if (bias + map->surface_count == 0) {
   1102       *bt_state = (struct anv_state) { 0, };
   1103       return VK_SUCCESS;
   1104    }
   1105 
   1106    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
   1107                                                   bias + map->surface_count,
   1108                                                   &state_offset);
   1109    uint32_t *bt_map = bt_state->map;
   1110 
   1111    if (bt_state->map == NULL)
   1112       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
   1113 
   1114    if (stage == MESA_SHADER_COMPUTE &&
   1115        get_cs_prog_data(cmd_buffer->state.compute_pipeline)->uses_num_work_groups) {
   1116       struct anv_bo *bo = cmd_buffer->state.num_workgroups_bo;
   1117       uint32_t bo_offset = cmd_buffer->state.num_workgroups_offset;
   1118 
   1119       struct anv_state surface_state;
   1120       surface_state =
   1121          anv_cmd_buffer_alloc_surface_state(cmd_buffer);
   1122 
   1123       const enum isl_format format =
   1124          anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
   1125       anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
   1126                                     format, bo_offset, 12, 1);
   1127 
   1128       bt_map[0] = surface_state.offset + state_offset;
   1129       add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset);
   1130    }
   1131 
   1132    if (map->surface_count == 0)
   1133       goto out;
   1134 
   1135    if (map->image_count > 0) {
   1136       VkResult result =
   1137          anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images);
   1138       if (result != VK_SUCCESS)
   1139          return result;
   1140 
   1141       cmd_buffer->state.push_constants_dirty |= 1 << stage;
   1142    }
   1143 
   1144    uint32_t image = 0;
   1145    for (uint32_t s = 0; s < map->surface_count; s++) {
   1146       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
   1147 
   1148       struct anv_state surface_state;
   1149 
   1150       if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) {
   1151          /* Color attachment binding */
   1152          assert(stage == MESA_SHADER_FRAGMENT);
   1153          assert(binding->binding == 0);
   1154          if (binding->index < subpass->color_count) {
   1155             const unsigned att = subpass->color_attachments[binding->index];
   1156             surface_state = cmd_buffer->state.attachments[att].color_rt_state;
   1157          } else {
   1158             surface_state = cmd_buffer->state.null_surface_state;
   1159          }
   1160 
   1161          bt_map[bias + s] = surface_state.offset + state_offset;
   1162          continue;
   1163       }
   1164 
   1165       struct anv_descriptor_set *set =
   1166          cmd_buffer->state.descriptors[binding->set];
   1167       uint32_t offset = set->layout->binding[binding->binding].descriptor_index;
   1168       struct anv_descriptor *desc = &set->descriptors[offset + binding->index];
   1169 
   1170       switch (desc->type) {
   1171       case VK_DESCRIPTOR_TYPE_SAMPLER:
   1172          /* Nothing for us to do here */
   1173          continue;
   1174 
   1175       case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
   1176       case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
   1177          surface_state = desc->image_view->sampler_surface_state;
   1178          assert(surface_state.alloc_size);
   1179          add_image_view_relocs(cmd_buffer, desc->image_view,
   1180                                desc->image_view->image->aux_usage,
   1181                                surface_state);
   1182          break;
   1183 
   1184       case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
   1185          assert(stage == MESA_SHADER_FRAGMENT);
   1186          if (desc->image_view->aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) {
   1187             /* For depth and stencil input attachments, we treat it like any
   1188              * old texture that a user may have bound.
   1189              */
   1190             surface_state = desc->image_view->sampler_surface_state;
   1191             assert(surface_state.alloc_size);
   1192             add_image_view_relocs(cmd_buffer, desc->image_view,
   1193                                   desc->image_view->image->aux_usage,
   1194                                   surface_state);
   1195          } else {
   1196             /* For color input attachments, we create the surface state at
   1197              * vkBeginRenderPass time so that we can include aux and clear
   1198              * color information.
   1199              */
   1200             assert(binding->input_attachment_index < subpass->input_count);
   1201             const unsigned subpass_att = binding->input_attachment_index;
   1202             const unsigned att = subpass->input_attachments[subpass_att];
   1203             surface_state = cmd_buffer->state.attachments[att].input_att_state;
   1204          }
   1205          break;
   1206 
   1207       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
   1208          surface_state = desc->image_view->storage_surface_state;
   1209          assert(surface_state.alloc_size);
   1210          add_image_view_relocs(cmd_buffer, desc->image_view,
   1211                                desc->image_view->image->aux_usage,
   1212                                surface_state);
   1213 
   1214          struct brw_image_param *image_param =
   1215             &cmd_buffer->state.push_constants[stage]->images[image++];
   1216 
   1217          *image_param = desc->image_view->storage_image_param;
   1218          image_param->surface_idx = bias + s;
   1219          break;
   1220       }
   1221 
   1222       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
   1223       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
   1224       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
   1225       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
   1226       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
   1227          surface_state = desc->buffer_view->surface_state;
   1228          assert(surface_state.alloc_size);
   1229          add_surface_state_reloc(cmd_buffer, surface_state,
   1230                                  desc->buffer_view->bo,
   1231                                  desc->buffer_view->offset);
   1232          break;
   1233 
   1234       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
   1235          surface_state = desc->buffer_view->storage_surface_state;
   1236          assert(surface_state.alloc_size);
   1237          add_surface_state_reloc(cmd_buffer, surface_state,
   1238                                  desc->buffer_view->bo,
   1239                                  desc->buffer_view->offset);
   1240 
   1241          struct brw_image_param *image_param =
   1242             &cmd_buffer->state.push_constants[stage]->images[image++];
   1243 
   1244          *image_param = desc->buffer_view->storage_image_param;
   1245          image_param->surface_idx = bias + s;
   1246          break;
   1247 
   1248       default:
   1249          assert(!"Invalid descriptor type");
   1250          continue;
   1251       }
   1252 
   1253       bt_map[bias + s] = surface_state.offset + state_offset;
   1254    }
   1255    assert(image == map->image_count);
   1256 
   1257  out:
   1258    if (!cmd_buffer->device->info.has_llc)
   1259       anv_state_clflush(*bt_state);
   1260 
   1261    return VK_SUCCESS;
   1262 }
   1263 
   1264 static VkResult
   1265 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
   1266               gl_shader_stage stage,
   1267               struct anv_state *state)
   1268 {
   1269    struct anv_pipeline *pipeline;
   1270 
   1271    if (stage == MESA_SHADER_COMPUTE)
   1272       pipeline = cmd_buffer->state.compute_pipeline;
   1273    else
   1274       pipeline = cmd_buffer->state.pipeline;
   1275 
   1276    if (!anv_pipeline_has_stage(pipeline, stage)) {
   1277       *state = (struct anv_state) { 0, };
   1278       return VK_SUCCESS;
   1279    }
   1280 
   1281    struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
   1282    if (map->sampler_count == 0) {
   1283       *state = (struct anv_state) { 0, };
   1284       return VK_SUCCESS;
   1285    }
   1286 
   1287    uint32_t size = map->sampler_count * 16;
   1288    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
   1289 
   1290    if (state->map == NULL)
   1291       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
   1292 
   1293    for (uint32_t s = 0; s < map->sampler_count; s++) {
   1294       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
   1295       struct anv_descriptor_set *set =
   1296          cmd_buffer->state.descriptors[binding->set];
   1297       uint32_t offset = set->layout->binding[binding->binding].descriptor_index;
   1298       struct anv_descriptor *desc = &set->descriptors[offset + binding->index];
   1299 
   1300       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
   1301           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
   1302          continue;
   1303 
   1304       struct anv_sampler *sampler = desc->sampler;
   1305 
   1306       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
   1307        * happens to be zero.
   1308        */
   1309       if (sampler == NULL)
   1310          continue;
   1311 
   1312       memcpy(state->map + (s * 16),
   1313              sampler->state, sizeof(sampler->state));
   1314    }
   1315 
   1316    if (!cmd_buffer->device->info.has_llc)
   1317       anv_state_clflush(*state);
   1318 
   1319    return VK_SUCCESS;
   1320 }
   1321 
   1322 static uint32_t
   1323 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer)
   1324 {
   1325    VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty &
   1326                               cmd_buffer->state.pipeline->active_stages;
   1327 
   1328    VkResult result = VK_SUCCESS;
   1329    anv_foreach_stage(s, dirty) {
   1330       result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]);
   1331       if (result != VK_SUCCESS)
   1332          break;
   1333       result = emit_binding_table(cmd_buffer, s,
   1334                                   &cmd_buffer->state.binding_tables[s]);
   1335       if (result != VK_SUCCESS)
   1336          break;
   1337    }
   1338 
   1339    if (result != VK_SUCCESS) {
   1340       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
   1341 
   1342       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
   1343       assert(result == VK_SUCCESS);
   1344 
   1345       /* Re-emit state base addresses so we get the new surface state base
   1346        * address before we start emitting binding tables etc.
   1347        */
   1348       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
   1349 
   1350       /* Re-emit all active binding tables */
   1351       dirty |= cmd_buffer->state.pipeline->active_stages;
   1352       anv_foreach_stage(s, dirty) {
   1353          result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]);
   1354          if (result != VK_SUCCESS)
   1355             return result;
   1356          result = emit_binding_table(cmd_buffer, s,
   1357                                      &cmd_buffer->state.binding_tables[s]);
   1358          if (result != VK_SUCCESS)
   1359             return result;
   1360       }
   1361    }
   1362 
   1363    cmd_buffer->state.descriptors_dirty &= ~dirty;
   1364 
   1365    return dirty;
   1366 }
   1367 
   1368 static void
   1369 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
   1370                                     uint32_t stages)
   1371 {
   1372    static const uint32_t sampler_state_opcodes[] = {
   1373       [MESA_SHADER_VERTEX]                      = 43,
   1374       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
   1375       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
   1376       [MESA_SHADER_GEOMETRY]                    = 46,
   1377       [MESA_SHADER_FRAGMENT]                    = 47,
   1378       [MESA_SHADER_COMPUTE]                     = 0,
   1379    };
   1380 
   1381    static const uint32_t binding_table_opcodes[] = {
   1382       [MESA_SHADER_VERTEX]                      = 38,
   1383       [MESA_SHADER_TESS_CTRL]                   = 39,
   1384       [MESA_SHADER_TESS_EVAL]                   = 40,
   1385       [MESA_SHADER_GEOMETRY]                    = 41,
   1386       [MESA_SHADER_FRAGMENT]                    = 42,
   1387       [MESA_SHADER_COMPUTE]                     = 0,
   1388    };
   1389 
   1390    anv_foreach_stage(s, stages) {
   1391       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
   1392          anv_batch_emit(&cmd_buffer->batch,
   1393                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
   1394             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
   1395             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
   1396          }
   1397       }
   1398 
   1399       /* Always emit binding table pointers if we're asked to, since on SKL
   1400        * this is what flushes push constants. */
   1401       anv_batch_emit(&cmd_buffer->batch,
   1402                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
   1403          btp._3DCommandSubOpcode = binding_table_opcodes[s];
   1404          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
   1405       }
   1406    }
   1407 }
   1408 
   1409 static uint32_t
   1410 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
   1411 {
   1412    static const uint32_t push_constant_opcodes[] = {
   1413       [MESA_SHADER_VERTEX]                      = 21,
   1414       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
   1415       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
   1416       [MESA_SHADER_GEOMETRY]                    = 22,
   1417       [MESA_SHADER_FRAGMENT]                    = 23,
   1418       [MESA_SHADER_COMPUTE]                     = 0,
   1419    };
   1420 
   1421    VkShaderStageFlags flushed = 0;
   1422 
   1423    anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
   1424       if (stage == MESA_SHADER_COMPUTE)
   1425          continue;
   1426 
   1427       struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
   1428 
   1429       if (state.offset == 0) {
   1430          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
   1431             c._3DCommandSubOpcode = push_constant_opcodes[stage];
   1432       } else {
   1433          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
   1434             c._3DCommandSubOpcode = push_constant_opcodes[stage],
   1435             c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
   1436 #if GEN_GEN >= 9
   1437                .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
   1438                .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
   1439 #else
   1440                .PointerToConstantBuffer0 = { .offset = state.offset },
   1441                .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
   1442 #endif
   1443             };
   1444          }
   1445       }
   1446 
   1447       flushed |= mesa_to_vk_shader_stage(stage);
   1448    }
   1449 
   1450    cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
   1451 
   1452    return flushed;
   1453 }
   1454 
   1455 void
   1456 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
   1457 {
   1458    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
   1459    uint32_t *p;
   1460 
   1461    uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
   1462 
   1463    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
   1464 
   1465    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
   1466 
   1467    genX(flush_pipeline_select_3d)(cmd_buffer);
   1468 
   1469    if (vb_emit) {
   1470       const uint32_t num_buffers = __builtin_popcount(vb_emit);
   1471       const uint32_t num_dwords = 1 + num_buffers * 4;
   1472 
   1473       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
   1474                           GENX(3DSTATE_VERTEX_BUFFERS));
   1475       uint32_t vb, i = 0;
   1476       for_each_bit(vb, vb_emit) {
   1477          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
   1478          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
   1479 
   1480          struct GENX(VERTEX_BUFFER_STATE) state = {
   1481             .VertexBufferIndex = vb,
   1482 
   1483 #if GEN_GEN >= 8
   1484             .MemoryObjectControlState = GENX(MOCS),
   1485 #else
   1486             .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
   1487             .InstanceDataStepRate = 1,
   1488             .VertexBufferMemoryObjectControlState = GENX(MOCS),
   1489 #endif
   1490 
   1491             .AddressModifyEnable = true,
   1492             .BufferPitch = pipeline->binding_stride[vb],
   1493             .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
   1494 
   1495 #if GEN_GEN >= 8
   1496             .BufferSize = buffer->size - offset
   1497 #else
   1498             .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
   1499 #endif
   1500          };
   1501 
   1502          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
   1503          i++;
   1504       }
   1505    }
   1506 
   1507    cmd_buffer->state.vb_dirty &= ~vb_emit;
   1508 
   1509    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
   1510       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
   1511 
   1512       /* The exact descriptor layout is pulled from the pipeline, so we need
   1513        * to re-emit binding tables on every pipeline change.
   1514        */
   1515       cmd_buffer->state.descriptors_dirty |=
   1516          cmd_buffer->state.pipeline->active_stages;
   1517 
   1518       /* If the pipeline changed, we may need to re-allocate push constant
   1519        * space in the URB.
   1520        */
   1521       cmd_buffer_alloc_push_constants(cmd_buffer);
   1522    }
   1523 
   1524 #if GEN_GEN <= 7
   1525    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
   1526        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
   1527       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
   1528        *
   1529        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
   1530        *    stall needs to be sent just prior to any 3DSTATE_VS,
   1531        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
   1532        *    3DSTATE_BINDING_TABLE_POINTER_VS,
   1533        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
   1534        *    PIPE_CONTROL needs to be sent before any combination of VS
   1535        *    associated 3DSTATE."
   1536        */
   1537       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
   1538          pc.DepthStallEnable  = true;
   1539          pc.PostSyncOperation = WriteImmediateData;
   1540          pc.Address           =
   1541             (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
   1542       }
   1543    }
   1544 #endif
   1545 
   1546    /* Render targets live in the same binding table as fragment descriptors */
   1547    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
   1548       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
   1549 
   1550    /* We emit the binding tables and sampler tables first, then emit push
   1551     * constants and then finally emit binding table and sampler table
   1552     * pointers.  It has to happen in this order, since emitting the binding
   1553     * tables may change the push constants (in case of storage images). After
   1554     * emitting push constants, on SKL+ we have to emit the corresponding
   1555     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
   1556     */
   1557    uint32_t dirty = 0;
   1558    if (cmd_buffer->state.descriptors_dirty)
   1559       dirty = flush_descriptor_sets(cmd_buffer);
   1560 
   1561    if (cmd_buffer->state.push_constants_dirty) {
   1562 #if GEN_GEN >= 9
   1563       /* On Sky Lake and later, the binding table pointers commands are
   1564        * what actually flush the changes to push constant state so we need
   1565        * to dirty them so they get re-emitted below.
   1566        */
   1567       dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
   1568 #else
   1569       cmd_buffer_flush_push_constants(cmd_buffer);
   1570 #endif
   1571    }
   1572 
   1573    if (dirty)
   1574       cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
   1575 
   1576    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
   1577       gen8_cmd_buffer_emit_viewport(cmd_buffer);
   1578 
   1579    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
   1580                                   ANV_CMD_DIRTY_PIPELINE)) {
   1581       gen8_cmd_buffer_emit_depth_viewport(cmd_buffer,
   1582                                           pipeline->depth_clamp_enable);
   1583    }
   1584 
   1585    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
   1586       gen7_cmd_buffer_emit_scissor(cmd_buffer);
   1587 
   1588    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
   1589 
   1590    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
   1591 }
   1592 
   1593 static void
   1594 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
   1595                              struct anv_bo *bo, uint32_t offset)
   1596 {
   1597    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
   1598                                  GENX(3DSTATE_VERTEX_BUFFERS));
   1599 
   1600    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
   1601       &(struct GENX(VERTEX_BUFFER_STATE)) {
   1602          .VertexBufferIndex = 32, /* Reserved for this */
   1603          .AddressModifyEnable = true,
   1604          .BufferPitch = 0,
   1605 #if (GEN_GEN >= 8)
   1606          .MemoryObjectControlState = GENX(MOCS),
   1607          .BufferStartingAddress = { bo, offset },
   1608          .BufferSize = 8
   1609 #else
   1610          .VertexBufferMemoryObjectControlState = GENX(MOCS),
   1611          .BufferStartingAddress = { bo, offset },
   1612          .EndAddress = { bo, offset + 8 },
   1613 #endif
   1614       });
   1615 }
   1616 
   1617 static void
   1618 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
   1619                           uint32_t base_vertex, uint32_t base_instance)
   1620 {
   1621    struct anv_state id_state =
   1622       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
   1623 
   1624    ((uint32_t *)id_state.map)[0] = base_vertex;
   1625    ((uint32_t *)id_state.map)[1] = base_instance;
   1626 
   1627    if (!cmd_buffer->device->info.has_llc)
   1628       anv_state_clflush(id_state);
   1629 
   1630    emit_base_vertex_instance_bo(cmd_buffer,
   1631       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
   1632 }
   1633 
   1634 void genX(CmdDraw)(
   1635     VkCommandBuffer                             commandBuffer,
   1636     uint32_t                                    vertexCount,
   1637     uint32_t                                    instanceCount,
   1638     uint32_t                                    firstVertex,
   1639     uint32_t                                    firstInstance)
   1640 {
   1641    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   1642    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
   1643    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
   1644 
   1645    genX(cmd_buffer_flush_state)(cmd_buffer);
   1646 
   1647    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
   1648       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
   1649 
   1650    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
   1651       prim.VertexAccessType         = SEQUENTIAL;
   1652       prim.PrimitiveTopologyType    = pipeline->topology;
   1653       prim.VertexCountPerInstance   = vertexCount;
   1654       prim.StartVertexLocation      = firstVertex;
   1655       prim.InstanceCount            = instanceCount;
   1656       prim.StartInstanceLocation    = firstInstance;
   1657       prim.BaseVertexLocation       = 0;
   1658    }
   1659 }
   1660 
   1661 void genX(CmdDrawIndexed)(
   1662     VkCommandBuffer                             commandBuffer,
   1663     uint32_t                                    indexCount,
   1664     uint32_t                                    instanceCount,
   1665     uint32_t                                    firstIndex,
   1666     int32_t                                     vertexOffset,
   1667     uint32_t                                    firstInstance)
   1668 {
   1669    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   1670    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
   1671    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
   1672 
   1673    genX(cmd_buffer_flush_state)(cmd_buffer);
   1674 
   1675    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
   1676       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
   1677 
   1678    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
   1679       prim.VertexAccessType         = RANDOM;
   1680       prim.PrimitiveTopologyType    = pipeline->topology;
   1681       prim.VertexCountPerInstance   = indexCount;
   1682       prim.StartVertexLocation      = firstIndex;
   1683       prim.InstanceCount            = instanceCount;
   1684       prim.StartInstanceLocation    = firstInstance;
   1685       prim.BaseVertexLocation       = vertexOffset;
   1686    }
   1687 }
   1688 
   1689 /* Auto-Draw / Indirect Registers */
   1690 #define GEN7_3DPRIM_END_OFFSET          0x2420
   1691 #define GEN7_3DPRIM_START_VERTEX        0x2430
   1692 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
   1693 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
   1694 #define GEN7_3DPRIM_START_INSTANCE      0x243C
   1695 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
   1696 
   1697 void genX(CmdDrawIndirect)(
   1698     VkCommandBuffer                             commandBuffer,
   1699     VkBuffer                                    _buffer,
   1700     VkDeviceSize                                offset,
   1701     uint32_t                                    drawCount,
   1702     uint32_t                                    stride)
   1703 {
   1704    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   1705    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
   1706    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
   1707    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
   1708    struct anv_bo *bo = buffer->bo;
   1709    uint32_t bo_offset = buffer->offset + offset;
   1710 
   1711    genX(cmd_buffer_flush_state)(cmd_buffer);
   1712 
   1713    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
   1714       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
   1715 
   1716    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
   1717    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
   1718    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
   1719    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
   1720    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
   1721 
   1722    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
   1723       prim.IndirectParameterEnable  = true;
   1724       prim.VertexAccessType         = SEQUENTIAL;
   1725       prim.PrimitiveTopologyType    = pipeline->topology;
   1726    }
   1727 }
   1728 
   1729 void genX(CmdDrawIndexedIndirect)(
   1730     VkCommandBuffer                             commandBuffer,
   1731     VkBuffer                                    _buffer,
   1732     VkDeviceSize                                offset,
   1733     uint32_t                                    drawCount,
   1734     uint32_t                                    stride)
   1735 {
   1736    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   1737    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
   1738    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
   1739    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
   1740    struct anv_bo *bo = buffer->bo;
   1741    uint32_t bo_offset = buffer->offset + offset;
   1742 
   1743    genX(cmd_buffer_flush_state)(cmd_buffer);
   1744 
   1745    /* TODO: We need to stomp base vertex to 0 somehow */
   1746    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
   1747       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
   1748 
   1749    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
   1750    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
   1751    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
   1752    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
   1753    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
   1754 
   1755    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
   1756       prim.IndirectParameterEnable  = true;
   1757       prim.VertexAccessType         = RANDOM;
   1758       prim.PrimitiveTopologyType    = pipeline->topology;
   1759    }
   1760 }
   1761 
   1762 static VkResult
   1763 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
   1764 {
   1765    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
   1766    struct anv_state surfaces = { 0, }, samplers = { 0, };
   1767    VkResult result;
   1768 
   1769    result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
   1770    if (result != VK_SUCCESS) {
   1771       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
   1772       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
   1773       assert(result == VK_SUCCESS);
   1774 
   1775       /* Re-emit state base addresses so we get the new surface state base
   1776        * address before we start emitting binding tables etc.
   1777        */
   1778       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
   1779 
   1780       result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
   1781       assert(result == VK_SUCCESS);
   1782    }
   1783 
   1784    result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers);
   1785    assert(result == VK_SUCCESS);
   1786 
   1787    uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
   1788    struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
   1789       .BindingTablePointer = surfaces.offset,
   1790       .SamplerStatePointer = samplers.offset,
   1791    };
   1792    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
   1793 
   1794    struct anv_state state =
   1795       anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
   1796                                    pipeline->interface_descriptor_data,
   1797                                    GENX(INTERFACE_DESCRIPTOR_DATA_length),
   1798                                    64);
   1799 
   1800    uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
   1801    anv_batch_emit(&cmd_buffer->batch,
   1802                   GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
   1803       mid.InterfaceDescriptorTotalLength        = size;
   1804       mid.InterfaceDescriptorDataStartAddress   = state.offset;
   1805    }
   1806 
   1807    return VK_SUCCESS;
   1808 }
   1809 
   1810 void
   1811 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
   1812 {
   1813    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
   1814    MAYBE_UNUSED VkResult result;
   1815 
   1816    assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
   1817 
   1818    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
   1819 
   1820    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
   1821 
   1822    if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE) {
   1823       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
   1824        *
   1825        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
   1826        *    the only bits that are changed are scoreboard related: Scoreboard
   1827        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
   1828        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
   1829        *    sufficient."
   1830        */
   1831       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
   1832       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
   1833 
   1834       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
   1835    }
   1836 
   1837    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
   1838        (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
   1839       /* FIXME: figure out descriptors for gen7 */
   1840       result = flush_compute_descriptor_set(cmd_buffer);
   1841       assert(result == VK_SUCCESS);
   1842       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
   1843    }
   1844 
   1845    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
   1846       struct anv_state push_state =
   1847          anv_cmd_buffer_cs_push_constants(cmd_buffer);
   1848 
   1849       if (push_state.alloc_size) {
   1850          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
   1851             curbe.CURBETotalDataLength    = push_state.alloc_size;
   1852             curbe.CURBEDataStartAddress   = push_state.offset;
   1853          }
   1854       }
   1855    }
   1856 
   1857    cmd_buffer->state.compute_dirty = 0;
   1858 
   1859    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
   1860 }
   1861 
   1862 #if GEN_GEN == 7
   1863 
   1864 static bool
   1865 verify_cmd_parser(const struct anv_device *device,
   1866                   int required_version,
   1867                   const char *function)
   1868 {
   1869    if (device->instance->physicalDevice.cmd_parser_version < required_version) {
   1870       vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
   1871                 "cmd parser version %d is required for %s",
   1872                 required_version, function);
   1873       return false;
   1874    } else {
   1875       return true;
   1876    }
   1877 }
   1878 
   1879 #endif
   1880 
   1881 void genX(CmdDispatch)(
   1882     VkCommandBuffer                             commandBuffer,
   1883     uint32_t                                    x,
   1884     uint32_t                                    y,
   1885     uint32_t                                    z)
   1886 {
   1887    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   1888    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
   1889    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
   1890 
   1891    if (prog_data->uses_num_work_groups) {
   1892       struct anv_state state =
   1893          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
   1894       uint32_t *sizes = state.map;
   1895       sizes[0] = x;
   1896       sizes[1] = y;
   1897       sizes[2] = z;
   1898       if (!cmd_buffer->device->info.has_llc)
   1899          anv_state_clflush(state);
   1900       cmd_buffer->state.num_workgroups_offset = state.offset;
   1901       cmd_buffer->state.num_workgroups_bo =
   1902          &cmd_buffer->device->dynamic_state_block_pool.bo;
   1903    }
   1904 
   1905    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
   1906 
   1907    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
   1908       ggw.SIMDSize                     = prog_data->simd_size / 16;
   1909       ggw.ThreadDepthCounterMaximum    = 0;
   1910       ggw.ThreadHeightCounterMaximum   = 0;
   1911       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
   1912       ggw.ThreadGroupIDXDimension      = x;
   1913       ggw.ThreadGroupIDYDimension      = y;
   1914       ggw.ThreadGroupIDZDimension      = z;
   1915       ggw.RightExecutionMask           = pipeline->cs_right_mask;
   1916       ggw.BottomExecutionMask          = 0xffffffff;
   1917    }
   1918 
   1919    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
   1920 }
   1921 
   1922 #define GPGPU_DISPATCHDIMX 0x2500
   1923 #define GPGPU_DISPATCHDIMY 0x2504
   1924 #define GPGPU_DISPATCHDIMZ 0x2508
   1925 
   1926 #define MI_PREDICATE_SRC0  0x2400
   1927 #define MI_PREDICATE_SRC1  0x2408
   1928 
   1929 void genX(CmdDispatchIndirect)(
   1930     VkCommandBuffer                             commandBuffer,
   1931     VkBuffer                                    _buffer,
   1932     VkDeviceSize                                offset)
   1933 {
   1934    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   1935    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
   1936    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
   1937    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
   1938    struct anv_bo *bo = buffer->bo;
   1939    uint32_t bo_offset = buffer->offset + offset;
   1940    struct anv_batch *batch = &cmd_buffer->batch;
   1941 
   1942 #if GEN_GEN == 7
   1943    /* Linux 4.4 added command parser version 5 which allows the GPGPU
   1944     * indirect dispatch registers to be written.
   1945     */
   1946    if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
   1947       return;
   1948 #endif
   1949 
   1950    if (prog_data->uses_num_work_groups) {
   1951       cmd_buffer->state.num_workgroups_offset = bo_offset;
   1952       cmd_buffer->state.num_workgroups_bo = bo;
   1953    }
   1954 
   1955    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
   1956 
   1957    emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
   1958    emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
   1959    emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
   1960 
   1961 #if GEN_GEN <= 7
   1962    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
   1963    emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
   1964    emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
   1965    emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
   1966 
   1967    /* Load compute_dispatch_indirect_x_size into SRC0 */
   1968    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
   1969 
   1970    /* predicate = (compute_dispatch_indirect_x_size == 0); */
   1971    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
   1972       mip.LoadOperation    = LOAD_LOAD;
   1973       mip.CombineOperation = COMBINE_SET;
   1974       mip.CompareOperation = COMPARE_SRCS_EQUAL;
   1975    }
   1976 
   1977    /* Load compute_dispatch_indirect_y_size into SRC0 */
   1978    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
   1979 
   1980    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
   1981    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
   1982       mip.LoadOperation    = LOAD_LOAD;
   1983       mip.CombineOperation = COMBINE_OR;
   1984       mip.CompareOperation = COMPARE_SRCS_EQUAL;
   1985    }
   1986 
   1987    /* Load compute_dispatch_indirect_z_size into SRC0 */
   1988    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
   1989 
   1990    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
   1991    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
   1992       mip.LoadOperation    = LOAD_LOAD;
   1993       mip.CombineOperation = COMBINE_OR;
   1994       mip.CompareOperation = COMPARE_SRCS_EQUAL;
   1995    }
   1996 
   1997    /* predicate = !predicate; */
   1998 #define COMPARE_FALSE                           1
   1999    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
   2000       mip.LoadOperation    = LOAD_LOADINV;
   2001       mip.CombineOperation = COMBINE_OR;
   2002       mip.CompareOperation = COMPARE_FALSE;
   2003    }
   2004 #endif
   2005 
   2006    anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
   2007       ggw.IndirectParameterEnable      = true;
   2008       ggw.PredicateEnable              = GEN_GEN <= 7;
   2009       ggw.SIMDSize                     = prog_data->simd_size / 16;
   2010       ggw.ThreadDepthCounterMaximum    = 0;
   2011       ggw.ThreadHeightCounterMaximum   = 0;
   2012       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
   2013       ggw.RightExecutionMask           = pipeline->cs_right_mask;
   2014       ggw.BottomExecutionMask          = 0xffffffff;
   2015    }
   2016 
   2017    anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
   2018 }
   2019 
   2020 static void
   2021 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
   2022                                       uint32_t pipeline)
   2023 {
   2024 #if GEN_GEN >= 8 && GEN_GEN < 10
   2025    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
   2026     *
   2027     *   Software must clear the COLOR_CALC_STATE Valid field in
   2028     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
   2029     *   with Pipeline Select set to GPGPU.
   2030     *
   2031     * The internal hardware docs recommend the same workaround for Gen9
   2032     * hardware too.
   2033     */
   2034    if (pipeline == GPGPU)
   2035       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
   2036 #endif
   2037 
   2038    /* From "BXML  GT  MI  vol1a GPU Overview  [Instruction]
   2039     * PIPELINE_SELECT [DevBWR+]":
   2040     *
   2041     *   Project: DEVSNB+
   2042     *
   2043     *   Software must ensure all the write caches are flushed through a
   2044     *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
   2045     *   command to invalidate read only caches prior to programming
   2046     *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
   2047     */
   2048    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
   2049       pc.RenderTargetCacheFlushEnable  = true;
   2050       pc.DepthCacheFlushEnable         = true;
   2051       pc.DCFlushEnable                 = true;
   2052       pc.PostSyncOperation             = NoWrite;
   2053       pc.CommandStreamerStallEnable    = true;
   2054    }
   2055 
   2056    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
   2057       pc.TextureCacheInvalidationEnable   = true;
   2058       pc.ConstantCacheInvalidationEnable  = true;
   2059       pc.StateCacheInvalidationEnable     = true;
   2060       pc.InstructionCacheInvalidateEnable = true;
   2061       pc.PostSyncOperation                = NoWrite;
   2062    }
   2063 }
   2064 
   2065 void
   2066 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
   2067 {
   2068    if (cmd_buffer->state.current_pipeline != _3D) {
   2069       flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
   2070 
   2071       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
   2072 #if GEN_GEN >= 9
   2073          ps.MaskBits = 3;
   2074 #endif
   2075          ps.PipelineSelection = _3D;
   2076       }
   2077 
   2078       cmd_buffer->state.current_pipeline = _3D;
   2079    }
   2080 }
   2081 
   2082 void
   2083 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
   2084 {
   2085    if (cmd_buffer->state.current_pipeline != GPGPU) {
   2086       flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
   2087 
   2088       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
   2089 #if GEN_GEN >= 9
   2090          ps.MaskBits = 3;
   2091 #endif
   2092          ps.PipelineSelection = GPGPU;
   2093       }
   2094 
   2095       cmd_buffer->state.current_pipeline = GPGPU;
   2096    }
   2097 }
   2098 
   2099 void
   2100 genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
   2101 {
   2102    if (GEN_GEN >= 8)
   2103       return;
   2104 
   2105    /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
   2106     *
   2107     *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
   2108     *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
   2109     *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
   2110     *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
   2111     *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
   2112     *    Depth Flush Bit set, followed by another pipelined depth stall
   2113     *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
   2114     *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
   2115     *    via a preceding MI_FLUSH)."
   2116     */
   2117    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
   2118       pipe.DepthStallEnable = true;
   2119    }
   2120    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
   2121       pipe.DepthCacheFlushEnable = true;
   2122    }
   2123    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
   2124       pipe.DepthStallEnable = true;
   2125    }
   2126 }
   2127 
   2128 static uint32_t
   2129 depth_stencil_surface_type(enum isl_surf_dim dim)
   2130 {
   2131    switch (dim) {
   2132    case ISL_SURF_DIM_1D:
   2133       if (GEN_GEN >= 9) {
   2134          /* From the Sky Lake PRM, 3DSTATAE_DEPTH_BUFFER::SurfaceType
   2135           *
   2136           *    Programming Notes:
   2137           *    The Surface Type of the depth buffer must be the same as the
   2138           *    Surface Type of the render target(s) (defined in
   2139           *    SURFACE_STATE), unless either the depth buffer or render
   2140           *    targets are SURFTYPE_NULL (see exception below for SKL).  1D
   2141           *    surface type not allowed for depth surface and stencil surface.
   2142           *
   2143           *    Workaround:
   2144           *    If depth/stencil is enabled with 1D render target,
   2145           *    depth/stencil surface type needs to be set to 2D surface type
   2146           *    and height set to 1. Depth will use (legacy) TileY and stencil
   2147           *    will use TileW. For this case only, the Surface Type of the
   2148           *    depth buffer can be 2D while the Surface Type of the render
   2149           *    target(s) are 1D, representing an exception to a programming
   2150           *    note above.
   2151           */
   2152          return SURFTYPE_2D;
   2153       } else {
   2154          return SURFTYPE_1D;
   2155       }
   2156    case ISL_SURF_DIM_2D:
   2157       return SURFTYPE_2D;
   2158    case ISL_SURF_DIM_3D:
   2159       if (GEN_GEN >= 9) {
   2160          /* The Sky Lake docs list the value for 3D as "Reserved".  However,
   2161           * they have the exact same layout as 2D arrays on gen9+, so we can
   2162           * just use 2D here.
   2163           */
   2164          return SURFTYPE_2D;
   2165       } else {
   2166          return SURFTYPE_3D;
   2167       }
   2168    default:
   2169       unreachable("Invalid surface dimension");
   2170    }
   2171 }
   2172 
   2173 static void
   2174 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
   2175 {
   2176    struct anv_device *device = cmd_buffer->device;
   2177    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
   2178    const struct anv_image_view *iview =
   2179       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
   2180    const struct anv_image *image = iview ? iview->image : NULL;
   2181    const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
   2182    const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment;
   2183    const bool has_hiz = image != NULL &&
   2184       cmd_buffer->state.attachments[ds].aux_usage == ISL_AUX_USAGE_HIZ;
   2185    const bool has_stencil =
   2186       image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
   2187 
   2188    /* FIXME: Implement the PMA stall W/A */
   2189    /* FIXME: Width and Height are wrong */
   2190 
   2191    genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
   2192 
   2193    /* Emit 3DSTATE_DEPTH_BUFFER */
   2194    if (has_depth) {
   2195       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
   2196          db.SurfaceType                   =
   2197             depth_stencil_surface_type(image->depth_surface.isl.dim);
   2198          db.DepthWriteEnable              = true;
   2199          db.StencilWriteEnable            = has_stencil;
   2200          db.HierarchicalDepthBufferEnable = has_hiz;
   2201 
   2202          db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
   2203                                                       &image->depth_surface.isl);
   2204 
   2205          db.SurfaceBaseAddress = (struct anv_address) {
   2206             .bo = image->bo,
   2207             .offset = image->offset + image->depth_surface.offset,
   2208          };
   2209          db.DepthBufferObjectControlState = GENX(MOCS);
   2210 
   2211          db.SurfacePitch         = image->depth_surface.isl.row_pitch - 1;
   2212          db.Height               = image->extent.height - 1;
   2213          db.Width                = image->extent.width - 1;
   2214          db.LOD                  = iview->isl.base_level;
   2215          db.MinimumArrayElement  = iview->isl.base_array_layer;
   2216 
   2217          assert(image->depth_surface.isl.dim != ISL_SURF_DIM_3D);
   2218          db.Depth =
   2219          db.RenderTargetViewExtent =
   2220             iview->isl.array_len - iview->isl.base_array_layer - 1;
   2221 
   2222 #if GEN_GEN >= 8
   2223          db.SurfaceQPitch =
   2224             isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2;
   2225 #endif
   2226       }
   2227    } else {
   2228       /* Even when no depth buffer is present, the hardware requires that
   2229        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
   2230        *
   2231        *    If a null depth buffer is bound, the driver must instead bind depth as:
   2232        *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
   2233        *       3DSTATE_DEPTH.Width = 1
   2234        *       3DSTATE_DEPTH.Height = 1
   2235        *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
   2236        *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
   2237        *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
   2238        *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
   2239        *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
   2240        *
   2241        * The PRM is wrong, though. The width and height must be programmed to
   2242        * actual framebuffer's width and height, even when neither depth buffer
   2243        * nor stencil buffer is present.  Also, D16_UNORM is not allowed to
   2244        * be combined with a stencil buffer so we use D32_FLOAT instead.
   2245        */
   2246       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
   2247          if (has_stencil) {
   2248             db.SurfaceType       =
   2249                depth_stencil_surface_type(image->stencil_surface.isl.dim);
   2250          } else {
   2251             db.SurfaceType       = SURFTYPE_2D;
   2252          }
   2253          db.SurfaceFormat        = D32_FLOAT;
   2254          db.Width                = MAX2(fb->width, 1) - 1;
   2255          db.Height               = MAX2(fb->height, 1) - 1;
   2256          db.StencilWriteEnable   = has_stencil;
   2257       }
   2258    }
   2259 
   2260    if (has_hiz) {
   2261       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) {
   2262          hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS);
   2263          hdb.SurfacePitch = image->aux_surface.isl.row_pitch - 1;
   2264          hdb.SurfaceBaseAddress = (struct anv_address) {
   2265             .bo = image->bo,
   2266             .offset = image->offset + image->aux_surface.offset,
   2267          };
   2268 #if GEN_GEN >= 8
   2269          /* From the SKL PRM Vol2a:
   2270           *
   2271           *    The interpretation of this field is dependent on Surface Type
   2272           *    as follows:
   2273           *    - SURFTYPE_1D: distance in pixels between array slices
   2274           *    - SURFTYPE_2D/CUBE: distance in rows between array slices
   2275           *    - SURFTYPE_3D: distance in rows between R - slices
   2276           *
   2277           * Unfortunately, the docs aren't 100% accurate here.  They fail to
   2278           * mention that the 1-D rule only applies to linear 1-D images.
   2279           * Since depth and HiZ buffers are always tiled, they are treated as
   2280           * 2-D images.  Prior to Sky Lake, this field is always in rows.
   2281           */
   2282          hdb.SurfaceQPitch =
   2283             isl_surf_get_array_pitch_sa_rows(&image->aux_surface.isl) >> 2;
   2284 #endif
   2285       }
   2286    } else {
   2287       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb);
   2288    }
   2289 
   2290    /* Emit 3DSTATE_STENCIL_BUFFER */
   2291    if (has_stencil) {
   2292       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
   2293 #if GEN_GEN >= 8 || GEN_IS_HASWELL
   2294          sb.StencilBufferEnable = true;
   2295 #endif
   2296          sb.StencilBufferObjectControlState = GENX(MOCS);
   2297 
   2298          sb.SurfacePitch = image->stencil_surface.isl.row_pitch - 1;
   2299 
   2300 #if GEN_GEN >= 8
   2301          sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2;
   2302 #endif
   2303          sb.SurfaceBaseAddress = (struct anv_address) {
   2304             .bo = image->bo,
   2305             .offset = image->offset + image->stencil_surface.offset,
   2306          };
   2307       }
   2308    } else {
   2309       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
   2310    }
   2311 
   2312    /* From the IVB PRM Vol2P1, 11.5.5.4 3DSTATE_CLEAR_PARAMS:
   2313     *
   2314     *    3DSTATE_CLEAR_PARAMS must always be programmed in the along with
   2315     *    the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER,
   2316     *    3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER)
   2317     *
   2318     * Testing also shows that some variant of this restriction may exist HSW+.
   2319     * On BDW+, it is not possible to emit 2 of these packets consecutively when
   2320     * both have DepthClearValueValid set. An analysis of such state programming
   2321     * on SKL showed that the GPU doesn't register the latter packet's clear
   2322     * value.
   2323     */
   2324    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) {
   2325       if (has_hiz) {
   2326          cp.DepthClearValueValid = true;
   2327          cp.DepthClearValue = ANV_HZ_FC_VAL;
   2328       }
   2329    }
   2330 }
   2331 
   2332 static void
   2333 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
   2334                              struct anv_subpass *subpass)
   2335 {
   2336    cmd_buffer->state.subpass = subpass;
   2337 
   2338    cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
   2339 
   2340    const struct anv_image_view *iview =
   2341       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
   2342 
   2343    if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) {
   2344       const uint32_t ds = subpass->depth_stencil_attachment;
   2345       transition_depth_buffer(cmd_buffer, iview->image,
   2346                               cmd_buffer->state.attachments[ds].current_layout,
   2347                               cmd_buffer->state.subpass->depth_stencil_layout);
   2348       cmd_buffer->state.attachments[ds].current_layout =
   2349          cmd_buffer->state.subpass->depth_stencil_layout;
   2350       cmd_buffer->state.attachments[ds].aux_usage =
   2351          layout_to_hiz_usage(cmd_buffer->state.subpass->depth_stencil_layout,
   2352                              iview->image->samples);
   2353    }
   2354 
   2355    cmd_buffer_emit_depth_stencil(cmd_buffer);
   2356 
   2357    anv_cmd_buffer_clear_subpass(cmd_buffer);
   2358 }
   2359 
   2360 void genX(CmdBeginRenderPass)(
   2361     VkCommandBuffer                             commandBuffer,
   2362     const VkRenderPassBeginInfo*                pRenderPassBegin,
   2363     VkSubpassContents                           contents)
   2364 {
   2365    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   2366    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
   2367    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
   2368 
   2369    cmd_buffer->state.framebuffer = framebuffer;
   2370    cmd_buffer->state.pass = pass;
   2371    cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
   2372    genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, pRenderPassBegin);
   2373 
   2374    genX(flush_pipeline_select_3d)(cmd_buffer);
   2375 
   2376    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
   2377 }
   2378 
   2379 void genX(CmdNextSubpass)(
   2380     VkCommandBuffer                             commandBuffer,
   2381     VkSubpassContents                           contents)
   2382 {
   2383    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   2384 
   2385    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
   2386 
   2387    const struct anv_image_view *iview =
   2388       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
   2389 
   2390    if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) {
   2391       const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment;
   2392 
   2393       if (cmd_buffer->state.subpass - cmd_buffer->state.pass->subpasses ==
   2394           cmd_buffer->state.pass->attachments[ds].last_subpass_idx) {
   2395          transition_depth_buffer(cmd_buffer, iview->image,
   2396                                  cmd_buffer->state.attachments[ds].current_layout,
   2397                                  cmd_buffer->state.pass->attachments[ds].final_layout);
   2398       }
   2399    }
   2400 
   2401    anv_cmd_buffer_resolve_subpass(cmd_buffer);
   2402    genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
   2403 }
   2404 
   2405 void genX(CmdEndRenderPass)(
   2406     VkCommandBuffer                             commandBuffer)
   2407 {
   2408    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   2409 
   2410    const struct anv_image_view *iview =
   2411       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
   2412 
   2413    if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) {
   2414       const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment;
   2415 
   2416       if (cmd_buffer->state.subpass - cmd_buffer->state.pass->subpasses ==
   2417           cmd_buffer->state.pass->attachments[ds].last_subpass_idx) {
   2418          transition_depth_buffer(cmd_buffer, iview->image,
   2419                                  cmd_buffer->state.attachments[ds].current_layout,
   2420                                  cmd_buffer->state.pass->attachments[ds].final_layout);
   2421       }
   2422    }
   2423 
   2424    anv_cmd_buffer_resolve_subpass(cmd_buffer);
   2425 
   2426 #ifndef NDEBUG
   2427    anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
   2428 #endif
   2429 }
   2430 
   2431 static void
   2432 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
   2433                     struct anv_bo *bo, uint32_t offset)
   2434 {
   2435    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
   2436       pc.DestinationAddressType  = DAT_PPGTT;
   2437       pc.PostSyncOperation       = WritePSDepthCount;
   2438       pc.DepthStallEnable        = true;
   2439       pc.Address                 = (struct anv_address) { bo, offset };
   2440 
   2441       if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
   2442          pc.CommandStreamerStallEnable = true;
   2443    }
   2444 }
   2445 
   2446 static void
   2447 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
   2448                         struct anv_bo *bo, uint32_t offset)
   2449 {
   2450    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
   2451       pc.DestinationAddressType  = DAT_PPGTT;
   2452       pc.PostSyncOperation       = WriteImmediateData;
   2453       pc.Address                 = (struct anv_address) { bo, offset };
   2454       pc.ImmediateData           = 1;
   2455    }
   2456 }
   2457 
   2458 void genX(CmdResetQueryPool)(
   2459     VkCommandBuffer                             commandBuffer,
   2460     VkQueryPool                                 queryPool,
   2461     uint32_t                                    firstQuery,
   2462     uint32_t                                    queryCount)
   2463 {
   2464    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   2465    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
   2466 
   2467    for (uint32_t i = 0; i < queryCount; i++) {
   2468       switch (pool->type) {
   2469       case VK_QUERY_TYPE_OCCLUSION:
   2470       case VK_QUERY_TYPE_TIMESTAMP: {
   2471          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
   2472             sdm.Address = (struct anv_address) {
   2473                .bo = &pool->bo,
   2474                .offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot) +
   2475                          offsetof(struct anv_query_pool_slot, available),
   2476             };
   2477             sdm.DataDWord0 = 0;
   2478             sdm.DataDWord1 = 0;
   2479          }
   2480          break;
   2481       }
   2482       default:
   2483          assert(!"Invalid query type");
   2484       }
   2485    }
   2486 }
   2487 
   2488 void genX(CmdBeginQuery)(
   2489     VkCommandBuffer                             commandBuffer,
   2490     VkQueryPool                                 queryPool,
   2491     uint32_t                                    query,
   2492     VkQueryControlFlags                         flags)
   2493 {
   2494    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   2495    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
   2496 
   2497    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
   2498     * that the pipelining of the depth write breaks. What we see is that
   2499     * samples from the render pass clear leaks into the first query
   2500     * immediately after the clear. Doing a pipecontrol with a post-sync
   2501     * operation and DepthStallEnable seems to work around the issue.
   2502     */
   2503    if (cmd_buffer->state.need_query_wa) {
   2504       cmd_buffer->state.need_query_wa = false;
   2505       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
   2506          pc.DepthCacheFlushEnable   = true;
   2507          pc.DepthStallEnable        = true;
   2508       }
   2509    }
   2510 
   2511    switch (pool->type) {
   2512    case VK_QUERY_TYPE_OCCLUSION:
   2513       emit_ps_depth_count(cmd_buffer, &pool->bo,
   2514                           query * sizeof(struct anv_query_pool_slot));
   2515       break;
   2516 
   2517    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
   2518    default:
   2519       unreachable("");
   2520    }
   2521 }
   2522 
   2523 void genX(CmdEndQuery)(
   2524     VkCommandBuffer                             commandBuffer,
   2525     VkQueryPool                                 queryPool,
   2526     uint32_t                                    query)
   2527 {
   2528    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   2529    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
   2530 
   2531    switch (pool->type) {
   2532    case VK_QUERY_TYPE_OCCLUSION:
   2533       emit_ps_depth_count(cmd_buffer, &pool->bo,
   2534                           query * sizeof(struct anv_query_pool_slot) + 8);
   2535 
   2536       emit_query_availability(cmd_buffer, &pool->bo,
   2537                               query * sizeof(struct anv_query_pool_slot) + 16);
   2538       break;
   2539 
   2540    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
   2541    default:
   2542       unreachable("");
   2543    }
   2544 }
   2545 
   2546 #define TIMESTAMP 0x2358
   2547 
   2548 void genX(CmdWriteTimestamp)(
   2549     VkCommandBuffer                             commandBuffer,
   2550     VkPipelineStageFlagBits                     pipelineStage,
   2551     VkQueryPool                                 queryPool,
   2552     uint32_t                                    query)
   2553 {
   2554    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   2555    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
   2556    uint32_t offset = query * sizeof(struct anv_query_pool_slot);
   2557 
   2558    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
   2559 
   2560    switch (pipelineStage) {
   2561    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
   2562       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
   2563          srm.RegisterAddress  = TIMESTAMP;
   2564          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset };
   2565       }
   2566       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
   2567          srm.RegisterAddress  = TIMESTAMP + 4;
   2568          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 4 };
   2569       }
   2570       break;
   2571 
   2572    default:
   2573       /* Everything else is bottom-of-pipe */
   2574       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
   2575          pc.DestinationAddressType  = DAT_PPGTT;
   2576          pc.PostSyncOperation       = WriteTimestamp;
   2577          pc.Address = (struct anv_address) { &pool->bo, offset };
   2578 
   2579          if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
   2580             pc.CommandStreamerStallEnable = true;
   2581       }
   2582       break;
   2583    }
   2584 
   2585    emit_query_availability(cmd_buffer, &pool->bo, offset + 16);
   2586 }
   2587 
   2588 #if GEN_GEN > 7 || GEN_IS_HASWELL
   2589 
   2590 #define alu_opcode(v)   __gen_uint((v),  20, 31)
   2591 #define alu_operand1(v) __gen_uint((v),  10, 19)
   2592 #define alu_operand2(v) __gen_uint((v),   0,  9)
   2593 #define alu(opcode, operand1, operand2) \
   2594    alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
   2595 
   2596 #define OPCODE_NOOP      0x000
   2597 #define OPCODE_LOAD      0x080
   2598 #define OPCODE_LOADINV   0x480
   2599 #define OPCODE_LOAD0     0x081
   2600 #define OPCODE_LOAD1     0x481
   2601 #define OPCODE_ADD       0x100
   2602 #define OPCODE_SUB       0x101
   2603 #define OPCODE_AND       0x102
   2604 #define OPCODE_OR        0x103
   2605 #define OPCODE_XOR       0x104
   2606 #define OPCODE_STORE     0x180
   2607 #define OPCODE_STOREINV  0x580
   2608 
   2609 #define OPERAND_R0   0x00
   2610 #define OPERAND_R1   0x01
   2611 #define OPERAND_R2   0x02
   2612 #define OPERAND_R3   0x03
   2613 #define OPERAND_R4   0x04
   2614 #define OPERAND_SRCA 0x20
   2615 #define OPERAND_SRCB 0x21
   2616 #define OPERAND_ACCU 0x31
   2617 #define OPERAND_ZF   0x32
   2618 #define OPERAND_CF   0x33
   2619 
   2620 #define CS_GPR(n) (0x2600 + (n) * 8)
   2621 
   2622 static void
   2623 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
   2624                       struct anv_bo *bo, uint32_t offset)
   2625 {
   2626    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
   2627       lrm.RegisterAddress  = reg,
   2628       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
   2629    }
   2630    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
   2631       lrm.RegisterAddress  = reg + 4;
   2632       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
   2633    }
   2634 }
   2635 
   2636 static void
   2637 store_query_result(struct anv_batch *batch, uint32_t reg,
   2638                    struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
   2639 {
   2640    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
   2641       srm.RegisterAddress  = reg;
   2642       srm.MemoryAddress    = (struct anv_address) { bo, offset };
   2643    }
   2644 
   2645    if (flags & VK_QUERY_RESULT_64_BIT) {
   2646       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
   2647          srm.RegisterAddress  = reg + 4;
   2648          srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
   2649       }
   2650    }
   2651 }
   2652 
   2653 void genX(CmdCopyQueryPoolResults)(
   2654     VkCommandBuffer                             commandBuffer,
   2655     VkQueryPool                                 queryPool,
   2656     uint32_t                                    firstQuery,
   2657     uint32_t                                    queryCount,
   2658     VkBuffer                                    destBuffer,
   2659     VkDeviceSize                                destOffset,
   2660     VkDeviceSize                                destStride,
   2661     VkQueryResultFlags                          flags)
   2662 {
   2663    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
   2664    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
   2665    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
   2666    uint32_t slot_offset, dst_offset;
   2667 
   2668    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
   2669       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
   2670          pc.CommandStreamerStallEnable = true;
   2671          pc.StallAtPixelScoreboard     = true;
   2672       }
   2673    }
   2674 
   2675    dst_offset = buffer->offset + destOffset;
   2676    for (uint32_t i = 0; i < queryCount; i++) {
   2677 
   2678       slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
   2679       switch (pool->type) {
   2680       case VK_QUERY_TYPE_OCCLUSION:
   2681          emit_load_alu_reg_u64(&cmd_buffer->batch,
   2682                                CS_GPR(0), &pool->bo, slot_offset);
   2683          emit_load_alu_reg_u64(&cmd_buffer->batch,
   2684                                CS_GPR(1), &pool->bo, slot_offset + 8);
   2685 
   2686          /* FIXME: We need to clamp the result for 32 bit. */
   2687 
   2688          uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
   2689          dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
   2690          dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
   2691          dw[3] = alu(OPCODE_SUB, 0, 0);
   2692          dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
   2693          break;
   2694 
   2695       case VK_QUERY_TYPE_TIMESTAMP:
   2696          emit_load_alu_reg_u64(&cmd_buffer->batch,
   2697                                CS_GPR(2), &pool->bo, slot_offset);
   2698          break;
   2699 
   2700       default:
   2701          unreachable("unhandled query type");
   2702       }
   2703 
   2704       store_query_result(&cmd_buffer->batch,
   2705                          CS_GPR(2), buffer->bo, dst_offset, flags);
   2706 
   2707       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
   2708          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
   2709                                &pool->bo, slot_offset + 16);
   2710          if (flags & VK_QUERY_RESULT_64_BIT)
   2711             store_query_result(&cmd_buffer->batch,
   2712                                CS_GPR(0), buffer->bo, dst_offset + 8, flags);
   2713          else
   2714             store_query_result(&cmd_buffer->batch,
   2715                                CS_GPR(0), buffer->bo, dst_offset + 4, flags);
   2716       }
   2717 
   2718       dst_offset += destStride;
   2719    }
   2720 }
   2721 
   2722 #else
   2723 void genX(CmdCopyQueryPoolResults)(
   2724     VkCommandBuffer                             commandBuffer,
   2725     VkQueryPool                                 queryPool,
   2726     uint32_t                                    firstQuery,
   2727     uint32_t                                    queryCount,
   2728     VkBuffer                                    destBuffer,
   2729     VkDeviceSize                                destOffset,
   2730     VkDeviceSize                                destStride,
   2731     VkQueryResultFlags                          flags)
   2732 {
   2733    anv_finishme("Queries not yet supported on Ivy Bridge");
   2734 }
   2735 #endif
   2736