1 /* 2 * Copyright 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include <assert.h> 25 #include <stdbool.h> 26 27 #include "anv_private.h" 28 #include "vk_format_info.h" 29 30 #include "common/gen_l3_config.h" 31 #include "genxml/gen_macros.h" 32 #include "genxml/genX_pack.h" 33 34 static void 35 emit_lrm(struct anv_batch *batch, 36 uint32_t reg, struct anv_bo *bo, uint32_t offset) 37 { 38 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 39 lrm.RegisterAddress = reg; 40 lrm.MemoryAddress = (struct anv_address) { bo, offset }; 41 } 42 } 43 44 static void 45 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) 46 { 47 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 48 lri.RegisterOffset = reg; 49 lri.DataDWord = imm; 50 } 51 } 52 53 void 54 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) 55 { 56 struct anv_device *device = cmd_buffer->device; 57 58 /* Emit a render target cache flush. 59 * 60 * This isn't documented anywhere in the PRM. However, it seems to be 61 * necessary prior to changing the surface state base adress. Without 62 * this, we get GPU hangs when using multi-level command buffers which 63 * clear depth, reset state base address, and then go render stuff. 64 */ 65 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 66 pc.DCFlushEnable = true; 67 pc.RenderTargetCacheFlushEnable = true; 68 pc.CommandStreamerStallEnable = true; 69 } 70 71 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { 72 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; 73 sba.GeneralStateMemoryObjectControlState = GENX(MOCS); 74 sba.GeneralStateBaseAddressModifyEnable = true; 75 76 sba.SurfaceStateBaseAddress = 77 anv_cmd_buffer_surface_base_address(cmd_buffer); 78 sba.SurfaceStateMemoryObjectControlState = GENX(MOCS); 79 sba.SurfaceStateBaseAddressModifyEnable = true; 80 81 sba.DynamicStateBaseAddress = 82 (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 }; 83 sba.DynamicStateMemoryObjectControlState = GENX(MOCS); 84 sba.DynamicStateBaseAddressModifyEnable = true; 85 86 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; 87 sba.IndirectObjectMemoryObjectControlState = GENX(MOCS); 88 sba.IndirectObjectBaseAddressModifyEnable = true; 89 90 sba.InstructionBaseAddress = 91 (struct anv_address) { &device->instruction_block_pool.bo, 0 }; 92 sba.InstructionMemoryObjectControlState = GENX(MOCS); 93 sba.InstructionBaseAddressModifyEnable = true; 94 95 # if (GEN_GEN >= 8) 96 /* Broadwell requires that we specify a buffer size for a bunch of 97 * these fields. However, since we will be growing the BO's live, we 98 * just set them all to the maximum. 99 */ 100 sba.GeneralStateBufferSize = 0xfffff; 101 sba.GeneralStateBufferSizeModifyEnable = true; 102 sba.DynamicStateBufferSize = 0xfffff; 103 sba.DynamicStateBufferSizeModifyEnable = true; 104 sba.IndirectObjectBufferSize = 0xfffff; 105 sba.IndirectObjectBufferSizeModifyEnable = true; 106 sba.InstructionBufferSize = 0xfffff; 107 sba.InstructionBuffersizeModifyEnable = true; 108 # endif 109 } 110 111 /* After re-setting the surface state base address, we have to do some 112 * cache flusing so that the sampler engine will pick up the new 113 * SURFACE_STATE objects and binding tables. From the Broadwell PRM, 114 * Shared Function > 3D Sampler > State > State Caching (page 96): 115 * 116 * Coherency with system memory in the state cache, like the texture 117 * cache is handled partially by software. It is expected that the 118 * command stream or shader will issue Cache Flush operation or 119 * Cache_Flush sampler message to ensure that the L1 cache remains 120 * coherent with system memory. 121 * 122 * [...] 123 * 124 * Whenever the value of the Dynamic_State_Base_Addr, 125 * Surface_State_Base_Addr are altered, the L1 state cache must be 126 * invalidated to ensure the new surface or sampler state is fetched 127 * from system memory. 128 * 129 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit 130 * which, according the PIPE_CONTROL instruction documentation in the 131 * Broadwell PRM: 132 * 133 * Setting this bit is independent of any other bit in this packet. 134 * This bit controls the invalidation of the L1 and L2 state caches 135 * at the top of the pipe i.e. at the parsing time. 136 * 137 * Unfortunately, experimentation seems to indicate that state cache 138 * invalidation through a PIPE_CONTROL does nothing whatsoever in 139 * regards to surface state and binding tables. In stead, it seems that 140 * invalidating the texture cache is what is actually needed. 141 * 142 * XXX: As far as we have been able to determine through 143 * experimentation, shows that flush the texture cache appears to be 144 * sufficient. The theory here is that all of the sampling/rendering 145 * units cache the binding table in the texture cache. However, we have 146 * yet to be able to actually confirm this. 147 */ 148 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 149 pc.TextureCacheInvalidationEnable = true; 150 pc.ConstantCacheInvalidationEnable = true; 151 pc.StateCacheInvalidationEnable = true; 152 } 153 } 154 155 static void 156 add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer, 157 struct anv_state state, 158 struct anv_bo *bo, uint32_t offset) 159 { 160 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 161 162 anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, 163 state.offset + isl_dev->ss.addr_offset, bo, offset); 164 } 165 166 static void 167 add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer, 168 const struct anv_image_view *iview, 169 enum isl_aux_usage aux_usage, 170 struct anv_state state) 171 { 172 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 173 174 anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, 175 state.offset + isl_dev->ss.addr_offset, 176 iview->bo, iview->offset); 177 178 if (aux_usage != ISL_AUX_USAGE_NONE) { 179 uint32_t aux_offset = iview->offset + iview->image->aux_surface.offset; 180 181 /* On gen7 and prior, the bottom 12 bits of the MCS base address are 182 * used to store other information. This should be ok, however, because 183 * surface buffer addresses are always 4K page alinged. 184 */ 185 assert((aux_offset & 0xfff) == 0); 186 uint32_t *aux_addr_dw = state.map + isl_dev->ss.aux_addr_offset; 187 aux_offset += *aux_addr_dw & 0xfff; 188 189 anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc, 190 state.offset + isl_dev->ss.aux_addr_offset, 191 iview->bo, aux_offset); 192 } 193 } 194 195 static bool 196 color_is_zero_one(VkClearColorValue value, enum isl_format format) 197 { 198 if (isl_format_has_int_channel(format)) { 199 for (unsigned i = 0; i < 4; i++) { 200 if (value.int32[i] != 0 && value.int32[i] != 1) 201 return false; 202 } 203 } else { 204 for (unsigned i = 0; i < 4; i++) { 205 if (value.float32[i] != 0.0f && value.float32[i] != 1.0f) 206 return false; 207 } 208 } 209 210 return true; 211 } 212 213 static void 214 color_attachment_compute_aux_usage(struct anv_device *device, 215 struct anv_attachment_state *att_state, 216 struct anv_image_view *iview, 217 VkRect2D render_area, 218 union isl_color_value *fast_clear_color) 219 { 220 if (iview->image->aux_surface.isl.size == 0) { 221 att_state->aux_usage = ISL_AUX_USAGE_NONE; 222 att_state->input_aux_usage = ISL_AUX_USAGE_NONE; 223 att_state->fast_clear = false; 224 return; 225 } 226 227 assert(iview->image->aux_surface.isl.usage & ISL_SURF_USAGE_CCS_BIT); 228 229 att_state->clear_color_is_zero_one = 230 color_is_zero_one(att_state->clear_value.color, iview->isl.format); 231 232 if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 233 /* Start off assuming fast clears are possible */ 234 att_state->fast_clear = true; 235 236 /* Potentially, we could do partial fast-clears but doing so has crazy 237 * alignment restrictions. It's easier to just restrict to full size 238 * fast clears for now. 239 */ 240 if (render_area.offset.x != 0 || 241 render_area.offset.y != 0 || 242 render_area.extent.width != iview->extent.width || 243 render_area.extent.height != iview->extent.height) 244 att_state->fast_clear = false; 245 246 if (GEN_GEN <= 7) { 247 /* On gen7, we can't do multi-LOD or multi-layer fast-clears. We 248 * technically can, but it comes with crazy restrictions that we 249 * don't want to deal with now. 250 */ 251 if (iview->isl.base_level > 0 || 252 iview->isl.base_array_layer > 0 || 253 iview->isl.array_len > 1) 254 att_state->fast_clear = false; 255 } 256 257 /* On Broadwell and earlier, we can only handle 0/1 clear colors */ 258 if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one) 259 att_state->fast_clear = false; 260 261 if (att_state->fast_clear) { 262 memcpy(fast_clear_color->u32, att_state->clear_value.color.uint32, 263 sizeof(fast_clear_color->u32)); 264 } 265 } else { 266 att_state->fast_clear = false; 267 } 268 269 if (isl_format_supports_lossless_compression(&device->info, 270 iview->isl.format)) { 271 att_state->aux_usage = ISL_AUX_USAGE_CCS_E; 272 att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E; 273 } else if (att_state->fast_clear) { 274 att_state->aux_usage = ISL_AUX_USAGE_CCS_D; 275 if (GEN_GEN >= 9) { 276 /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode: 277 * 278 * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D 279 * setting is only allowed if Surface Format supported for Fast 280 * Clear. In addition, if the surface is bound to the sampling 281 * engine, Surface Format must be supported for Render Target 282 * Compression for surfaces bound to the sampling engine." 283 * 284 * In other words, we can't sample from a fast-cleared image if it 285 * doesn't also support color compression. 286 */ 287 att_state->input_aux_usage = ISL_AUX_USAGE_NONE; 288 } else if (GEN_GEN == 8) { 289 /* Broadwell can sample from fast-cleared images */ 290 att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D; 291 } else { 292 /* Ivy Bridge and Haswell cannot */ 293 att_state->input_aux_usage = ISL_AUX_USAGE_NONE; 294 } 295 } else { 296 att_state->aux_usage = ISL_AUX_USAGE_NONE; 297 att_state->input_aux_usage = ISL_AUX_USAGE_NONE; 298 } 299 } 300 301 static bool 302 need_input_attachment_state(const struct anv_render_pass_attachment *att) 303 { 304 if (!(att->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) 305 return false; 306 307 /* We only allocate input attachment states for color surfaces. Compression 308 * is not yet enabled for depth textures and stencil doesn't allow 309 * compression so we can just use the texture surface state from the view. 310 */ 311 return vk_format_is_color(att->format); 312 } 313 314 static enum isl_aux_usage 315 layout_to_hiz_usage(VkImageLayout layout, uint8_t samples) 316 { 317 switch (layout) { 318 case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: 319 return ISL_AUX_USAGE_HIZ; 320 case VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL: 321 case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: 322 if (anv_can_sample_with_hiz(GEN_GEN, samples)) 323 return ISL_AUX_USAGE_HIZ; 324 /* Fall-through */ 325 case VK_IMAGE_LAYOUT_GENERAL: 326 /* This buffer could be used as a source or destination in a transfer 327 * operation. Transfer operations current don't perform HiZ-enabled reads 328 * and writes. 329 */ 330 default: 331 return ISL_AUX_USAGE_NONE; 332 } 333 } 334 335 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless 336 * the initial layout is undefined, the HiZ buffer and depth buffer will 337 * represent the same data at the end of this operation. 338 */ 339 static void 340 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, 341 const struct anv_image *image, 342 VkImageLayout initial_layout, 343 VkImageLayout final_layout) 344 { 345 assert(image); 346 347 if (image->aux_usage != ISL_AUX_USAGE_HIZ || final_layout == initial_layout) 348 return; 349 350 const bool hiz_enabled = layout_to_hiz_usage(initial_layout, image->samples) == 351 ISL_AUX_USAGE_HIZ; 352 const bool enable_hiz = layout_to_hiz_usage(final_layout, image->samples) == 353 ISL_AUX_USAGE_HIZ; 354 355 enum blorp_hiz_op hiz_op; 356 if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED) { 357 /* We've already initialized the aux HiZ buffer at BindImageMemory time, 358 * so there's no need to perform a HIZ resolve or clear to avoid GPU hangs. 359 * This initial layout indicates that the user doesn't care about the data 360 * that's currently in the buffer, so resolves are not necessary except 361 * for the special case noted below. 362 */ 363 hiz_op = BLORP_HIZ_OP_NONE; 364 } else if (hiz_enabled && !enable_hiz) { 365 hiz_op = BLORP_HIZ_OP_DEPTH_RESOLVE; 366 } else if (!hiz_enabled && enable_hiz) { 367 hiz_op = BLORP_HIZ_OP_HIZ_RESOLVE; 368 } else { 369 assert(hiz_enabled == enable_hiz); 370 /* If the same buffer will be used, no resolves are necessary except for 371 * the special case noted below. 372 */ 373 hiz_op = BLORP_HIZ_OP_NONE; 374 } 375 376 if (hiz_op != BLORP_HIZ_OP_NONE) 377 anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op); 378 379 /* Images that have sampling with HiZ enabled cause all shader sampling to 380 * load data with the HiZ buffer. Therefore, in the case of transitioning to 381 * the general layout - which currently routes all writes to the depth 382 * buffer - we must ensure that the HiZ buffer remains consistent with the 383 * depth buffer by performing an additional HIZ resolve if the operation 384 * required by this transition was not already a HiZ resolve. 385 */ 386 if (final_layout == VK_IMAGE_LAYOUT_GENERAL && 387 anv_can_sample_with_hiz(GEN_GEN, image->samples) && 388 hiz_op != BLORP_HIZ_OP_HIZ_RESOLVE) { 389 anv_gen8_hiz_op_resolve(cmd_buffer, image, BLORP_HIZ_OP_HIZ_RESOLVE); 390 } 391 } 392 393 394 /** 395 * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass. 396 */ 397 static void 398 genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer, 399 struct anv_render_pass *pass, 400 const VkRenderPassBeginInfo *begin) 401 { 402 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; 403 struct anv_cmd_state *state = &cmd_buffer->state; 404 405 vk_free(&cmd_buffer->pool->alloc, state->attachments); 406 407 if (pass->attachment_count == 0) { 408 state->attachments = NULL; 409 return; 410 } 411 412 state->attachments = vk_alloc(&cmd_buffer->pool->alloc, 413 pass->attachment_count * 414 sizeof(state->attachments[0]), 415 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 416 if (state->attachments == NULL) { 417 /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */ 418 abort(); 419 } 420 421 bool need_null_state = false; 422 unsigned num_states = 0; 423 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 424 if (vk_format_is_color(pass->attachments[i].format)) { 425 num_states++; 426 } else { 427 /* We need a null state for any depth-stencil-only subpasses. 428 * Importantly, this includes depth/stencil clears so we create one 429 * whenever we have depth or stencil 430 */ 431 need_null_state = true; 432 } 433 434 if (need_input_attachment_state(&pass->attachments[i])) 435 num_states++; 436 } 437 num_states += need_null_state; 438 439 const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); 440 state->render_pass_states = 441 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 442 num_states * ss_stride, isl_dev->ss.align); 443 444 struct anv_state next_state = state->render_pass_states; 445 next_state.alloc_size = isl_dev->ss.size; 446 447 if (need_null_state) { 448 state->null_surface_state = next_state; 449 next_state.offset += ss_stride; 450 next_state.map += ss_stride; 451 } 452 453 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 454 if (vk_format_is_color(pass->attachments[i].format)) { 455 state->attachments[i].color_rt_state = next_state; 456 next_state.offset += ss_stride; 457 next_state.map += ss_stride; 458 } 459 460 if (need_input_attachment_state(&pass->attachments[i])) { 461 state->attachments[i].input_att_state = next_state; 462 next_state.offset += ss_stride; 463 next_state.map += ss_stride; 464 } 465 } 466 assert(next_state.offset == state->render_pass_states.offset + 467 state->render_pass_states.alloc_size); 468 469 if (begin) { 470 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer); 471 assert(pass->attachment_count == framebuffer->attachment_count); 472 473 if (need_null_state) { 474 struct GENX(RENDER_SURFACE_STATE) null_ss = { 475 .SurfaceType = SURFTYPE_NULL, 476 .SurfaceArray = framebuffer->layers > 0, 477 .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM, 478 #if GEN_GEN >= 8 479 .TileMode = YMAJOR, 480 #else 481 .TiledSurface = true, 482 #endif 483 .Width = framebuffer->width - 1, 484 .Height = framebuffer->height - 1, 485 .Depth = framebuffer->layers - 1, 486 .RenderTargetViewExtent = framebuffer->layers - 1, 487 }; 488 GENX(RENDER_SURFACE_STATE_pack)(NULL, state->null_surface_state.map, 489 &null_ss); 490 } 491 492 for (uint32_t i = 0; i < pass->attachment_count; ++i) { 493 struct anv_render_pass_attachment *att = &pass->attachments[i]; 494 VkImageAspectFlags att_aspects = vk_format_aspects(att->format); 495 VkImageAspectFlags clear_aspects = 0; 496 497 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 498 /* color attachment */ 499 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 500 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; 501 } 502 } else { 503 /* depthstencil attachment */ 504 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && 505 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 506 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; 507 } 508 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && 509 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { 510 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; 511 } 512 } 513 514 state->attachments[i].current_layout = att->initial_layout; 515 state->attachments[i].pending_clear_aspects = clear_aspects; 516 if (clear_aspects) 517 state->attachments[i].clear_value = begin->pClearValues[i]; 518 519 struct anv_image_view *iview = framebuffer->attachments[i]; 520 assert(iview->vk_format == att->format); 521 522 union isl_color_value clear_color = { .u32 = { 0, } }; 523 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { 524 color_attachment_compute_aux_usage(cmd_buffer->device, 525 &state->attachments[i], 526 iview, begin->renderArea, 527 &clear_color); 528 529 struct isl_view view = iview->isl; 530 view.usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT; 531 isl_surf_fill_state(isl_dev, 532 state->attachments[i].color_rt_state.map, 533 .surf = &iview->image->color_surface.isl, 534 .view = &view, 535 .aux_surf = &iview->image->aux_surface.isl, 536 .aux_usage = state->attachments[i].aux_usage, 537 .clear_color = clear_color, 538 .mocs = cmd_buffer->device->default_mocs); 539 540 add_image_view_relocs(cmd_buffer, iview, 541 state->attachments[i].aux_usage, 542 state->attachments[i].color_rt_state); 543 } else { 544 if (iview->image->aux_usage == ISL_AUX_USAGE_HIZ) { 545 state->attachments[i].aux_usage = 546 layout_to_hiz_usage(att->initial_layout, iview->image->samples); 547 } else { 548 state->attachments[i].aux_usage = ISL_AUX_USAGE_NONE; 549 } 550 state->attachments[i].input_aux_usage = ISL_AUX_USAGE_NONE; 551 } 552 553 if (need_input_attachment_state(&pass->attachments[i])) { 554 struct isl_view view = iview->isl; 555 view.usage |= ISL_SURF_USAGE_TEXTURE_BIT; 556 isl_surf_fill_state(isl_dev, 557 state->attachments[i].input_att_state.map, 558 .surf = &iview->image->color_surface.isl, 559 .view = &view, 560 .aux_surf = &iview->image->aux_surface.isl, 561 .aux_usage = state->attachments[i].input_aux_usage, 562 .clear_color = clear_color, 563 .mocs = cmd_buffer->device->default_mocs); 564 565 add_image_view_relocs(cmd_buffer, iview, 566 state->attachments[i].input_aux_usage, 567 state->attachments[i].input_att_state); 568 } 569 } 570 571 if (!cmd_buffer->device->info.has_llc) 572 anv_state_clflush(state->render_pass_states); 573 } 574 } 575 576 VkResult 577 genX(BeginCommandBuffer)( 578 VkCommandBuffer commandBuffer, 579 const VkCommandBufferBeginInfo* pBeginInfo) 580 { 581 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 582 583 /* If this is the first vkBeginCommandBuffer, we must *initialize* the 584 * command buffer's state. Otherwise, we must *reset* its state. In both 585 * cases we reset it. 586 * 587 * From the Vulkan 1.0 spec: 588 * 589 * If a command buffer is in the executable state and the command buffer 590 * was allocated from a command pool with the 591 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then 592 * vkBeginCommandBuffer implicitly resets the command buffer, behaving 593 * as if vkResetCommandBuffer had been called with 594 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts 595 * the command buffer in the recording state. 596 */ 597 anv_cmd_buffer_reset(cmd_buffer); 598 599 cmd_buffer->usage_flags = pBeginInfo->flags; 600 601 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY || 602 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)); 603 604 genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 605 606 if (cmd_buffer->usage_flags & 607 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 608 cmd_buffer->state.pass = 609 anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); 610 cmd_buffer->state.subpass = 611 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; 612 cmd_buffer->state.framebuffer = NULL; 613 614 genX(cmd_buffer_setup_attachments)(cmd_buffer, cmd_buffer->state.pass, 615 NULL); 616 617 cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; 618 } 619 620 return VK_SUCCESS; 621 } 622 623 VkResult 624 genX(EndCommandBuffer)( 625 VkCommandBuffer commandBuffer) 626 { 627 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 628 629 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 630 631 anv_cmd_buffer_end_batch_buffer(cmd_buffer); 632 633 return VK_SUCCESS; 634 } 635 636 void 637 genX(CmdExecuteCommands)( 638 VkCommandBuffer commandBuffer, 639 uint32_t commandBufferCount, 640 const VkCommandBuffer* pCmdBuffers) 641 { 642 ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer); 643 644 assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 645 646 /* The secondary command buffer doesn't know which textures etc. have been 647 * flushed prior to their execution. Apply those flushes now. 648 */ 649 genX(cmd_buffer_apply_pipe_flushes)(primary); 650 651 for (uint32_t i = 0; i < commandBufferCount; i++) { 652 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); 653 654 assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); 655 656 if (secondary->usage_flags & 657 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { 658 /* If we're continuing a render pass from the primary, we need to 659 * copy the surface states for the current subpass into the storage 660 * we allocated for them in BeginCommandBuffer. 661 */ 662 struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo; 663 struct anv_state src_state = primary->state.render_pass_states; 664 struct anv_state dst_state = secondary->state.render_pass_states; 665 assert(src_state.alloc_size == dst_state.alloc_size); 666 667 genX(cmd_buffer_gpu_memcpy)(primary, ss_bo, dst_state.offset, 668 ss_bo, src_state.offset, 669 src_state.alloc_size); 670 } 671 672 anv_cmd_buffer_add_secondary(primary, secondary); 673 } 674 675 /* Each of the secondary command buffers will use its own state base 676 * address. We need to re-emit state base address for the primary after 677 * all of the secondaries are done. 678 * 679 * TODO: Maybe we want to make this a dirty bit to avoid extra state base 680 * address calls? 681 */ 682 genX(cmd_buffer_emit_state_base_address)(primary); 683 } 684 685 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000 686 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000 687 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000 688 689 /** 690 * Program the hardware to use the specified L3 configuration. 691 */ 692 void 693 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, 694 const struct gen_l3_config *cfg) 695 { 696 assert(cfg); 697 if (cfg == cmd_buffer->state.current_l3_config) 698 return; 699 700 if (unlikely(INTEL_DEBUG & DEBUG_L3)) { 701 fprintf(stderr, "L3 config transition: "); 702 gen_dump_l3_config(cfg, stderr); 703 } 704 705 const bool has_slm = cfg->n[GEN_L3P_SLM]; 706 707 /* According to the hardware docs, the L3 partitioning can only be changed 708 * while the pipeline is completely drained and the caches are flushed, 709 * which involves a first PIPE_CONTROL flush which stalls the pipeline... 710 */ 711 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 712 pc.DCFlushEnable = true; 713 pc.PostSyncOperation = NoWrite; 714 pc.CommandStreamerStallEnable = true; 715 } 716 717 /* ...followed by a second pipelined PIPE_CONTROL that initiates 718 * invalidation of the relevant caches. Note that because RO invalidation 719 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL 720 * command is processed by the CS) we cannot combine it with the previous 721 * stalling flush as the hardware documentation suggests, because that 722 * would cause the CS to stall on previous rendering *after* RO 723 * invalidation and wouldn't prevent the RO caches from being polluted by 724 * concurrent rendering before the stall completes. This intentionally 725 * doesn't implement the SKL+ hardware workaround suggesting to enable CS 726 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for 727 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs 728 * already guarantee that there is no concurrent GPGPU kernel execution 729 * (see SKL HSD 2132585). 730 */ 731 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 732 pc.TextureCacheInvalidationEnable = true; 733 pc.ConstantCacheInvalidationEnable = true; 734 pc.InstructionCacheInvalidateEnable = true; 735 pc.StateCacheInvalidationEnable = true; 736 pc.PostSyncOperation = NoWrite; 737 } 738 739 /* Now send a third stalling flush to make sure that invalidation is 740 * complete when the L3 configuration registers are modified. 741 */ 742 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 743 pc.DCFlushEnable = true; 744 pc.PostSyncOperation = NoWrite; 745 pc.CommandStreamerStallEnable = true; 746 } 747 748 #if GEN_GEN >= 8 749 750 assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]); 751 752 uint32_t l3cr; 753 anv_pack_struct(&l3cr, GENX(L3CNTLREG), 754 .SLMEnable = has_slm, 755 .URBAllocation = cfg->n[GEN_L3P_URB], 756 .ROAllocation = cfg->n[GEN_L3P_RO], 757 .DCAllocation = cfg->n[GEN_L3P_DC], 758 .AllAllocation = cfg->n[GEN_L3P_ALL]); 759 760 /* Set up the L3 partitioning. */ 761 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr); 762 763 #else 764 765 const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL]; 766 const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] || 767 cfg->n[GEN_L3P_ALL]; 768 const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] || 769 cfg->n[GEN_L3P_ALL]; 770 const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] || 771 cfg->n[GEN_L3P_ALL]; 772 773 assert(!cfg->n[GEN_L3P_ALL]); 774 775 /* When enabled SLM only uses a portion of the L3 on half of the banks, 776 * the matching space on the remaining banks has to be allocated to a 777 * client (URB for all validated configurations) set to the 778 * lower-bandwidth 2-bank address hashing mode. 779 */ 780 const struct gen_device_info *devinfo = &cmd_buffer->device->info; 781 const bool urb_low_bw = has_slm && !devinfo->is_baytrail; 782 assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]); 783 784 /* Minimum number of ways that can be allocated to the URB. */ 785 MAYBE_UNUSED const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0; 786 assert(cfg->n[GEN_L3P_URB] >= n0_urb); 787 788 uint32_t l3sqcr1, l3cr2, l3cr3; 789 anv_pack_struct(&l3sqcr1, GENX(L3SQCREG1), 790 .ConvertDC_UC = !has_dc, 791 .ConvertIS_UC = !has_is, 792 .ConvertC_UC = !has_c, 793 .ConvertT_UC = !has_t); 794 l3sqcr1 |= 795 GEN_IS_HASWELL ? HSW_L3SQCREG1_SQGHPCI_DEFAULT : 796 devinfo->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT : 797 IVB_L3SQCREG1_SQGHPCI_DEFAULT; 798 799 anv_pack_struct(&l3cr2, GENX(L3CNTLREG2), 800 .SLMEnable = has_slm, 801 .URBLowBandwidth = urb_low_bw, 802 .URBAllocation = cfg->n[GEN_L3P_URB], 803 #if !GEN_IS_HASWELL 804 .ALLAllocation = cfg->n[GEN_L3P_ALL], 805 #endif 806 .ROAllocation = cfg->n[GEN_L3P_RO], 807 .DCAllocation = cfg->n[GEN_L3P_DC]); 808 809 anv_pack_struct(&l3cr3, GENX(L3CNTLREG3), 810 .ISAllocation = cfg->n[GEN_L3P_IS], 811 .ISLowBandwidth = 0, 812 .CAllocation = cfg->n[GEN_L3P_C], 813 .CLowBandwidth = 0, 814 .TAllocation = cfg->n[GEN_L3P_T], 815 .TLowBandwidth = 0); 816 817 /* Set up the L3 partitioning. */ 818 emit_lri(&cmd_buffer->batch, GENX(L3SQCREG1_num), l3sqcr1); 819 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG2_num), l3cr2); 820 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3); 821 822 #if GEN_IS_HASWELL 823 if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) { 824 /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep 825 * them disabled to avoid crashing the system hard. 826 */ 827 uint32_t scratch1, chicken3; 828 anv_pack_struct(&scratch1, GENX(SCRATCH1), 829 .L3AtomicDisable = !has_dc); 830 anv_pack_struct(&chicken3, GENX(CHICKEN3), 831 .L3AtomicDisableMask = true, 832 .L3AtomicDisable = !has_dc); 833 emit_lri(&cmd_buffer->batch, GENX(SCRATCH1_num), scratch1); 834 emit_lri(&cmd_buffer->batch, GENX(CHICKEN3_num), chicken3); 835 } 836 #endif 837 838 #endif 839 840 cmd_buffer->state.current_l3_config = cfg; 841 } 842 843 void 844 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) 845 { 846 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; 847 848 /* Flushes are pipelined while invalidations are handled immediately. 849 * Therefore, if we're flushing anything then we need to schedule a stall 850 * before any invalidations can happen. 851 */ 852 if (bits & ANV_PIPE_FLUSH_BITS) 853 bits |= ANV_PIPE_NEEDS_CS_STALL_BIT; 854 855 /* If we're going to do an invalidate and we have a pending CS stall that 856 * has yet to be resolved, we do the CS stall now. 857 */ 858 if ((bits & ANV_PIPE_INVALIDATE_BITS) && 859 (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) { 860 bits |= ANV_PIPE_CS_STALL_BIT; 861 bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT; 862 } 863 864 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) { 865 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 866 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; 867 pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; 868 pipe.RenderTargetCacheFlushEnable = 869 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; 870 871 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; 872 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; 873 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; 874 875 /* 876 * According to the Broadwell documentation, any PIPE_CONTROL with the 877 * "Command Streamer Stall" bit set must also have another bit set, 878 * with five different options: 879 * 880 * - Render Target Cache Flush 881 * - Depth Cache Flush 882 * - Stall at Pixel Scoreboard 883 * - Post-Sync Operation 884 * - Depth Stall 885 * - DC Flush Enable 886 * 887 * I chose "Stall at Pixel Scoreboard" since that's what we use in 888 * mesa and it seems to work fine. The choice is fairly arbitrary. 889 */ 890 if ((bits & ANV_PIPE_CS_STALL_BIT) && 891 !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT | 892 ANV_PIPE_STALL_AT_SCOREBOARD_BIT))) 893 pipe.StallAtPixelScoreboard = true; 894 } 895 896 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT); 897 } 898 899 if (bits & ANV_PIPE_INVALIDATE_BITS) { 900 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 901 pipe.StateCacheInvalidationEnable = 902 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; 903 pipe.ConstantCacheInvalidationEnable = 904 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; 905 pipe.VFCacheInvalidationEnable = 906 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; 907 pipe.TextureCacheInvalidationEnable = 908 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; 909 pipe.InstructionCacheInvalidateEnable = 910 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT; 911 } 912 913 bits &= ~ANV_PIPE_INVALIDATE_BITS; 914 } 915 916 cmd_buffer->state.pending_pipe_bits = bits; 917 } 918 919 void genX(CmdPipelineBarrier)( 920 VkCommandBuffer commandBuffer, 921 VkPipelineStageFlags srcStageMask, 922 VkPipelineStageFlags destStageMask, 923 VkBool32 byRegion, 924 uint32_t memoryBarrierCount, 925 const VkMemoryBarrier* pMemoryBarriers, 926 uint32_t bufferMemoryBarrierCount, 927 const VkBufferMemoryBarrier* pBufferMemoryBarriers, 928 uint32_t imageMemoryBarrierCount, 929 const VkImageMemoryBarrier* pImageMemoryBarriers) 930 { 931 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 932 uint32_t b; 933 934 /* XXX: Right now, we're really dumb and just flush whatever categories 935 * the app asks for. One of these days we may make this a bit better 936 * but right now that's all the hardware allows for in most areas. 937 */ 938 VkAccessFlags src_flags = 0; 939 VkAccessFlags dst_flags = 0; 940 941 for (uint32_t i = 0; i < memoryBarrierCount; i++) { 942 src_flags |= pMemoryBarriers[i].srcAccessMask; 943 dst_flags |= pMemoryBarriers[i].dstAccessMask; 944 } 945 946 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) { 947 src_flags |= pBufferMemoryBarriers[i].srcAccessMask; 948 dst_flags |= pBufferMemoryBarriers[i].dstAccessMask; 949 } 950 951 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) { 952 src_flags |= pImageMemoryBarriers[i].srcAccessMask; 953 dst_flags |= pImageMemoryBarriers[i].dstAccessMask; 954 ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image); 955 if (pImageMemoryBarriers[i].subresourceRange.aspectMask & 956 VK_IMAGE_ASPECT_DEPTH_BIT) { 957 transition_depth_buffer(cmd_buffer, image, 958 pImageMemoryBarriers[i].oldLayout, 959 pImageMemoryBarriers[i].newLayout); 960 } 961 } 962 963 enum anv_pipe_bits pipe_bits = 0; 964 965 for_each_bit(b, src_flags) { 966 switch ((VkAccessFlagBits)(1 << b)) { 967 case VK_ACCESS_SHADER_WRITE_BIT: 968 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT; 969 break; 970 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT: 971 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; 972 break; 973 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: 974 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; 975 break; 976 case VK_ACCESS_TRANSFER_WRITE_BIT: 977 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; 978 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; 979 break; 980 default: 981 break; /* Nothing to do */ 982 } 983 } 984 985 for_each_bit(b, dst_flags) { 986 switch ((VkAccessFlagBits)(1 << b)) { 987 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT: 988 case VK_ACCESS_INDEX_READ_BIT: 989 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT: 990 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; 991 break; 992 case VK_ACCESS_UNIFORM_READ_BIT: 993 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; 994 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; 995 break; 996 case VK_ACCESS_SHADER_READ_BIT: 997 case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT: 998 case VK_ACCESS_TRANSFER_READ_BIT: 999 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; 1000 break; 1001 default: 1002 break; /* Nothing to do */ 1003 } 1004 } 1005 1006 cmd_buffer->state.pending_pipe_bits |= pipe_bits; 1007 } 1008 1009 static void 1010 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) 1011 { 1012 VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages; 1013 1014 /* In order to avoid thrash, we assume that vertex and fragment stages 1015 * always exist. In the rare case where one is missing *and* the other 1016 * uses push concstants, this may be suboptimal. However, avoiding stalls 1017 * seems more important. 1018 */ 1019 stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; 1020 1021 if (stages == cmd_buffer->state.push_constant_stages) 1022 return; 1023 1024 #if GEN_GEN >= 8 1025 const unsigned push_constant_kb = 32; 1026 #elif GEN_IS_HASWELL 1027 const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16; 1028 #else 1029 const unsigned push_constant_kb = 16; 1030 #endif 1031 1032 const unsigned num_stages = 1033 _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS); 1034 unsigned size_per_stage = push_constant_kb / num_stages; 1035 1036 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in 1037 * units of 2KB. Incidentally, these are the same platforms that have 1038 * 32KB worth of push constant space. 1039 */ 1040 if (push_constant_kb == 32) 1041 size_per_stage &= ~1u; 1042 1043 uint32_t kb_used = 0; 1044 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) { 1045 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0; 1046 anv_batch_emit(&cmd_buffer->batch, 1047 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { 1048 alloc._3DCommandSubOpcode = 18 + i; 1049 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0; 1050 alloc.ConstantBufferSize = push_size; 1051 } 1052 kb_used += push_size; 1053 } 1054 1055 anv_batch_emit(&cmd_buffer->batch, 1056 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) { 1057 alloc.ConstantBufferOffset = kb_used; 1058 alloc.ConstantBufferSize = push_constant_kb - kb_used; 1059 } 1060 1061 cmd_buffer->state.push_constant_stages = stages; 1062 1063 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS: 1064 * 1065 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to 1066 * the next 3DPRIMITIVE command after programming the 1067 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS" 1068 * 1069 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of 1070 * pipeline setup, we need to dirty push constants. 1071 */ 1072 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; 1073 } 1074 1075 static VkResult 1076 emit_binding_table(struct anv_cmd_buffer *cmd_buffer, 1077 gl_shader_stage stage, 1078 struct anv_state *bt_state) 1079 { 1080 struct anv_subpass *subpass = cmd_buffer->state.subpass; 1081 struct anv_pipeline *pipeline; 1082 uint32_t bias, state_offset; 1083 1084 switch (stage) { 1085 case MESA_SHADER_COMPUTE: 1086 pipeline = cmd_buffer->state.compute_pipeline; 1087 bias = 1; 1088 break; 1089 default: 1090 pipeline = cmd_buffer->state.pipeline; 1091 bias = 0; 1092 break; 1093 } 1094 1095 if (!anv_pipeline_has_stage(pipeline, stage)) { 1096 *bt_state = (struct anv_state) { 0, }; 1097 return VK_SUCCESS; 1098 } 1099 1100 struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; 1101 if (bias + map->surface_count == 0) { 1102 *bt_state = (struct anv_state) { 0, }; 1103 return VK_SUCCESS; 1104 } 1105 1106 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, 1107 bias + map->surface_count, 1108 &state_offset); 1109 uint32_t *bt_map = bt_state->map; 1110 1111 if (bt_state->map == NULL) 1112 return VK_ERROR_OUT_OF_DEVICE_MEMORY; 1113 1114 if (stage == MESA_SHADER_COMPUTE && 1115 get_cs_prog_data(cmd_buffer->state.compute_pipeline)->uses_num_work_groups) { 1116 struct anv_bo *bo = cmd_buffer->state.num_workgroups_bo; 1117 uint32_t bo_offset = cmd_buffer->state.num_workgroups_offset; 1118 1119 struct anv_state surface_state; 1120 surface_state = 1121 anv_cmd_buffer_alloc_surface_state(cmd_buffer); 1122 1123 const enum isl_format format = 1124 anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); 1125 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, 1126 format, bo_offset, 12, 1); 1127 1128 bt_map[0] = surface_state.offset + state_offset; 1129 add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset); 1130 } 1131 1132 if (map->surface_count == 0) 1133 goto out; 1134 1135 if (map->image_count > 0) { 1136 VkResult result = 1137 anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images); 1138 if (result != VK_SUCCESS) 1139 return result; 1140 1141 cmd_buffer->state.push_constants_dirty |= 1 << stage; 1142 } 1143 1144 uint32_t image = 0; 1145 for (uint32_t s = 0; s < map->surface_count; s++) { 1146 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; 1147 1148 struct anv_state surface_state; 1149 1150 if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) { 1151 /* Color attachment binding */ 1152 assert(stage == MESA_SHADER_FRAGMENT); 1153 assert(binding->binding == 0); 1154 if (binding->index < subpass->color_count) { 1155 const unsigned att = subpass->color_attachments[binding->index]; 1156 surface_state = cmd_buffer->state.attachments[att].color_rt_state; 1157 } else { 1158 surface_state = cmd_buffer->state.null_surface_state; 1159 } 1160 1161 bt_map[bias + s] = surface_state.offset + state_offset; 1162 continue; 1163 } 1164 1165 struct anv_descriptor_set *set = 1166 cmd_buffer->state.descriptors[binding->set]; 1167 uint32_t offset = set->layout->binding[binding->binding].descriptor_index; 1168 struct anv_descriptor *desc = &set->descriptors[offset + binding->index]; 1169 1170 switch (desc->type) { 1171 case VK_DESCRIPTOR_TYPE_SAMPLER: 1172 /* Nothing for us to do here */ 1173 continue; 1174 1175 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: 1176 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: 1177 surface_state = desc->image_view->sampler_surface_state; 1178 assert(surface_state.alloc_size); 1179 add_image_view_relocs(cmd_buffer, desc->image_view, 1180 desc->image_view->image->aux_usage, 1181 surface_state); 1182 break; 1183 1184 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: 1185 assert(stage == MESA_SHADER_FRAGMENT); 1186 if (desc->image_view->aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) { 1187 /* For depth and stencil input attachments, we treat it like any 1188 * old texture that a user may have bound. 1189 */ 1190 surface_state = desc->image_view->sampler_surface_state; 1191 assert(surface_state.alloc_size); 1192 add_image_view_relocs(cmd_buffer, desc->image_view, 1193 desc->image_view->image->aux_usage, 1194 surface_state); 1195 } else { 1196 /* For color input attachments, we create the surface state at 1197 * vkBeginRenderPass time so that we can include aux and clear 1198 * color information. 1199 */ 1200 assert(binding->input_attachment_index < subpass->input_count); 1201 const unsigned subpass_att = binding->input_attachment_index; 1202 const unsigned att = subpass->input_attachments[subpass_att]; 1203 surface_state = cmd_buffer->state.attachments[att].input_att_state; 1204 } 1205 break; 1206 1207 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { 1208 surface_state = desc->image_view->storage_surface_state; 1209 assert(surface_state.alloc_size); 1210 add_image_view_relocs(cmd_buffer, desc->image_view, 1211 desc->image_view->image->aux_usage, 1212 surface_state); 1213 1214 struct brw_image_param *image_param = 1215 &cmd_buffer->state.push_constants[stage]->images[image++]; 1216 1217 *image_param = desc->image_view->storage_image_param; 1218 image_param->surface_idx = bias + s; 1219 break; 1220 } 1221 1222 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: 1223 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 1224 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: 1225 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: 1226 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: 1227 surface_state = desc->buffer_view->surface_state; 1228 assert(surface_state.alloc_size); 1229 add_surface_state_reloc(cmd_buffer, surface_state, 1230 desc->buffer_view->bo, 1231 desc->buffer_view->offset); 1232 break; 1233 1234 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: 1235 surface_state = desc->buffer_view->storage_surface_state; 1236 assert(surface_state.alloc_size); 1237 add_surface_state_reloc(cmd_buffer, surface_state, 1238 desc->buffer_view->bo, 1239 desc->buffer_view->offset); 1240 1241 struct brw_image_param *image_param = 1242 &cmd_buffer->state.push_constants[stage]->images[image++]; 1243 1244 *image_param = desc->buffer_view->storage_image_param; 1245 image_param->surface_idx = bias + s; 1246 break; 1247 1248 default: 1249 assert(!"Invalid descriptor type"); 1250 continue; 1251 } 1252 1253 bt_map[bias + s] = surface_state.offset + state_offset; 1254 } 1255 assert(image == map->image_count); 1256 1257 out: 1258 if (!cmd_buffer->device->info.has_llc) 1259 anv_state_clflush(*bt_state); 1260 1261 return VK_SUCCESS; 1262 } 1263 1264 static VkResult 1265 emit_samplers(struct anv_cmd_buffer *cmd_buffer, 1266 gl_shader_stage stage, 1267 struct anv_state *state) 1268 { 1269 struct anv_pipeline *pipeline; 1270 1271 if (stage == MESA_SHADER_COMPUTE) 1272 pipeline = cmd_buffer->state.compute_pipeline; 1273 else 1274 pipeline = cmd_buffer->state.pipeline; 1275 1276 if (!anv_pipeline_has_stage(pipeline, stage)) { 1277 *state = (struct anv_state) { 0, }; 1278 return VK_SUCCESS; 1279 } 1280 1281 struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; 1282 if (map->sampler_count == 0) { 1283 *state = (struct anv_state) { 0, }; 1284 return VK_SUCCESS; 1285 } 1286 1287 uint32_t size = map->sampler_count * 16; 1288 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32); 1289 1290 if (state->map == NULL) 1291 return VK_ERROR_OUT_OF_DEVICE_MEMORY; 1292 1293 for (uint32_t s = 0; s < map->sampler_count; s++) { 1294 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; 1295 struct anv_descriptor_set *set = 1296 cmd_buffer->state.descriptors[binding->set]; 1297 uint32_t offset = set->layout->binding[binding->binding].descriptor_index; 1298 struct anv_descriptor *desc = &set->descriptors[offset + binding->index]; 1299 1300 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && 1301 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) 1302 continue; 1303 1304 struct anv_sampler *sampler = desc->sampler; 1305 1306 /* This can happen if we have an unfilled slot since TYPE_SAMPLER 1307 * happens to be zero. 1308 */ 1309 if (sampler == NULL) 1310 continue; 1311 1312 memcpy(state->map + (s * 16), 1313 sampler->state, sizeof(sampler->state)); 1314 } 1315 1316 if (!cmd_buffer->device->info.has_llc) 1317 anv_state_clflush(*state); 1318 1319 return VK_SUCCESS; 1320 } 1321 1322 static uint32_t 1323 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer) 1324 { 1325 VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty & 1326 cmd_buffer->state.pipeline->active_stages; 1327 1328 VkResult result = VK_SUCCESS; 1329 anv_foreach_stage(s, dirty) { 1330 result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]); 1331 if (result != VK_SUCCESS) 1332 break; 1333 result = emit_binding_table(cmd_buffer, s, 1334 &cmd_buffer->state.binding_tables[s]); 1335 if (result != VK_SUCCESS) 1336 break; 1337 } 1338 1339 if (result != VK_SUCCESS) { 1340 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); 1341 1342 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); 1343 assert(result == VK_SUCCESS); 1344 1345 /* Re-emit state base addresses so we get the new surface state base 1346 * address before we start emitting binding tables etc. 1347 */ 1348 genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 1349 1350 /* Re-emit all active binding tables */ 1351 dirty |= cmd_buffer->state.pipeline->active_stages; 1352 anv_foreach_stage(s, dirty) { 1353 result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]); 1354 if (result != VK_SUCCESS) 1355 return result; 1356 result = emit_binding_table(cmd_buffer, s, 1357 &cmd_buffer->state.binding_tables[s]); 1358 if (result != VK_SUCCESS) 1359 return result; 1360 } 1361 } 1362 1363 cmd_buffer->state.descriptors_dirty &= ~dirty; 1364 1365 return dirty; 1366 } 1367 1368 static void 1369 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, 1370 uint32_t stages) 1371 { 1372 static const uint32_t sampler_state_opcodes[] = { 1373 [MESA_SHADER_VERTEX] = 43, 1374 [MESA_SHADER_TESS_CTRL] = 44, /* HS */ 1375 [MESA_SHADER_TESS_EVAL] = 45, /* DS */ 1376 [MESA_SHADER_GEOMETRY] = 46, 1377 [MESA_SHADER_FRAGMENT] = 47, 1378 [MESA_SHADER_COMPUTE] = 0, 1379 }; 1380 1381 static const uint32_t binding_table_opcodes[] = { 1382 [MESA_SHADER_VERTEX] = 38, 1383 [MESA_SHADER_TESS_CTRL] = 39, 1384 [MESA_SHADER_TESS_EVAL] = 40, 1385 [MESA_SHADER_GEOMETRY] = 41, 1386 [MESA_SHADER_FRAGMENT] = 42, 1387 [MESA_SHADER_COMPUTE] = 0, 1388 }; 1389 1390 anv_foreach_stage(s, stages) { 1391 if (cmd_buffer->state.samplers[s].alloc_size > 0) { 1392 anv_batch_emit(&cmd_buffer->batch, 1393 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) { 1394 ssp._3DCommandSubOpcode = sampler_state_opcodes[s]; 1395 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset; 1396 } 1397 } 1398 1399 /* Always emit binding table pointers if we're asked to, since on SKL 1400 * this is what flushes push constants. */ 1401 anv_batch_emit(&cmd_buffer->batch, 1402 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) { 1403 btp._3DCommandSubOpcode = binding_table_opcodes[s]; 1404 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset; 1405 } 1406 } 1407 } 1408 1409 static uint32_t 1410 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer) 1411 { 1412 static const uint32_t push_constant_opcodes[] = { 1413 [MESA_SHADER_VERTEX] = 21, 1414 [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 1415 [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 1416 [MESA_SHADER_GEOMETRY] = 22, 1417 [MESA_SHADER_FRAGMENT] = 23, 1418 [MESA_SHADER_COMPUTE] = 0, 1419 }; 1420 1421 VkShaderStageFlags flushed = 0; 1422 1423 anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) { 1424 if (stage == MESA_SHADER_COMPUTE) 1425 continue; 1426 1427 struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage); 1428 1429 if (state.offset == 0) { 1430 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) 1431 c._3DCommandSubOpcode = push_constant_opcodes[stage]; 1432 } else { 1433 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { 1434 c._3DCommandSubOpcode = push_constant_opcodes[stage], 1435 c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) { 1436 #if GEN_GEN >= 9 1437 .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset }, 1438 .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32), 1439 #else 1440 .PointerToConstantBuffer0 = { .offset = state.offset }, 1441 .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32), 1442 #endif 1443 }; 1444 } 1445 } 1446 1447 flushed |= mesa_to_vk_shader_stage(stage); 1448 } 1449 1450 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS; 1451 1452 return flushed; 1453 } 1454 1455 void 1456 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) 1457 { 1458 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; 1459 uint32_t *p; 1460 1461 uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used; 1462 1463 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); 1464 1465 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); 1466 1467 genX(flush_pipeline_select_3d)(cmd_buffer); 1468 1469 if (vb_emit) { 1470 const uint32_t num_buffers = __builtin_popcount(vb_emit); 1471 const uint32_t num_dwords = 1 + num_buffers * 4; 1472 1473 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords, 1474 GENX(3DSTATE_VERTEX_BUFFERS)); 1475 uint32_t vb, i = 0; 1476 for_each_bit(vb, vb_emit) { 1477 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; 1478 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset; 1479 1480 struct GENX(VERTEX_BUFFER_STATE) state = { 1481 .VertexBufferIndex = vb, 1482 1483 #if GEN_GEN >= 8 1484 .MemoryObjectControlState = GENX(MOCS), 1485 #else 1486 .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA, 1487 .InstanceDataStepRate = 1, 1488 .VertexBufferMemoryObjectControlState = GENX(MOCS), 1489 #endif 1490 1491 .AddressModifyEnable = true, 1492 .BufferPitch = pipeline->binding_stride[vb], 1493 .BufferStartingAddress = { buffer->bo, buffer->offset + offset }, 1494 1495 #if GEN_GEN >= 8 1496 .BufferSize = buffer->size - offset 1497 #else 1498 .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1}, 1499 #endif 1500 }; 1501 1502 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); 1503 i++; 1504 } 1505 } 1506 1507 cmd_buffer->state.vb_dirty &= ~vb_emit; 1508 1509 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) { 1510 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); 1511 1512 /* The exact descriptor layout is pulled from the pipeline, so we need 1513 * to re-emit binding tables on every pipeline change. 1514 */ 1515 cmd_buffer->state.descriptors_dirty |= 1516 cmd_buffer->state.pipeline->active_stages; 1517 1518 /* If the pipeline changed, we may need to re-allocate push constant 1519 * space in the URB. 1520 */ 1521 cmd_buffer_alloc_push_constants(cmd_buffer); 1522 } 1523 1524 #if GEN_GEN <= 7 1525 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT || 1526 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) { 1527 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: 1528 * 1529 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth 1530 * stall needs to be sent just prior to any 3DSTATE_VS, 1531 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS, 1532 * 3DSTATE_BINDING_TABLE_POINTER_VS, 1533 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one 1534 * PIPE_CONTROL needs to be sent before any combination of VS 1535 * associated 3DSTATE." 1536 */ 1537 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1538 pc.DepthStallEnable = true; 1539 pc.PostSyncOperation = WriteImmediateData; 1540 pc.Address = 1541 (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 }; 1542 } 1543 } 1544 #endif 1545 1546 /* Render targets live in the same binding table as fragment descriptors */ 1547 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) 1548 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; 1549 1550 /* We emit the binding tables and sampler tables first, then emit push 1551 * constants and then finally emit binding table and sampler table 1552 * pointers. It has to happen in this order, since emitting the binding 1553 * tables may change the push constants (in case of storage images). After 1554 * emitting push constants, on SKL+ we have to emit the corresponding 1555 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect. 1556 */ 1557 uint32_t dirty = 0; 1558 if (cmd_buffer->state.descriptors_dirty) 1559 dirty = flush_descriptor_sets(cmd_buffer); 1560 1561 if (cmd_buffer->state.push_constants_dirty) { 1562 #if GEN_GEN >= 9 1563 /* On Sky Lake and later, the binding table pointers commands are 1564 * what actually flush the changes to push constant state so we need 1565 * to dirty them so they get re-emitted below. 1566 */ 1567 dirty |= cmd_buffer_flush_push_constants(cmd_buffer); 1568 #else 1569 cmd_buffer_flush_push_constants(cmd_buffer); 1570 #endif 1571 } 1572 1573 if (dirty) 1574 cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty); 1575 1576 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) 1577 gen8_cmd_buffer_emit_viewport(cmd_buffer); 1578 1579 if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT | 1580 ANV_CMD_DIRTY_PIPELINE)) { 1581 gen8_cmd_buffer_emit_depth_viewport(cmd_buffer, 1582 pipeline->depth_clamp_enable); 1583 } 1584 1585 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR) 1586 gen7_cmd_buffer_emit_scissor(cmd_buffer); 1587 1588 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); 1589 1590 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1591 } 1592 1593 static void 1594 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, 1595 struct anv_bo *bo, uint32_t offset) 1596 { 1597 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5, 1598 GENX(3DSTATE_VERTEX_BUFFERS)); 1599 1600 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1, 1601 &(struct GENX(VERTEX_BUFFER_STATE)) { 1602 .VertexBufferIndex = 32, /* Reserved for this */ 1603 .AddressModifyEnable = true, 1604 .BufferPitch = 0, 1605 #if (GEN_GEN >= 8) 1606 .MemoryObjectControlState = GENX(MOCS), 1607 .BufferStartingAddress = { bo, offset }, 1608 .BufferSize = 8 1609 #else 1610 .VertexBufferMemoryObjectControlState = GENX(MOCS), 1611 .BufferStartingAddress = { bo, offset }, 1612 .EndAddress = { bo, offset + 8 }, 1613 #endif 1614 }); 1615 } 1616 1617 static void 1618 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, 1619 uint32_t base_vertex, uint32_t base_instance) 1620 { 1621 struct anv_state id_state = 1622 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); 1623 1624 ((uint32_t *)id_state.map)[0] = base_vertex; 1625 ((uint32_t *)id_state.map)[1] = base_instance; 1626 1627 if (!cmd_buffer->device->info.has_llc) 1628 anv_state_clflush(id_state); 1629 1630 emit_base_vertex_instance_bo(cmd_buffer, 1631 &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset); 1632 } 1633 1634 void genX(CmdDraw)( 1635 VkCommandBuffer commandBuffer, 1636 uint32_t vertexCount, 1637 uint32_t instanceCount, 1638 uint32_t firstVertex, 1639 uint32_t firstInstance) 1640 { 1641 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1642 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; 1643 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 1644 1645 genX(cmd_buffer_flush_state)(cmd_buffer); 1646 1647 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) 1648 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); 1649 1650 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 1651 prim.VertexAccessType = SEQUENTIAL; 1652 prim.PrimitiveTopologyType = pipeline->topology; 1653 prim.VertexCountPerInstance = vertexCount; 1654 prim.StartVertexLocation = firstVertex; 1655 prim.InstanceCount = instanceCount; 1656 prim.StartInstanceLocation = firstInstance; 1657 prim.BaseVertexLocation = 0; 1658 } 1659 } 1660 1661 void genX(CmdDrawIndexed)( 1662 VkCommandBuffer commandBuffer, 1663 uint32_t indexCount, 1664 uint32_t instanceCount, 1665 uint32_t firstIndex, 1666 int32_t vertexOffset, 1667 uint32_t firstInstance) 1668 { 1669 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1670 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; 1671 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 1672 1673 genX(cmd_buffer_flush_state)(cmd_buffer); 1674 1675 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) 1676 emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance); 1677 1678 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 1679 prim.VertexAccessType = RANDOM; 1680 prim.PrimitiveTopologyType = pipeline->topology; 1681 prim.VertexCountPerInstance = indexCount; 1682 prim.StartVertexLocation = firstIndex; 1683 prim.InstanceCount = instanceCount; 1684 prim.StartInstanceLocation = firstInstance; 1685 prim.BaseVertexLocation = vertexOffset; 1686 } 1687 } 1688 1689 /* Auto-Draw / Indirect Registers */ 1690 #define GEN7_3DPRIM_END_OFFSET 0x2420 1691 #define GEN7_3DPRIM_START_VERTEX 0x2430 1692 #define GEN7_3DPRIM_VERTEX_COUNT 0x2434 1693 #define GEN7_3DPRIM_INSTANCE_COUNT 0x2438 1694 #define GEN7_3DPRIM_START_INSTANCE 0x243C 1695 #define GEN7_3DPRIM_BASE_VERTEX 0x2440 1696 1697 void genX(CmdDrawIndirect)( 1698 VkCommandBuffer commandBuffer, 1699 VkBuffer _buffer, 1700 VkDeviceSize offset, 1701 uint32_t drawCount, 1702 uint32_t stride) 1703 { 1704 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1705 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 1706 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; 1707 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 1708 struct anv_bo *bo = buffer->bo; 1709 uint32_t bo_offset = buffer->offset + offset; 1710 1711 genX(cmd_buffer_flush_state)(cmd_buffer); 1712 1713 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) 1714 emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8); 1715 1716 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); 1717 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); 1718 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); 1719 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12); 1720 emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0); 1721 1722 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 1723 prim.IndirectParameterEnable = true; 1724 prim.VertexAccessType = SEQUENTIAL; 1725 prim.PrimitiveTopologyType = pipeline->topology; 1726 } 1727 } 1728 1729 void genX(CmdDrawIndexedIndirect)( 1730 VkCommandBuffer commandBuffer, 1731 VkBuffer _buffer, 1732 VkDeviceSize offset, 1733 uint32_t drawCount, 1734 uint32_t stride) 1735 { 1736 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1737 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 1738 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; 1739 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 1740 struct anv_bo *bo = buffer->bo; 1741 uint32_t bo_offset = buffer->offset + offset; 1742 1743 genX(cmd_buffer_flush_state)(cmd_buffer); 1744 1745 /* TODO: We need to stomp base vertex to 0 somehow */ 1746 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) 1747 emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12); 1748 1749 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); 1750 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); 1751 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); 1752 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12); 1753 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16); 1754 1755 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { 1756 prim.IndirectParameterEnable = true; 1757 prim.VertexAccessType = RANDOM; 1758 prim.PrimitiveTopologyType = pipeline->topology; 1759 } 1760 } 1761 1762 static VkResult 1763 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) 1764 { 1765 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; 1766 struct anv_state surfaces = { 0, }, samplers = { 0, }; 1767 VkResult result; 1768 1769 result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); 1770 if (result != VK_SUCCESS) { 1771 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); 1772 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); 1773 assert(result == VK_SUCCESS); 1774 1775 /* Re-emit state base addresses so we get the new surface state base 1776 * address before we start emitting binding tables etc. 1777 */ 1778 genX(cmd_buffer_emit_state_base_address)(cmd_buffer); 1779 1780 result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); 1781 assert(result == VK_SUCCESS); 1782 } 1783 1784 result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers); 1785 assert(result == VK_SUCCESS); 1786 1787 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; 1788 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { 1789 .BindingTablePointer = surfaces.offset, 1790 .SamplerStatePointer = samplers.offset, 1791 }; 1792 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); 1793 1794 struct anv_state state = 1795 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, 1796 pipeline->interface_descriptor_data, 1797 GENX(INTERFACE_DESCRIPTOR_DATA_length), 1798 64); 1799 1800 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); 1801 anv_batch_emit(&cmd_buffer->batch, 1802 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { 1803 mid.InterfaceDescriptorTotalLength = size; 1804 mid.InterfaceDescriptorDataStartAddress = state.offset; 1805 } 1806 1807 return VK_SUCCESS; 1808 } 1809 1810 void 1811 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) 1812 { 1813 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; 1814 MAYBE_UNUSED VkResult result; 1815 1816 assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); 1817 1818 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); 1819 1820 genX(flush_pipeline_select_gpgpu)(cmd_buffer); 1821 1822 if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE) { 1823 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: 1824 * 1825 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless 1826 * the only bits that are changed are scoreboard related: Scoreboard 1827 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For 1828 * these scoreboard related states, a MEDIA_STATE_FLUSH is 1829 * sufficient." 1830 */ 1831 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; 1832 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1833 1834 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); 1835 } 1836 1837 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || 1838 (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) { 1839 /* FIXME: figure out descriptors for gen7 */ 1840 result = flush_compute_descriptor_set(cmd_buffer); 1841 assert(result == VK_SUCCESS); 1842 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; 1843 } 1844 1845 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { 1846 struct anv_state push_state = 1847 anv_cmd_buffer_cs_push_constants(cmd_buffer); 1848 1849 if (push_state.alloc_size) { 1850 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { 1851 curbe.CURBETotalDataLength = push_state.alloc_size; 1852 curbe.CURBEDataStartAddress = push_state.offset; 1853 } 1854 } 1855 } 1856 1857 cmd_buffer->state.compute_dirty = 0; 1858 1859 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1860 } 1861 1862 #if GEN_GEN == 7 1863 1864 static bool 1865 verify_cmd_parser(const struct anv_device *device, 1866 int required_version, 1867 const char *function) 1868 { 1869 if (device->instance->physicalDevice.cmd_parser_version < required_version) { 1870 vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT, 1871 "cmd parser version %d is required for %s", 1872 required_version, function); 1873 return false; 1874 } else { 1875 return true; 1876 } 1877 } 1878 1879 #endif 1880 1881 void genX(CmdDispatch)( 1882 VkCommandBuffer commandBuffer, 1883 uint32_t x, 1884 uint32_t y, 1885 uint32_t z) 1886 { 1887 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1888 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; 1889 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); 1890 1891 if (prog_data->uses_num_work_groups) { 1892 struct anv_state state = 1893 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4); 1894 uint32_t *sizes = state.map; 1895 sizes[0] = x; 1896 sizes[1] = y; 1897 sizes[2] = z; 1898 if (!cmd_buffer->device->info.has_llc) 1899 anv_state_clflush(state); 1900 cmd_buffer->state.num_workgroups_offset = state.offset; 1901 cmd_buffer->state.num_workgroups_bo = 1902 &cmd_buffer->device->dynamic_state_block_pool.bo; 1903 } 1904 1905 genX(cmd_buffer_flush_compute_state)(cmd_buffer); 1906 1907 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) { 1908 ggw.SIMDSize = prog_data->simd_size / 16; 1909 ggw.ThreadDepthCounterMaximum = 0; 1910 ggw.ThreadHeightCounterMaximum = 0; 1911 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; 1912 ggw.ThreadGroupIDXDimension = x; 1913 ggw.ThreadGroupIDYDimension = y; 1914 ggw.ThreadGroupIDZDimension = z; 1915 ggw.RightExecutionMask = pipeline->cs_right_mask; 1916 ggw.BottomExecutionMask = 0xffffffff; 1917 } 1918 1919 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf); 1920 } 1921 1922 #define GPGPU_DISPATCHDIMX 0x2500 1923 #define GPGPU_DISPATCHDIMY 0x2504 1924 #define GPGPU_DISPATCHDIMZ 0x2508 1925 1926 #define MI_PREDICATE_SRC0 0x2400 1927 #define MI_PREDICATE_SRC1 0x2408 1928 1929 void genX(CmdDispatchIndirect)( 1930 VkCommandBuffer commandBuffer, 1931 VkBuffer _buffer, 1932 VkDeviceSize offset) 1933 { 1934 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1935 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); 1936 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline; 1937 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); 1938 struct anv_bo *bo = buffer->bo; 1939 uint32_t bo_offset = buffer->offset + offset; 1940 struct anv_batch *batch = &cmd_buffer->batch; 1941 1942 #if GEN_GEN == 7 1943 /* Linux 4.4 added command parser version 5 which allows the GPGPU 1944 * indirect dispatch registers to be written. 1945 */ 1946 if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect")) 1947 return; 1948 #endif 1949 1950 if (prog_data->uses_num_work_groups) { 1951 cmd_buffer->state.num_workgroups_offset = bo_offset; 1952 cmd_buffer->state.num_workgroups_bo = bo; 1953 } 1954 1955 genX(cmd_buffer_flush_compute_state)(cmd_buffer); 1956 1957 emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset); 1958 emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4); 1959 emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8); 1960 1961 #if GEN_GEN <= 7 1962 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */ 1963 emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0); 1964 emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0); 1965 emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0); 1966 1967 /* Load compute_dispatch_indirect_x_size into SRC0 */ 1968 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0); 1969 1970 /* predicate = (compute_dispatch_indirect_x_size == 0); */ 1971 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 1972 mip.LoadOperation = LOAD_LOAD; 1973 mip.CombineOperation = COMBINE_SET; 1974 mip.CompareOperation = COMPARE_SRCS_EQUAL; 1975 } 1976 1977 /* Load compute_dispatch_indirect_y_size into SRC0 */ 1978 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4); 1979 1980 /* predicate |= (compute_dispatch_indirect_y_size == 0); */ 1981 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 1982 mip.LoadOperation = LOAD_LOAD; 1983 mip.CombineOperation = COMBINE_OR; 1984 mip.CompareOperation = COMPARE_SRCS_EQUAL; 1985 } 1986 1987 /* Load compute_dispatch_indirect_z_size into SRC0 */ 1988 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8); 1989 1990 /* predicate |= (compute_dispatch_indirect_z_size == 0); */ 1991 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 1992 mip.LoadOperation = LOAD_LOAD; 1993 mip.CombineOperation = COMBINE_OR; 1994 mip.CompareOperation = COMPARE_SRCS_EQUAL; 1995 } 1996 1997 /* predicate = !predicate; */ 1998 #define COMPARE_FALSE 1 1999 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { 2000 mip.LoadOperation = LOAD_LOADINV; 2001 mip.CombineOperation = COMBINE_OR; 2002 mip.CompareOperation = COMPARE_FALSE; 2003 } 2004 #endif 2005 2006 anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) { 2007 ggw.IndirectParameterEnable = true; 2008 ggw.PredicateEnable = GEN_GEN <= 7; 2009 ggw.SIMDSize = prog_data->simd_size / 16; 2010 ggw.ThreadDepthCounterMaximum = 0; 2011 ggw.ThreadHeightCounterMaximum = 0; 2012 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; 2013 ggw.RightExecutionMask = pipeline->cs_right_mask; 2014 ggw.BottomExecutionMask = 0xffffffff; 2015 } 2016 2017 anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf); 2018 } 2019 2020 static void 2021 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer, 2022 uint32_t pipeline) 2023 { 2024 #if GEN_GEN >= 8 && GEN_GEN < 10 2025 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: 2026 * 2027 * Software must clear the COLOR_CALC_STATE Valid field in 2028 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT 2029 * with Pipeline Select set to GPGPU. 2030 * 2031 * The internal hardware docs recommend the same workaround for Gen9 2032 * hardware too. 2033 */ 2034 if (pipeline == GPGPU) 2035 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t); 2036 #endif 2037 2038 /* From "BXML GT MI vol1a GPU Overview [Instruction] 2039 * PIPELINE_SELECT [DevBWR+]": 2040 * 2041 * Project: DEVSNB+ 2042 * 2043 * Software must ensure all the write caches are flushed through a 2044 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL 2045 * command to invalidate read only caches prior to programming 2046 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode. 2047 */ 2048 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2049 pc.RenderTargetCacheFlushEnable = true; 2050 pc.DepthCacheFlushEnable = true; 2051 pc.DCFlushEnable = true; 2052 pc.PostSyncOperation = NoWrite; 2053 pc.CommandStreamerStallEnable = true; 2054 } 2055 2056 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2057 pc.TextureCacheInvalidationEnable = true; 2058 pc.ConstantCacheInvalidationEnable = true; 2059 pc.StateCacheInvalidationEnable = true; 2060 pc.InstructionCacheInvalidateEnable = true; 2061 pc.PostSyncOperation = NoWrite; 2062 } 2063 } 2064 2065 void 2066 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer) 2067 { 2068 if (cmd_buffer->state.current_pipeline != _3D) { 2069 flush_pipeline_before_pipeline_select(cmd_buffer, _3D); 2070 2071 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { 2072 #if GEN_GEN >= 9 2073 ps.MaskBits = 3; 2074 #endif 2075 ps.PipelineSelection = _3D; 2076 } 2077 2078 cmd_buffer->state.current_pipeline = _3D; 2079 } 2080 } 2081 2082 void 2083 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer) 2084 { 2085 if (cmd_buffer->state.current_pipeline != GPGPU) { 2086 flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU); 2087 2088 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { 2089 #if GEN_GEN >= 9 2090 ps.MaskBits = 3; 2091 #endif 2092 ps.PipelineSelection = GPGPU; 2093 } 2094 2095 cmd_buffer->state.current_pipeline = GPGPU; 2096 } 2097 } 2098 2099 void 2100 genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) 2101 { 2102 if (GEN_GEN >= 8) 2103 return; 2104 2105 /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER: 2106 * 2107 * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any 2108 * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 2109 * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first 2110 * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit 2111 * set), followed by a pipelined depth cache flush (PIPE_CONTROL with 2112 * Depth Flush Bit set, followed by another pipelined depth stall 2113 * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise 2114 * guarantee that the pipeline from WM onwards is already flushed (e.g., 2115 * via a preceding MI_FLUSH)." 2116 */ 2117 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 2118 pipe.DepthStallEnable = true; 2119 } 2120 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 2121 pipe.DepthCacheFlushEnable = true; 2122 } 2123 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { 2124 pipe.DepthStallEnable = true; 2125 } 2126 } 2127 2128 static uint32_t 2129 depth_stencil_surface_type(enum isl_surf_dim dim) 2130 { 2131 switch (dim) { 2132 case ISL_SURF_DIM_1D: 2133 if (GEN_GEN >= 9) { 2134 /* From the Sky Lake PRM, 3DSTATAE_DEPTH_BUFFER::SurfaceType 2135 * 2136 * Programming Notes: 2137 * The Surface Type of the depth buffer must be the same as the 2138 * Surface Type of the render target(s) (defined in 2139 * SURFACE_STATE), unless either the depth buffer or render 2140 * targets are SURFTYPE_NULL (see exception below for SKL). 1D 2141 * surface type not allowed for depth surface and stencil surface. 2142 * 2143 * Workaround: 2144 * If depth/stencil is enabled with 1D render target, 2145 * depth/stencil surface type needs to be set to 2D surface type 2146 * and height set to 1. Depth will use (legacy) TileY and stencil 2147 * will use TileW. For this case only, the Surface Type of the 2148 * depth buffer can be 2D while the Surface Type of the render 2149 * target(s) are 1D, representing an exception to a programming 2150 * note above. 2151 */ 2152 return SURFTYPE_2D; 2153 } else { 2154 return SURFTYPE_1D; 2155 } 2156 case ISL_SURF_DIM_2D: 2157 return SURFTYPE_2D; 2158 case ISL_SURF_DIM_3D: 2159 if (GEN_GEN >= 9) { 2160 /* The Sky Lake docs list the value for 3D as "Reserved". However, 2161 * they have the exact same layout as 2D arrays on gen9+, so we can 2162 * just use 2D here. 2163 */ 2164 return SURFTYPE_2D; 2165 } else { 2166 return SURFTYPE_3D; 2167 } 2168 default: 2169 unreachable("Invalid surface dimension"); 2170 } 2171 } 2172 2173 static void 2174 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) 2175 { 2176 struct anv_device *device = cmd_buffer->device; 2177 const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer; 2178 const struct anv_image_view *iview = 2179 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 2180 const struct anv_image *image = iview ? iview->image : NULL; 2181 const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT); 2182 const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment; 2183 const bool has_hiz = image != NULL && 2184 cmd_buffer->state.attachments[ds].aux_usage == ISL_AUX_USAGE_HIZ; 2185 const bool has_stencil = 2186 image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); 2187 2188 /* FIXME: Implement the PMA stall W/A */ 2189 /* FIXME: Width and Height are wrong */ 2190 2191 genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); 2192 2193 /* Emit 3DSTATE_DEPTH_BUFFER */ 2194 if (has_depth) { 2195 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) { 2196 db.SurfaceType = 2197 depth_stencil_surface_type(image->depth_surface.isl.dim); 2198 db.DepthWriteEnable = true; 2199 db.StencilWriteEnable = has_stencil; 2200 db.HierarchicalDepthBufferEnable = has_hiz; 2201 2202 db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev, 2203 &image->depth_surface.isl); 2204 2205 db.SurfaceBaseAddress = (struct anv_address) { 2206 .bo = image->bo, 2207 .offset = image->offset + image->depth_surface.offset, 2208 }; 2209 db.DepthBufferObjectControlState = GENX(MOCS); 2210 2211 db.SurfacePitch = image->depth_surface.isl.row_pitch - 1; 2212 db.Height = image->extent.height - 1; 2213 db.Width = image->extent.width - 1; 2214 db.LOD = iview->isl.base_level; 2215 db.MinimumArrayElement = iview->isl.base_array_layer; 2216 2217 assert(image->depth_surface.isl.dim != ISL_SURF_DIM_3D); 2218 db.Depth = 2219 db.RenderTargetViewExtent = 2220 iview->isl.array_len - iview->isl.base_array_layer - 1; 2221 2222 #if GEN_GEN >= 8 2223 db.SurfaceQPitch = 2224 isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2; 2225 #endif 2226 } 2227 } else { 2228 /* Even when no depth buffer is present, the hardware requires that 2229 * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says: 2230 * 2231 * If a null depth buffer is bound, the driver must instead bind depth as: 2232 * 3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D 2233 * 3DSTATE_DEPTH.Width = 1 2234 * 3DSTATE_DEPTH.Height = 1 2235 * 3DSTATE_DEPTH.SuraceFormat = D16_UNORM 2236 * 3DSTATE_DEPTH.SurfaceBaseAddress = 0 2237 * 3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0 2238 * 3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0 2239 * 3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0 2240 * 2241 * The PRM is wrong, though. The width and height must be programmed to 2242 * actual framebuffer's width and height, even when neither depth buffer 2243 * nor stencil buffer is present. Also, D16_UNORM is not allowed to 2244 * be combined with a stencil buffer so we use D32_FLOAT instead. 2245 */ 2246 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) { 2247 if (has_stencil) { 2248 db.SurfaceType = 2249 depth_stencil_surface_type(image->stencil_surface.isl.dim); 2250 } else { 2251 db.SurfaceType = SURFTYPE_2D; 2252 } 2253 db.SurfaceFormat = D32_FLOAT; 2254 db.Width = MAX2(fb->width, 1) - 1; 2255 db.Height = MAX2(fb->height, 1) - 1; 2256 db.StencilWriteEnable = has_stencil; 2257 } 2258 } 2259 2260 if (has_hiz) { 2261 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) { 2262 hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS); 2263 hdb.SurfacePitch = image->aux_surface.isl.row_pitch - 1; 2264 hdb.SurfaceBaseAddress = (struct anv_address) { 2265 .bo = image->bo, 2266 .offset = image->offset + image->aux_surface.offset, 2267 }; 2268 #if GEN_GEN >= 8 2269 /* From the SKL PRM Vol2a: 2270 * 2271 * The interpretation of this field is dependent on Surface Type 2272 * as follows: 2273 * - SURFTYPE_1D: distance in pixels between array slices 2274 * - SURFTYPE_2D/CUBE: distance in rows between array slices 2275 * - SURFTYPE_3D: distance in rows between R - slices 2276 * 2277 * Unfortunately, the docs aren't 100% accurate here. They fail to 2278 * mention that the 1-D rule only applies to linear 1-D images. 2279 * Since depth and HiZ buffers are always tiled, they are treated as 2280 * 2-D images. Prior to Sky Lake, this field is always in rows. 2281 */ 2282 hdb.SurfaceQPitch = 2283 isl_surf_get_array_pitch_sa_rows(&image->aux_surface.isl) >> 2; 2284 #endif 2285 } 2286 } else { 2287 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb); 2288 } 2289 2290 /* Emit 3DSTATE_STENCIL_BUFFER */ 2291 if (has_stencil) { 2292 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) { 2293 #if GEN_GEN >= 8 || GEN_IS_HASWELL 2294 sb.StencilBufferEnable = true; 2295 #endif 2296 sb.StencilBufferObjectControlState = GENX(MOCS); 2297 2298 sb.SurfacePitch = image->stencil_surface.isl.row_pitch - 1; 2299 2300 #if GEN_GEN >= 8 2301 sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2; 2302 #endif 2303 sb.SurfaceBaseAddress = (struct anv_address) { 2304 .bo = image->bo, 2305 .offset = image->offset + image->stencil_surface.offset, 2306 }; 2307 } 2308 } else { 2309 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb); 2310 } 2311 2312 /* From the IVB PRM Vol2P1, 11.5.5.4 3DSTATE_CLEAR_PARAMS: 2313 * 2314 * 3DSTATE_CLEAR_PARAMS must always be programmed in the along with 2315 * the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER, 2316 * 3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER) 2317 * 2318 * Testing also shows that some variant of this restriction may exist HSW+. 2319 * On BDW+, it is not possible to emit 2 of these packets consecutively when 2320 * both have DepthClearValueValid set. An analysis of such state programming 2321 * on SKL showed that the GPU doesn't register the latter packet's clear 2322 * value. 2323 */ 2324 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) { 2325 if (has_hiz) { 2326 cp.DepthClearValueValid = true; 2327 cp.DepthClearValue = ANV_HZ_FC_VAL; 2328 } 2329 } 2330 } 2331 2332 static void 2333 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, 2334 struct anv_subpass *subpass) 2335 { 2336 cmd_buffer->state.subpass = subpass; 2337 2338 cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; 2339 2340 const struct anv_image_view *iview = 2341 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 2342 2343 if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) { 2344 const uint32_t ds = subpass->depth_stencil_attachment; 2345 transition_depth_buffer(cmd_buffer, iview->image, 2346 cmd_buffer->state.attachments[ds].current_layout, 2347 cmd_buffer->state.subpass->depth_stencil_layout); 2348 cmd_buffer->state.attachments[ds].current_layout = 2349 cmd_buffer->state.subpass->depth_stencil_layout; 2350 cmd_buffer->state.attachments[ds].aux_usage = 2351 layout_to_hiz_usage(cmd_buffer->state.subpass->depth_stencil_layout, 2352 iview->image->samples); 2353 } 2354 2355 cmd_buffer_emit_depth_stencil(cmd_buffer); 2356 2357 anv_cmd_buffer_clear_subpass(cmd_buffer); 2358 } 2359 2360 void genX(CmdBeginRenderPass)( 2361 VkCommandBuffer commandBuffer, 2362 const VkRenderPassBeginInfo* pRenderPassBegin, 2363 VkSubpassContents contents) 2364 { 2365 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2366 ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass); 2367 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer); 2368 2369 cmd_buffer->state.framebuffer = framebuffer; 2370 cmd_buffer->state.pass = pass; 2371 cmd_buffer->state.render_area = pRenderPassBegin->renderArea; 2372 genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, pRenderPassBegin); 2373 2374 genX(flush_pipeline_select_3d)(cmd_buffer); 2375 2376 genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses); 2377 } 2378 2379 void genX(CmdNextSubpass)( 2380 VkCommandBuffer commandBuffer, 2381 VkSubpassContents contents) 2382 { 2383 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2384 2385 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); 2386 2387 const struct anv_image_view *iview = 2388 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 2389 2390 if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) { 2391 const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment; 2392 2393 if (cmd_buffer->state.subpass - cmd_buffer->state.pass->subpasses == 2394 cmd_buffer->state.pass->attachments[ds].last_subpass_idx) { 2395 transition_depth_buffer(cmd_buffer, iview->image, 2396 cmd_buffer->state.attachments[ds].current_layout, 2397 cmd_buffer->state.pass->attachments[ds].final_layout); 2398 } 2399 } 2400 2401 anv_cmd_buffer_resolve_subpass(cmd_buffer); 2402 genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1); 2403 } 2404 2405 void genX(CmdEndRenderPass)( 2406 VkCommandBuffer commandBuffer) 2407 { 2408 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2409 2410 const struct anv_image_view *iview = 2411 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer); 2412 2413 if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) { 2414 const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment; 2415 2416 if (cmd_buffer->state.subpass - cmd_buffer->state.pass->subpasses == 2417 cmd_buffer->state.pass->attachments[ds].last_subpass_idx) { 2418 transition_depth_buffer(cmd_buffer, iview->image, 2419 cmd_buffer->state.attachments[ds].current_layout, 2420 cmd_buffer->state.pass->attachments[ds].final_layout); 2421 } 2422 } 2423 2424 anv_cmd_buffer_resolve_subpass(cmd_buffer); 2425 2426 #ifndef NDEBUG 2427 anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); 2428 #endif 2429 } 2430 2431 static void 2432 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, 2433 struct anv_bo *bo, uint32_t offset) 2434 { 2435 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2436 pc.DestinationAddressType = DAT_PPGTT; 2437 pc.PostSyncOperation = WritePSDepthCount; 2438 pc.DepthStallEnable = true; 2439 pc.Address = (struct anv_address) { bo, offset }; 2440 2441 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) 2442 pc.CommandStreamerStallEnable = true; 2443 } 2444 } 2445 2446 static void 2447 emit_query_availability(struct anv_cmd_buffer *cmd_buffer, 2448 struct anv_bo *bo, uint32_t offset) 2449 { 2450 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2451 pc.DestinationAddressType = DAT_PPGTT; 2452 pc.PostSyncOperation = WriteImmediateData; 2453 pc.Address = (struct anv_address) { bo, offset }; 2454 pc.ImmediateData = 1; 2455 } 2456 } 2457 2458 void genX(CmdResetQueryPool)( 2459 VkCommandBuffer commandBuffer, 2460 VkQueryPool queryPool, 2461 uint32_t firstQuery, 2462 uint32_t queryCount) 2463 { 2464 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2465 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 2466 2467 for (uint32_t i = 0; i < queryCount; i++) { 2468 switch (pool->type) { 2469 case VK_QUERY_TYPE_OCCLUSION: 2470 case VK_QUERY_TYPE_TIMESTAMP: { 2471 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) { 2472 sdm.Address = (struct anv_address) { 2473 .bo = &pool->bo, 2474 .offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot) + 2475 offsetof(struct anv_query_pool_slot, available), 2476 }; 2477 sdm.DataDWord0 = 0; 2478 sdm.DataDWord1 = 0; 2479 } 2480 break; 2481 } 2482 default: 2483 assert(!"Invalid query type"); 2484 } 2485 } 2486 } 2487 2488 void genX(CmdBeginQuery)( 2489 VkCommandBuffer commandBuffer, 2490 VkQueryPool queryPool, 2491 uint32_t query, 2492 VkQueryControlFlags flags) 2493 { 2494 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2495 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 2496 2497 /* Workaround: When meta uses the pipeline with the VS disabled, it seems 2498 * that the pipelining of the depth write breaks. What we see is that 2499 * samples from the render pass clear leaks into the first query 2500 * immediately after the clear. Doing a pipecontrol with a post-sync 2501 * operation and DepthStallEnable seems to work around the issue. 2502 */ 2503 if (cmd_buffer->state.need_query_wa) { 2504 cmd_buffer->state.need_query_wa = false; 2505 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2506 pc.DepthCacheFlushEnable = true; 2507 pc.DepthStallEnable = true; 2508 } 2509 } 2510 2511 switch (pool->type) { 2512 case VK_QUERY_TYPE_OCCLUSION: 2513 emit_ps_depth_count(cmd_buffer, &pool->bo, 2514 query * sizeof(struct anv_query_pool_slot)); 2515 break; 2516 2517 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 2518 default: 2519 unreachable(""); 2520 } 2521 } 2522 2523 void genX(CmdEndQuery)( 2524 VkCommandBuffer commandBuffer, 2525 VkQueryPool queryPool, 2526 uint32_t query) 2527 { 2528 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2529 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 2530 2531 switch (pool->type) { 2532 case VK_QUERY_TYPE_OCCLUSION: 2533 emit_ps_depth_count(cmd_buffer, &pool->bo, 2534 query * sizeof(struct anv_query_pool_slot) + 8); 2535 2536 emit_query_availability(cmd_buffer, &pool->bo, 2537 query * sizeof(struct anv_query_pool_slot) + 16); 2538 break; 2539 2540 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 2541 default: 2542 unreachable(""); 2543 } 2544 } 2545 2546 #define TIMESTAMP 0x2358 2547 2548 void genX(CmdWriteTimestamp)( 2549 VkCommandBuffer commandBuffer, 2550 VkPipelineStageFlagBits pipelineStage, 2551 VkQueryPool queryPool, 2552 uint32_t query) 2553 { 2554 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2555 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 2556 uint32_t offset = query * sizeof(struct anv_query_pool_slot); 2557 2558 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); 2559 2560 switch (pipelineStage) { 2561 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: 2562 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { 2563 srm.RegisterAddress = TIMESTAMP; 2564 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset }; 2565 } 2566 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { 2567 srm.RegisterAddress = TIMESTAMP + 4; 2568 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 4 }; 2569 } 2570 break; 2571 2572 default: 2573 /* Everything else is bottom-of-pipe */ 2574 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2575 pc.DestinationAddressType = DAT_PPGTT; 2576 pc.PostSyncOperation = WriteTimestamp; 2577 pc.Address = (struct anv_address) { &pool->bo, offset }; 2578 2579 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) 2580 pc.CommandStreamerStallEnable = true; 2581 } 2582 break; 2583 } 2584 2585 emit_query_availability(cmd_buffer, &pool->bo, offset + 16); 2586 } 2587 2588 #if GEN_GEN > 7 || GEN_IS_HASWELL 2589 2590 #define alu_opcode(v) __gen_uint((v), 20, 31) 2591 #define alu_operand1(v) __gen_uint((v), 10, 19) 2592 #define alu_operand2(v) __gen_uint((v), 0, 9) 2593 #define alu(opcode, operand1, operand2) \ 2594 alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2) 2595 2596 #define OPCODE_NOOP 0x000 2597 #define OPCODE_LOAD 0x080 2598 #define OPCODE_LOADINV 0x480 2599 #define OPCODE_LOAD0 0x081 2600 #define OPCODE_LOAD1 0x481 2601 #define OPCODE_ADD 0x100 2602 #define OPCODE_SUB 0x101 2603 #define OPCODE_AND 0x102 2604 #define OPCODE_OR 0x103 2605 #define OPCODE_XOR 0x104 2606 #define OPCODE_STORE 0x180 2607 #define OPCODE_STOREINV 0x580 2608 2609 #define OPERAND_R0 0x00 2610 #define OPERAND_R1 0x01 2611 #define OPERAND_R2 0x02 2612 #define OPERAND_R3 0x03 2613 #define OPERAND_R4 0x04 2614 #define OPERAND_SRCA 0x20 2615 #define OPERAND_SRCB 0x21 2616 #define OPERAND_ACCU 0x31 2617 #define OPERAND_ZF 0x32 2618 #define OPERAND_CF 0x33 2619 2620 #define CS_GPR(n) (0x2600 + (n) * 8) 2621 2622 static void 2623 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg, 2624 struct anv_bo *bo, uint32_t offset) 2625 { 2626 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 2627 lrm.RegisterAddress = reg, 2628 lrm.MemoryAddress = (struct anv_address) { bo, offset }; 2629 } 2630 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 2631 lrm.RegisterAddress = reg + 4; 2632 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; 2633 } 2634 } 2635 2636 static void 2637 store_query_result(struct anv_batch *batch, uint32_t reg, 2638 struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags) 2639 { 2640 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) { 2641 srm.RegisterAddress = reg; 2642 srm.MemoryAddress = (struct anv_address) { bo, offset }; 2643 } 2644 2645 if (flags & VK_QUERY_RESULT_64_BIT) { 2646 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) { 2647 srm.RegisterAddress = reg + 4; 2648 srm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; 2649 } 2650 } 2651 } 2652 2653 void genX(CmdCopyQueryPoolResults)( 2654 VkCommandBuffer commandBuffer, 2655 VkQueryPool queryPool, 2656 uint32_t firstQuery, 2657 uint32_t queryCount, 2658 VkBuffer destBuffer, 2659 VkDeviceSize destOffset, 2660 VkDeviceSize destStride, 2661 VkQueryResultFlags flags) 2662 { 2663 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 2664 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 2665 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); 2666 uint32_t slot_offset, dst_offset; 2667 2668 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 2669 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 2670 pc.CommandStreamerStallEnable = true; 2671 pc.StallAtPixelScoreboard = true; 2672 } 2673 } 2674 2675 dst_offset = buffer->offset + destOffset; 2676 for (uint32_t i = 0; i < queryCount; i++) { 2677 2678 slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot); 2679 switch (pool->type) { 2680 case VK_QUERY_TYPE_OCCLUSION: 2681 emit_load_alu_reg_u64(&cmd_buffer->batch, 2682 CS_GPR(0), &pool->bo, slot_offset); 2683 emit_load_alu_reg_u64(&cmd_buffer->batch, 2684 CS_GPR(1), &pool->bo, slot_offset + 8); 2685 2686 /* FIXME: We need to clamp the result for 32 bit. */ 2687 2688 uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); 2689 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); 2690 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); 2691 dw[3] = alu(OPCODE_SUB, 0, 0); 2692 dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU); 2693 break; 2694 2695 case VK_QUERY_TYPE_TIMESTAMP: 2696 emit_load_alu_reg_u64(&cmd_buffer->batch, 2697 CS_GPR(2), &pool->bo, slot_offset); 2698 break; 2699 2700 default: 2701 unreachable("unhandled query type"); 2702 } 2703 2704 store_query_result(&cmd_buffer->batch, 2705 CS_GPR(2), buffer->bo, dst_offset, flags); 2706 2707 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 2708 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), 2709 &pool->bo, slot_offset + 16); 2710 if (flags & VK_QUERY_RESULT_64_BIT) 2711 store_query_result(&cmd_buffer->batch, 2712 CS_GPR(0), buffer->bo, dst_offset + 8, flags); 2713 else 2714 store_query_result(&cmd_buffer->batch, 2715 CS_GPR(0), buffer->bo, dst_offset + 4, flags); 2716 } 2717 2718 dst_offset += destStride; 2719 } 2720 } 2721 2722 #else 2723 void genX(CmdCopyQueryPoolResults)( 2724 VkCommandBuffer commandBuffer, 2725 VkQueryPool queryPool, 2726 uint32_t firstQuery, 2727 uint32_t queryCount, 2728 VkBuffer destBuffer, 2729 VkDeviceSize destOffset, 2730 VkDeviceSize destStride, 2731 VkQueryResultFlags flags) 2732 { 2733 anv_finishme("Queries not yet supported on Ivy Bridge"); 2734 } 2735 #endif 2736