Home | History | Annotate | Download | only in vulkan
      1 /*
      2  * Copyright  2016 Red Hat.
      3  * Copyright  2016 Bas Nieuwenhuizen
      4  *
      5  * based in part on anv driver which is:
      6  * Copyright  2015 Intel Corporation
      7  *
      8  * Permission is hereby granted, free of charge, to any person obtaining a
      9  * copy of this software and associated documentation files (the "Software"),
     10  * to deal in the Software without restriction, including without limitation
     11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     12  * and/or sell copies of the Software, and to permit persons to whom the
     13  * Software is furnished to do so, subject to the following conditions:
     14  *
     15  * The above copyright notice and this permission notice (including the next
     16  * paragraph) shall be included in all copies or substantial portions of the
     17  * Software.
     18  *
     19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     25  * IN THE SOFTWARE.
     26  */
     27 
     28 #include "radv_private.h"
     29 #include "radv_radeon_winsys.h"
     30 #include "radv_shader.h"
     31 #include "radv_cs.h"
     32 #include "sid.h"
     33 #include "gfx9d.h"
     34 #include "vk_format.h"
     35 #include "radv_debug.h"
     36 #include "radv_meta.h"
     37 
     38 #include "ac_debug.h"
     39 
     40 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
     41 					 struct radv_image *image,
     42 					 VkImageLayout src_layout,
     43 					 VkImageLayout dst_layout,
     44 					 uint32_t src_family,
     45 					 uint32_t dst_family,
     46 					 const VkImageSubresourceRange *range,
     47 					 VkImageAspectFlags pending_clears);
     48 
     49 const struct radv_dynamic_state default_dynamic_state = {
     50 	.viewport = {
     51 		.count = 0,
     52 	},
     53 	.scissor = {
     54 		.count = 0,
     55 	},
     56 	.line_width = 1.0f,
     57 	.depth_bias = {
     58 		.bias = 0.0f,
     59 		.clamp = 0.0f,
     60 		.slope = 0.0f,
     61 	},
     62 	.blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
     63 	.depth_bounds = {
     64 		.min = 0.0f,
     65 		.max = 1.0f,
     66 	},
     67 	.stencil_compare_mask = {
     68 		.front = ~0u,
     69 		.back = ~0u,
     70 	},
     71 	.stencil_write_mask = {
     72 		.front = ~0u,
     73 		.back = ~0u,
     74 	},
     75 	.stencil_reference = {
     76 		.front = 0u,
     77 		.back = 0u,
     78 	},
     79 };
     80 
     81 static void
     82 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
     83 			const struct radv_dynamic_state *src)
     84 {
     85 	struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
     86 	uint32_t copy_mask = src->mask;
     87 	uint32_t dest_mask = 0;
     88 
     89 	/* Make sure to copy the number of viewports/scissors because they can
     90 	 * only be specified at pipeline creation time.
     91 	 */
     92 	dest->viewport.count = src->viewport.count;
     93 	dest->scissor.count = src->scissor.count;
     94 	dest->discard_rectangle.count = src->discard_rectangle.count;
     95 
     96 	if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
     97 		if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
     98 			   src->viewport.count * sizeof(VkViewport))) {
     99 			typed_memcpy(dest->viewport.viewports,
    100 				     src->viewport.viewports,
    101 				     src->viewport.count);
    102 			dest_mask |= RADV_DYNAMIC_VIEWPORT;
    103 		}
    104 	}
    105 
    106 	if (copy_mask & RADV_DYNAMIC_SCISSOR) {
    107 		if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
    108 			   src->scissor.count * sizeof(VkRect2D))) {
    109 			typed_memcpy(dest->scissor.scissors,
    110 				     src->scissor.scissors, src->scissor.count);
    111 			dest_mask |= RADV_DYNAMIC_SCISSOR;
    112 		}
    113 	}
    114 
    115 	if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
    116 		if (dest->line_width != src->line_width) {
    117 			dest->line_width = src->line_width;
    118 			dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
    119 		}
    120 	}
    121 
    122 	if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
    123 		if (memcmp(&dest->depth_bias, &src->depth_bias,
    124 			   sizeof(src->depth_bias))) {
    125 			dest->depth_bias = src->depth_bias;
    126 			dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
    127 		}
    128 	}
    129 
    130 	if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
    131 		if (memcmp(&dest->blend_constants, &src->blend_constants,
    132 			   sizeof(src->blend_constants))) {
    133 			typed_memcpy(dest->blend_constants,
    134 				     src->blend_constants, 4);
    135 			dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
    136 		}
    137 	}
    138 
    139 	if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
    140 		if (memcmp(&dest->depth_bounds, &src->depth_bounds,
    141 			   sizeof(src->depth_bounds))) {
    142 			dest->depth_bounds = src->depth_bounds;
    143 			dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
    144 		}
    145 	}
    146 
    147 	if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
    148 		if (memcmp(&dest->stencil_compare_mask,
    149 			   &src->stencil_compare_mask,
    150 			   sizeof(src->stencil_compare_mask))) {
    151 			dest->stencil_compare_mask = src->stencil_compare_mask;
    152 			dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
    153 		}
    154 	}
    155 
    156 	if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
    157 		if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
    158 			   sizeof(src->stencil_write_mask))) {
    159 			dest->stencil_write_mask = src->stencil_write_mask;
    160 			dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
    161 		}
    162 	}
    163 
    164 	if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
    165 		if (memcmp(&dest->stencil_reference, &src->stencil_reference,
    166 			   sizeof(src->stencil_reference))) {
    167 			dest->stencil_reference = src->stencil_reference;
    168 			dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
    169 		}
    170 	}
    171 
    172 	if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
    173 		if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
    174 			   src->discard_rectangle.count * sizeof(VkRect2D))) {
    175 			typed_memcpy(dest->discard_rectangle.rectangles,
    176 				     src->discard_rectangle.rectangles,
    177 				     src->discard_rectangle.count);
    178 			dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
    179 		}
    180 	}
    181 
    182 	cmd_buffer->state.dirty |= dest_mask;
    183 }
    184 
    185 bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
    186 {
    187 	return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
    188 	       cmd_buffer->device->physical_device->rad_info.chip_class >= CIK;
    189 }
    190 
    191 enum ring_type radv_queue_family_to_ring(int f) {
    192 	switch (f) {
    193 	case RADV_QUEUE_GENERAL:
    194 		return RING_GFX;
    195 	case RADV_QUEUE_COMPUTE:
    196 		return RING_COMPUTE;
    197 	case RADV_QUEUE_TRANSFER:
    198 		return RING_DMA;
    199 	default:
    200 		unreachable("Unknown queue family");
    201 	}
    202 }
    203 
    204 static VkResult radv_create_cmd_buffer(
    205 	struct radv_device *                         device,
    206 	struct radv_cmd_pool *                       pool,
    207 	VkCommandBufferLevel                        level,
    208 	VkCommandBuffer*                            pCommandBuffer)
    209 {
    210 	struct radv_cmd_buffer *cmd_buffer;
    211 	unsigned ring;
    212 	cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
    213 			       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    214 	if (cmd_buffer == NULL)
    215 		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
    216 
    217 	cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
    218 	cmd_buffer->device = device;
    219 	cmd_buffer->pool = pool;
    220 	cmd_buffer->level = level;
    221 
    222 	if (pool) {
    223 		list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
    224 		cmd_buffer->queue_family_index = pool->queue_family_index;
    225 
    226 	} else {
    227 		/* Init the pool_link so we can safefly call list_del when we destroy
    228 		 * the command buffer
    229 		 */
    230 		list_inithead(&cmd_buffer->pool_link);
    231 		cmd_buffer->queue_family_index = RADV_QUEUE_GENERAL;
    232 	}
    233 
    234 	ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
    235 
    236 	cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
    237 	if (!cmd_buffer->cs) {
    238 		vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
    239 		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
    240 	}
    241 
    242 	*pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
    243 
    244 	list_inithead(&cmd_buffer->upload.list);
    245 
    246 	return VK_SUCCESS;
    247 }
    248 
    249 static void
    250 radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
    251 {
    252 	list_del(&cmd_buffer->pool_link);
    253 
    254 	list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
    255 				 &cmd_buffer->upload.list, list) {
    256 		cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
    257 		list_del(&up->list);
    258 		free(up);
    259 	}
    260 
    261 	if (cmd_buffer->upload.upload_bo)
    262 		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
    263 	cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
    264 	free(cmd_buffer->push_descriptors.set.mapped_ptr);
    265 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
    266 }
    267 
    268 static VkResult
    269 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
    270 {
    271 
    272 	cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
    273 
    274 	list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
    275 				 &cmd_buffer->upload.list, list) {
    276 		cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
    277 		list_del(&up->list);
    278 		free(up);
    279 	}
    280 
    281 	cmd_buffer->push_constant_stages = 0;
    282 	cmd_buffer->scratch_size_needed = 0;
    283 	cmd_buffer->compute_scratch_size_needed = 0;
    284 	cmd_buffer->esgs_ring_size_needed = 0;
    285 	cmd_buffer->gsvs_ring_size_needed = 0;
    286 	cmd_buffer->tess_rings_needed = false;
    287 	cmd_buffer->sample_positions_needed = false;
    288 
    289 	if (cmd_buffer->upload.upload_bo)
    290 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
    291 				   cmd_buffer->upload.upload_bo, 8);
    292 	cmd_buffer->upload.offset = 0;
    293 
    294 	cmd_buffer->record_result = VK_SUCCESS;
    295 
    296 	cmd_buffer->ring_offsets_idx = -1;
    297 
    298 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
    299 		void *fence_ptr;
    300 		radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0,
    301 					     &cmd_buffer->gfx9_fence_offset,
    302 					     &fence_ptr);
    303 		cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo;
    304 	}
    305 
    306 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
    307 
    308 	return cmd_buffer->record_result;
    309 }
    310 
    311 static bool
    312 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
    313 				  uint64_t min_needed)
    314 {
    315 	uint64_t new_size;
    316 	struct radeon_winsys_bo *bo;
    317 	struct radv_cmd_buffer_upload *upload;
    318 	struct radv_device *device = cmd_buffer->device;
    319 
    320 	new_size = MAX2(min_needed, 16 * 1024);
    321 	new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
    322 
    323 	bo = device->ws->buffer_create(device->ws,
    324 				       new_size, 4096,
    325 				       RADEON_DOMAIN_GTT,
    326 				       RADEON_FLAG_CPU_ACCESS|
    327 				       RADEON_FLAG_NO_INTERPROCESS_SHARING);
    328 
    329 	if (!bo) {
    330 		cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
    331 		return false;
    332 	}
    333 
    334 	radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo, 8);
    335 	if (cmd_buffer->upload.upload_bo) {
    336 		upload = malloc(sizeof(*upload));
    337 
    338 		if (!upload) {
    339 			cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
    340 			device->ws->buffer_destroy(bo);
    341 			return false;
    342 		}
    343 
    344 		memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
    345 		list_add(&upload->list, &cmd_buffer->upload.list);
    346 	}
    347 
    348 	cmd_buffer->upload.upload_bo = bo;
    349 	cmd_buffer->upload.size = new_size;
    350 	cmd_buffer->upload.offset = 0;
    351 	cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
    352 
    353 	if (!cmd_buffer->upload.map) {
    354 		cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
    355 		return false;
    356 	}
    357 
    358 	return true;
    359 }
    360 
    361 bool
    362 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
    363 			     unsigned size,
    364 			     unsigned alignment,
    365 			     unsigned *out_offset,
    366 			     void **ptr)
    367 {
    368 	uint64_t offset = align(cmd_buffer->upload.offset, alignment);
    369 	if (offset + size > cmd_buffer->upload.size) {
    370 		if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
    371 			return false;
    372 		offset = 0;
    373 	}
    374 
    375 	*out_offset = offset;
    376 	*ptr = cmd_buffer->upload.map + offset;
    377 
    378 	cmd_buffer->upload.offset = offset + size;
    379 	return true;
    380 }
    381 
    382 bool
    383 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer,
    384 			    unsigned size, unsigned alignment,
    385 			    const void *data, unsigned *out_offset)
    386 {
    387 	uint8_t *ptr;
    388 
    389 	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, alignment,
    390 					  out_offset, (void **)&ptr))
    391 		return false;
    392 
    393 	if (ptr)
    394 		memcpy(ptr, data, size);
    395 
    396 	return true;
    397 }
    398 
    399 static void
    400 radv_emit_write_data_packet(struct radeon_winsys_cs *cs, uint64_t va,
    401 			    unsigned count, const uint32_t *data)
    402 {
    403 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
    404 	radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
    405 		    S_370_WR_CONFIRM(1) |
    406 		    S_370_ENGINE_SEL(V_370_ME));
    407 	radeon_emit(cs, va);
    408 	radeon_emit(cs, va >> 32);
    409 	radeon_emit_array(cs, data, count);
    410 }
    411 
    412 void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
    413 {
    414 	struct radv_device *device = cmd_buffer->device;
    415 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
    416 	uint64_t va;
    417 
    418 	va = radv_buffer_get_va(device->trace_bo);
    419 	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
    420 		va += 4;
    421 
    422 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 7);
    423 
    424 	++cmd_buffer->state.trace_id;
    425 	radv_cs_add_buffer(device->ws, cs, device->trace_bo, 8);
    426 	radv_emit_write_data_packet(cs, va, 1, &cmd_buffer->state.trace_id);
    427 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
    428 	radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
    429 }
    430 
    431 static void
    432 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer)
    433 {
    434 	if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
    435 		enum radv_cmd_flush_bits flags;
    436 
    437 		/* Force wait for graphics/compute engines to be idle. */
    438 		flags = RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
    439 			RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
    440 
    441 		si_cs_emit_cache_flush(cmd_buffer->cs,
    442 				       cmd_buffer->device->physical_device->rad_info.chip_class,
    443 				       NULL, 0,
    444 				       radv_cmd_buffer_uses_mec(cmd_buffer),
    445 				       flags);
    446 	}
    447 
    448 	if (unlikely(cmd_buffer->device->trace_bo))
    449 		radv_cmd_buffer_trace_emit(cmd_buffer);
    450 }
    451 
    452 static void
    453 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
    454 		   struct radv_pipeline *pipeline, enum ring_type ring)
    455 {
    456 	struct radv_device *device = cmd_buffer->device;
    457 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
    458 	uint32_t data[2];
    459 	uint64_t va;
    460 
    461 	va = radv_buffer_get_va(device->trace_bo);
    462 
    463 	switch (ring) {
    464 	case RING_GFX:
    465 		va += 8;
    466 		break;
    467 	case RING_COMPUTE:
    468 		va += 16;
    469 		break;
    470 	default:
    471 		assert(!"invalid ring type");
    472 	}
    473 
    474 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws,
    475 							   cmd_buffer->cs, 6);
    476 
    477 	data[0] = (uintptr_t)pipeline;
    478 	data[1] = (uintptr_t)pipeline >> 32;
    479 
    480 	radv_cs_add_buffer(device->ws, cs, device->trace_bo, 8);
    481 	radv_emit_write_data_packet(cs, va, 2, data);
    482 }
    483 
    484 void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
    485 			     struct radv_descriptor_set *set,
    486 			     unsigned idx)
    487 {
    488 	cmd_buffer->descriptors[idx] = set;
    489 	if (set)
    490 		cmd_buffer->state.valid_descriptors |= (1u << idx);
    491 	else
    492 		cmd_buffer->state.valid_descriptors &= ~(1u << idx);
    493 	cmd_buffer->state.descriptors_dirty |= (1u << idx);
    494 
    495 }
    496 
    497 static void
    498 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer)
    499 {
    500 	struct radv_device *device = cmd_buffer->device;
    501 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
    502 	uint32_t data[MAX_SETS * 2] = {};
    503 	uint64_t va;
    504 	unsigned i;
    505 	va = radv_buffer_get_va(device->trace_bo) + 24;
    506 
    507 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws,
    508 							   cmd_buffer->cs, 4 + MAX_SETS * 2);
    509 
    510 	for_each_bit(i, cmd_buffer->state.valid_descriptors) {
    511 		struct radv_descriptor_set *set = cmd_buffer->descriptors[i];
    512 		data[i * 2] = (uintptr_t)set;
    513 		data[i * 2 + 1] = (uintptr_t)set >> 32;
    514 	}
    515 
    516 	radv_cs_add_buffer(device->ws, cs, device->trace_bo, 8);
    517 	radv_emit_write_data_packet(cs, va, MAX_SETS * 2, data);
    518 }
    519 
    520 static void
    521 radv_emit_graphics_blend_state(struct radv_cmd_buffer *cmd_buffer,
    522 			       struct radv_pipeline *pipeline)
    523 {
    524 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028780_CB_BLEND0_CONTROL, 8);
    525 	radeon_emit_array(cmd_buffer->cs, pipeline->graphics.blend.cb_blend_control,
    526 			  8);
    527 	radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, pipeline->graphics.blend.cb_color_control);
    528 	radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, pipeline->graphics.blend.db_alpha_to_mask);
    529 
    530 	if (cmd_buffer->device->physical_device->has_rbplus) {
    531 
    532 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028760_SX_MRT0_BLEND_OPT, 8);
    533 		radeon_emit_array(cmd_buffer->cs, pipeline->graphics.blend.sx_mrt_blend_opt, 8);
    534 
    535 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
    536 		radeon_emit(cmd_buffer->cs, 0);	/* R_028754_SX_PS_DOWNCONVERT */
    537 		radeon_emit(cmd_buffer->cs, 0);	/* R_028758_SX_BLEND_OPT_EPSILON */
    538 		radeon_emit(cmd_buffer->cs, 0);	/* R_02875C_SX_BLEND_OPT_CONTROL */
    539 	}
    540 }
    541 
    542 static void
    543 radv_emit_graphics_depth_stencil_state(struct radv_cmd_buffer *cmd_buffer,
    544 				       struct radv_pipeline *pipeline)
    545 {
    546 	struct radv_depth_stencil_state *ds = &pipeline->graphics.ds;
    547 	radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, ds->db_depth_control);
    548 	radeon_set_context_reg(cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL, ds->db_stencil_control);
    549 
    550 	radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, ds->db_render_control);
    551 	radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2, ds->db_render_override2);
    552 }
    553 
    554 struct ac_userdata_info *
    555 radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
    556 		      gl_shader_stage stage,
    557 		      int idx)
    558 {
    559 	if (stage == MESA_SHADER_VERTEX) {
    560 		if (pipeline->shaders[MESA_SHADER_VERTEX])
    561 			return &pipeline->shaders[MESA_SHADER_VERTEX]->info.user_sgprs_locs.shader_data[idx];
    562 		if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
    563 			return &pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.user_sgprs_locs.shader_data[idx];
    564 		if (pipeline->shaders[MESA_SHADER_GEOMETRY])
    565 			return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.user_sgprs_locs.shader_data[idx];
    566 	} else if (stage == MESA_SHADER_TESS_EVAL) {
    567 		if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
    568 			return &pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.user_sgprs_locs.shader_data[idx];
    569 		if (pipeline->shaders[MESA_SHADER_GEOMETRY])
    570 			return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.user_sgprs_locs.shader_data[idx];
    571 	}
    572 	return &pipeline->shaders[stage]->info.user_sgprs_locs.shader_data[idx];
    573 }
    574 
    575 static void
    576 radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer,
    577 			   struct radv_pipeline *pipeline,
    578 			   gl_shader_stage stage,
    579 			   int idx, uint64_t va)
    580 {
    581 	struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
    582 	uint32_t base_reg = pipeline->user_data_0[stage];
    583 	if (loc->sgpr_idx == -1)
    584 		return;
    585 	assert(loc->num_sgprs == 2);
    586 	assert(!loc->indirect);
    587 	radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 2);
    588 	radeon_emit(cmd_buffer->cs, va);
    589 	radeon_emit(cmd_buffer->cs, va >> 32);
    590 }
    591 
    592 static void
    593 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
    594 			      struct radv_pipeline *pipeline)
    595 {
    596 	int num_samples = pipeline->graphics.ms.num_samples;
    597 	struct radv_multisample_state *ms = &pipeline->graphics.ms;
    598 	struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
    599 
    600 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
    601 	radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_mask[0]);
    602 	radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_mask[1]);
    603 
    604 	radeon_set_context_reg(cmd_buffer->cs, R_028804_DB_EQAA, ms->db_eqaa);
    605 	radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
    606 
    607 	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions) {
    608 		uint32_t offset;
    609 		struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET);
    610 		uint32_t base_reg = pipeline->user_data_0[MESA_SHADER_FRAGMENT];
    611 		if (loc->sgpr_idx == -1)
    612 			return;
    613 		assert(loc->num_sgprs == 1);
    614 		assert(!loc->indirect);
    615 		switch (num_samples) {
    616 		default:
    617 			offset = 0;
    618 			break;
    619 		case 2:
    620 			offset = 1;
    621 			break;
    622 		case 4:
    623 			offset = 3;
    624 			break;
    625 		case 8:
    626 			offset = 7;
    627 			break;
    628 		case 16:
    629 			offset = 15;
    630 			break;
    631 		}
    632 
    633 		radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, offset);
    634 		cmd_buffer->sample_positions_needed = true;
    635 	}
    636 
    637 	if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
    638 		return;
    639 
    640 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2);
    641 	radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
    642 	radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config);
    643 
    644 	radeon_set_context_reg(cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0);
    645 
    646 	radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples);
    647 
    648 	/* GFX9: Flush DFSM when the AA mode changes. */
    649 	if (cmd_buffer->device->dfsm_allowed) {
    650 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
    651 		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
    652 	}
    653 }
    654 
    655 static void
    656 radv_emit_graphics_raster_state(struct radv_cmd_buffer *cmd_buffer,
    657 				struct radv_pipeline *pipeline)
    658 {
    659 	struct radv_raster_state *raster = &pipeline->graphics.raster;
    660 
    661 	radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL,
    662 			       raster->pa_cl_clip_cntl);
    663 	radeon_set_context_reg(cmd_buffer->cs, R_0286D4_SPI_INTERP_CONTROL_0,
    664 			       raster->spi_interp_control);
    665 	radeon_set_context_reg(cmd_buffer->cs, R_028BE4_PA_SU_VTX_CNTL,
    666 			       raster->pa_su_vtx_cntl);
    667 	radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL,
    668 			       raster->pa_su_sc_mode_cntl);
    669 }
    670 
    671 static inline void
    672 radv_emit_prefetch_TC_L2_async(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
    673 			       unsigned size)
    674 {
    675 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK)
    676 		si_cp_dma_prefetch(cmd_buffer, va, size);
    677 }
    678 
    679 static void
    680 radv_emit_VBO_descriptors_prefetch(struct radv_cmd_buffer *cmd_buffer)
    681 {
    682 	if (cmd_buffer->state.vb_prefetch_dirty) {
    683 		radv_emit_prefetch_TC_L2_async(cmd_buffer,
    684 					       cmd_buffer->state.vb_va,
    685 					       cmd_buffer->state.vb_size);
    686 		cmd_buffer->state.vb_prefetch_dirty = false;
    687 	}
    688 }
    689 
    690 static void
    691 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer,
    692 			  struct radv_shader_variant *shader)
    693 {
    694 	struct radeon_winsys *ws = cmd_buffer->device->ws;
    695 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
    696 	uint64_t va;
    697 
    698 	if (!shader)
    699 		return;
    700 
    701 	va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
    702 
    703 	radv_cs_add_buffer(ws, cs, shader->bo, 8);
    704 	radv_emit_prefetch_TC_L2_async(cmd_buffer, va, shader->code_size);
    705 }
    706 
    707 static void
    708 radv_emit_prefetch(struct radv_cmd_buffer *cmd_buffer,
    709 		   struct radv_pipeline *pipeline)
    710 {
    711 	radv_emit_shader_prefetch(cmd_buffer,
    712 				  pipeline->shaders[MESA_SHADER_VERTEX]);
    713 	radv_emit_VBO_descriptors_prefetch(cmd_buffer);
    714 	radv_emit_shader_prefetch(cmd_buffer,
    715 				  pipeline->shaders[MESA_SHADER_TESS_CTRL]);
    716 	radv_emit_shader_prefetch(cmd_buffer,
    717 				  pipeline->shaders[MESA_SHADER_TESS_EVAL]);
    718 	radv_emit_shader_prefetch(cmd_buffer,
    719 				  pipeline->shaders[MESA_SHADER_GEOMETRY]);
    720 	radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
    721 	radv_emit_shader_prefetch(cmd_buffer,
    722 				  pipeline->shaders[MESA_SHADER_FRAGMENT]);
    723 }
    724 
    725 static void
    726 radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer,
    727 		struct radv_pipeline *pipeline,
    728 		struct radv_shader_variant *shader)
    729 {
    730 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
    731 
    732 	radeon_set_context_reg(cmd_buffer->cs, R_0286C4_SPI_VS_OUT_CONFIG,
    733 			       pipeline->graphics.vs.spi_vs_out_config);
    734 
    735 	radeon_set_context_reg(cmd_buffer->cs, R_02870C_SPI_SHADER_POS_FORMAT,
    736 			       pipeline->graphics.vs.spi_shader_pos_format);
    737 
    738 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4);
    739 	radeon_emit(cmd_buffer->cs, va >> 8);
    740 	radeon_emit(cmd_buffer->cs, va >> 40);
    741 	radeon_emit(cmd_buffer->cs, shader->rsrc1);
    742 	radeon_emit(cmd_buffer->cs, shader->rsrc2);
    743 
    744 	radeon_set_context_reg(cmd_buffer->cs, R_028818_PA_CL_VTE_CNTL,
    745 			       S_028818_VTX_W0_FMT(1) |
    746 			       S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
    747 			       S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
    748 			       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
    749 
    750 
    751 	radeon_set_context_reg(cmd_buffer->cs, R_02881C_PA_CL_VS_OUT_CNTL,
    752 			       pipeline->graphics.vs.pa_cl_vs_out_cntl);
    753 
    754 	if (cmd_buffer->device->physical_device->rad_info.chip_class <= VI)
    755 		radeon_set_context_reg(cmd_buffer->cs, R_028AB4_VGT_REUSE_OFF,
    756 				       pipeline->graphics.vs.vgt_reuse_off);
    757 }
    758 
    759 static void
    760 radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer,
    761 		struct radv_pipeline *pipeline,
    762 		struct radv_shader_variant *shader)
    763 {
    764 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
    765 
    766 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4);
    767 	radeon_emit(cmd_buffer->cs, va >> 8);
    768 	radeon_emit(cmd_buffer->cs, va >> 40);
    769 	radeon_emit(cmd_buffer->cs, shader->rsrc1);
    770 	radeon_emit(cmd_buffer->cs, shader->rsrc2);
    771 }
    772 
    773 static void
    774 radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer,
    775 		struct radv_shader_variant *shader)
    776 {
    777 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
    778 	uint32_t rsrc2 = shader->rsrc2;
    779 
    780 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
    781 	radeon_emit(cmd_buffer->cs, va >> 8);
    782 	radeon_emit(cmd_buffer->cs, va >> 40);
    783 
    784 	rsrc2 |= S_00B52C_LDS_SIZE(cmd_buffer->state.pipeline->graphics.tess.lds_size);
    785 	if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK &&
    786 	    cmd_buffer->device->physical_device->rad_info.family != CHIP_HAWAII)
    787 		radeon_set_sh_reg(cmd_buffer->cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2);
    788 
    789 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
    790 	radeon_emit(cmd_buffer->cs, shader->rsrc1);
    791 	radeon_emit(cmd_buffer->cs, rsrc2);
    792 }
    793 
    794 static void
    795 radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer,
    796 		struct radv_shader_variant *shader)
    797 {
    798 	uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
    799 
    800 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
    801 		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B410_SPI_SHADER_PGM_LO_LS, 2);
    802 		radeon_emit(cmd_buffer->cs, va >> 8);
    803 		radeon_emit(cmd_buffer->cs, va >> 40);
    804 
    805 		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2);
    806 		radeon_emit(cmd_buffer->cs, shader->rsrc1);
    807 		radeon_emit(cmd_buffer->cs, shader->rsrc2 |
    808 		                            S_00B42C_LDS_SIZE(cmd_buffer->state.pipeline->graphics.tess.lds_size));
    809 	} else {
    810 		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
    811 		radeon_emit(cmd_buffer->cs, va >> 8);
    812 		radeon_emit(cmd_buffer->cs, va >> 40);
    813 		radeon_emit(cmd_buffer->cs, shader->rsrc1);
    814 		radeon_emit(cmd_buffer->cs, shader->rsrc2);
    815 	}
    816 }
    817 
    818 static void
    819 radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer,
    820 			struct radv_pipeline *pipeline)
    821 {
    822 	struct radv_shader_variant *vs;
    823 
    824 	radeon_set_context_reg(cmd_buffer->cs, R_028A84_VGT_PRIMITIVEID_EN, pipeline->graphics.vgt_primitiveid_en);
    825 
    826 	/* Skip shaders merged into HS/GS */
    827 	vs = pipeline->shaders[MESA_SHADER_VERTEX];
    828 	if (!vs)
    829 		return;
    830 
    831 	if (vs->info.vs.as_ls)
    832 		radv_emit_hw_ls(cmd_buffer, vs);
    833 	else if (vs->info.vs.as_es)
    834 		radv_emit_hw_es(cmd_buffer, pipeline, vs);
    835 	else
    836 		radv_emit_hw_vs(cmd_buffer, pipeline, vs);
    837 }
    838 
    839 
    840 static void
    841 radv_emit_tess_shaders(struct radv_cmd_buffer *cmd_buffer,
    842 		       struct radv_pipeline *pipeline)
    843 {
    844 	if (!radv_pipeline_has_tess(pipeline))
    845 		return;
    846 
    847 	struct radv_shader_variant *tes, *tcs;
    848 
    849 	tcs = pipeline->shaders[MESA_SHADER_TESS_CTRL];
    850 	tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
    851 
    852 	if (tes) {
    853 		if (tes->info.tes.as_es)
    854 			radv_emit_hw_es(cmd_buffer, pipeline, tes);
    855 		else
    856 			radv_emit_hw_vs(cmd_buffer, pipeline, tes);
    857 	}
    858 
    859 	radv_emit_hw_hs(cmd_buffer, tcs);
    860 
    861 	radeon_set_context_reg(cmd_buffer->cs, R_028B6C_VGT_TF_PARAM,
    862 			       pipeline->graphics.tess.tf_param);
    863 
    864 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK)
    865 		radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2,
    866 					   pipeline->graphics.tess.ls_hs_config);
    867 	else
    868 		radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG,
    869 				       pipeline->graphics.tess.ls_hs_config);
    870 
    871 	struct ac_userdata_info *loc;
    872 
    873 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_CTRL, AC_UD_TCS_OFFCHIP_LAYOUT);
    874 	if (loc->sgpr_idx != -1) {
    875 		uint32_t base_reg = pipeline->user_data_0[MESA_SHADER_TESS_CTRL];
    876 		assert(loc->num_sgprs == 4);
    877 		assert(!loc->indirect);
    878 		radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 4);
    879 		radeon_emit(cmd_buffer->cs, pipeline->graphics.tess.offchip_layout);
    880 		radeon_emit(cmd_buffer->cs, pipeline->graphics.tess.tcs_out_offsets);
    881 		radeon_emit(cmd_buffer->cs, pipeline->graphics.tess.tcs_out_layout |
    882 			    pipeline->graphics.tess.num_tcs_input_cp << 26);
    883 		radeon_emit(cmd_buffer->cs, pipeline->graphics.tess.tcs_in_layout);
    884 	}
    885 
    886 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_EVAL, AC_UD_TES_OFFCHIP_LAYOUT);
    887 	if (loc->sgpr_idx != -1) {
    888 		uint32_t base_reg = pipeline->user_data_0[MESA_SHADER_TESS_EVAL];
    889 		assert(loc->num_sgprs == 1);
    890 		assert(!loc->indirect);
    891 
    892 		radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
    893 				  pipeline->graphics.tess.offchip_layout);
    894 	}
    895 
    896 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX, AC_UD_VS_LS_TCS_IN_LAYOUT);
    897 	if (loc->sgpr_idx != -1) {
    898 		uint32_t base_reg = pipeline->user_data_0[MESA_SHADER_VERTEX];
    899 		assert(loc->num_sgprs == 1);
    900 		assert(!loc->indirect);
    901 
    902 		radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
    903 				  pipeline->graphics.tess.tcs_in_layout);
    904 	}
    905 }
    906 
    907 static void
    908 radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer,
    909 			  struct radv_pipeline *pipeline)
    910 {
    911 	struct radv_shader_variant *gs;
    912 	uint64_t va;
    913 
    914 	radeon_set_context_reg(cmd_buffer->cs, R_028A40_VGT_GS_MODE, pipeline->graphics.vgt_gs_mode);
    915 
    916 	gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
    917 	if (!gs)
    918 		return;
    919 
    920 	uint32_t gsvs_itemsize = gs->info.gs.max_gsvs_emit_size >> 2;
    921 
    922 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
    923 	radeon_emit(cmd_buffer->cs, gsvs_itemsize);
    924 	radeon_emit(cmd_buffer->cs, gsvs_itemsize);
    925 	radeon_emit(cmd_buffer->cs, gsvs_itemsize);
    926 
    927 	radeon_set_context_reg(cmd_buffer->cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize);
    928 
    929 	radeon_set_context_reg(cmd_buffer->cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
    930 
    931 	uint32_t gs_vert_itemsize = gs->info.gs.gsvs_vertex_size;
    932 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
    933 	radeon_emit(cmd_buffer->cs, gs_vert_itemsize >> 2);
    934 	radeon_emit(cmd_buffer->cs, 0);
    935 	radeon_emit(cmd_buffer->cs, 0);
    936 	radeon_emit(cmd_buffer->cs, 0);
    937 
    938 	uint32_t gs_num_invocations = gs->info.gs.invocations;
    939 	radeon_set_context_reg(cmd_buffer->cs, R_028B90_VGT_GS_INSTANCE_CNT,
    940 			       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
    941 			       S_028B90_ENABLE(gs_num_invocations > 0));
    942 
    943 	radeon_set_context_reg(cmd_buffer->cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
    944 			       pipeline->graphics.gs.vgt_esgs_ring_itemsize);
    945 
    946 	va = radv_buffer_get_va(gs->bo) + gs->bo_offset;
    947 
    948 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
    949 		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B210_SPI_SHADER_PGM_LO_ES, 2);
    950 		radeon_emit(cmd_buffer->cs, va >> 8);
    951 		radeon_emit(cmd_buffer->cs, va >> 40);
    952 
    953 		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
    954 		radeon_emit(cmd_buffer->cs, gs->rsrc1);
    955 		radeon_emit(cmd_buffer->cs, gs->rsrc2 |
    956 		                            S_00B22C_LDS_SIZE(pipeline->graphics.gs.lds_size));
    957 
    958 		radeon_set_context_reg(cmd_buffer->cs, R_028A44_VGT_GS_ONCHIP_CNTL, pipeline->graphics.gs.vgt_gs_onchip_cntl);
    959 		radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, pipeline->graphics.gs.vgt_gs_max_prims_per_subgroup);
    960 	} else {
    961 		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
    962 		radeon_emit(cmd_buffer->cs, va >> 8);
    963 		radeon_emit(cmd_buffer->cs, va >> 40);
    964 		radeon_emit(cmd_buffer->cs, gs->rsrc1);
    965 		radeon_emit(cmd_buffer->cs, gs->rsrc2);
    966 	}
    967 
    968 	radv_emit_hw_vs(cmd_buffer, pipeline, pipeline->gs_copy_shader);
    969 
    970 	struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
    971 							     AC_UD_GS_VS_RING_STRIDE_ENTRIES);
    972 	if (loc->sgpr_idx != -1) {
    973 		uint32_t stride = gs->info.gs.max_gsvs_emit_size;
    974 		uint32_t num_entries = 64;
    975 		bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= VI;
    976 
    977 		if (is_vi)
    978 			num_entries *= stride;
    979 
    980 		stride = S_008F04_STRIDE(stride);
    981 		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + loc->sgpr_idx * 4, 2);
    982 		radeon_emit(cmd_buffer->cs, stride);
    983 		radeon_emit(cmd_buffer->cs, num_entries);
    984 	}
    985 }
    986 
    987 static void
    988 radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer,
    989 			  struct radv_pipeline *pipeline)
    990 {
    991 	struct radv_shader_variant *ps;
    992 	uint64_t va;
    993 	struct radv_blend_state *blend = &pipeline->graphics.blend;
    994 	assert (pipeline->shaders[MESA_SHADER_FRAGMENT]);
    995 
    996 	ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
    997 	va = radv_buffer_get_va(ps->bo) + ps->bo_offset;
    998 
    999 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
   1000 	radeon_emit(cmd_buffer->cs, va >> 8);
   1001 	radeon_emit(cmd_buffer->cs, va >> 40);
   1002 	radeon_emit(cmd_buffer->cs, ps->rsrc1);
   1003 	radeon_emit(cmd_buffer->cs, ps->rsrc2);
   1004 
   1005 	radeon_set_context_reg(cmd_buffer->cs, R_02880C_DB_SHADER_CONTROL,
   1006 			       pipeline->graphics.db_shader_control);
   1007 
   1008 	radeon_set_context_reg(cmd_buffer->cs, R_0286CC_SPI_PS_INPUT_ENA,
   1009 			       ps->config.spi_ps_input_ena);
   1010 
   1011 	radeon_set_context_reg(cmd_buffer->cs, R_0286D0_SPI_PS_INPUT_ADDR,
   1012 			       ps->config.spi_ps_input_addr);
   1013 
   1014 	radeon_set_context_reg(cmd_buffer->cs, R_0286D8_SPI_PS_IN_CONTROL,
   1015 			       S_0286D8_NUM_INTERP(ps->info.fs.num_interp));
   1016 
   1017 	radeon_set_context_reg(cmd_buffer->cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl);
   1018 
   1019 	radeon_set_context_reg(cmd_buffer->cs, R_028710_SPI_SHADER_Z_FORMAT,
   1020 			       pipeline->graphics.shader_z_format);
   1021 
   1022 	radeon_set_context_reg(cmd_buffer->cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
   1023 
   1024 	radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, blend->cb_target_mask);
   1025 	radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
   1026 
   1027 	if (cmd_buffer->device->dfsm_allowed) {
   1028 		/* optimise this? */
   1029 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
   1030 		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
   1031 	}
   1032 
   1033 	if (pipeline->graphics.ps_input_cntl_num) {
   1034 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028644_SPI_PS_INPUT_CNTL_0, pipeline->graphics.ps_input_cntl_num);
   1035 		for (unsigned i = 0; i < pipeline->graphics.ps_input_cntl_num; i++) {
   1036 			radeon_emit(cmd_buffer->cs, pipeline->graphics.ps_input_cntl[i]);
   1037 		}
   1038 	}
   1039 }
   1040 
   1041 static void
   1042 radv_emit_vgt_vertex_reuse(struct radv_cmd_buffer *cmd_buffer,
   1043 			   struct radv_pipeline *pipeline)
   1044 {
   1045 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   1046 
   1047 	if (cmd_buffer->device->physical_device->rad_info.family < CHIP_POLARIS10)
   1048 		return;
   1049 
   1050 	radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
   1051 			       pipeline->graphics.vtx_reuse_depth);
   1052 }
   1053 
   1054 static void
   1055 radv_emit_binning_state(struct radv_cmd_buffer *cmd_buffer,
   1056 			   struct radv_pipeline *pipeline)
   1057 {
   1058 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   1059 
   1060 	if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
   1061 		return;
   1062 
   1063 	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
   1064 			       pipeline->graphics.bin.pa_sc_binner_cntl_0);
   1065 	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
   1066 			       pipeline->graphics.bin.db_dfsm_control);
   1067 }
   1068 
   1069 static void
   1070 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
   1071 {
   1072 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
   1073 
   1074 	if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
   1075 		return;
   1076 
   1077 	radv_emit_graphics_depth_stencil_state(cmd_buffer, pipeline);
   1078 	radv_emit_graphics_blend_state(cmd_buffer, pipeline);
   1079 	radv_emit_graphics_raster_state(cmd_buffer, pipeline);
   1080 	radv_update_multisample_state(cmd_buffer, pipeline);
   1081 	radv_emit_vertex_shader(cmd_buffer, pipeline);
   1082 	radv_emit_tess_shaders(cmd_buffer, pipeline);
   1083 	radv_emit_geometry_shader(cmd_buffer, pipeline);
   1084 	radv_emit_fragment_shader(cmd_buffer, pipeline);
   1085 	radv_emit_vgt_vertex_reuse(cmd_buffer, pipeline);
   1086 	radv_emit_binning_state(cmd_buffer, pipeline);
   1087 
   1088 	cmd_buffer->scratch_size_needed =
   1089 	                          MAX2(cmd_buffer->scratch_size_needed,
   1090 	                               pipeline->max_waves * pipeline->scratch_bytes_per_wave);
   1091 
   1092 	radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
   1093 			       S_0286E8_WAVES(pipeline->max_waves) |
   1094 			       S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
   1095 
   1096 	if (!cmd_buffer->state.emitted_pipeline ||
   1097 	    cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
   1098 	     pipeline->graphics.can_use_guardband)
   1099 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
   1100 
   1101 	radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, pipeline->graphics.vgt_shader_stages_en);
   1102 
   1103 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
   1104 		radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, pipeline->graphics.prim);
   1105 	} else {
   1106 		radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, pipeline->graphics.prim);
   1107 	}
   1108 	radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, pipeline->graphics.gs_out);
   1109 
   1110 	radeon_set_context_reg(cmd_buffer->cs, R_02820C_PA_SC_CLIPRECT_RULE, pipeline->graphics.pa_sc_cliprect_rule);
   1111 
   1112 	if (unlikely(cmd_buffer->device->trace_bo))
   1113 		radv_save_pipeline(cmd_buffer, pipeline, RING_GFX);
   1114 
   1115 	cmd_buffer->state.emitted_pipeline = pipeline;
   1116 
   1117 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
   1118 }
   1119 
   1120 static void
   1121 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
   1122 {
   1123 	si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count,
   1124 			  cmd_buffer->state.dynamic.viewport.viewports);
   1125 }
   1126 
   1127 static void
   1128 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
   1129 {
   1130 	uint32_t count = cmd_buffer->state.dynamic.scissor.count;
   1131 
   1132 	/* Vega10/Raven scissor bug workaround. This must be done before VPORT
   1133 	 * scissor registers are changed. There is also a more efficient but
   1134 	 * more involved alternative workaround.
   1135 	 */
   1136 	if (cmd_buffer->device->physical_device->has_scissor_bug) {
   1137 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
   1138 		si_emit_cache_flush(cmd_buffer);
   1139 	}
   1140 	si_write_scissors(cmd_buffer->cs, 0, count,
   1141 			  cmd_buffer->state.dynamic.scissor.scissors,
   1142 			  cmd_buffer->state.dynamic.viewport.viewports,
   1143 			  cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
   1144 }
   1145 
   1146 static void
   1147 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
   1148 {
   1149 	if (!cmd_buffer->state.dynamic.discard_rectangle.count)
   1150 		return;
   1151 
   1152 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
   1153 	                           cmd_buffer->state.dynamic.discard_rectangle.count * 2);
   1154 	for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
   1155 		VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
   1156 		radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
   1157 		radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
   1158 		                            S_028214_BR_Y(rect.offset.y + rect.extent.height));
   1159 	}
   1160 }
   1161 
   1162 static void
   1163 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
   1164 {
   1165 	unsigned width = cmd_buffer->state.dynamic.line_width * 8;
   1166 
   1167 	radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
   1168 			       S_028A08_WIDTH(CLAMP(width, 0, 0xFFF)));
   1169 }
   1170 
   1171 static void
   1172 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
   1173 {
   1174 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
   1175 
   1176 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
   1177 	radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
   1178 }
   1179 
   1180 static void
   1181 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
   1182 {
   1183 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
   1184 
   1185 	radeon_set_context_reg_seq(cmd_buffer->cs,
   1186 				   R_028430_DB_STENCILREFMASK, 2);
   1187 	radeon_emit(cmd_buffer->cs,
   1188 		    S_028430_STENCILTESTVAL(d->stencil_reference.front) |
   1189 		    S_028430_STENCILMASK(d->stencil_compare_mask.front) |
   1190 		    S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
   1191 		    S_028430_STENCILOPVAL(1));
   1192 	radeon_emit(cmd_buffer->cs,
   1193 		    S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
   1194 		    S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
   1195 		    S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
   1196 		    S_028434_STENCILOPVAL_BF(1));
   1197 }
   1198 
   1199 static void
   1200 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
   1201 {
   1202 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
   1203 
   1204 	radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN,
   1205 			       fui(d->depth_bounds.min));
   1206 	radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX,
   1207 			       fui(d->depth_bounds.max));
   1208 }
   1209 
   1210 static void
   1211 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
   1212 {
   1213 	struct radv_raster_state *raster = &cmd_buffer->state.pipeline->graphics.raster;
   1214 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
   1215 	unsigned slope = fui(d->depth_bias.slope * 16.0f);
   1216 	unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale);
   1217 
   1218 	if (G_028814_POLY_OFFSET_FRONT_ENABLE(raster->pa_su_sc_mode_cntl)) {
   1219 		radeon_set_context_reg_seq(cmd_buffer->cs,
   1220 					   R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
   1221 		radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
   1222 		radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
   1223 		radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */
   1224 		radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
   1225 		radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */
   1226 	}
   1227 }
   1228 
   1229 static void
   1230 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer,
   1231 			 int index,
   1232 			 struct radv_attachment_info *att,
   1233 			 struct radv_image *image,
   1234 			 VkImageLayout layout)
   1235 {
   1236 	bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= VI;
   1237 	struct radv_color_buffer_info *cb = &att->cb;
   1238 	uint32_t cb_color_info = cb->cb_color_info;
   1239 
   1240 	if (!radv_layout_dcc_compressed(image, layout,
   1241 	                                radv_image_queue_family_mask(image,
   1242 	                                                             cmd_buffer->queue_family_index,
   1243 	                                                             cmd_buffer->queue_family_index))) {
   1244 		cb_color_info &= C_028C70_DCC_ENABLE;
   1245 	}
   1246 
   1247 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
   1248 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
   1249 		radeon_emit(cmd_buffer->cs, cb->cb_color_base);
   1250 		radeon_emit(cmd_buffer->cs, cb->cb_color_base >> 32);
   1251 		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
   1252 		radeon_emit(cmd_buffer->cs, cb->cb_color_view);
   1253 		radeon_emit(cmd_buffer->cs, cb_color_info);
   1254 		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
   1255 		radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
   1256 		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
   1257 		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask >> 32);
   1258 		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
   1259 		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask >> 32);
   1260 
   1261 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
   1262 		radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
   1263 		radeon_emit(cmd_buffer->cs, cb->cb_dcc_base >> 32);
   1264 
   1265 		radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
   1266 				       S_0287A0_EPITCH(att->attachment->image->surface.u.gfx9.surf.epitch));
   1267 	} else {
   1268 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
   1269 		radeon_emit(cmd_buffer->cs, cb->cb_color_base);
   1270 		radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
   1271 		radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
   1272 		radeon_emit(cmd_buffer->cs, cb->cb_color_view);
   1273 		radeon_emit(cmd_buffer->cs, cb_color_info);
   1274 		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
   1275 		radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
   1276 		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
   1277 		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
   1278 		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
   1279 		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
   1280 
   1281 		if (is_vi) { /* DCC BASE */
   1282 			radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
   1283 		}
   1284 	}
   1285 }
   1286 
   1287 static void
   1288 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
   1289 		      struct radv_ds_buffer_info *ds,
   1290 		      struct radv_image *image,
   1291 		      VkImageLayout layout)
   1292 {
   1293 	uint32_t db_z_info = ds->db_z_info;
   1294 	uint32_t db_stencil_info = ds->db_stencil_info;
   1295 
   1296 	if (!radv_layout_has_htile(image, layout,
   1297 	                           radv_image_queue_family_mask(image,
   1298 	                                                        cmd_buffer->queue_family_index,
   1299 	                                                        cmd_buffer->queue_family_index))) {
   1300 		db_z_info &= C_028040_TILE_SURFACE_ENABLE;
   1301 		db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
   1302 	}
   1303 
   1304 	radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
   1305 	radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
   1306 
   1307 
   1308 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
   1309 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
   1310 		radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
   1311 		radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
   1312 		radeon_emit(cmd_buffer->cs, ds->db_depth_size);
   1313 
   1314 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
   1315 		radeon_emit(cmd_buffer->cs, db_z_info);			/* DB_Z_INFO */
   1316 		radeon_emit(cmd_buffer->cs, db_stencil_info);	        /* DB_STENCIL_INFO */
   1317 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* DB_Z_READ_BASE */
   1318 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);	/* DB_Z_READ_BASE_HI */
   1319 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* DB_STENCIL_READ_BASE */
   1320 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32); /* DB_STENCIL_READ_BASE_HI */
   1321 		radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* DB_Z_WRITE_BASE */
   1322 		radeon_emit(cmd_buffer->cs, ds->db_z_write_base >> 32);	/* DB_Z_WRITE_BASE_HI */
   1323 		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* DB_STENCIL_WRITE_BASE */
   1324 		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */
   1325 
   1326 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
   1327 		radeon_emit(cmd_buffer->cs, ds->db_z_info2);
   1328 		radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
   1329 	} else {
   1330 		radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
   1331 
   1332 		radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
   1333 		radeon_emit(cmd_buffer->cs, ds->db_depth_info);	/* R_02803C_DB_DEPTH_INFO */
   1334 		radeon_emit(cmd_buffer->cs, db_z_info);			/* R_028040_DB_Z_INFO */
   1335 		radeon_emit(cmd_buffer->cs, db_stencil_info);	        /* R_028044_DB_STENCIL_INFO */
   1336 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* R_028048_DB_Z_READ_BASE */
   1337 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* R_02804C_DB_STENCIL_READ_BASE */
   1338 		radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* R_028050_DB_Z_WRITE_BASE */
   1339 		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* R_028054_DB_STENCIL_WRITE_BASE */
   1340 		radeon_emit(cmd_buffer->cs, ds->db_depth_size);	/* R_028058_DB_DEPTH_SIZE */
   1341 		radeon_emit(cmd_buffer->cs, ds->db_depth_slice);	/* R_02805C_DB_DEPTH_SLICE */
   1342 
   1343 	}
   1344 
   1345 	radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
   1346 			       ds->pa_su_poly_offset_db_fmt_cntl);
   1347 }
   1348 
   1349 void
   1350 radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
   1351 			  struct radv_image *image,
   1352 			  VkClearDepthStencilValue ds_clear_value,
   1353 			  VkImageAspectFlags aspects)
   1354 {
   1355 	uint64_t va = radv_buffer_get_va(image->bo);
   1356 	va += image->offset + image->clear_value_offset;
   1357 	unsigned reg_offset = 0, reg_count = 0;
   1358 
   1359 	assert(image->surface.htile_size);
   1360 
   1361 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
   1362 		++reg_count;
   1363 	} else {
   1364 		++reg_offset;
   1365 		va += 4;
   1366 	}
   1367 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
   1368 		++reg_count;
   1369 
   1370 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0));
   1371 	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
   1372 				    S_370_WR_CONFIRM(1) |
   1373 				    S_370_ENGINE_SEL(V_370_PFP));
   1374 	radeon_emit(cmd_buffer->cs, va);
   1375 	radeon_emit(cmd_buffer->cs, va >> 32);
   1376 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
   1377 		radeon_emit(cmd_buffer->cs, ds_clear_value.stencil);
   1378 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
   1379 		radeon_emit(cmd_buffer->cs, fui(ds_clear_value.depth));
   1380 
   1381 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028028_DB_STENCIL_CLEAR + 4 * reg_offset, reg_count);
   1382 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
   1383 		radeon_emit(cmd_buffer->cs, ds_clear_value.stencil); /* R_028028_DB_STENCIL_CLEAR */
   1384 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
   1385 		radeon_emit(cmd_buffer->cs, fui(ds_clear_value.depth)); /* R_02802C_DB_DEPTH_CLEAR */
   1386 }
   1387 
   1388 static void
   1389 radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
   1390 			   struct radv_image *image)
   1391 {
   1392 	VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
   1393 	uint64_t va = radv_buffer_get_va(image->bo);
   1394 	va += image->offset + image->clear_value_offset;
   1395 	unsigned reg_offset = 0, reg_count = 0;
   1396 
   1397 	if (!image->surface.htile_size)
   1398 		return;
   1399 
   1400 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
   1401 		++reg_count;
   1402 	} else {
   1403 		++reg_offset;
   1404 		va += 4;
   1405 	}
   1406 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
   1407 		++reg_count;
   1408 
   1409 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
   1410 	radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
   1411 				    COPY_DATA_DST_SEL(COPY_DATA_REG) |
   1412 				    (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
   1413 	radeon_emit(cmd_buffer->cs, va);
   1414 	radeon_emit(cmd_buffer->cs, va >> 32);
   1415 	radeon_emit(cmd_buffer->cs, (R_028028_DB_STENCIL_CLEAR + 4 * reg_offset) >> 2);
   1416 	radeon_emit(cmd_buffer->cs, 0);
   1417 
   1418 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
   1419 	radeon_emit(cmd_buffer->cs, 0);
   1420 }
   1421 
   1422 /*
   1423  *with DCC some colors don't require CMASK elimiation before being
   1424  * used as a texture. This sets a predicate value to determine if the
   1425  * cmask eliminate is required.
   1426  */
   1427 void
   1428 radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer,
   1429 				  struct radv_image *image,
   1430 				  bool value)
   1431 {
   1432 	uint64_t pred_val = value;
   1433 	uint64_t va = radv_buffer_get_va(image->bo);
   1434 	va += image->offset + image->dcc_pred_offset;
   1435 
   1436 	assert(image->surface.dcc_size);
   1437 
   1438 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0));
   1439 	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
   1440 				    S_370_WR_CONFIRM(1) |
   1441 				    S_370_ENGINE_SEL(V_370_PFP));
   1442 	radeon_emit(cmd_buffer->cs, va);
   1443 	radeon_emit(cmd_buffer->cs, va >> 32);
   1444 	radeon_emit(cmd_buffer->cs, pred_val);
   1445 	radeon_emit(cmd_buffer->cs, pred_val >> 32);
   1446 }
   1447 
   1448 void
   1449 radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
   1450 			  struct radv_image *image,
   1451 			  int idx,
   1452 			  uint32_t color_values[2])
   1453 {
   1454 	uint64_t va = radv_buffer_get_va(image->bo);
   1455 	va += image->offset + image->clear_value_offset;
   1456 
   1457 	assert(image->cmask.size || image->surface.dcc_size);
   1458 
   1459 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0));
   1460 	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
   1461 				    S_370_WR_CONFIRM(1) |
   1462 				    S_370_ENGINE_SEL(V_370_PFP));
   1463 	radeon_emit(cmd_buffer->cs, va);
   1464 	radeon_emit(cmd_buffer->cs, va >> 32);
   1465 	radeon_emit(cmd_buffer->cs, color_values[0]);
   1466 	radeon_emit(cmd_buffer->cs, color_values[1]);
   1467 
   1468 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c, 2);
   1469 	radeon_emit(cmd_buffer->cs, color_values[0]);
   1470 	radeon_emit(cmd_buffer->cs, color_values[1]);
   1471 }
   1472 
   1473 static void
   1474 radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
   1475 			   struct radv_image *image,
   1476 			   int idx)
   1477 {
   1478 	uint64_t va = radv_buffer_get_va(image->bo);
   1479 	va += image->offset + image->clear_value_offset;
   1480 
   1481 	if (!image->cmask.size && !image->surface.dcc_size)
   1482 		return;
   1483 
   1484 	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c;
   1485 
   1486 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
   1487 	radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
   1488 				    COPY_DATA_DST_SEL(COPY_DATA_REG) |
   1489 				    COPY_DATA_COUNT_SEL);
   1490 	radeon_emit(cmd_buffer->cs, va);
   1491 	radeon_emit(cmd_buffer->cs, va >> 32);
   1492 	radeon_emit(cmd_buffer->cs, reg >> 2);
   1493 	radeon_emit(cmd_buffer->cs, 0);
   1494 
   1495 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
   1496 	radeon_emit(cmd_buffer->cs, 0);
   1497 }
   1498 
   1499 static void
   1500 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
   1501 {
   1502 	int i;
   1503 	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
   1504 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
   1505 
   1506 	/* this may happen for inherited secondary recording */
   1507 	if (!framebuffer)
   1508 		return;
   1509 
   1510 	for (i = 0; i < 8; ++i) {
   1511 		if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
   1512 			radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
   1513 				       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
   1514 			continue;
   1515 		}
   1516 
   1517 		int idx = subpass->color_attachments[i].attachment;
   1518 		struct radv_attachment_info *att = &framebuffer->attachments[idx];
   1519 		struct radv_image *image = att->attachment->image;
   1520 		VkImageLayout layout = subpass->color_attachments[i].layout;
   1521 
   1522 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo, 8);
   1523 
   1524 		assert(att->attachment->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT);
   1525 		radv_emit_fb_color_state(cmd_buffer, i, att, image, layout);
   1526 
   1527 		radv_load_color_clear_regs(cmd_buffer, image, i);
   1528 	}
   1529 
   1530 	if(subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
   1531 		int idx = subpass->depth_stencil_attachment.attachment;
   1532 		VkImageLayout layout = subpass->depth_stencil_attachment.layout;
   1533 		struct radv_attachment_info *att = &framebuffer->attachments[idx];
   1534 		struct radv_image *image = att->attachment->image;
   1535 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo, 8);
   1536 		MAYBE_UNUSED uint32_t queue_mask = radv_image_queue_family_mask(image,
   1537 										cmd_buffer->queue_family_index,
   1538 										cmd_buffer->queue_family_index);
   1539 		/* We currently don't support writing decompressed HTILE */
   1540 		assert(radv_layout_has_htile(image, layout, queue_mask) ==
   1541 		       radv_layout_is_htile_compressed(image, layout, queue_mask));
   1542 
   1543 		radv_emit_fb_ds_state(cmd_buffer, &att->ds, image, layout);
   1544 
   1545 		if (att->ds.offset_scale != cmd_buffer->state.offset_scale) {
   1546 			cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
   1547 			cmd_buffer->state.offset_scale = att->ds.offset_scale;
   1548 		}
   1549 		radv_load_depth_clear_regs(cmd_buffer, image);
   1550 	} else {
   1551 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
   1552 			radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
   1553 		else
   1554 			radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
   1555 
   1556 		radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
   1557 		radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
   1558 	}
   1559 	radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
   1560 			       S_028208_BR_X(framebuffer->width) |
   1561 			       S_028208_BR_Y(framebuffer->height));
   1562 
   1563 	if (cmd_buffer->device->dfsm_allowed) {
   1564 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
   1565 		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
   1566 	}
   1567 
   1568 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
   1569 }
   1570 
   1571 static void
   1572 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
   1573 {
   1574 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   1575 	struct radv_cmd_state *state = &cmd_buffer->state;
   1576 
   1577 	if (state->index_type != state->last_index_type) {
   1578 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
   1579 			radeon_set_uconfig_reg_idx(cs, R_03090C_VGT_INDEX_TYPE,
   1580 						   2, state->index_type);
   1581 		} else {
   1582 			radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
   1583 			radeon_emit(cs, state->index_type);
   1584 		}
   1585 
   1586 		state->last_index_type = state->index_type;
   1587 	}
   1588 
   1589 	radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
   1590 	radeon_emit(cs, state->index_va);
   1591 	radeon_emit(cs, state->index_va >> 32);
   1592 
   1593 	radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
   1594 	radeon_emit(cs, state->max_index_count);
   1595 
   1596 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
   1597 }
   1598 
   1599 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
   1600 {
   1601 	uint32_t db_count_control;
   1602 
   1603 	if(!cmd_buffer->state.active_occlusion_queries) {
   1604 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
   1605 			db_count_control = 0;
   1606 		} else {
   1607 			db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
   1608 		}
   1609 	} else {
   1610 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
   1611 			db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
   1612 				S_028004_SAMPLE_RATE(0) | /* TODO: set this to the number of samples of the current framebuffer */
   1613 				S_028004_ZPASS_ENABLE(1) |
   1614 				S_028004_SLICE_EVEN_ENABLE(1) |
   1615 				S_028004_SLICE_ODD_ENABLE(1);
   1616 		} else {
   1617 			db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
   1618 				S_028004_SAMPLE_RATE(0); /* TODO: set this to the number of samples of the current framebuffer */
   1619 		}
   1620 	}
   1621 
   1622 	radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
   1623 }
   1624 
   1625 static void
   1626 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
   1627 {
   1628 	if (G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.raster.pa_cl_clip_cntl))
   1629 		return;
   1630 
   1631 	if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
   1632 		radv_emit_viewport(cmd_buffer);
   1633 
   1634 	if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
   1635 		radv_emit_scissor(cmd_buffer);
   1636 
   1637 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
   1638 		radv_emit_line_width(cmd_buffer);
   1639 
   1640 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
   1641 		radv_emit_blend_constants(cmd_buffer);
   1642 
   1643 	if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
   1644 				       RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
   1645 				       RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
   1646 		radv_emit_stencil(cmd_buffer);
   1647 
   1648 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
   1649 		radv_emit_depth_bounds(cmd_buffer);
   1650 
   1651 	if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_PIPELINE |
   1652 				       RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS))
   1653 		radv_emit_depth_bias(cmd_buffer);
   1654 
   1655 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
   1656 		radv_emit_discard_rectangle(cmd_buffer);
   1657 
   1658 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DYNAMIC_ALL;
   1659 }
   1660 
   1661 static void
   1662 emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
   1663 				   struct radv_pipeline *pipeline,
   1664 				   int idx,
   1665 				   uint64_t va,
   1666 				   gl_shader_stage stage)
   1667 {
   1668 	struct ac_userdata_info *desc_set_loc = &pipeline->shaders[stage]->info.user_sgprs_locs.descriptor_sets[idx];
   1669 	uint32_t base_reg = pipeline->user_data_0[stage];
   1670 
   1671 	if (desc_set_loc->sgpr_idx == -1 || desc_set_loc->indirect)
   1672 		return;
   1673 
   1674 	assert(!desc_set_loc->indirect);
   1675 	assert(desc_set_loc->num_sgprs == 2);
   1676 	radeon_set_sh_reg_seq(cmd_buffer->cs,
   1677 			      base_reg + desc_set_loc->sgpr_idx * 4, 2);
   1678 	radeon_emit(cmd_buffer->cs, va);
   1679 	radeon_emit(cmd_buffer->cs, va >> 32);
   1680 }
   1681 
   1682 static void
   1683 radv_emit_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
   1684 				  VkShaderStageFlags stages,
   1685 				  struct radv_descriptor_set *set,
   1686 				  unsigned idx)
   1687 {
   1688 	if (cmd_buffer->state.pipeline) {
   1689 		radv_foreach_stage(stage, stages) {
   1690 			if (cmd_buffer->state.pipeline->shaders[stage])
   1691 				emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
   1692 								   idx, set->va,
   1693 								   stage);
   1694 		}
   1695 	}
   1696 
   1697 	if (cmd_buffer->state.compute_pipeline && (stages & VK_SHADER_STAGE_COMPUTE_BIT))
   1698 		emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.compute_pipeline,
   1699 						   idx, set->va,
   1700 						   MESA_SHADER_COMPUTE);
   1701 }
   1702 
   1703 static void
   1704 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer)
   1705 {
   1706 	struct radv_descriptor_set *set = &cmd_buffer->push_descriptors.set;
   1707 	unsigned bo_offset;
   1708 
   1709 	if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32,
   1710 					 set->mapped_ptr,
   1711 					 &bo_offset))
   1712 		return;
   1713 
   1714 	set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
   1715 	set->va += bo_offset;
   1716 }
   1717 
   1718 static void
   1719 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer)
   1720 {
   1721 	uint32_t size = MAX_SETS * 2 * 4;
   1722 	uint32_t offset;
   1723 	void *ptr;
   1724 
   1725 	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size,
   1726 					  256, &offset, &ptr))
   1727 		return;
   1728 
   1729 	for (unsigned i = 0; i < MAX_SETS; i++) {
   1730 		uint32_t *uptr = ((uint32_t *)ptr) + i * 2;
   1731 		uint64_t set_va = 0;
   1732 		struct radv_descriptor_set *set = cmd_buffer->descriptors[i];
   1733 		if (cmd_buffer->state.valid_descriptors & (1u << i))
   1734 			set_va = set->va;
   1735 		uptr[0] = set_va & 0xffffffff;
   1736 		uptr[1] = set_va >> 32;
   1737 	}
   1738 
   1739 	uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
   1740 	va += offset;
   1741 
   1742 	if (cmd_buffer->state.pipeline) {
   1743 		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX])
   1744 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
   1745 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
   1746 
   1747 		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT])
   1748 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT,
   1749 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
   1750 
   1751 		if (radv_pipeline_has_gs(cmd_buffer->state.pipeline))
   1752 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
   1753 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
   1754 
   1755 		if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
   1756 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL,
   1757 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
   1758 
   1759 		if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
   1760 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL,
   1761 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
   1762 	}
   1763 
   1764 	if (cmd_buffer->state.compute_pipeline)
   1765 		radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE,
   1766 					   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
   1767 }
   1768 
   1769 static void
   1770 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
   1771 		       VkShaderStageFlags stages)
   1772 {
   1773 	unsigned i;
   1774 
   1775 	if (!cmd_buffer->state.descriptors_dirty)
   1776 		return;
   1777 
   1778 	if (cmd_buffer->state.push_descriptors_dirty)
   1779 		radv_flush_push_descriptors(cmd_buffer);
   1780 
   1781 	if ((cmd_buffer->state.pipeline && cmd_buffer->state.pipeline->need_indirect_descriptor_sets) ||
   1782 	    (cmd_buffer->state.compute_pipeline && cmd_buffer->state.compute_pipeline->need_indirect_descriptor_sets)) {
   1783 		radv_flush_indirect_descriptor_sets(cmd_buffer);
   1784 	}
   1785 
   1786 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
   1787 	                                                   cmd_buffer->cs,
   1788 	                                                   MAX_SETS * MESA_SHADER_STAGES * 4);
   1789 
   1790 	for_each_bit(i, cmd_buffer->state.descriptors_dirty) {
   1791 		struct radv_descriptor_set *set = cmd_buffer->descriptors[i];
   1792 		if (!(cmd_buffer->state.valid_descriptors & (1u << i)))
   1793 			continue;
   1794 
   1795 		radv_emit_descriptor_set_userdata(cmd_buffer, stages, set, i);
   1796 	}
   1797 	cmd_buffer->state.descriptors_dirty = 0;
   1798 	cmd_buffer->state.push_descriptors_dirty = false;
   1799 
   1800 	if (unlikely(cmd_buffer->device->trace_bo))
   1801 		radv_save_descriptors(cmd_buffer);
   1802 
   1803 	assert(cmd_buffer->cs->cdw <= cdw_max);
   1804 }
   1805 
   1806 static void
   1807 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
   1808 		     struct radv_pipeline *pipeline,
   1809 		     VkShaderStageFlags stages)
   1810 {
   1811 	struct radv_pipeline_layout *layout = pipeline->layout;
   1812 	unsigned offset;
   1813 	void *ptr;
   1814 	uint64_t va;
   1815 
   1816 	stages &= cmd_buffer->push_constant_stages;
   1817 	if (!stages ||
   1818 	    (!layout->push_constant_size && !layout->dynamic_offset_count))
   1819 		return;
   1820 
   1821 	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
   1822 					  16 * layout->dynamic_offset_count,
   1823 					  256, &offset, &ptr))
   1824 		return;
   1825 
   1826 	memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
   1827 	memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
   1828 	       16 * layout->dynamic_offset_count);
   1829 
   1830 	va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
   1831 	va += offset;
   1832 
   1833 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
   1834 	                                                   cmd_buffer->cs, MESA_SHADER_STAGES * 4);
   1835 
   1836 	radv_foreach_stage(stage, stages) {
   1837 		if (pipeline->shaders[stage]) {
   1838 			radv_emit_userdata_address(cmd_buffer, pipeline, stage,
   1839 						   AC_UD_PUSH_CONSTANTS, va);
   1840 		}
   1841 	}
   1842 
   1843 	cmd_buffer->push_constant_stages &= ~stages;
   1844 	assert(cmd_buffer->cs->cdw <= cdw_max);
   1845 }
   1846 
   1847 static bool
   1848 radv_cmd_buffer_update_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
   1849 {
   1850 	if ((pipeline_is_dirty ||
   1851 	    (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
   1852 	    cmd_buffer->state.pipeline->vertex_elements.count &&
   1853 	    radv_get_vertex_shader(cmd_buffer->state.pipeline)->info.info.vs.has_vertex_buffers) {
   1854 		struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
   1855 		unsigned vb_offset;
   1856 		void *vb_ptr;
   1857 		uint32_t i = 0;
   1858 		uint32_t count = velems->count;
   1859 		uint64_t va;
   1860 
   1861 		/* allocate some descriptor state for vertex buffers */
   1862 		if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256,
   1863 						  &vb_offset, &vb_ptr))
   1864 			return false;
   1865 
   1866 		for (i = 0; i < count; i++) {
   1867 			uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
   1868 			uint32_t offset;
   1869 			int vb = velems->binding[i];
   1870 			struct radv_buffer *buffer = cmd_buffer->vertex_bindings[vb].buffer;
   1871 			uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb];
   1872 
   1873 			va = radv_buffer_get_va(buffer->bo);
   1874 
   1875 			offset = cmd_buffer->vertex_bindings[vb].offset + velems->offset[i];
   1876 			va += offset + buffer->offset;
   1877 			desc[0] = va;
   1878 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
   1879 			if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride)
   1880 				desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
   1881 			else
   1882 				desc[2] = buffer->size - offset;
   1883 			desc[3] = velems->rsrc_word3[i];
   1884 		}
   1885 
   1886 		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
   1887 		va += vb_offset;
   1888 
   1889 		radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
   1890 					   AC_UD_VS_VERTEX_BUFFERS, va);
   1891 
   1892 		cmd_buffer->state.vb_va = va;
   1893 		cmd_buffer->state.vb_size = count * 16;
   1894 		cmd_buffer->state.vb_prefetch_dirty = true;
   1895 	}
   1896 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
   1897 
   1898 	return true;
   1899 }
   1900 
   1901 static bool
   1902 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
   1903 {
   1904 	if (!radv_cmd_buffer_update_vertex_descriptors(cmd_buffer, pipeline_is_dirty))
   1905 		return false;
   1906 
   1907 	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
   1908 	radv_flush_constants(cmd_buffer, cmd_buffer->state.pipeline,
   1909 			     VK_SHADER_STAGE_ALL_GRAPHICS);
   1910 
   1911 	return true;
   1912 }
   1913 
   1914 static void
   1915 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw,
   1916 			 bool instanced_draw, bool indirect_draw,
   1917 			 uint32_t draw_vertex_count)
   1918 {
   1919 	struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
   1920 	struct radv_cmd_state *state = &cmd_buffer->state;
   1921 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   1922 	uint32_t ia_multi_vgt_param;
   1923 	int32_t primitive_reset_en;
   1924 
   1925 	/* Draw state. */
   1926 	ia_multi_vgt_param =
   1927 		si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw,
   1928 					  indirect_draw, draw_vertex_count);
   1929 
   1930 	if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
   1931 		if (info->chip_class >= GFX9) {
   1932 			radeon_set_uconfig_reg_idx(cs,
   1933 						   R_030960_IA_MULTI_VGT_PARAM,
   1934 						   4, ia_multi_vgt_param);
   1935 		} else if (info->chip_class >= CIK) {
   1936 			radeon_set_context_reg_idx(cs,
   1937 						   R_028AA8_IA_MULTI_VGT_PARAM,
   1938 						   1, ia_multi_vgt_param);
   1939 		} else {
   1940 			radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM,
   1941 					       ia_multi_vgt_param);
   1942 		}
   1943 		state->last_ia_multi_vgt_param = ia_multi_vgt_param;
   1944 	}
   1945 
   1946 	/* Primitive restart. */
   1947 	primitive_reset_en =
   1948 		indexed_draw && state->pipeline->graphics.prim_restart_enable;
   1949 
   1950 	if (primitive_reset_en != state->last_primitive_reset_en) {
   1951 		state->last_primitive_reset_en = primitive_reset_en;
   1952 		if (info->chip_class >= GFX9) {
   1953 			radeon_set_uconfig_reg(cs,
   1954 					       R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
   1955 					       primitive_reset_en);
   1956 		} else {
   1957 			radeon_set_context_reg(cs,
   1958 					       R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
   1959 					       primitive_reset_en);
   1960 		}
   1961 	}
   1962 
   1963 	if (primitive_reset_en) {
   1964 		uint32_t primitive_reset_index =
   1965 			state->index_type ? 0xffffffffu : 0xffffu;
   1966 
   1967 		if (primitive_reset_index != state->last_primitive_reset_index) {
   1968 			radeon_set_context_reg(cs,
   1969 					       R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
   1970 					       primitive_reset_index);
   1971 			state->last_primitive_reset_index = primitive_reset_index;
   1972 		}
   1973 	}
   1974 }
   1975 
   1976 static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
   1977 			     VkPipelineStageFlags src_stage_mask)
   1978 {
   1979 	if (src_stage_mask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
   1980 	                      VK_PIPELINE_STAGE_TRANSFER_BIT |
   1981 	                      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
   1982 	                      VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
   1983 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
   1984 	}
   1985 
   1986 	if (src_stage_mask & (VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
   1987 			      VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
   1988 			      VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
   1989 			      VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
   1990 			      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
   1991 			      VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
   1992 			      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
   1993 			      VK_PIPELINE_STAGE_TRANSFER_BIT |
   1994 			      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
   1995 			      VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
   1996 			      VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
   1997 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
   1998 	} else if (src_stage_mask & (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
   1999 	                             VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
   2000 	                             VK_PIPELINE_STAGE_VERTEX_SHADER_BIT)) {
   2001 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
   2002 	}
   2003 }
   2004 
   2005 static enum radv_cmd_flush_bits
   2006 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer,
   2007 				  VkAccessFlags src_flags)
   2008 {
   2009 	enum radv_cmd_flush_bits flush_bits = 0;
   2010 	uint32_t b;
   2011 	for_each_bit(b, src_flags) {
   2012 		switch ((VkAccessFlagBits)(1 << b)) {
   2013 		case VK_ACCESS_SHADER_WRITE_BIT:
   2014 			flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
   2015 			break;
   2016 		case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
   2017 			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
   2018 			              RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
   2019 			break;
   2020 		case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
   2021 			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
   2022 			              RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
   2023 			break;
   2024 		case VK_ACCESS_TRANSFER_WRITE_BIT:
   2025 			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
   2026 			              RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
   2027 			              RADV_CMD_FLAG_FLUSH_AND_INV_DB |
   2028 			              RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
   2029 			              RADV_CMD_FLAG_INV_GLOBAL_L2;
   2030 			break;
   2031 		default:
   2032 			break;
   2033 		}
   2034 	}
   2035 	return flush_bits;
   2036 }
   2037 
   2038 static enum radv_cmd_flush_bits
   2039 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer,
   2040                       VkAccessFlags dst_flags,
   2041                       struct radv_image *image)
   2042 {
   2043 	enum radv_cmd_flush_bits flush_bits = 0;
   2044 	uint32_t b;
   2045 	for_each_bit(b, dst_flags) {
   2046 		switch ((VkAccessFlagBits)(1 << b)) {
   2047 		case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
   2048 		case VK_ACCESS_INDEX_READ_BIT:
   2049 			break;
   2050 		case VK_ACCESS_UNIFORM_READ_BIT:
   2051 			flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1;
   2052 			break;
   2053 		case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
   2054 		case VK_ACCESS_SHADER_READ_BIT:
   2055 		case VK_ACCESS_TRANSFER_READ_BIT:
   2056 		case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
   2057 			flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 |
   2058 			              RADV_CMD_FLAG_INV_GLOBAL_L2;
   2059 			break;
   2060 		case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
   2061 			/* TODO: change to image && when the image gets passed
   2062 			 * through from the subpass. */
   2063 			if (!image || (image->usage & VK_IMAGE_USAGE_STORAGE_BIT))
   2064 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
   2065 				              RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
   2066 			break;
   2067 		case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
   2068 			if (!image || (image->usage & VK_IMAGE_USAGE_STORAGE_BIT))
   2069 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
   2070 				              RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
   2071 			break;
   2072 		default:
   2073 			break;
   2074 		}
   2075 	}
   2076 	return flush_bits;
   2077 }
   2078 
   2079 static void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier)
   2080 {
   2081 	cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask);
   2082 	radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
   2083 	cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask,
   2084 	                                                      NULL);
   2085 }
   2086 
   2087 static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
   2088 						 VkAttachmentReference att)
   2089 {
   2090 	unsigned idx = att.attachment;
   2091 	struct radv_image_view *view = cmd_buffer->state.framebuffer->attachments[idx].attachment;
   2092 	VkImageSubresourceRange range;
   2093 	range.aspectMask = 0;
   2094 	range.baseMipLevel = view->base_mip;
   2095 	range.levelCount = 1;
   2096 	range.baseArrayLayer = view->base_layer;
   2097 	range.layerCount = cmd_buffer->state.framebuffer->layers;
   2098 
   2099 	radv_handle_image_transition(cmd_buffer,
   2100 				     view->image,
   2101 				     cmd_buffer->state.attachments[idx].current_layout,
   2102 				     att.layout, 0, 0, &range,
   2103 				     cmd_buffer->state.attachments[idx].pending_clear_aspects);
   2104 
   2105 	cmd_buffer->state.attachments[idx].current_layout = att.layout;
   2106 
   2107 
   2108 }
   2109 
   2110 void
   2111 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,
   2112 			    const struct radv_subpass *subpass, bool transitions)
   2113 {
   2114 	if (transitions) {
   2115 		radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
   2116 
   2117 		for (unsigned i = 0; i < subpass->color_count; ++i) {
   2118 			if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
   2119 				radv_handle_subpass_image_transition(cmd_buffer,
   2120 				                                     subpass->color_attachments[i]);
   2121 		}
   2122 
   2123 		for (unsigned i = 0; i < subpass->input_count; ++i) {
   2124 			radv_handle_subpass_image_transition(cmd_buffer,
   2125 							subpass->input_attachments[i]);
   2126 		}
   2127 
   2128 		if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
   2129 			radv_handle_subpass_image_transition(cmd_buffer,
   2130 							subpass->depth_stencil_attachment);
   2131 		}
   2132 	}
   2133 
   2134 	cmd_buffer->state.subpass = subpass;
   2135 
   2136 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
   2137 }
   2138 
   2139 static VkResult
   2140 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer,
   2141 				 struct radv_render_pass *pass,
   2142 				 const VkRenderPassBeginInfo *info)
   2143 {
   2144 	struct radv_cmd_state *state = &cmd_buffer->state;
   2145 
   2146 	if (pass->attachment_count == 0) {
   2147 		state->attachments = NULL;
   2148 		return VK_SUCCESS;
   2149 	}
   2150 
   2151 	state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
   2152 					pass->attachment_count *
   2153 					sizeof(state->attachments[0]),
   2154 					8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   2155 	if (state->attachments == NULL) {
   2156 		cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
   2157 		return cmd_buffer->record_result;
   2158 	}
   2159 
   2160 	for (uint32_t i = 0; i < pass->attachment_count; ++i) {
   2161 		struct radv_render_pass_attachment *att = &pass->attachments[i];
   2162 		VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
   2163 		VkImageAspectFlags clear_aspects = 0;
   2164 
   2165 		if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
   2166 			/* color attachment */
   2167 			if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
   2168 				clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
   2169 			}
   2170 		} else {
   2171 			/* depthstencil attachment */
   2172 			if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
   2173 			    att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
   2174 				clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
   2175 				if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
   2176 				    att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
   2177 					clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
   2178 			}
   2179 			if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
   2180 			    att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
   2181 				clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
   2182 			}
   2183 		}
   2184 
   2185 		state->attachments[i].pending_clear_aspects = clear_aspects;
   2186 		state->attachments[i].cleared_views = 0;
   2187 		if (clear_aspects && info) {
   2188 			assert(info->clearValueCount > i);
   2189 			state->attachments[i].clear_value = info->pClearValues[i];
   2190 		}
   2191 
   2192 		state->attachments[i].current_layout = att->initial_layout;
   2193 	}
   2194 
   2195 	return VK_SUCCESS;
   2196 }
   2197 
   2198 VkResult radv_AllocateCommandBuffers(
   2199 	VkDevice _device,
   2200 	const VkCommandBufferAllocateInfo *pAllocateInfo,
   2201 	VkCommandBuffer *pCommandBuffers)
   2202 {
   2203 	RADV_FROM_HANDLE(radv_device, device, _device);
   2204 	RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
   2205 
   2206 	VkResult result = VK_SUCCESS;
   2207 	uint32_t i;
   2208 
   2209 	for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
   2210 
   2211 		if (!list_empty(&pool->free_cmd_buffers)) {
   2212 			struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
   2213 
   2214 			list_del(&cmd_buffer->pool_link);
   2215 			list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
   2216 
   2217 			result = radv_reset_cmd_buffer(cmd_buffer);
   2218 			cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
   2219 			cmd_buffer->level = pAllocateInfo->level;
   2220 
   2221 			pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
   2222 		} else {
   2223 			result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level,
   2224 			                                &pCommandBuffers[i]);
   2225 		}
   2226 		if (result != VK_SUCCESS)
   2227 			break;
   2228 	}
   2229 
   2230 	if (result != VK_SUCCESS) {
   2231 		radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
   2232 					i, pCommandBuffers);
   2233 
   2234 		/* From the Vulkan 1.0.66 spec:
   2235 		 *
   2236 		 * "vkAllocateCommandBuffers can be used to create multiple
   2237 		 *  command buffers. If the creation of any of those command
   2238 		 *  buffers fails, the implementation must destroy all
   2239 		 *  successfully created command buffer objects from this
   2240 		 *  command, set all entries of the pCommandBuffers array to
   2241 		 *  NULL and return the error."
   2242 		 */
   2243 		memset(pCommandBuffers, 0,
   2244 		       sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
   2245 	}
   2246 
   2247 	return result;
   2248 }
   2249 
   2250 void radv_FreeCommandBuffers(
   2251 	VkDevice device,
   2252 	VkCommandPool commandPool,
   2253 	uint32_t commandBufferCount,
   2254 	const VkCommandBuffer *pCommandBuffers)
   2255 {
   2256 	for (uint32_t i = 0; i < commandBufferCount; i++) {
   2257 		RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
   2258 
   2259 		if (cmd_buffer) {
   2260 			if (cmd_buffer->pool) {
   2261 				list_del(&cmd_buffer->pool_link);
   2262 				list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
   2263 			} else
   2264 				radv_cmd_buffer_destroy(cmd_buffer);
   2265 
   2266 		}
   2267 	}
   2268 }
   2269 
   2270 VkResult radv_ResetCommandBuffer(
   2271 	VkCommandBuffer commandBuffer,
   2272 	VkCommandBufferResetFlags flags)
   2273 {
   2274 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2275 	return radv_reset_cmd_buffer(cmd_buffer);
   2276 }
   2277 
   2278 static void emit_gfx_buffer_state(struct radv_cmd_buffer *cmd_buffer)
   2279 {
   2280 	struct radv_device *device = cmd_buffer->device;
   2281 	if (device->gfx_init) {
   2282 		uint64_t va = radv_buffer_get_va(device->gfx_init);
   2283 		radv_cs_add_buffer(device->ws, cmd_buffer->cs, device->gfx_init, 8);
   2284 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
   2285 		radeon_emit(cmd_buffer->cs, va);
   2286 		radeon_emit(cmd_buffer->cs, va >> 32);
   2287 		radeon_emit(cmd_buffer->cs, device->gfx_init_size_dw & 0xffff);
   2288 	} else
   2289 		si_init_config(cmd_buffer);
   2290 }
   2291 
   2292 VkResult radv_BeginCommandBuffer(
   2293 	VkCommandBuffer commandBuffer,
   2294 	const VkCommandBufferBeginInfo *pBeginInfo)
   2295 {
   2296 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2297 	VkResult result = VK_SUCCESS;
   2298 
   2299 	if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
   2300 		/* If the command buffer has already been resetted with
   2301 		 * vkResetCommandBuffer, no need to do it again.
   2302 		 */
   2303 		result = radv_reset_cmd_buffer(cmd_buffer);
   2304 		if (result != VK_SUCCESS)
   2305 			return result;
   2306 	}
   2307 
   2308 	memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
   2309 	cmd_buffer->state.last_primitive_reset_en = -1;
   2310 	cmd_buffer->state.last_index_type = -1;
   2311 	cmd_buffer->state.last_num_instances = -1;
   2312 	cmd_buffer->state.last_vertex_offset = -1;
   2313 	cmd_buffer->state.last_first_instance = -1;
   2314 	cmd_buffer->usage_flags = pBeginInfo->flags;
   2315 
   2316 	/* setup initial configuration into command buffer */
   2317 	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
   2318 		switch (cmd_buffer->queue_family_index) {
   2319 		case RADV_QUEUE_GENERAL:
   2320 			emit_gfx_buffer_state(cmd_buffer);
   2321 			break;
   2322 		case RADV_QUEUE_COMPUTE:
   2323 			si_init_compute(cmd_buffer);
   2324 			break;
   2325 		case RADV_QUEUE_TRANSFER:
   2326 		default:
   2327 			break;
   2328 		}
   2329 	}
   2330 
   2331 	if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
   2332 		assert(pBeginInfo->pInheritanceInfo);
   2333 		cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
   2334 		cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
   2335 
   2336 		struct radv_subpass *subpass =
   2337 			&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
   2338 
   2339 		result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
   2340 		if (result != VK_SUCCESS)
   2341 			return result;
   2342 
   2343 		radv_cmd_buffer_set_subpass(cmd_buffer, subpass, false);
   2344 	}
   2345 
   2346 	if (unlikely(cmd_buffer->device->trace_bo))
   2347 		radv_cmd_buffer_trace_emit(cmd_buffer);
   2348 
   2349 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
   2350 
   2351 	return result;
   2352 }
   2353 
   2354 void radv_CmdBindVertexBuffers(
   2355 	VkCommandBuffer                             commandBuffer,
   2356 	uint32_t                                    firstBinding,
   2357 	uint32_t                                    bindingCount,
   2358 	const VkBuffer*                             pBuffers,
   2359 	const VkDeviceSize*                         pOffsets)
   2360 {
   2361 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2362 	struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
   2363 	bool changed = false;
   2364 
   2365 	/* We have to defer setting up vertex buffer since we need the buffer
   2366 	 * stride from the pipeline. */
   2367 
   2368 	assert(firstBinding + bindingCount <= MAX_VBS);
   2369 	for (uint32_t i = 0; i < bindingCount; i++) {
   2370 		uint32_t idx = firstBinding + i;
   2371 
   2372 		if (!changed &&
   2373 		    (vb[idx].buffer != radv_buffer_from_handle(pBuffers[i]) ||
   2374 		     vb[idx].offset != pOffsets[i])) {
   2375 			changed = true;
   2376 		}
   2377 
   2378 		vb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
   2379 		vb[idx].offset = pOffsets[i];
   2380 
   2381 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
   2382 				   vb[idx].buffer->bo, 8);
   2383 	}
   2384 
   2385 	if (!changed) {
   2386 		/* No state changes. */
   2387 		return;
   2388 	}
   2389 
   2390 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
   2391 }
   2392 
   2393 void radv_CmdBindIndexBuffer(
   2394 	VkCommandBuffer                             commandBuffer,
   2395 	VkBuffer buffer,
   2396 	VkDeviceSize offset,
   2397 	VkIndexType indexType)
   2398 {
   2399 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2400 	RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
   2401 
   2402 	if (cmd_buffer->state.index_buffer == index_buffer &&
   2403 	    cmd_buffer->state.index_offset == offset &&
   2404 	    cmd_buffer->state.index_type == indexType) {
   2405 		/* No state changes. */
   2406 		return;
   2407 	}
   2408 
   2409 	cmd_buffer->state.index_buffer = index_buffer;
   2410 	cmd_buffer->state.index_offset = offset;
   2411 	cmd_buffer->state.index_type = indexType; /* vk matches hw */
   2412 	cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
   2413 	cmd_buffer->state.index_va += index_buffer->offset + offset;
   2414 
   2415 	int index_size_shift = cmd_buffer->state.index_type ? 2 : 1;
   2416 	cmd_buffer->state.max_index_count = (index_buffer->size - offset) >> index_size_shift;
   2417 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
   2418 	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo, 8);
   2419 }
   2420 
   2421 
   2422 static void
   2423 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
   2424 			 struct radv_descriptor_set *set, unsigned idx)
   2425 {
   2426 	struct radeon_winsys *ws = cmd_buffer->device->ws;
   2427 
   2428 	radv_set_descriptor_set(cmd_buffer, set, idx);
   2429 	if (!set)
   2430 		return;
   2431 
   2432 	assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
   2433 
   2434 	for (unsigned j = 0; j < set->layout->buffer_count; ++j)
   2435 		if (set->descriptors[j])
   2436 			radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j], 7);
   2437 
   2438 	if(set->bo)
   2439 		radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo, 8);
   2440 }
   2441 
   2442 void radv_CmdBindDescriptorSets(
   2443 	VkCommandBuffer                             commandBuffer,
   2444 	VkPipelineBindPoint                         pipelineBindPoint,
   2445 	VkPipelineLayout                            _layout,
   2446 	uint32_t                                    firstSet,
   2447 	uint32_t                                    descriptorSetCount,
   2448 	const VkDescriptorSet*                      pDescriptorSets,
   2449 	uint32_t                                    dynamicOffsetCount,
   2450 	const uint32_t*                             pDynamicOffsets)
   2451 {
   2452 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2453 	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
   2454 	unsigned dyn_idx = 0;
   2455 
   2456 	for (unsigned i = 0; i < descriptorSetCount; ++i) {
   2457 		unsigned idx = i + firstSet;
   2458 		RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
   2459 		radv_bind_descriptor_set(cmd_buffer, set, idx);
   2460 
   2461 		for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
   2462 			unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
   2463 			uint32_t *dst = cmd_buffer->dynamic_buffers + idx * 4;
   2464 			assert(dyn_idx < dynamicOffsetCount);
   2465 
   2466 			struct radv_descriptor_range *range = set->dynamic_descriptors + j;
   2467 			uint64_t va = range->va + pDynamicOffsets[dyn_idx];
   2468 			dst[0] = va;
   2469 			dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
   2470 			dst[2] = range->size;
   2471 			dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
   2472 			         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
   2473 			         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
   2474 			         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
   2475 			         S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
   2476 			         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
   2477 			cmd_buffer->push_constant_stages |=
   2478 			                     set->layout->dynamic_shader_stages;
   2479 		}
   2480 	}
   2481 }
   2482 
   2483 static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
   2484                                           struct radv_descriptor_set *set,
   2485                                           struct radv_descriptor_set_layout *layout)
   2486 {
   2487 	set->size = layout->size;
   2488 	set->layout = layout;
   2489 
   2490 	if (cmd_buffer->push_descriptors.capacity < set->size) {
   2491 		size_t new_size = MAX2(set->size, 1024);
   2492 		new_size = MAX2(new_size, 2 * cmd_buffer->push_descriptors.capacity);
   2493 		new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
   2494 
   2495 		free(set->mapped_ptr);
   2496 		set->mapped_ptr = malloc(new_size);
   2497 
   2498 		if (!set->mapped_ptr) {
   2499 			cmd_buffer->push_descriptors.capacity = 0;
   2500 			cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
   2501 			return false;
   2502 		}
   2503 
   2504 		cmd_buffer->push_descriptors.capacity = new_size;
   2505 	}
   2506 
   2507 	return true;
   2508 }
   2509 
   2510 void radv_meta_push_descriptor_set(
   2511 	struct radv_cmd_buffer*              cmd_buffer,
   2512 	VkPipelineBindPoint                  pipelineBindPoint,
   2513 	VkPipelineLayout                     _layout,
   2514 	uint32_t                             set,
   2515 	uint32_t                             descriptorWriteCount,
   2516 	const VkWriteDescriptorSet*          pDescriptorWrites)
   2517 {
   2518 	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
   2519 	struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors;
   2520 	unsigned bo_offset;
   2521 
   2522 	assert(set == 0);
   2523 	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
   2524 
   2525 	push_set->size = layout->set[set].layout->size;
   2526 	push_set->layout = layout->set[set].layout;
   2527 
   2528 	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32,
   2529 	                                  &bo_offset,
   2530 	                                  (void**) &push_set->mapped_ptr))
   2531 		return;
   2532 
   2533 	push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
   2534 	push_set->va += bo_offset;
   2535 
   2536 	radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
   2537 	                            radv_descriptor_set_to_handle(push_set),
   2538 	                            descriptorWriteCount, pDescriptorWrites, 0, NULL);
   2539 
   2540 	radv_set_descriptor_set(cmd_buffer, push_set, set);
   2541 }
   2542 
   2543 void radv_CmdPushDescriptorSetKHR(
   2544 	VkCommandBuffer                             commandBuffer,
   2545 	VkPipelineBindPoint                         pipelineBindPoint,
   2546 	VkPipelineLayout                            _layout,
   2547 	uint32_t                                    set,
   2548 	uint32_t                                    descriptorWriteCount,
   2549 	const VkWriteDescriptorSet*                 pDescriptorWrites)
   2550 {
   2551 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2552 	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
   2553 	struct radv_descriptor_set *push_set = &cmd_buffer->push_descriptors.set;
   2554 
   2555 	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
   2556 
   2557 	if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout))
   2558 		return;
   2559 
   2560 	radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
   2561 	                            radv_descriptor_set_to_handle(push_set),
   2562 	                            descriptorWriteCount, pDescriptorWrites, 0, NULL);
   2563 
   2564 	radv_set_descriptor_set(cmd_buffer, push_set, set);
   2565 	cmd_buffer->state.push_descriptors_dirty = true;
   2566 }
   2567 
   2568 void radv_CmdPushDescriptorSetWithTemplateKHR(
   2569 	VkCommandBuffer                             commandBuffer,
   2570 	VkDescriptorUpdateTemplateKHR               descriptorUpdateTemplate,
   2571 	VkPipelineLayout                            _layout,
   2572 	uint32_t                                    set,
   2573 	const void*                                 pData)
   2574 {
   2575 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2576 	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
   2577 	struct radv_descriptor_set *push_set = &cmd_buffer->push_descriptors.set;
   2578 
   2579 	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
   2580 
   2581 	if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout))
   2582 		return;
   2583 
   2584 	radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
   2585 						 descriptorUpdateTemplate, pData);
   2586 
   2587 	radv_set_descriptor_set(cmd_buffer, push_set, set);
   2588 	cmd_buffer->state.push_descriptors_dirty = true;
   2589 }
   2590 
   2591 void radv_CmdPushConstants(VkCommandBuffer commandBuffer,
   2592 			   VkPipelineLayout layout,
   2593 			   VkShaderStageFlags stageFlags,
   2594 			   uint32_t offset,
   2595 			   uint32_t size,
   2596 			   const void* pValues)
   2597 {
   2598 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2599 	memcpy(cmd_buffer->push_constants + offset, pValues, size);
   2600 	cmd_buffer->push_constant_stages |= stageFlags;
   2601 }
   2602 
   2603 VkResult radv_EndCommandBuffer(
   2604 	VkCommandBuffer                             commandBuffer)
   2605 {
   2606 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2607 
   2608 	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
   2609 		if (cmd_buffer->device->physical_device->rad_info.chip_class == SI)
   2610 			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
   2611 		si_emit_cache_flush(cmd_buffer);
   2612 	}
   2613 
   2614 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
   2615 
   2616 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
   2617 		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
   2618 
   2619 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
   2620 
   2621 	return cmd_buffer->record_result;
   2622 }
   2623 
   2624 static void
   2625 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
   2626 {
   2627 	struct radv_shader_variant *compute_shader;
   2628 	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
   2629 	struct radv_device *device = cmd_buffer->device;
   2630 	unsigned compute_resource_limits;
   2631 	unsigned waves_per_threadgroup;
   2632 	uint64_t va;
   2633 
   2634 	if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
   2635 		return;
   2636 
   2637 	cmd_buffer->state.emitted_compute_pipeline = pipeline;
   2638 
   2639 	compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
   2640 	va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
   2641 
   2642 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
   2643 							   cmd_buffer->cs, 19);
   2644 
   2645 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B830_COMPUTE_PGM_LO, 2);
   2646 	radeon_emit(cmd_buffer->cs, va >> 8);
   2647 	radeon_emit(cmd_buffer->cs, va >> 40);
   2648 
   2649 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
   2650 	radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
   2651 	radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
   2652 
   2653 
   2654 	cmd_buffer->compute_scratch_size_needed =
   2655 	                          MAX2(cmd_buffer->compute_scratch_size_needed,
   2656 	                               pipeline->max_waves * pipeline->scratch_bytes_per_wave);
   2657 
   2658 	/* change these once we have scratch support */
   2659 	radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
   2660 			  S_00B860_WAVES(pipeline->max_waves) |
   2661 			  S_00B860_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
   2662 
   2663 	/* Calculate best compute resource limits. */
   2664 	waves_per_threadgroup =
   2665 		DIV_ROUND_UP(compute_shader->info.cs.block_size[0] *
   2666 			     compute_shader->info.cs.block_size[1] *
   2667 			     compute_shader->info.cs.block_size[2], 64);
   2668 	compute_resource_limits =
   2669 		S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
   2670 
   2671 	if (device->physical_device->rad_info.chip_class >= CIK) {
   2672 		unsigned num_cu_per_se =
   2673 			device->physical_device->rad_info.num_good_compute_units /
   2674 			device->physical_device->rad_info.max_se;
   2675 
   2676 		/* Force even distribution on all SIMDs in CU if the workgroup
   2677 		 * size is 64. This has shown some good improvements if # of
   2678 		 * CUs per SE is not a multiple of 4.
   2679 		 */
   2680 		if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
   2681 			compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
   2682 	}
   2683 
   2684 	radeon_set_sh_reg(cmd_buffer->cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
   2685 			  compute_resource_limits);
   2686 
   2687 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
   2688 	radeon_emit(cmd_buffer->cs,
   2689 		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
   2690 	radeon_emit(cmd_buffer->cs,
   2691 		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
   2692 	radeon_emit(cmd_buffer->cs,
   2693 		    S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
   2694 
   2695 	assert(cmd_buffer->cs->cdw <= cdw_max);
   2696 
   2697 	if (unlikely(cmd_buffer->device->trace_bo))
   2698 		radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
   2699 }
   2700 
   2701 static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer)
   2702 {
   2703 	cmd_buffer->state.descriptors_dirty |= cmd_buffer->state.valid_descriptors;
   2704 }
   2705 
   2706 void radv_CmdBindPipeline(
   2707 	VkCommandBuffer                             commandBuffer,
   2708 	VkPipelineBindPoint                         pipelineBindPoint,
   2709 	VkPipeline                                  _pipeline)
   2710 {
   2711 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2712 	RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
   2713 
   2714 	switch (pipelineBindPoint) {
   2715 	case VK_PIPELINE_BIND_POINT_COMPUTE:
   2716 		if (cmd_buffer->state.compute_pipeline == pipeline)
   2717 			return;
   2718 		radv_mark_descriptor_sets_dirty(cmd_buffer);
   2719 
   2720 		cmd_buffer->state.compute_pipeline = pipeline;
   2721 		cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
   2722 		break;
   2723 	case VK_PIPELINE_BIND_POINT_GRAPHICS:
   2724 		if (cmd_buffer->state.pipeline == pipeline)
   2725 			return;
   2726 		radv_mark_descriptor_sets_dirty(cmd_buffer);
   2727 
   2728 		cmd_buffer->state.pipeline = pipeline;
   2729 		if (!pipeline)
   2730 			break;
   2731 
   2732 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
   2733 		cmd_buffer->push_constant_stages |= pipeline->active_stages;
   2734 
   2735 		/* the new vertex shader might not have the same user regs */
   2736 		cmd_buffer->state.last_first_instance = -1;
   2737 		cmd_buffer->state.last_vertex_offset = -1;
   2738 
   2739 		radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
   2740 
   2741 		if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
   2742 			cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
   2743 		if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
   2744 			cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
   2745 
   2746 		if (radv_pipeline_has_tess(pipeline))
   2747 			cmd_buffer->tess_rings_needed = true;
   2748 
   2749 		if (radv_pipeline_has_gs(pipeline)) {
   2750 			struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
   2751 									     AC_UD_SCRATCH_RING_OFFSETS);
   2752 			if (cmd_buffer->ring_offsets_idx == -1)
   2753 				cmd_buffer->ring_offsets_idx = loc->sgpr_idx;
   2754 			else if (loc->sgpr_idx != -1)
   2755 				assert(loc->sgpr_idx == cmd_buffer->ring_offsets_idx);
   2756 		}
   2757 		break;
   2758 	default:
   2759 		assert(!"invalid bind point");
   2760 		break;
   2761 	}
   2762 }
   2763 
   2764 void radv_CmdSetViewport(
   2765 	VkCommandBuffer                             commandBuffer,
   2766 	uint32_t                                    firstViewport,
   2767 	uint32_t                                    viewportCount,
   2768 	const VkViewport*                           pViewports)
   2769 {
   2770 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2771 	struct radv_cmd_state *state = &cmd_buffer->state;
   2772 	MAYBE_UNUSED const uint32_t total_count = firstViewport + viewportCount;
   2773 
   2774 	assert(firstViewport < MAX_VIEWPORTS);
   2775 	assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
   2776 
   2777 	if (cmd_buffer->device->physical_device->has_scissor_bug) {
   2778 		/* Try to skip unnecessary PS partial flushes when the viewports
   2779 		 * don't change.
   2780 		 */
   2781 		if (!(state->dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT |
   2782 				      RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) &&
   2783 		    !memcmp(state->dynamic.viewport.viewports + firstViewport,
   2784 			    pViewports, viewportCount * sizeof(*pViewports))) {
   2785 			return;
   2786 		}
   2787 	}
   2788 
   2789 	memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
   2790 	       viewportCount * sizeof(*pViewports));
   2791 
   2792 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
   2793 }
   2794 
   2795 void radv_CmdSetScissor(
   2796 	VkCommandBuffer                             commandBuffer,
   2797 	uint32_t                                    firstScissor,
   2798 	uint32_t                                    scissorCount,
   2799 	const VkRect2D*                             pScissors)
   2800 {
   2801 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2802 	struct radv_cmd_state *state = &cmd_buffer->state;
   2803 	MAYBE_UNUSED const uint32_t total_count = firstScissor + scissorCount;
   2804 
   2805 	assert(firstScissor < MAX_SCISSORS);
   2806 	assert(total_count >= 1 && total_count <= MAX_SCISSORS);
   2807 
   2808 	if (cmd_buffer->device->physical_device->has_scissor_bug) {
   2809 		/* Try to skip unnecessary PS partial flushes when the scissors
   2810 		 * don't change.
   2811 		 */
   2812 		if (!(state->dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT |
   2813 				      RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) &&
   2814 		    !memcmp(state->dynamic.scissor.scissors + firstScissor,
   2815 			    pScissors, scissorCount * sizeof(*pScissors))) {
   2816 			return;
   2817 		}
   2818 	}
   2819 
   2820 	memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
   2821 	       scissorCount * sizeof(*pScissors));
   2822 
   2823 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
   2824 }
   2825 
   2826 void radv_CmdSetLineWidth(
   2827 	VkCommandBuffer                             commandBuffer,
   2828 	float                                       lineWidth)
   2829 {
   2830 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2831 	cmd_buffer->state.dynamic.line_width = lineWidth;
   2832 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
   2833 }
   2834 
   2835 void radv_CmdSetDepthBias(
   2836 	VkCommandBuffer                             commandBuffer,
   2837 	float                                       depthBiasConstantFactor,
   2838 	float                                       depthBiasClamp,
   2839 	float                                       depthBiasSlopeFactor)
   2840 {
   2841 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2842 
   2843 	cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor;
   2844 	cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp;
   2845 	cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor;
   2846 
   2847 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
   2848 }
   2849 
   2850 void radv_CmdSetBlendConstants(
   2851 	VkCommandBuffer                             commandBuffer,
   2852 	const float                                 blendConstants[4])
   2853 {
   2854 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2855 
   2856 	memcpy(cmd_buffer->state.dynamic.blend_constants,
   2857 	       blendConstants, sizeof(float) * 4);
   2858 
   2859 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
   2860 }
   2861 
   2862 void radv_CmdSetDepthBounds(
   2863 	VkCommandBuffer                             commandBuffer,
   2864 	float                                       minDepthBounds,
   2865 	float                                       maxDepthBounds)
   2866 {
   2867 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2868 
   2869 	cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
   2870 	cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
   2871 
   2872 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
   2873 }
   2874 
   2875 void radv_CmdSetStencilCompareMask(
   2876 	VkCommandBuffer                             commandBuffer,
   2877 	VkStencilFaceFlags                          faceMask,
   2878 	uint32_t                                    compareMask)
   2879 {
   2880 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2881 
   2882 	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
   2883 		cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask;
   2884 	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
   2885 		cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask;
   2886 
   2887 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
   2888 }
   2889 
   2890 void radv_CmdSetStencilWriteMask(
   2891 	VkCommandBuffer                             commandBuffer,
   2892 	VkStencilFaceFlags                          faceMask,
   2893 	uint32_t                                    writeMask)
   2894 {
   2895 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2896 
   2897 	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
   2898 		cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask;
   2899 	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
   2900 		cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask;
   2901 
   2902 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
   2903 }
   2904 
   2905 void radv_CmdSetStencilReference(
   2906 	VkCommandBuffer                             commandBuffer,
   2907 	VkStencilFaceFlags                          faceMask,
   2908 	uint32_t                                    reference)
   2909 {
   2910 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2911 
   2912 	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
   2913 		cmd_buffer->state.dynamic.stencil_reference.front = reference;
   2914 	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
   2915 		cmd_buffer->state.dynamic.stencil_reference.back = reference;
   2916 
   2917 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
   2918 }
   2919 
   2920 void radv_CmdSetDiscardRectangleEXT(
   2921 	VkCommandBuffer                             commandBuffer,
   2922 	uint32_t                                    firstDiscardRectangle,
   2923 	uint32_t                                    discardRectangleCount,
   2924 	const VkRect2D*                             pDiscardRectangles)
   2925 {
   2926 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   2927 	struct radv_cmd_state *state = &cmd_buffer->state;
   2928 	MAYBE_UNUSED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
   2929 
   2930 	assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
   2931 	assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
   2932 
   2933 	typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
   2934 	             pDiscardRectangles, discardRectangleCount);
   2935 
   2936 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
   2937 }
   2938 
   2939 void radv_CmdExecuteCommands(
   2940 	VkCommandBuffer                             commandBuffer,
   2941 	uint32_t                                    commandBufferCount,
   2942 	const VkCommandBuffer*                      pCmdBuffers)
   2943 {
   2944 	RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
   2945 
   2946 	assert(commandBufferCount > 0);
   2947 
   2948 	/* Emit pending flushes on primary prior to executing secondary */
   2949 	si_emit_cache_flush(primary);
   2950 
   2951 	for (uint32_t i = 0; i < commandBufferCount; i++) {
   2952 		RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
   2953 
   2954 		primary->scratch_size_needed = MAX2(primary->scratch_size_needed,
   2955 		                                    secondary->scratch_size_needed);
   2956 		primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
   2957 		                                            secondary->compute_scratch_size_needed);
   2958 
   2959 		if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
   2960 			primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
   2961 		if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
   2962 			primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
   2963 		if (secondary->tess_rings_needed)
   2964 			primary->tess_rings_needed = true;
   2965 		if (secondary->sample_positions_needed)
   2966 			primary->sample_positions_needed = true;
   2967 
   2968 		if (secondary->ring_offsets_idx != -1) {
   2969 			if (primary->ring_offsets_idx == -1)
   2970 				primary->ring_offsets_idx = secondary->ring_offsets_idx;
   2971 			else
   2972 				assert(secondary->ring_offsets_idx == primary->ring_offsets_idx);
   2973 		}
   2974 		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
   2975 
   2976 
   2977 		/* When the secondary command buffer is compute only we don't
   2978 		 * need to re-emit the current graphics pipeline.
   2979 		 */
   2980 		if (secondary->state.emitted_pipeline) {
   2981 			primary->state.emitted_pipeline =
   2982 				secondary->state.emitted_pipeline;
   2983 		}
   2984 
   2985 		/* When the secondary command buffer is graphics only we don't
   2986 		 * need to re-emit the current compute pipeline.
   2987 		 */
   2988 		if (secondary->state.emitted_compute_pipeline) {
   2989 			primary->state.emitted_compute_pipeline =
   2990 				secondary->state.emitted_compute_pipeline;
   2991 		}
   2992 
   2993 		/* Only re-emit the draw packets when needed. */
   2994 		if (secondary->state.last_primitive_reset_en != -1) {
   2995 			primary->state.last_primitive_reset_en =
   2996 				secondary->state.last_primitive_reset_en;
   2997 		}
   2998 
   2999 		if (secondary->state.last_primitive_reset_index) {
   3000 			primary->state.last_primitive_reset_index =
   3001 				secondary->state.last_primitive_reset_index;
   3002 		}
   3003 
   3004 		if (secondary->state.last_ia_multi_vgt_param) {
   3005 			primary->state.last_ia_multi_vgt_param =
   3006 				secondary->state.last_ia_multi_vgt_param;
   3007 		}
   3008 
   3009 		primary->state.last_first_instance = secondary->state.last_first_instance;
   3010 		primary->state.last_num_instances = secondary->state.last_num_instances;
   3011 		primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
   3012 
   3013 		if (secondary->state.last_index_type != -1) {
   3014 			primary->state.last_index_type =
   3015 				secondary->state.last_index_type;
   3016 		}
   3017 	}
   3018 
   3019 	/* After executing commands from secondary buffers we have to dirty
   3020 	 * some states.
   3021 	 */
   3022 	primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE |
   3023 				RADV_CMD_DIRTY_INDEX_BUFFER |
   3024 				RADV_CMD_DIRTY_DYNAMIC_ALL;
   3025 	radv_mark_descriptor_sets_dirty(primary);
   3026 }
   3027 
   3028 VkResult radv_CreateCommandPool(
   3029 	VkDevice                                    _device,
   3030 	const VkCommandPoolCreateInfo*              pCreateInfo,
   3031 	const VkAllocationCallbacks*                pAllocator,
   3032 	VkCommandPool*                              pCmdPool)
   3033 {
   3034 	RADV_FROM_HANDLE(radv_device, device, _device);
   3035 	struct radv_cmd_pool *pool;
   3036 
   3037 	pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
   3038 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   3039 	if (pool == NULL)
   3040 		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
   3041 
   3042 	if (pAllocator)
   3043 		pool->alloc = *pAllocator;
   3044 	else
   3045 		pool->alloc = device->alloc;
   3046 
   3047 	list_inithead(&pool->cmd_buffers);
   3048 	list_inithead(&pool->free_cmd_buffers);
   3049 
   3050 	pool->queue_family_index = pCreateInfo->queueFamilyIndex;
   3051 
   3052 	*pCmdPool = radv_cmd_pool_to_handle(pool);
   3053 
   3054 	return VK_SUCCESS;
   3055 
   3056 }
   3057 
   3058 void radv_DestroyCommandPool(
   3059 	VkDevice                                    _device,
   3060 	VkCommandPool                               commandPool,
   3061 	const VkAllocationCallbacks*                pAllocator)
   3062 {
   3063 	RADV_FROM_HANDLE(radv_device, device, _device);
   3064 	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
   3065 
   3066 	if (!pool)
   3067 		return;
   3068 
   3069 	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
   3070 				 &pool->cmd_buffers, pool_link) {
   3071 		radv_cmd_buffer_destroy(cmd_buffer);
   3072 	}
   3073 
   3074 	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
   3075 				 &pool->free_cmd_buffers, pool_link) {
   3076 		radv_cmd_buffer_destroy(cmd_buffer);
   3077 	}
   3078 
   3079 	vk_free2(&device->alloc, pAllocator, pool);
   3080 }
   3081 
   3082 VkResult radv_ResetCommandPool(
   3083 	VkDevice                                    device,
   3084 	VkCommandPool                               commandPool,
   3085 	VkCommandPoolResetFlags                     flags)
   3086 {
   3087 	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
   3088 	VkResult result;
   3089 
   3090 	list_for_each_entry(struct radv_cmd_buffer, cmd_buffer,
   3091 			    &pool->cmd_buffers, pool_link) {
   3092 		result = radv_reset_cmd_buffer(cmd_buffer);
   3093 		if (result != VK_SUCCESS)
   3094 			return result;
   3095 	}
   3096 
   3097 	return VK_SUCCESS;
   3098 }
   3099 
   3100 void radv_TrimCommandPoolKHR(
   3101     VkDevice                                    device,
   3102     VkCommandPool                               commandPool,
   3103     VkCommandPoolTrimFlagsKHR                   flags)
   3104 {
   3105 	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
   3106 
   3107 	if (!pool)
   3108 		return;
   3109 
   3110 	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
   3111 				 &pool->free_cmd_buffers, pool_link) {
   3112 		radv_cmd_buffer_destroy(cmd_buffer);
   3113 	}
   3114 }
   3115 
   3116 void radv_CmdBeginRenderPass(
   3117 	VkCommandBuffer                             commandBuffer,
   3118 	const VkRenderPassBeginInfo*                pRenderPassBegin,
   3119 	VkSubpassContents                           contents)
   3120 {
   3121 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3122 	RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
   3123 	RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
   3124 
   3125 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
   3126 							   cmd_buffer->cs, 2048);
   3127 	MAYBE_UNUSED VkResult result;
   3128 
   3129 	cmd_buffer->state.framebuffer = framebuffer;
   3130 	cmd_buffer->state.pass = pass;
   3131 	cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
   3132 
   3133 	result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin);
   3134 	if (result != VK_SUCCESS)
   3135 		return;
   3136 
   3137 	radv_cmd_buffer_set_subpass(cmd_buffer, pass->subpasses, true);
   3138 	assert(cmd_buffer->cs->cdw <= cdw_max);
   3139 
   3140 	radv_cmd_buffer_clear_subpass(cmd_buffer);
   3141 }
   3142 
   3143 void radv_CmdNextSubpass(
   3144     VkCommandBuffer                             commandBuffer,
   3145     VkSubpassContents                           contents)
   3146 {
   3147 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3148 
   3149 	radv_cmd_buffer_resolve_subpass(cmd_buffer);
   3150 
   3151 	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs,
   3152 					      2048);
   3153 
   3154 	radv_cmd_buffer_set_subpass(cmd_buffer, cmd_buffer->state.subpass + 1, true);
   3155 	radv_cmd_buffer_clear_subpass(cmd_buffer);
   3156 }
   3157 
   3158 static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
   3159 {
   3160 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
   3161 	for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
   3162 		if (!pipeline->shaders[stage])
   3163 			continue;
   3164 		struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
   3165 		if (loc->sgpr_idx == -1)
   3166 			continue;
   3167 		uint32_t base_reg = pipeline->user_data_0[stage];
   3168 		radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
   3169 
   3170 	}
   3171 	if (pipeline->gs_copy_shader) {
   3172 		struct ac_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
   3173 		if (loc->sgpr_idx != -1) {
   3174 			uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
   3175 			radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
   3176 		}
   3177 	}
   3178 }
   3179 
   3180 static void
   3181 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer,
   3182                          uint32_t vertex_count)
   3183 {
   3184 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
   3185 	radeon_emit(cmd_buffer->cs, vertex_count);
   3186 	radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
   3187 	                            S_0287F0_USE_OPAQUE(0));
   3188 }
   3189 
   3190 static void
   3191 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer,
   3192                                  uint64_t index_va,
   3193                                  uint32_t index_count)
   3194 {
   3195 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, false));
   3196 	radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
   3197 	radeon_emit(cmd_buffer->cs, index_va);
   3198 	radeon_emit(cmd_buffer->cs, index_va >> 32);
   3199 	radeon_emit(cmd_buffer->cs, index_count);
   3200 	radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA);
   3201 }
   3202 
   3203 static void
   3204 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
   3205                                   bool indexed,
   3206                                   uint32_t draw_count,
   3207                                   uint64_t count_va,
   3208                                   uint32_t stride)
   3209 {
   3210 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   3211 	unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
   3212 	                              : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
   3213 	bool draw_id_enable = radv_get_vertex_shader(cmd_buffer->state.pipeline)->info.info.vs.needs_draw_id;
   3214 	uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
   3215 	assert(base_reg);
   3216 
   3217 	/* just reset draw state for vertex data */
   3218 	cmd_buffer->state.last_first_instance = -1;
   3219 	cmd_buffer->state.last_num_instances = -1;
   3220 	cmd_buffer->state.last_vertex_offset = -1;
   3221 
   3222 	if (draw_count == 1 && !count_va && !draw_id_enable) {
   3223 		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT :
   3224 				     PKT3_DRAW_INDIRECT, 3, false));
   3225 		radeon_emit(cs, 0);
   3226 		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
   3227 		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
   3228 		radeon_emit(cs, di_src_sel);
   3229 	} else {
   3230 		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
   3231 				     PKT3_DRAW_INDIRECT_MULTI,
   3232 				     8, false));
   3233 		radeon_emit(cs, 0);
   3234 		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
   3235 		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
   3236 		radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) |
   3237 			    S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
   3238 			    S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
   3239 		radeon_emit(cs, draw_count); /* count */
   3240 		radeon_emit(cs, count_va); /* count_addr */
   3241 		radeon_emit(cs, count_va >> 32);
   3242 		radeon_emit(cs, stride); /* stride */
   3243 		radeon_emit(cs, di_src_sel);
   3244 	}
   3245 }
   3246 
   3247 struct radv_draw_info {
   3248 	/**
   3249 	 * Number of vertices.
   3250 	 */
   3251 	uint32_t count;
   3252 
   3253 	/**
   3254 	 * Index of the first vertex.
   3255 	 */
   3256 	int32_t vertex_offset;
   3257 
   3258 	/**
   3259 	 * First instance id.
   3260 	 */
   3261 	uint32_t first_instance;
   3262 
   3263 	/**
   3264 	 * Number of instances.
   3265 	 */
   3266 	uint32_t instance_count;
   3267 
   3268 	/**
   3269 	 * First index (indexed draws only).
   3270 	 */
   3271 	uint32_t first_index;
   3272 
   3273 	/**
   3274 	 * Whether it's an indexed draw.
   3275 	 */
   3276 	bool indexed;
   3277 
   3278 	/**
   3279 	 * Indirect draw parameters resource.
   3280 	 */
   3281 	struct radv_buffer *indirect;
   3282 	uint64_t indirect_offset;
   3283 	uint32_t stride;
   3284 
   3285 	/**
   3286 	 * Draw count parameters resource.
   3287 	 */
   3288 	struct radv_buffer *count_buffer;
   3289 	uint64_t count_buffer_offset;
   3290 };
   3291 
   3292 static void
   3293 radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
   3294 		       const struct radv_draw_info *info)
   3295 {
   3296 	struct radv_cmd_state *state = &cmd_buffer->state;
   3297 	struct radeon_winsys *ws = cmd_buffer->device->ws;
   3298 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   3299 
   3300 	if (info->indirect) {
   3301 		uint64_t va = radv_buffer_get_va(info->indirect->bo);
   3302 		uint64_t count_va = 0;
   3303 
   3304 		va += info->indirect->offset + info->indirect_offset;
   3305 
   3306 		radv_cs_add_buffer(ws, cs, info->indirect->bo, 8);
   3307 
   3308 		radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
   3309 		radeon_emit(cs, 1);
   3310 		radeon_emit(cs, va);
   3311 		radeon_emit(cs, va >> 32);
   3312 
   3313 		if (info->count_buffer) {
   3314 			count_va = radv_buffer_get_va(info->count_buffer->bo);
   3315 			count_va += info->count_buffer->offset +
   3316 				    info->count_buffer_offset;
   3317 
   3318 			radv_cs_add_buffer(ws, cs, info->count_buffer->bo, 8);
   3319 		}
   3320 
   3321 		if (!state->subpass->view_mask) {
   3322 			radv_cs_emit_indirect_draw_packet(cmd_buffer,
   3323 							  info->indexed,
   3324 							  info->count,
   3325 							  count_va,
   3326 							  info->stride);
   3327 		} else {
   3328 			unsigned i;
   3329 			for_each_bit(i, state->subpass->view_mask) {
   3330 				radv_emit_view_index(cmd_buffer, i);
   3331 
   3332 				radv_cs_emit_indirect_draw_packet(cmd_buffer,
   3333 								  info->indexed,
   3334 								  info->count,
   3335 								  count_va,
   3336 								  info->stride);
   3337 			}
   3338 		}
   3339 	} else {
   3340 		assert(state->pipeline->graphics.vtx_base_sgpr);
   3341 
   3342 		if (info->vertex_offset != state->last_vertex_offset ||
   3343 		    info->first_instance != state->last_first_instance) {
   3344 			radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
   3345 					      state->pipeline->graphics.vtx_emit_num);
   3346 
   3347 			radeon_emit(cs, info->vertex_offset);
   3348 			radeon_emit(cs, info->first_instance);
   3349 			if (state->pipeline->graphics.vtx_emit_num == 3)
   3350 				radeon_emit(cs, 0);
   3351 			state->last_first_instance = info->first_instance;
   3352 			state->last_vertex_offset = info->vertex_offset;
   3353 		}
   3354 
   3355 		if (state->last_num_instances != info->instance_count) {
   3356 			radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
   3357 			radeon_emit(cs, info->instance_count);
   3358 			state->last_num_instances = info->instance_count;
   3359 		}
   3360 
   3361 		if (info->indexed) {
   3362 			int index_size = state->index_type ? 4 : 2;
   3363 			uint64_t index_va;
   3364 
   3365 			index_va = state->index_va;
   3366 			index_va += info->first_index * index_size;
   3367 
   3368 			if (!state->subpass->view_mask) {
   3369 				radv_cs_emit_draw_indexed_packet(cmd_buffer,
   3370 								 index_va,
   3371 								 info->count);
   3372 			} else {
   3373 				unsigned i;
   3374 				for_each_bit(i, state->subpass->view_mask) {
   3375 					radv_emit_view_index(cmd_buffer, i);
   3376 
   3377 					radv_cs_emit_draw_indexed_packet(cmd_buffer,
   3378 									 index_va,
   3379 									 info->count);
   3380 				}
   3381 			}
   3382 		} else {
   3383 			if (!state->subpass->view_mask) {
   3384 				radv_cs_emit_draw_packet(cmd_buffer, info->count);
   3385 			} else {
   3386 				unsigned i;
   3387 				for_each_bit(i, state->subpass->view_mask) {
   3388 					radv_emit_view_index(cmd_buffer, i);
   3389 
   3390 					radv_cs_emit_draw_packet(cmd_buffer,
   3391 								 info->count);
   3392 				}
   3393 			}
   3394 		}
   3395 	}
   3396 }
   3397 
   3398 static void
   3399 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
   3400 			      const struct radv_draw_info *info)
   3401 {
   3402 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
   3403 		radv_emit_graphics_pipeline(cmd_buffer);
   3404 
   3405 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
   3406 		radv_emit_framebuffer_state(cmd_buffer);
   3407 
   3408 	if (info->indexed) {
   3409 		if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
   3410 			radv_emit_index_buffer(cmd_buffer);
   3411 	} else {
   3412 		/* On CI and later, non-indexed draws overwrite VGT_INDEX_TYPE,
   3413 		 * so the state must be re-emitted before the next indexed
   3414 		 * draw.
   3415 		 */
   3416 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
   3417 			cmd_buffer->state.last_index_type = -1;
   3418 			cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
   3419 		}
   3420 	}
   3421 
   3422 	radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
   3423 
   3424 	radv_emit_draw_registers(cmd_buffer, info->indexed,
   3425 				 info->instance_count > 1, info->indirect,
   3426 				 info->indirect ? 0 : info->count);
   3427 }
   3428 
   3429 static void
   3430 radv_draw(struct radv_cmd_buffer *cmd_buffer,
   3431 	  const struct radv_draw_info *info)
   3432 {
   3433 	bool pipeline_is_dirty =
   3434 		(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
   3435 		cmd_buffer->state.pipeline &&
   3436 		cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
   3437 
   3438 	MAYBE_UNUSED unsigned cdw_max =
   3439 		radeon_check_space(cmd_buffer->device->ws,
   3440 				   cmd_buffer->cs, 4096);
   3441 
   3442 	/* Use optimal packet order based on whether we need to sync the
   3443 	 * pipeline.
   3444 	 */
   3445 	if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
   3446 					    RADV_CMD_FLAG_FLUSH_AND_INV_DB |
   3447 					    RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
   3448 					    RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
   3449 		/* If we have to wait for idle, set all states first, so that
   3450 		 * all SET packets are processed in parallel with previous draw
   3451 		 * calls. Then upload descriptors, set shader pointers, and
   3452 		 * draw, and prefetch at the end. This ensures that the time
   3453 		 * the CUs are idle is very short. (there are only SET_SH
   3454 		 * packets between the wait and the draw)
   3455 		 */
   3456 		radv_emit_all_graphics_states(cmd_buffer, info);
   3457 		si_emit_cache_flush(cmd_buffer);
   3458 		/* <-- CUs are idle here --> */
   3459 
   3460 		if (!radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty))
   3461 			return;
   3462 
   3463 		radv_emit_draw_packets(cmd_buffer, info);
   3464 		/* <-- CUs are busy here --> */
   3465 
   3466 		/* Start prefetches after the draw has been started. Both will
   3467 		 * run in parallel, but starting the draw first is more
   3468 		 * important.
   3469 		 */
   3470 		if (pipeline_is_dirty) {
   3471 			radv_emit_prefetch(cmd_buffer,
   3472 					   cmd_buffer->state.pipeline);
   3473 		}
   3474 	} else {
   3475 		/* If we don't wait for idle, start prefetches first, then set
   3476 		 * states, and draw at the end.
   3477 		 */
   3478 		si_emit_cache_flush(cmd_buffer);
   3479 
   3480 		if (pipeline_is_dirty) {
   3481 			radv_emit_prefetch(cmd_buffer,
   3482 					   cmd_buffer->state.pipeline);
   3483 		}
   3484 
   3485 		if (!radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty))
   3486 			return;
   3487 
   3488 		radv_emit_all_graphics_states(cmd_buffer, info);
   3489 		radv_emit_draw_packets(cmd_buffer, info);
   3490 	}
   3491 
   3492 	assert(cmd_buffer->cs->cdw <= cdw_max);
   3493 	radv_cmd_buffer_after_draw(cmd_buffer);
   3494 }
   3495 
   3496 void radv_CmdDraw(
   3497 	VkCommandBuffer                             commandBuffer,
   3498 	uint32_t                                    vertexCount,
   3499 	uint32_t                                    instanceCount,
   3500 	uint32_t                                    firstVertex,
   3501 	uint32_t                                    firstInstance)
   3502 {
   3503 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3504 	struct radv_draw_info info = {};
   3505 
   3506 	info.count = vertexCount;
   3507 	info.instance_count = instanceCount;
   3508 	info.first_instance = firstInstance;
   3509 	info.vertex_offset = firstVertex;
   3510 
   3511 	radv_draw(cmd_buffer, &info);
   3512 }
   3513 
   3514 void radv_CmdDrawIndexed(
   3515 	VkCommandBuffer                             commandBuffer,
   3516 	uint32_t                                    indexCount,
   3517 	uint32_t                                    instanceCount,
   3518 	uint32_t                                    firstIndex,
   3519 	int32_t                                     vertexOffset,
   3520 	uint32_t                                    firstInstance)
   3521 {
   3522 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3523 	struct radv_draw_info info = {};
   3524 
   3525 	info.indexed = true;
   3526 	info.count = indexCount;
   3527 	info.instance_count = instanceCount;
   3528 	info.first_index = firstIndex;
   3529 	info.vertex_offset = vertexOffset;
   3530 	info.first_instance = firstInstance;
   3531 
   3532 	radv_draw(cmd_buffer, &info);
   3533 }
   3534 
   3535 void radv_CmdDrawIndirect(
   3536 	VkCommandBuffer                             commandBuffer,
   3537 	VkBuffer                                    _buffer,
   3538 	VkDeviceSize                                offset,
   3539 	uint32_t                                    drawCount,
   3540 	uint32_t                                    stride)
   3541 {
   3542 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3543 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
   3544 	struct radv_draw_info info = {};
   3545 
   3546 	info.count = drawCount;
   3547 	info.indirect = buffer;
   3548 	info.indirect_offset = offset;
   3549 	info.stride = stride;
   3550 
   3551 	radv_draw(cmd_buffer, &info);
   3552 }
   3553 
   3554 void radv_CmdDrawIndexedIndirect(
   3555 	VkCommandBuffer                             commandBuffer,
   3556 	VkBuffer                                    _buffer,
   3557 	VkDeviceSize                                offset,
   3558 	uint32_t                                    drawCount,
   3559 	uint32_t                                    stride)
   3560 {
   3561 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3562 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
   3563 	struct radv_draw_info info = {};
   3564 
   3565 	info.indexed = true;
   3566 	info.count = drawCount;
   3567 	info.indirect = buffer;
   3568 	info.indirect_offset = offset;
   3569 	info.stride = stride;
   3570 
   3571 	radv_draw(cmd_buffer, &info);
   3572 }
   3573 
   3574 void radv_CmdDrawIndirectCountAMD(
   3575 	VkCommandBuffer                             commandBuffer,
   3576 	VkBuffer                                    _buffer,
   3577 	VkDeviceSize                                offset,
   3578 	VkBuffer                                    _countBuffer,
   3579 	VkDeviceSize                                countBufferOffset,
   3580 	uint32_t                                    maxDrawCount,
   3581 	uint32_t                                    stride)
   3582 {
   3583 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3584 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
   3585 	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
   3586 	struct radv_draw_info info = {};
   3587 
   3588 	info.count = maxDrawCount;
   3589 	info.indirect = buffer;
   3590 	info.indirect_offset = offset;
   3591 	info.count_buffer = count_buffer;
   3592 	info.count_buffer_offset = countBufferOffset;
   3593 	info.stride = stride;
   3594 
   3595 	radv_draw(cmd_buffer, &info);
   3596 }
   3597 
   3598 void radv_CmdDrawIndexedIndirectCountAMD(
   3599 	VkCommandBuffer                             commandBuffer,
   3600 	VkBuffer                                    _buffer,
   3601 	VkDeviceSize                                offset,
   3602 	VkBuffer                                    _countBuffer,
   3603 	VkDeviceSize                                countBufferOffset,
   3604 	uint32_t                                    maxDrawCount,
   3605 	uint32_t                                    stride)
   3606 {
   3607 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3608 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
   3609 	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
   3610 	struct radv_draw_info info = {};
   3611 
   3612 	info.indexed = true;
   3613 	info.count = maxDrawCount;
   3614 	info.indirect = buffer;
   3615 	info.indirect_offset = offset;
   3616 	info.count_buffer = count_buffer;
   3617 	info.count_buffer_offset = countBufferOffset;
   3618 	info.stride = stride;
   3619 
   3620 	radv_draw(cmd_buffer, &info);
   3621 }
   3622 
   3623 struct radv_dispatch_info {
   3624 	/**
   3625 	 * Determine the layout of the grid (in block units) to be used.
   3626 	 */
   3627 	uint32_t blocks[3];
   3628 
   3629 	/**
   3630 	 * Whether it's an unaligned compute dispatch.
   3631 	 */
   3632 	bool unaligned;
   3633 
   3634 	/**
   3635 	 * Indirect compute parameters resource.
   3636 	 */
   3637 	struct radv_buffer *indirect;
   3638 	uint64_t indirect_offset;
   3639 };
   3640 
   3641 static void
   3642 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
   3643 			   const struct radv_dispatch_info *info)
   3644 {
   3645 	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
   3646 	struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
   3647 	unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
   3648 	struct radeon_winsys *ws = cmd_buffer->device->ws;
   3649 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   3650 	struct ac_userdata_info *loc;
   3651 
   3652 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
   3653 				    AC_UD_CS_GRID_SIZE);
   3654 
   3655 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(ws, cs, 25);
   3656 
   3657 	if (info->indirect) {
   3658 		uint64_t va = radv_buffer_get_va(info->indirect->bo);
   3659 
   3660 		va += info->indirect->offset + info->indirect_offset;
   3661 
   3662 		radv_cs_add_buffer(ws, cs, info->indirect->bo, 8);
   3663 
   3664 		if (loc->sgpr_idx != -1) {
   3665 			for (unsigned i = 0; i < 3; ++i) {
   3666 				radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
   3667 				radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
   3668 						COPY_DATA_DST_SEL(COPY_DATA_REG));
   3669 				radeon_emit(cs, (va +  4 * i));
   3670 				radeon_emit(cs, (va + 4 * i) >> 32);
   3671 				radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
   3672 						 + loc->sgpr_idx * 4) >> 2) + i);
   3673 				radeon_emit(cs, 0);
   3674 			}
   3675 		}
   3676 
   3677 		if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
   3678 			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
   3679 					PKT3_SHADER_TYPE_S(1));
   3680 			radeon_emit(cs, va);
   3681 			radeon_emit(cs, va >> 32);
   3682 			radeon_emit(cs, dispatch_initiator);
   3683 		} else {
   3684 			radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
   3685 					PKT3_SHADER_TYPE_S(1));
   3686 			radeon_emit(cs, 1);
   3687 			radeon_emit(cs, va);
   3688 			radeon_emit(cs, va >> 32);
   3689 
   3690 			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
   3691 					PKT3_SHADER_TYPE_S(1));
   3692 			radeon_emit(cs, 0);
   3693 			radeon_emit(cs, dispatch_initiator);
   3694 		}
   3695 	} else {
   3696 		unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] };
   3697 
   3698 		if (info->unaligned) {
   3699 			unsigned *cs_block_size = compute_shader->info.cs.block_size;
   3700 			unsigned remainder[3];
   3701 
   3702 			/* If aligned, these should be an entire block size,
   3703 			 * not 0.
   3704 			 */
   3705 			remainder[0] = blocks[0] + cs_block_size[0] -
   3706 				       align_u32_npot(blocks[0], cs_block_size[0]);
   3707 			remainder[1] = blocks[1] + cs_block_size[1] -
   3708 				       align_u32_npot(blocks[1], cs_block_size[1]);
   3709 			remainder[2] = blocks[2] + cs_block_size[2] -
   3710 				       align_u32_npot(blocks[2], cs_block_size[2]);
   3711 
   3712 			blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
   3713 			blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
   3714 			blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
   3715 
   3716 			radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
   3717 			radeon_emit(cs,
   3718 				    S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
   3719 				    S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
   3720 			radeon_emit(cs,
   3721 				    S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
   3722 				    S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
   3723 			radeon_emit(cs,
   3724 				    S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
   3725 				    S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
   3726 
   3727 			dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
   3728 		}
   3729 
   3730 		if (loc->sgpr_idx != -1) {
   3731 			assert(!loc->indirect);
   3732 			assert(loc->num_sgprs == 3);
   3733 
   3734 			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
   3735 						  loc->sgpr_idx * 4, 3);
   3736 			radeon_emit(cs, blocks[0]);
   3737 			radeon_emit(cs, blocks[1]);
   3738 			radeon_emit(cs, blocks[2]);
   3739 		}
   3740 
   3741 		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
   3742 				PKT3_SHADER_TYPE_S(1));
   3743 		radeon_emit(cs, blocks[0]);
   3744 		radeon_emit(cs, blocks[1]);
   3745 		radeon_emit(cs, blocks[2]);
   3746 		radeon_emit(cs, dispatch_initiator);
   3747 	}
   3748 
   3749 	assert(cmd_buffer->cs->cdw <= cdw_max);
   3750 }
   3751 
   3752 static void
   3753 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
   3754 {
   3755 	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
   3756 	radv_flush_constants(cmd_buffer, cmd_buffer->state.compute_pipeline,
   3757 			     VK_SHADER_STAGE_COMPUTE_BIT);
   3758 }
   3759 
   3760 static void
   3761 radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
   3762 	      const struct radv_dispatch_info *info)
   3763 {
   3764 	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
   3765 	bool pipeline_is_dirty = pipeline &&
   3766 				 pipeline != cmd_buffer->state.emitted_compute_pipeline;
   3767 
   3768 	if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
   3769 					    RADV_CMD_FLAG_FLUSH_AND_INV_DB |
   3770 					    RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
   3771 					    RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
   3772 		/* If we have to wait for idle, set all states first, so that
   3773 		 * all SET packets are processed in parallel with previous draw
   3774 		 * calls. Then upload descriptors, set shader pointers, and
   3775 		 * dispatch, and prefetch at the end. This ensures that the
   3776 		 * time the CUs are idle is very short. (there are only SET_SH
   3777 		 * packets between the wait and the draw)
   3778 		 */
   3779 		radv_emit_compute_pipeline(cmd_buffer);
   3780 		si_emit_cache_flush(cmd_buffer);
   3781 		/* <-- CUs are idle here --> */
   3782 
   3783 		radv_upload_compute_shader_descriptors(cmd_buffer);
   3784 
   3785 		radv_emit_dispatch_packets(cmd_buffer, info);
   3786 		/* <-- CUs are busy here --> */
   3787 
   3788 		/* Start prefetches after the dispatch has been started. Both
   3789 		 * will run in parallel, but starting the dispatch first is
   3790 		 * more important.
   3791 		 */
   3792 		if (pipeline_is_dirty) {
   3793 			radv_emit_shader_prefetch(cmd_buffer,
   3794 						  pipeline->shaders[MESA_SHADER_COMPUTE]);
   3795 		}
   3796 	} else {
   3797 		/* If we don't wait for idle, start prefetches first, then set
   3798 		 * states, and dispatch at the end.
   3799 		 */
   3800 		si_emit_cache_flush(cmd_buffer);
   3801 
   3802 		if (pipeline_is_dirty) {
   3803 			radv_emit_shader_prefetch(cmd_buffer,
   3804 						  pipeline->shaders[MESA_SHADER_COMPUTE]);
   3805 		}
   3806 
   3807 		radv_upload_compute_shader_descriptors(cmd_buffer);
   3808 
   3809 		radv_emit_compute_pipeline(cmd_buffer);
   3810 		radv_emit_dispatch_packets(cmd_buffer, info);
   3811 	}
   3812 
   3813 	radv_cmd_buffer_after_draw(cmd_buffer);
   3814 }
   3815 
   3816 void radv_CmdDispatch(
   3817 	VkCommandBuffer                             commandBuffer,
   3818 	uint32_t                                    x,
   3819 	uint32_t                                    y,
   3820 	uint32_t                                    z)
   3821 {
   3822 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3823 	struct radv_dispatch_info info = {};
   3824 
   3825 	info.blocks[0] = x;
   3826 	info.blocks[1] = y;
   3827 	info.blocks[2] = z;
   3828 
   3829 	radv_dispatch(cmd_buffer, &info);
   3830 }
   3831 
   3832 void radv_CmdDispatchIndirect(
   3833 	VkCommandBuffer                             commandBuffer,
   3834 	VkBuffer                                    _buffer,
   3835 	VkDeviceSize                                offset)
   3836 {
   3837 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3838 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
   3839 	struct radv_dispatch_info info = {};
   3840 
   3841 	info.indirect = buffer;
   3842 	info.indirect_offset = offset;
   3843 
   3844 	radv_dispatch(cmd_buffer, &info);
   3845 }
   3846 
   3847 void radv_unaligned_dispatch(
   3848 	struct radv_cmd_buffer                      *cmd_buffer,
   3849 	uint32_t                                    x,
   3850 	uint32_t                                    y,
   3851 	uint32_t                                    z)
   3852 {
   3853 	struct radv_dispatch_info info = {};
   3854 
   3855 	info.blocks[0] = x;
   3856 	info.blocks[1] = y;
   3857 	info.blocks[2] = z;
   3858 	info.unaligned = 1;
   3859 
   3860 	radv_dispatch(cmd_buffer, &info);
   3861 }
   3862 
   3863 void radv_CmdEndRenderPass(
   3864 	VkCommandBuffer                             commandBuffer)
   3865 {
   3866 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   3867 
   3868 	radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
   3869 
   3870 	radv_cmd_buffer_resolve_subpass(cmd_buffer);
   3871 
   3872 	for (unsigned i = 0; i < cmd_buffer->state.framebuffer->attachment_count; ++i) {
   3873 		VkImageLayout layout = cmd_buffer->state.pass->attachments[i].final_layout;
   3874 		radv_handle_subpass_image_transition(cmd_buffer,
   3875 		                      (VkAttachmentReference){i, layout});
   3876 	}
   3877 
   3878 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
   3879 
   3880 	cmd_buffer->state.pass = NULL;
   3881 	cmd_buffer->state.subpass = NULL;
   3882 	cmd_buffer->state.attachments = NULL;
   3883 	cmd_buffer->state.framebuffer = NULL;
   3884 }
   3885 
   3886 /*
   3887  * For HTILE we have the following interesting clear words:
   3888  *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
   3889  *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
   3890  *   0xfffffff0: Clear depth to 1.0
   3891  *   0x00000000: Clear depth to 0.0
   3892  */
   3893 static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
   3894                                   struct radv_image *image,
   3895                                   const VkImageSubresourceRange *range,
   3896                                   uint32_t clear_word)
   3897 {
   3898 	assert(range->baseMipLevel == 0);
   3899 	assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS);
   3900 	unsigned layer_count = radv_get_layerCount(image, range);
   3901 	uint64_t size = image->surface.htile_slice_size * layer_count;
   3902 	uint64_t offset = image->offset + image->htile_offset +
   3903 	                  image->surface.htile_slice_size * range->baseArrayLayer;
   3904 	struct radv_cmd_state *state = &cmd_buffer->state;
   3905 
   3906 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
   3907 			     RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
   3908 
   3909 	state->flush_bits |= radv_fill_buffer(cmd_buffer, image->bo, offset,
   3910 					      size, clear_word);
   3911 
   3912 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
   3913 }
   3914 
   3915 static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
   3916 					       struct radv_image *image,
   3917 					       VkImageLayout src_layout,
   3918 					       VkImageLayout dst_layout,
   3919 					       unsigned src_queue_mask,
   3920 					       unsigned dst_queue_mask,
   3921 					       const VkImageSubresourceRange *range,
   3922 					       VkImageAspectFlags pending_clears)
   3923 {
   3924 	if (dst_layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL &&
   3925 	    (pending_clears & vk_format_aspects(image->vk_format)) == vk_format_aspects(image->vk_format) &&
   3926 	    cmd_buffer->state.render_area.offset.x == 0 && cmd_buffer->state.render_area.offset.y == 0 &&
   3927 	    cmd_buffer->state.render_area.extent.width == image->info.width &&
   3928 	    cmd_buffer->state.render_area.extent.height == image->info.height) {
   3929 		/* The clear will initialize htile. */
   3930 		return;
   3931 	} else if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED &&
   3932 	           radv_layout_has_htile(image, dst_layout, dst_queue_mask)) {
   3933 		/* TODO: merge with the clear if applicable */
   3934 		radv_initialize_htile(cmd_buffer, image, range, 0);
   3935 	} else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
   3936 	           radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
   3937 		uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
   3938 		radv_initialize_htile(cmd_buffer, image, range, clear_value);
   3939 	} else if (radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
   3940 	           !radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
   3941 		VkImageSubresourceRange local_range = *range;
   3942 		local_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
   3943 		local_range.baseMipLevel = 0;
   3944 		local_range.levelCount = 1;
   3945 
   3946 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
   3947 		                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
   3948 
   3949 		radv_decompress_depth_image_inplace(cmd_buffer, image, &local_range);
   3950 
   3951 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
   3952 		                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
   3953 	}
   3954 }
   3955 
   3956 void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer,
   3957 			   struct radv_image *image, uint32_t value)
   3958 {
   3959 	struct radv_cmd_state *state = &cmd_buffer->state;
   3960 
   3961 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
   3962 			    RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
   3963 
   3964 	state->flush_bits |= radv_fill_buffer(cmd_buffer, image->bo,
   3965 					      image->offset + image->cmask.offset,
   3966 					      image->cmask.size, value);
   3967 
   3968 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
   3969 }
   3970 
   3971 static void radv_handle_cmask_image_transition(struct radv_cmd_buffer *cmd_buffer,
   3972 					       struct radv_image *image,
   3973 					       VkImageLayout src_layout,
   3974 					       VkImageLayout dst_layout,
   3975 					       unsigned src_queue_mask,
   3976 					       unsigned dst_queue_mask,
   3977 					       const VkImageSubresourceRange *range)
   3978 {
   3979 	if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
   3980 		if (image->fmask.size)
   3981 			radv_initialise_cmask(cmd_buffer, image, 0xccccccccu);
   3982 		else
   3983 			radv_initialise_cmask(cmd_buffer, image, 0xffffffffu);
   3984 	} else if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) &&
   3985 		   !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) {
   3986 		radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
   3987 	}
   3988 }
   3989 
   3990 void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer,
   3991 			 struct radv_image *image, uint32_t value)
   3992 {
   3993 	struct radv_cmd_state *state = &cmd_buffer->state;
   3994 
   3995 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
   3996 			     RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
   3997 
   3998 	state->flush_bits |= radv_fill_buffer(cmd_buffer, image->bo,
   3999 					      image->offset + image->dcc_offset,
   4000 					      image->surface.dcc_size, value);
   4001 
   4002 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
   4003 			     RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
   4004 }
   4005 
   4006 static void radv_handle_dcc_image_transition(struct radv_cmd_buffer *cmd_buffer,
   4007 					     struct radv_image *image,
   4008 					     VkImageLayout src_layout,
   4009 					     VkImageLayout dst_layout,
   4010 					     unsigned src_queue_mask,
   4011 					     unsigned dst_queue_mask,
   4012 					     const VkImageSubresourceRange *range)
   4013 {
   4014 	if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
   4015 		radv_initialize_dcc(cmd_buffer, image, 0xffffffffu);
   4016 	} else if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
   4017 		radv_initialize_dcc(cmd_buffer, image,
   4018 		                    radv_layout_dcc_compressed(image, dst_layout, dst_queue_mask) ?
   4019 		                         0x20202020u : 0xffffffffu);
   4020 	} else if (radv_layout_dcc_compressed(image, src_layout, src_queue_mask) &&
   4021 	           !radv_layout_dcc_compressed(image, dst_layout, dst_queue_mask)) {
   4022 		radv_decompress_dcc(cmd_buffer, image, range);
   4023 	} else if (radv_layout_can_fast_clear(image, src_layout, src_queue_mask) &&
   4024 		   !radv_layout_can_fast_clear(image, dst_layout, dst_queue_mask)) {
   4025 		radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
   4026 	}
   4027 }
   4028 
   4029 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
   4030 					 struct radv_image *image,
   4031 					 VkImageLayout src_layout,
   4032 					 VkImageLayout dst_layout,
   4033 					 uint32_t src_family,
   4034 					 uint32_t dst_family,
   4035 					 const VkImageSubresourceRange *range,
   4036 					 VkImageAspectFlags pending_clears)
   4037 {
   4038 	if (image->exclusive && src_family != dst_family) {
   4039 		/* This is an acquire or a release operation and there will be
   4040 		 * a corresponding release/acquire. Do the transition in the
   4041 		 * most flexible queue. */
   4042 
   4043 		assert(src_family == cmd_buffer->queue_family_index ||
   4044 		       dst_family == cmd_buffer->queue_family_index);
   4045 
   4046 		if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
   4047 			return;
   4048 
   4049 		if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
   4050 		    (src_family == RADV_QUEUE_GENERAL ||
   4051 		     dst_family == RADV_QUEUE_GENERAL))
   4052 			return;
   4053 	}
   4054 
   4055 	unsigned src_queue_mask = radv_image_queue_family_mask(image, src_family, cmd_buffer->queue_family_index);
   4056 	unsigned dst_queue_mask = radv_image_queue_family_mask(image, dst_family, cmd_buffer->queue_family_index);
   4057 
   4058 	if (image->surface.htile_size)
   4059 		radv_handle_depth_image_transition(cmd_buffer, image, src_layout,
   4060 						   dst_layout, src_queue_mask,
   4061 						   dst_queue_mask, range,
   4062 						   pending_clears);
   4063 
   4064 	if (image->cmask.size || image->fmask.size)
   4065 		radv_handle_cmask_image_transition(cmd_buffer, image, src_layout,
   4066 						   dst_layout, src_queue_mask,
   4067 						   dst_queue_mask, range);
   4068 
   4069 	if (image->surface.dcc_size)
   4070 		radv_handle_dcc_image_transition(cmd_buffer, image, src_layout,
   4071 						 dst_layout, src_queue_mask,
   4072 						 dst_queue_mask, range);
   4073 }
   4074 
   4075 void radv_CmdPipelineBarrier(
   4076 	VkCommandBuffer                             commandBuffer,
   4077 	VkPipelineStageFlags                        srcStageMask,
   4078 	VkPipelineStageFlags                        destStageMask,
   4079 	VkBool32                                    byRegion,
   4080 	uint32_t                                    memoryBarrierCount,
   4081 	const VkMemoryBarrier*                      pMemoryBarriers,
   4082 	uint32_t                                    bufferMemoryBarrierCount,
   4083 	const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
   4084 	uint32_t                                    imageMemoryBarrierCount,
   4085 	const VkImageMemoryBarrier*                 pImageMemoryBarriers)
   4086 {
   4087 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   4088 	enum radv_cmd_flush_bits src_flush_bits = 0;
   4089 	enum radv_cmd_flush_bits dst_flush_bits = 0;
   4090 
   4091 	for (uint32_t i = 0; i < memoryBarrierCount; i++) {
   4092 		src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask);
   4093 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask,
   4094 		                                        NULL);
   4095 	}
   4096 
   4097 	for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
   4098 		src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask);
   4099 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask,
   4100 		                                        NULL);
   4101 	}
   4102 
   4103 	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
   4104 		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
   4105 		src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask);
   4106 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask,
   4107 		                                        image);
   4108 	}
   4109 
   4110 	radv_stage_flush(cmd_buffer, srcStageMask);
   4111 	cmd_buffer->state.flush_bits |= src_flush_bits;
   4112 
   4113 	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
   4114 		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
   4115 		radv_handle_image_transition(cmd_buffer, image,
   4116 					     pImageMemoryBarriers[i].oldLayout,
   4117 					     pImageMemoryBarriers[i].newLayout,
   4118 					     pImageMemoryBarriers[i].srcQueueFamilyIndex,
   4119 					     pImageMemoryBarriers[i].dstQueueFamilyIndex,
   4120 					     &pImageMemoryBarriers[i].subresourceRange,
   4121 					     0);
   4122 	}
   4123 
   4124 	cmd_buffer->state.flush_bits |= dst_flush_bits;
   4125 }
   4126 
   4127 
   4128 static void write_event(struct radv_cmd_buffer *cmd_buffer,
   4129 			struct radv_event *event,
   4130 			VkPipelineStageFlags stageMask,
   4131 			unsigned value)
   4132 {
   4133 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   4134 	uint64_t va = radv_buffer_get_va(event->bo);
   4135 
   4136 	radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo, 8);
   4137 
   4138 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 18);
   4139 
   4140 	/* TODO: this is overkill. Probably should figure something out from
   4141 	 * the stage mask. */
   4142 
   4143 	si_cs_emit_write_event_eop(cs,
   4144 				   cmd_buffer->state.predicating,
   4145 				   cmd_buffer->device->physical_device->rad_info.chip_class,
   4146 				   radv_cmd_buffer_uses_mec(cmd_buffer),
   4147 				   V_028A90_BOTTOM_OF_PIPE_TS, 0,
   4148 				   1, va, 2, value);
   4149 
   4150 	assert(cmd_buffer->cs->cdw <= cdw_max);
   4151 }
   4152 
   4153 void radv_CmdSetEvent(VkCommandBuffer commandBuffer,
   4154 		      VkEvent _event,
   4155 		      VkPipelineStageFlags stageMask)
   4156 {
   4157 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   4158 	RADV_FROM_HANDLE(radv_event, event, _event);
   4159 
   4160 	write_event(cmd_buffer, event, stageMask, 1);
   4161 }
   4162 
   4163 void radv_CmdResetEvent(VkCommandBuffer commandBuffer,
   4164 			VkEvent _event,
   4165 			VkPipelineStageFlags stageMask)
   4166 {
   4167 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   4168 	RADV_FROM_HANDLE(radv_event, event, _event);
   4169 
   4170 	write_event(cmd_buffer, event, stageMask, 0);
   4171 }
   4172 
   4173 void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
   4174 			uint32_t eventCount,
   4175 			const VkEvent* pEvents,
   4176 			VkPipelineStageFlags srcStageMask,
   4177 			VkPipelineStageFlags dstStageMask,
   4178 			uint32_t memoryBarrierCount,
   4179 			const VkMemoryBarrier* pMemoryBarriers,
   4180 			uint32_t bufferMemoryBarrierCount,
   4181 			const VkBufferMemoryBarrier* pBufferMemoryBarriers,
   4182 			uint32_t imageMemoryBarrierCount,
   4183 			const VkImageMemoryBarrier* pImageMemoryBarriers)
   4184 {
   4185 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   4186 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
   4187 
   4188 	for (unsigned i = 0; i < eventCount; ++i) {
   4189 		RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
   4190 		uint64_t va = radv_buffer_get_va(event->bo);
   4191 
   4192 		radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo, 8);
   4193 
   4194 		MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
   4195 
   4196 		si_emit_wait_fence(cs, false, va, 1, 0xffffffff);
   4197 		assert(cmd_buffer->cs->cdw <= cdw_max);
   4198 	}
   4199 
   4200 
   4201 	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
   4202 		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
   4203 
   4204 		radv_handle_image_transition(cmd_buffer, image,
   4205 					     pImageMemoryBarriers[i].oldLayout,
   4206 					     pImageMemoryBarriers[i].newLayout,
   4207 					     pImageMemoryBarriers[i].srcQueueFamilyIndex,
   4208 					     pImageMemoryBarriers[i].dstQueueFamilyIndex,
   4209 					     &pImageMemoryBarriers[i].subresourceRange,
   4210 					     0);
   4211 	}
   4212 
   4213 	/* TODO: figure out how to do memory barriers without waiting */
   4214 	cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER |
   4215 					RADV_CMD_FLAG_INV_GLOBAL_L2 |
   4216 					RADV_CMD_FLAG_INV_VMEM_L1 |
   4217 					RADV_CMD_FLAG_INV_SMEM_L1;
   4218 }
   4219