Home | History | Annotate | Download | only in radeonsi
      1 /*
      2  * Copyright 2013 Advanced Micro Devices, Inc.
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *      Marek Olk <marek.olsak (at) amd.com>
     25  */
     26 
     27 /* Resource binding slots and sampler states (each described with 8 or
     28  * 4 dwords) are stored in lists in memory which is accessed by shaders
     29  * using scalar load instructions.
     30  *
     31  * This file is responsible for managing such lists. It keeps a copy of all
     32  * descriptors in CPU memory and re-uploads a whole list if some slots have
     33  * been changed.
     34  *
     35  * This code is also reponsible for updating shader pointers to those lists.
     36  *
     37  * Note that CP DMA can't be used for updating the lists, because a GPU hang
     38  * could leave the list in a mid-IB state and the next IB would get wrong
     39  * descriptors and the whole context would be unusable at that point.
     40  * (Note: The register shadowing can't be used due to the same reason)
     41  *
     42  * Also, uploading descriptors to newly allocated memory doesn't require
     43  * a KCACHE flush.
     44  *
     45  *
     46  * Possible scenarios for one 16 dword image+sampler slot:
     47  *
     48  *       | Image        | w/ FMASK   | Buffer       | NULL
     49  * [ 0: 3] Image[0:3]   | Image[0:3] | Null[0:3]    | Null[0:3]
     50  * [ 4: 7] Image[4:7]   | Image[4:7] | Buffer[0:3]  | 0
     51  * [ 8:11] Null[0:3]    | Fmask[0:3] | Null[0:3]    | Null[0:3]
     52  * [12:15] Sampler[0:3] | Fmask[4:7] | Sampler[0:3] | Sampler[0:3]
     53  *
     54  * FMASK implies MSAA, therefore no sampler state.
     55  * Sampler states are never unbound except when FMASK is bound.
     56  */
     57 
     58 #include "radeon/r600_cs.h"
     59 #include "si_pipe.h"
     60 #include "sid.h"
     61 
     62 #include "util/u_format.h"
     63 #include "util/u_memory.h"
     64 #include "util/u_upload_mgr.h"
     65 
     66 
     67 /* NULL image and buffer descriptor for textures (alpha = 1) and images
     68  * (alpha = 0).
     69  *
     70  * For images, all fields must be zero except for the swizzle, which
     71  * supports arbitrary combinations of 0s and 1s. The texture type must be
     72  * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
     73  *
     74  * For buffers, all fields must be zero. If they are not, the hw hangs.
     75  *
     76  * This is the only reason why the buffer descriptor must be in words [4:7].
     77  */
     78 static uint32_t null_texture_descriptor[8] = {
     79 	0,
     80 	0,
     81 	0,
     82 	S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
     83 	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
     84 	/* the rest must contain zeros, which is also used by the buffer
     85 	 * descriptor */
     86 };
     87 
     88 static uint32_t null_image_descriptor[8] = {
     89 	0,
     90 	0,
     91 	0,
     92 	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
     93 	/* the rest must contain zeros, which is also used by the buffer
     94 	 * descriptor */
     95 };
     96 
     97 static void si_init_descriptors(struct si_descriptors *desc,
     98 				unsigned shader_userdata_index,
     99 				unsigned element_dw_size,
    100 				unsigned num_elements,
    101 				const uint32_t *null_descriptor,
    102 				unsigned *ce_offset)
    103 {
    104 	int i;
    105 
    106 	assert(num_elements <= sizeof(desc->dirty_mask)*8);
    107 
    108 	desc->list = CALLOC(num_elements, element_dw_size * 4);
    109 	desc->element_dw_size = element_dw_size;
    110 	desc->num_elements = num_elements;
    111 	desc->dirty_mask = num_elements == 32 ? ~0u : (1u << num_elements) - 1;
    112 	desc->shader_userdata_offset = shader_userdata_index * 4;
    113 
    114 	if (ce_offset) {
    115 		desc->ce_offset = *ce_offset;
    116 
    117 		/* make sure that ce_offset stays 32 byte aligned */
    118 		*ce_offset += align(element_dw_size * num_elements * 4, 32);
    119 	}
    120 
    121 	/* Initialize the array to NULL descriptors if the element size is 8. */
    122 	if (null_descriptor) {
    123 		assert(element_dw_size % 8 == 0);
    124 		for (i = 0; i < num_elements * element_dw_size / 8; i++)
    125 			memcpy(desc->list + i * 8, null_descriptor,
    126 			       8 * 4);
    127 	}
    128 }
    129 
    130 static void si_release_descriptors(struct si_descriptors *desc)
    131 {
    132 	r600_resource_reference(&desc->buffer, NULL);
    133 	FREE(desc->list);
    134 }
    135 
    136 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
    137 			 unsigned *out_offset, struct r600_resource **out_buf) {
    138 	uint64_t va;
    139 
    140 	u_suballocator_alloc(sctx->ce_suballocator, size, 64, out_offset,
    141 			     (struct pipe_resource**)out_buf);
    142 	if (!out_buf)
    143 			return false;
    144 
    145 	va = (*out_buf)->gpu_address + *out_offset;
    146 
    147 	radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
    148 	radeon_emit(sctx->ce_ib, ce_offset);
    149 	radeon_emit(sctx->ce_ib, size / 4);
    150 	radeon_emit(sctx->ce_ib, va);
    151 	radeon_emit(sctx->ce_ib, va >> 32);
    152 
    153 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf,
    154 	                       RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
    155 
    156 	sctx->ce_need_synchronization = true;
    157 	return true;
    158 }
    159 
    160 static void si_ce_reinitialize_descriptors(struct si_context *sctx,
    161                                            struct si_descriptors *desc)
    162 {
    163 	if (desc->buffer) {
    164 		struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
    165 		unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
    166 		uint64_t va = buffer->gpu_address + desc->buffer_offset;
    167 		struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
    168 
    169 		if (!ib)
    170 			ib = sctx->ce_ib;
    171 
    172 		list_size = align(list_size, 32);
    173 
    174 		radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
    175 		radeon_emit(ib, va);
    176 		radeon_emit(ib, va >> 32);
    177 		radeon_emit(ib, list_size / 4);
    178 		radeon_emit(ib, desc->ce_offset);
    179 
    180 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
    181 		                    RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
    182 	}
    183 	desc->ce_ram_dirty = false;
    184 }
    185 
    186 void si_ce_reinitialize_all_descriptors(struct si_context *sctx)
    187 {
    188 	int i;
    189 
    190 	for (i = 0; i < SI_NUM_DESCS; ++i)
    191 		si_ce_reinitialize_descriptors(sctx, &sctx->descriptors[i]);
    192 }
    193 
    194 void si_ce_enable_loads(struct radeon_winsys_cs *ib)
    195 {
    196 	radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
    197 	radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
    198 	                CONTEXT_CONTROL_LOAD_CE_RAM(1));
    199 	radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
    200 }
    201 
    202 static bool si_upload_descriptors(struct si_context *sctx,
    203 				  struct si_descriptors *desc,
    204 				  struct r600_atom * atom)
    205 {
    206 	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
    207 
    208 	if (!desc->dirty_mask)
    209 		return true;
    210 
    211 	if (sctx->ce_ib) {
    212 		uint32_t const* list = (uint32_t const*)desc->list;
    213 
    214 		if (desc->ce_ram_dirty)
    215 			si_ce_reinitialize_descriptors(sctx, desc);
    216 
    217 		while(desc->dirty_mask) {
    218 			int begin, count;
    219 			u_bit_scan_consecutive_range(&desc->dirty_mask, &begin,
    220 						     &count);
    221 
    222 			begin *= desc->element_dw_size;
    223 			count *= desc->element_dw_size;
    224 
    225 			radeon_emit(sctx->ce_ib,
    226 			            PKT3(PKT3_WRITE_CONST_RAM, count, 0));
    227 			radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
    228 			radeon_emit_array(sctx->ce_ib, list + begin, count);
    229 		}
    230 
    231 		if (!si_ce_upload(sctx, desc->ce_offset, list_size,
    232 		                           &desc->buffer_offset, &desc->buffer))
    233 			return false;
    234 	} else {
    235 		void *ptr;
    236 
    237 		u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
    238 			&desc->buffer_offset,
    239 			(struct pipe_resource**)&desc->buffer, &ptr);
    240 		if (!desc->buffer)
    241 			return false; /* skip the draw call */
    242 
    243 		util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
    244 		desc->gpu_list = ptr;
    245 
    246 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
    247 	                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
    248 	}
    249 	desc->dirty_mask = 0;
    250 
    251 	if (atom)
    252 		si_mark_atom_dirty(sctx, atom);
    253 
    254 	return true;
    255 }
    256 
    257 static void
    258 si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
    259 {
    260 	desc->ce_ram_dirty = true;
    261 
    262 	if (!desc->buffer)
    263 		return;
    264 
    265 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
    266 				  RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
    267 }
    268 
    269 /* SAMPLER VIEWS */
    270 
    271 static unsigned
    272 si_sampler_descriptors_idx(unsigned shader)
    273 {
    274 	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
    275 	       SI_SHADER_DESCS_SAMPLERS;
    276 }
    277 
    278 static struct si_descriptors *
    279 si_sampler_descriptors(struct si_context *sctx, unsigned shader)
    280 {
    281 	return &sctx->descriptors[si_sampler_descriptors_idx(shader)];
    282 }
    283 
    284 static void si_release_sampler_views(struct si_sampler_views *views)
    285 {
    286 	int i;
    287 
    288 	for (i = 0; i < ARRAY_SIZE(views->views); i++) {
    289 		pipe_sampler_view_reference(&views->views[i], NULL);
    290 	}
    291 }
    292 
    293 static void si_sampler_view_add_buffer(struct si_context *sctx,
    294 				       struct pipe_resource *resource,
    295 				       enum radeon_bo_usage usage,
    296 				       bool is_stencil_sampler,
    297 				       bool check_mem)
    298 {
    299 	struct r600_resource *rres;
    300 	struct r600_texture *rtex;
    301 	enum radeon_bo_priority priority;
    302 
    303 	if (!resource)
    304 		return;
    305 
    306 	if (resource->target != PIPE_BUFFER) {
    307 		struct r600_texture *tex = (struct r600_texture*)resource;
    308 
    309 		if (tex->is_depth && !r600_can_sample_zs(tex, is_stencil_sampler))
    310 			resource = &tex->flushed_depth_texture->resource.b.b;
    311 	}
    312 
    313 	rres = (struct r600_resource*)resource;
    314 	priority = r600_get_sampler_view_priority(rres);
    315 
    316 	radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
    317 					    rres, usage, priority,
    318 					    check_mem);
    319 
    320 	if (resource->target == PIPE_BUFFER)
    321 		return;
    322 
    323 	/* Now add separate DCC or HTILE. */
    324 	rtex = (struct r600_texture*)resource;
    325 	if (rtex->dcc_separate_buffer) {
    326 		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
    327 						    rtex->dcc_separate_buffer, usage,
    328 						    RADEON_PRIO_DCC, check_mem);
    329 	}
    330 
    331 	if (rtex->htile_buffer &&
    332 	    rtex->tc_compatible_htile &&
    333 	    !is_stencil_sampler) {
    334 		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
    335 						    rtex->htile_buffer, usage,
    336 						    RADEON_PRIO_HTILE, check_mem);
    337 	}
    338 }
    339 
    340 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
    341 					  struct si_sampler_views *views)
    342 {
    343 	unsigned mask = views->enabled_mask;
    344 
    345 	/* Add buffers to the CS. */
    346 	while (mask) {
    347 		int i = u_bit_scan(&mask);
    348 		struct si_sampler_view *sview = (struct si_sampler_view *)views->views[i];
    349 
    350 		si_sampler_view_add_buffer(sctx, sview->base.texture,
    351 					   RADEON_USAGE_READ,
    352 					   sview->is_stencil_sampler, false);
    353 	}
    354 }
    355 
    356 /* Set buffer descriptor fields that can be changed by reallocations. */
    357 static void si_set_buf_desc_address(struct r600_resource *buf,
    358 				    uint64_t offset, uint32_t *state)
    359 {
    360 	uint64_t va = buf->gpu_address + offset;
    361 
    362 	state[0] = va;
    363 	state[1] &= C_008F04_BASE_ADDRESS_HI;
    364 	state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32);
    365 }
    366 
    367 /* Set texture descriptor fields that can be changed by reallocations.
    368  *
    369  * \param tex			texture
    370  * \param base_level_info	information of the level of BASE_ADDRESS
    371  * \param base_level		the level of BASE_ADDRESS
    372  * \param first_level		pipe_sampler_view.u.tex.first_level
    373  * \param block_width		util_format_get_blockwidth()
    374  * \param is_stencil		select between separate Z & Stencil
    375  * \param state			descriptor to update
    376  */
    377 void si_set_mutable_tex_desc_fields(struct r600_texture *tex,
    378 				    const struct radeon_surf_level *base_level_info,
    379 				    unsigned base_level, unsigned first_level,
    380 				    unsigned block_width, bool is_stencil,
    381 				    uint32_t *state)
    382 {
    383 	uint64_t va;
    384 	unsigned pitch = base_level_info->nblk_x * block_width;
    385 
    386 	if (tex->is_depth && !r600_can_sample_zs(tex, is_stencil)) {
    387 		tex = tex->flushed_depth_texture;
    388 		is_stencil = false;
    389 	}
    390 
    391 	va = tex->resource.gpu_address + base_level_info->offset;
    392 
    393 	state[1] &= C_008F14_BASE_ADDRESS_HI;
    394 	state[3] &= C_008F1C_TILING_INDEX;
    395 	state[4] &= C_008F20_PITCH;
    396 	state[6] &= C_008F28_COMPRESSION_EN;
    397 
    398 	state[0] = va >> 8;
    399 	state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
    400 	state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level,
    401 							     is_stencil));
    402 	state[4] |= S_008F20_PITCH(pitch - 1);
    403 
    404 	if (tex->dcc_offset && first_level < tex->surface.num_dcc_levels) {
    405 		state[6] |= S_008F28_COMPRESSION_EN(1);
    406 		state[7] = ((!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
    407 			    tex->dcc_offset +
    408 			    base_level_info->dcc_offset) >> 8;
    409 	} else if (tex->tc_compatible_htile) {
    410 		state[6] |= S_008F28_COMPRESSION_EN(1);
    411 		state[7] = tex->htile_buffer->gpu_address >> 8;
    412 	}
    413 }
    414 
    415 static void si_set_sampler_view(struct si_context *sctx,
    416 				unsigned shader,
    417 				unsigned slot, struct pipe_sampler_view *view,
    418 				bool disallow_early_out)
    419 {
    420 	struct si_sampler_views *views = &sctx->samplers[shader].views;
    421 	struct si_sampler_view *rview = (struct si_sampler_view*)view;
    422 	struct si_descriptors *descs = si_sampler_descriptors(sctx, shader);
    423 	uint32_t *desc = descs->list + slot * 16;
    424 
    425 	if (views->views[slot] == view && !disallow_early_out)
    426 		return;
    427 
    428 	if (view) {
    429 		struct r600_texture *rtex = (struct r600_texture *)view->texture;
    430 
    431 		assert(rtex); /* views with texture == NULL aren't supported */
    432 		pipe_sampler_view_reference(&views->views[slot], view);
    433 		memcpy(desc, rview->state, 8*4);
    434 
    435 		if (rtex->resource.b.b.target == PIPE_BUFFER) {
    436 			rtex->resource.bind_history |= PIPE_BIND_SAMPLER_VIEW;
    437 
    438 			si_set_buf_desc_address(&rtex->resource,
    439 						view->u.buf.offset,
    440 						desc + 4);
    441 		} else {
    442 			bool is_separate_stencil =
    443 				rtex->db_compatible &&
    444 				rview->is_stencil_sampler;
    445 
    446 			si_set_mutable_tex_desc_fields(rtex,
    447 						       rview->base_level_info,
    448 						       rview->base_level,
    449 						       rview->base.u.tex.first_level,
    450 						       rview->block_width,
    451 						       is_separate_stencil,
    452 						       desc);
    453 		}
    454 
    455 		if (rtex->resource.b.b.target != PIPE_BUFFER &&
    456 		    rtex->fmask.size) {
    457 			memcpy(desc + 8,
    458 			       rview->fmask_state, 8*4);
    459 		} else {
    460 			/* Disable FMASK and bind sampler state in [12:15]. */
    461 			memcpy(desc + 8,
    462 			       null_texture_descriptor, 4*4);
    463 
    464 			if (views->sampler_states[slot])
    465 				memcpy(desc + 12,
    466 				       views->sampler_states[slot]->val, 4*4);
    467 		}
    468 
    469 		views->enabled_mask |= 1u << slot;
    470 
    471 		/* Since this can flush, it must be done after enabled_mask is
    472 		 * updated. */
    473 		si_sampler_view_add_buffer(sctx, view->texture,
    474 					   RADEON_USAGE_READ,
    475 					   rview->is_stencil_sampler, true);
    476 	} else {
    477 		pipe_sampler_view_reference(&views->views[slot], NULL);
    478 		memcpy(desc, null_texture_descriptor, 8*4);
    479 		/* Only clear the lower dwords of FMASK. */
    480 		memcpy(desc + 8, null_texture_descriptor, 4*4);
    481 		/* Re-set the sampler state if we are transitioning from FMASK. */
    482 		if (views->sampler_states[slot])
    483 			memcpy(desc + 12,
    484 			       views->sampler_states[slot]->val, 4*4);
    485 
    486 		views->enabled_mask &= ~(1u << slot);
    487 	}
    488 
    489 	descs->dirty_mask |= 1u << slot;
    490 	sctx->descriptors_dirty |= 1u << si_sampler_descriptors_idx(shader);
    491 }
    492 
    493 static bool is_compressed_colortex(struct r600_texture *rtex)
    494 {
    495 	return rtex->cmask.size || rtex->fmask.size ||
    496 	       (rtex->dcc_offset && rtex->dirty_level_mask);
    497 }
    498 
    499 static void si_update_compressed_tex_shader_mask(struct si_context *sctx,
    500 						 unsigned shader)
    501 {
    502 	struct si_textures_info *samplers = &sctx->samplers[shader];
    503 	unsigned shader_bit = 1 << shader;
    504 
    505 	if (samplers->depth_texture_mask ||
    506 	    samplers->compressed_colortex_mask ||
    507 	    sctx->images[shader].compressed_colortex_mask)
    508 		sctx->compressed_tex_shader_mask |= shader_bit;
    509 	else
    510 		sctx->compressed_tex_shader_mask &= ~shader_bit;
    511 }
    512 
    513 static void si_set_sampler_views(struct pipe_context *ctx,
    514 				 enum pipe_shader_type shader, unsigned start,
    515                                  unsigned count,
    516 				 struct pipe_sampler_view **views)
    517 {
    518 	struct si_context *sctx = (struct si_context *)ctx;
    519 	struct si_textures_info *samplers = &sctx->samplers[shader];
    520 	int i;
    521 
    522 	if (!count || shader >= SI_NUM_SHADERS)
    523 		return;
    524 
    525 	for (i = 0; i < count; i++) {
    526 		unsigned slot = start + i;
    527 
    528 		if (!views || !views[i]) {
    529 			samplers->depth_texture_mask &= ~(1u << slot);
    530 			samplers->compressed_colortex_mask &= ~(1u << slot);
    531 			si_set_sampler_view(sctx, shader, slot, NULL, false);
    532 			continue;
    533 		}
    534 
    535 		si_set_sampler_view(sctx, shader, slot, views[i], false);
    536 
    537 		if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
    538 			struct r600_texture *rtex =
    539 				(struct r600_texture*)views[i]->texture;
    540 			struct si_sampler_view *rview = (struct si_sampler_view *)views[i];
    541 
    542 			if (rtex->db_compatible &&
    543 			    (!rtex->tc_compatible_htile || rview->is_stencil_sampler)) {
    544 				samplers->depth_texture_mask |= 1u << slot;
    545 			} else {
    546 				samplers->depth_texture_mask &= ~(1u << slot);
    547 			}
    548 			if (is_compressed_colortex(rtex)) {
    549 				samplers->compressed_colortex_mask |= 1u << slot;
    550 			} else {
    551 				samplers->compressed_colortex_mask &= ~(1u << slot);
    552 			}
    553 
    554 			if (rtex->dcc_offset &&
    555 			    p_atomic_read(&rtex->framebuffers_bound))
    556 				sctx->need_check_render_feedback = true;
    557 		} else {
    558 			samplers->depth_texture_mask &= ~(1u << slot);
    559 			samplers->compressed_colortex_mask &= ~(1u << slot);
    560 		}
    561 	}
    562 
    563 	si_update_compressed_tex_shader_mask(sctx, shader);
    564 }
    565 
    566 static void
    567 si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
    568 {
    569 	unsigned mask = samplers->views.enabled_mask;
    570 
    571 	while (mask) {
    572 		int i = u_bit_scan(&mask);
    573 		struct pipe_resource *res = samplers->views.views[i]->texture;
    574 
    575 		if (res && res->target != PIPE_BUFFER) {
    576 			struct r600_texture *rtex = (struct r600_texture *)res;
    577 
    578 			if (is_compressed_colortex(rtex)) {
    579 				samplers->compressed_colortex_mask |= 1u << i;
    580 			} else {
    581 				samplers->compressed_colortex_mask &= ~(1u << i);
    582 			}
    583 		}
    584 	}
    585 }
    586 
    587 /* IMAGE VIEWS */
    588 
    589 static unsigned
    590 si_image_descriptors_idx(unsigned shader)
    591 {
    592 	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
    593 	       SI_SHADER_DESCS_IMAGES;
    594 }
    595 
    596 static struct si_descriptors*
    597 si_image_descriptors(struct si_context *sctx, unsigned shader)
    598 {
    599 	return &sctx->descriptors[si_image_descriptors_idx(shader)];
    600 }
    601 
    602 static void
    603 si_release_image_views(struct si_images_info *images)
    604 {
    605 	unsigned i;
    606 
    607 	for (i = 0; i < SI_NUM_IMAGES; ++i) {
    608 		struct pipe_image_view *view = &images->views[i];
    609 
    610 		pipe_resource_reference(&view->resource, NULL);
    611 	}
    612 }
    613 
    614 static void
    615 si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images)
    616 {
    617 	uint mask = images->enabled_mask;
    618 
    619 	/* Add buffers to the CS. */
    620 	while (mask) {
    621 		int i = u_bit_scan(&mask);
    622 		struct pipe_image_view *view = &images->views[i];
    623 
    624 		assert(view->resource);
    625 
    626 		si_sampler_view_add_buffer(sctx, view->resource,
    627 					   RADEON_USAGE_READWRITE, false, false);
    628 	}
    629 }
    630 
    631 static void
    632 si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot)
    633 {
    634 	struct si_images_info *images = &ctx->images[shader];
    635 
    636 	if (images->enabled_mask & (1u << slot)) {
    637 		struct si_descriptors *descs = si_image_descriptors(ctx, shader);
    638 
    639 		pipe_resource_reference(&images->views[slot].resource, NULL);
    640 		images->compressed_colortex_mask &= ~(1 << slot);
    641 
    642 		memcpy(descs->list + slot*8, null_image_descriptor, 8*4);
    643 		images->enabled_mask &= ~(1u << slot);
    644 		descs->dirty_mask |= 1u << slot;
    645 		ctx->descriptors_dirty |= 1u << si_image_descriptors_idx(shader);
    646 	}
    647 }
    648 
    649 static void
    650 si_mark_image_range_valid(const struct pipe_image_view *view)
    651 {
    652 	struct r600_resource *res = (struct r600_resource *)view->resource;
    653 
    654 	assert(res && res->b.b.target == PIPE_BUFFER);
    655 
    656 	util_range_add(&res->valid_buffer_range,
    657 		       view->u.buf.offset,
    658 		       view->u.buf.offset + view->u.buf.size);
    659 }
    660 
    661 static void si_set_shader_image(struct si_context *ctx,
    662 				unsigned shader,
    663 				unsigned slot, const struct pipe_image_view *view,
    664 				bool skip_decompress)
    665 {
    666 	struct si_screen *screen = ctx->screen;
    667 	struct si_images_info *images = &ctx->images[shader];
    668 	struct si_descriptors *descs = si_image_descriptors(ctx, shader);
    669 	struct r600_resource *res;
    670 	uint32_t *desc = descs->list + slot * 8;
    671 
    672 	if (!view || !view->resource) {
    673 		si_disable_shader_image(ctx, shader, slot);
    674 		return;
    675 	}
    676 
    677 	res = (struct r600_resource *)view->resource;
    678 
    679 	if (&images->views[slot] != view)
    680 		util_copy_image_view(&images->views[slot], view);
    681 
    682 	if (res->b.b.target == PIPE_BUFFER) {
    683 		if (view->access & PIPE_IMAGE_ACCESS_WRITE)
    684 			si_mark_image_range_valid(view);
    685 
    686 		si_make_buffer_descriptor(screen, res,
    687 					  view->format,
    688 					  view->u.buf.offset,
    689 					  view->u.buf.size,
    690 					  descs->list + slot * 8);
    691 		si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
    692 
    693 		images->compressed_colortex_mask &= ~(1 << slot);
    694 		res->bind_history |= PIPE_BIND_SHADER_IMAGE;
    695 	} else {
    696 		static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
    697 		struct r600_texture *tex = (struct r600_texture *)res;
    698 		unsigned level = view->u.tex.level;
    699 		unsigned width, height, depth;
    700 		bool uses_dcc = tex->dcc_offset &&
    701 				level < tex->surface.num_dcc_levels;
    702 
    703 		assert(!tex->is_depth);
    704 		assert(tex->fmask.size == 0);
    705 
    706 		if (uses_dcc && !skip_decompress &&
    707 		    (view->access & PIPE_IMAGE_ACCESS_WRITE ||
    708 		     !vi_dcc_formats_compatible(res->b.b.format, view->format))) {
    709 			/* If DCC can't be disabled, at least decompress it.
    710 			 * The decompression is relatively cheap if the surface
    711 			 * has been decompressed already.
    712 			 */
    713 			if (r600_texture_disable_dcc(&ctx->b, tex))
    714 				uses_dcc = false;
    715 			else
    716 				ctx->b.decompress_dcc(&ctx->b.b, tex);
    717 		}
    718 
    719 		if (is_compressed_colortex(tex)) {
    720 			images->compressed_colortex_mask |= 1 << slot;
    721 		} else {
    722 			images->compressed_colortex_mask &= ~(1 << slot);
    723 		}
    724 
    725 		if (uses_dcc &&
    726 		    p_atomic_read(&tex->framebuffers_bound))
    727 			ctx->need_check_render_feedback = true;
    728 
    729 		/* Always force the base level to the selected level.
    730 		 *
    731 		 * This is required for 3D textures, where otherwise
    732 		 * selecting a single slice for non-layered bindings
    733 		 * fails. It doesn't hurt the other targets.
    734 		 */
    735 		width = u_minify(res->b.b.width0, level);
    736 		height = u_minify(res->b.b.height0, level);
    737 		depth = u_minify(res->b.b.depth0, level);
    738 
    739 		si_make_texture_descriptor(screen, tex,
    740 					   false, res->b.b.target,
    741 					   view->format, swizzle,
    742 					   0, 0,
    743 					   view->u.tex.first_layer,
    744 					   view->u.tex.last_layer,
    745 					   width, height, depth,
    746 					   desc, NULL);
    747 		si_set_mutable_tex_desc_fields(tex, &tex->surface.level[level],
    748 					       level, level,
    749 					       util_format_get_blockwidth(view->format),
    750 					       false, desc);
    751 	}
    752 
    753 	images->enabled_mask |= 1u << slot;
    754 	descs->dirty_mask |= 1u << slot;
    755 	ctx->descriptors_dirty |= 1u << si_image_descriptors_idx(shader);
    756 
    757 	/* Since this can flush, it must be done after enabled_mask is updated. */
    758 	si_sampler_view_add_buffer(ctx, &res->b.b,
    759 				   RADEON_USAGE_READWRITE, false, true);
    760 }
    761 
    762 static void
    763 si_set_shader_images(struct pipe_context *pipe,
    764 		     enum pipe_shader_type shader,
    765 		     unsigned start_slot, unsigned count,
    766 		     const struct pipe_image_view *views)
    767 {
    768 	struct si_context *ctx = (struct si_context *)pipe;
    769 	unsigned i, slot;
    770 
    771 	assert(shader < SI_NUM_SHADERS);
    772 
    773 	if (!count)
    774 		return;
    775 
    776 	assert(start_slot + count <= SI_NUM_IMAGES);
    777 
    778 	if (views) {
    779 		for (i = 0, slot = start_slot; i < count; ++i, ++slot)
    780 			si_set_shader_image(ctx, shader, slot, &views[i], false);
    781 	} else {
    782 		for (i = 0, slot = start_slot; i < count; ++i, ++slot)
    783 			si_set_shader_image(ctx, shader, slot, NULL, false);
    784 	}
    785 
    786 	si_update_compressed_tex_shader_mask(ctx, shader);
    787 }
    788 
    789 static void
    790 si_images_update_compressed_colortex_mask(struct si_images_info *images)
    791 {
    792 	unsigned mask = images->enabled_mask;
    793 
    794 	while (mask) {
    795 		int i = u_bit_scan(&mask);
    796 		struct pipe_resource *res = images->views[i].resource;
    797 
    798 		if (res && res->target != PIPE_BUFFER) {
    799 			struct r600_texture *rtex = (struct r600_texture *)res;
    800 
    801 			if (is_compressed_colortex(rtex)) {
    802 				images->compressed_colortex_mask |= 1 << i;
    803 			} else {
    804 				images->compressed_colortex_mask &= ~(1 << i);
    805 			}
    806 		}
    807 	}
    808 }
    809 
    810 /* SAMPLER STATES */
    811 
    812 static void si_bind_sampler_states(struct pipe_context *ctx,
    813                                    enum pipe_shader_type shader,
    814                                    unsigned start, unsigned count, void **states)
    815 {
    816 	struct si_context *sctx = (struct si_context *)ctx;
    817 	struct si_textures_info *samplers = &sctx->samplers[shader];
    818 	struct si_descriptors *desc = si_sampler_descriptors(sctx, shader);
    819 	struct si_sampler_state **sstates = (struct si_sampler_state**)states;
    820 	int i;
    821 
    822 	if (!count || shader >= SI_NUM_SHADERS)
    823 		return;
    824 
    825 	for (i = 0; i < count; i++) {
    826 		unsigned slot = start + i;
    827 
    828 		if (!sstates[i] ||
    829 		    sstates[i] == samplers->views.sampler_states[slot])
    830 			continue;
    831 
    832 #ifdef DEBUG
    833 		assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC);
    834 #endif
    835 		samplers->views.sampler_states[slot] = sstates[i];
    836 
    837 		/* If FMASK is bound, don't overwrite it.
    838 		 * The sampler state will be set after FMASK is unbound.
    839 		 */
    840 		if (samplers->views.views[slot] &&
    841 		    samplers->views.views[slot]->texture &&
    842 		    samplers->views.views[slot]->texture->target != PIPE_BUFFER &&
    843 		    ((struct r600_texture*)samplers->views.views[slot]->texture)->fmask.size)
    844 			continue;
    845 
    846 		memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4);
    847 		desc->dirty_mask |= 1u << slot;
    848 		sctx->descriptors_dirty |= 1u << si_sampler_descriptors_idx(shader);
    849 	}
    850 }
    851 
    852 /* BUFFER RESOURCES */
    853 
    854 static void si_init_buffer_resources(struct si_buffer_resources *buffers,
    855 				     struct si_descriptors *descs,
    856 				     unsigned num_buffers,
    857 				     unsigned shader_userdata_index,
    858 				     enum radeon_bo_usage shader_usage,
    859 				     enum radeon_bo_priority priority,
    860 				     unsigned *ce_offset)
    861 {
    862 	buffers->shader_usage = shader_usage;
    863 	buffers->priority = priority;
    864 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
    865 
    866 	si_init_descriptors(descs, shader_userdata_index, 4,
    867 			    num_buffers, NULL, ce_offset);
    868 }
    869 
    870 static void si_release_buffer_resources(struct si_buffer_resources *buffers,
    871 					struct si_descriptors *descs)
    872 {
    873 	int i;
    874 
    875 	for (i = 0; i < descs->num_elements; i++) {
    876 		pipe_resource_reference(&buffers->buffers[i], NULL);
    877 	}
    878 
    879 	FREE(buffers->buffers);
    880 }
    881 
    882 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
    883 					     struct si_buffer_resources *buffers)
    884 {
    885 	unsigned mask = buffers->enabled_mask;
    886 
    887 	/* Add buffers to the CS. */
    888 	while (mask) {
    889 		int i = u_bit_scan(&mask);
    890 
    891 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
    892 				      (struct r600_resource*)buffers->buffers[i],
    893 				      buffers->shader_usage, buffers->priority);
    894 	}
    895 }
    896 
    897 static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers,
    898 					   struct si_descriptors *descs,
    899 					   unsigned idx, struct pipe_resource **buf,
    900 					   unsigned *offset, unsigned *size)
    901 {
    902 	pipe_resource_reference(buf, buffers->buffers[idx]);
    903 	if (*buf) {
    904 		struct r600_resource *res = r600_resource(*buf);
    905 		const uint32_t *desc = descs->list + idx * 4;
    906 		uint64_t va;
    907 
    908 		*size = desc[2];
    909 
    910 		assert(G_008F04_STRIDE(desc[1]) == 0);
    911 		va = ((uint64_t)desc[1] << 32) | desc[0];
    912 
    913 		assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size);
    914 		*offset = va - res->gpu_address;
    915 	}
    916 }
    917 
    918 /* VERTEX BUFFERS */
    919 
    920 static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
    921 {
    922 	struct si_descriptors *desc = &sctx->vertex_buffers;
    923 	int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
    924 	int i;
    925 
    926 	for (i = 0; i < count; i++) {
    927 		int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
    928 
    929 		if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
    930 			continue;
    931 		if (!sctx->vertex_buffer[vb].buffer)
    932 			continue;
    933 
    934 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
    935 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
    936 				      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
    937 	}
    938 
    939 	if (!desc->buffer)
    940 		return;
    941 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
    942 			      desc->buffer, RADEON_USAGE_READ,
    943 			      RADEON_PRIO_DESCRIPTORS);
    944 }
    945 
    946 bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
    947 {
    948 	struct si_vertex_element *velems = sctx->vertex_elements;
    949 	struct si_descriptors *desc = &sctx->vertex_buffers;
    950 	unsigned i, count = velems->count;
    951 	uint64_t va;
    952 	uint32_t *ptr;
    953 
    954 	if (!sctx->vertex_buffers_dirty || !count || !velems)
    955 		return true;
    956 
    957 	unsigned fix_size3 = velems->fix_size3;
    958 	unsigned first_vb_use_mask = velems->first_vb_use_mask;
    959 
    960 	/* Vertex buffer descriptors are the only ones which are uploaded
    961 	 * directly through a staging buffer and don't go through
    962 	 * the fine-grained upload path.
    963 	 */
    964 	u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
    965 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
    966 	if (!desc->buffer)
    967 		return false;
    968 
    969 	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
    970 			      desc->buffer, RADEON_USAGE_READ,
    971 			      RADEON_PRIO_DESCRIPTORS);
    972 
    973 	assert(count <= SI_NUM_VERTEX_BUFFERS);
    974 
    975 	for (i = 0; i < count; i++) {
    976 		struct pipe_vertex_element *ve = &velems->elements[i];
    977 		struct pipe_vertex_buffer *vb;
    978 		struct r600_resource *rbuffer;
    979 		unsigned offset;
    980 		unsigned vbo_index = ve->vertex_buffer_index;
    981 		uint32_t *desc = &ptr[i*4];
    982 
    983 		vb = &sctx->vertex_buffer[vbo_index];
    984 		rbuffer = (struct r600_resource*)vb->buffer;
    985 		if (!rbuffer) {
    986 			memset(desc, 0, 16);
    987 			continue;
    988 		}
    989 
    990 		offset = vb->buffer_offset + ve->src_offset;
    991 		va = rbuffer->gpu_address + offset;
    992 
    993 		/* Fill in T# buffer resource description */
    994 		desc[0] = va;
    995 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
    996 			  S_008F04_STRIDE(vb->stride);
    997 
    998 		if (sctx->b.chip_class <= CIK && vb->stride) {
    999 			/* Round up by rounding down and adding 1 */
   1000 			desc[2] = (vb->buffer->width0 - offset -
   1001 				   velems->format_size[i]) /
   1002 				  vb->stride + 1;
   1003 		} else {
   1004 			uint32_t size3;
   1005 
   1006 			desc[2] = vb->buffer->width0 - offset;
   1007 
   1008 			/* For attributes of size 3 with byte or short
   1009 			 * components, we use a 4-component data format.
   1010 			 *
   1011 			 * As a consequence, we have to round the buffer size
   1012 			 * up so that the hardware sees four components as
   1013 			 * being inside the buffer if and only if the first
   1014 			 * three components are in the buffer.
   1015 			 *
   1016 			 * Since the offset and stride are guaranteed to be
   1017 			 * 4-byte aligned, this alignment will never cross the
   1018 			 * winsys buffer boundary.
   1019 			 */
   1020 			size3 = (fix_size3 >> (2 * i)) & 3;
   1021 			if (vb->stride && size3) {
   1022 				assert(offset % 4 == 0 && vb->stride % 4 == 0);
   1023 				assert(size3 <= 2);
   1024 				desc[2] = align(desc[2], size3 * 2);
   1025 			}
   1026 		}
   1027 
   1028 		desc[3] = velems->rsrc_word3[i];
   1029 
   1030 		if (first_vb_use_mask & (1 << i)) {
   1031 			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
   1032 					      (struct r600_resource*)vb->buffer,
   1033 					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
   1034 		}
   1035 	}
   1036 
   1037 	/* Don't flush the const cache. It would have a very negative effect
   1038 	 * on performance (confirmed by testing). New descriptors are always
   1039 	 * uploaded to a fresh new buffer, so I don't think flushing the const
   1040 	 * cache is needed. */
   1041 	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
   1042 	sctx->vertex_buffers_dirty = false;
   1043 	sctx->vertex_buffer_pointer_dirty = true;
   1044 	return true;
   1045 }
   1046 
   1047 
   1048 /* CONSTANT BUFFERS */
   1049 
   1050 static unsigned
   1051 si_const_buffer_descriptors_idx(unsigned shader)
   1052 {
   1053 	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
   1054 	       SI_SHADER_DESCS_CONST_BUFFERS;
   1055 }
   1056 
   1057 static struct si_descriptors *
   1058 si_const_buffer_descriptors(struct si_context *sctx, unsigned shader)
   1059 {
   1060 	return &sctx->descriptors[si_const_buffer_descriptors_idx(shader)];
   1061 }
   1062 
   1063 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
   1064 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset)
   1065 {
   1066 	void *tmp;
   1067 
   1068 	u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset,
   1069 		       (struct pipe_resource**)rbuffer, &tmp);
   1070 	if (*rbuffer)
   1071 		util_memcpy_cpu_to_le32(tmp, ptr, size);
   1072 }
   1073 
   1074 static void si_set_constant_buffer(struct si_context *sctx,
   1075 				   struct si_buffer_resources *buffers,
   1076 				   unsigned descriptors_idx,
   1077 				   uint slot, const struct pipe_constant_buffer *input)
   1078 {
   1079 	struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
   1080 	assert(slot < descs->num_elements);
   1081 	pipe_resource_reference(&buffers->buffers[slot], NULL);
   1082 
   1083 	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
   1084 	 * with a NULL buffer). We need to use a dummy buffer instead. */
   1085 	if (sctx->b.chip_class == CIK &&
   1086 	    (!input || (!input->buffer && !input->user_buffer)))
   1087 		input = &sctx->null_const_buf;
   1088 
   1089 	if (input && (input->buffer || input->user_buffer)) {
   1090 		struct pipe_resource *buffer = NULL;
   1091 		uint64_t va;
   1092 
   1093 		/* Upload the user buffer if needed. */
   1094 		if (input->user_buffer) {
   1095 			unsigned buffer_offset;
   1096 
   1097 			si_upload_const_buffer(sctx,
   1098 					       (struct r600_resource**)&buffer, input->user_buffer,
   1099 					       input->buffer_size, &buffer_offset);
   1100 			if (!buffer) {
   1101 				/* Just unbind on failure. */
   1102 				si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
   1103 				return;
   1104 			}
   1105 			va = r600_resource(buffer)->gpu_address + buffer_offset;
   1106 		} else {
   1107 			pipe_resource_reference(&buffer, input->buffer);
   1108 			va = r600_resource(buffer)->gpu_address + input->buffer_offset;
   1109 			/* Only track usage for non-user buffers. */
   1110 			r600_resource(buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
   1111 		}
   1112 
   1113 		/* Set the descriptor. */
   1114 		uint32_t *desc = descs->list + slot*4;
   1115 		desc[0] = va;
   1116 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
   1117 			  S_008F04_STRIDE(0);
   1118 		desc[2] = input->buffer_size;
   1119 		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
   1120 			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
   1121 			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
   1122 			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
   1123 			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
   1124 			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
   1125 
   1126 		buffers->buffers[slot] = buffer;
   1127 		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
   1128 						    (struct r600_resource*)buffer,
   1129 						    buffers->shader_usage,
   1130 						    buffers->priority, true);
   1131 		buffers->enabled_mask |= 1u << slot;
   1132 	} else {
   1133 		/* Clear the descriptor. */
   1134 		memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
   1135 		buffers->enabled_mask &= ~(1u << slot);
   1136 	}
   1137 
   1138 	descs->dirty_mask |= 1u << slot;
   1139 	sctx->descriptors_dirty |= 1u << descriptors_idx;
   1140 }
   1141 
   1142 void si_set_rw_buffer(struct si_context *sctx,
   1143 		      uint slot, const struct pipe_constant_buffer *input)
   1144 {
   1145 	si_set_constant_buffer(sctx, &sctx->rw_buffers,
   1146 			                        SI_DESCS_RW_BUFFERS, slot, input);
   1147 }
   1148 
   1149 static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
   1150 					uint shader, uint slot,
   1151 					const struct pipe_constant_buffer *input)
   1152 {
   1153 	struct si_context *sctx = (struct si_context *)ctx;
   1154 
   1155 	if (shader >= SI_NUM_SHADERS)
   1156 		return;
   1157 
   1158 	si_set_constant_buffer(sctx, &sctx->const_buffers[shader],
   1159 			       si_const_buffer_descriptors_idx(shader),
   1160 			       slot, input);
   1161 }
   1162 
   1163 void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
   1164 				 uint slot, struct pipe_constant_buffer *cbuf)
   1165 {
   1166 	cbuf->user_buffer = NULL;
   1167 	si_get_buffer_from_descriptors(
   1168 		&sctx->const_buffers[shader],
   1169 		si_const_buffer_descriptors(sctx, shader),
   1170 		slot, &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
   1171 }
   1172 
   1173 /* SHADER BUFFERS */
   1174 
   1175 static unsigned
   1176 si_shader_buffer_descriptors_idx(enum pipe_shader_type shader)
   1177 {
   1178 	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
   1179 	       SI_SHADER_DESCS_SHADER_BUFFERS;
   1180 }
   1181 
   1182 static struct si_descriptors *
   1183 si_shader_buffer_descriptors(struct si_context *sctx,
   1184 				  enum pipe_shader_type shader)
   1185 {
   1186 	return &sctx->descriptors[si_shader_buffer_descriptors_idx(shader)];
   1187 }
   1188 
   1189 static void si_set_shader_buffers(struct pipe_context *ctx,
   1190 				  enum pipe_shader_type shader,
   1191 				  unsigned start_slot, unsigned count,
   1192 				  const struct pipe_shader_buffer *sbuffers)
   1193 {
   1194 	struct si_context *sctx = (struct si_context *)ctx;
   1195 	struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
   1196 	struct si_descriptors *descs = si_shader_buffer_descriptors(sctx, shader);
   1197 	unsigned i;
   1198 
   1199 	assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
   1200 
   1201 	for (i = 0; i < count; ++i) {
   1202 		const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
   1203 		struct r600_resource *buf;
   1204 		unsigned slot = start_slot + i;
   1205 		uint32_t *desc = descs->list + slot * 4;
   1206 		uint64_t va;
   1207 
   1208 		if (!sbuffer || !sbuffer->buffer) {
   1209 			pipe_resource_reference(&buffers->buffers[slot], NULL);
   1210 			memset(desc, 0, sizeof(uint32_t) * 4);
   1211 			buffers->enabled_mask &= ~(1u << slot);
   1212 			descs->dirty_mask |= 1u << slot;
   1213 			sctx->descriptors_dirty |=
   1214 				1u << si_shader_buffer_descriptors_idx(shader);
   1215 			continue;
   1216 		}
   1217 
   1218 		buf = (struct r600_resource *)sbuffer->buffer;
   1219 		va = buf->gpu_address + sbuffer->buffer_offset;
   1220 
   1221 		desc[0] = va;
   1222 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
   1223 			  S_008F04_STRIDE(0);
   1224 		desc[2] = sbuffer->buffer_size;
   1225 		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
   1226 			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
   1227 			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
   1228 			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
   1229 			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
   1230 			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
   1231 
   1232 		pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
   1233 		radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, buf,
   1234 						    buffers->shader_usage,
   1235 						    buffers->priority, true);
   1236 		buf->bind_history |= PIPE_BIND_SHADER_BUFFER;
   1237 
   1238 		buffers->enabled_mask |= 1u << slot;
   1239 		descs->dirty_mask |= 1u << slot;
   1240 		sctx->descriptors_dirty |=
   1241 			1u << si_shader_buffer_descriptors_idx(shader);
   1242 
   1243 		util_range_add(&buf->valid_buffer_range, sbuffer->buffer_offset,
   1244 			       sbuffer->buffer_offset + sbuffer->buffer_size);
   1245 	}
   1246 }
   1247 
   1248 void si_get_shader_buffers(struct si_context *sctx, uint shader,
   1249 			   uint start_slot, uint count,
   1250 			   struct pipe_shader_buffer *sbuf)
   1251 {
   1252 	struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
   1253 	struct si_descriptors *descs = si_shader_buffer_descriptors(sctx, shader);
   1254 
   1255 	for (unsigned i = 0; i < count; ++i) {
   1256 		si_get_buffer_from_descriptors(
   1257 			buffers, descs, start_slot + i,
   1258 			&sbuf[i].buffer, &sbuf[i].buffer_offset,
   1259 			&sbuf[i].buffer_size);
   1260 	}
   1261 }
   1262 
   1263 /* RING BUFFERS */
   1264 
   1265 void si_set_ring_buffer(struct pipe_context *ctx, uint slot,
   1266 			struct pipe_resource *buffer,
   1267 			unsigned stride, unsigned num_records,
   1268 			bool add_tid, bool swizzle,
   1269 			unsigned element_size, unsigned index_stride, uint64_t offset)
   1270 {
   1271 	struct si_context *sctx = (struct si_context *)ctx;
   1272 	struct si_buffer_resources *buffers = &sctx->rw_buffers;
   1273 	struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
   1274 
   1275 	/* The stride field in the resource descriptor has 14 bits */
   1276 	assert(stride < (1 << 14));
   1277 
   1278 	assert(slot < descs->num_elements);
   1279 	pipe_resource_reference(&buffers->buffers[slot], NULL);
   1280 
   1281 	if (buffer) {
   1282 		uint64_t va;
   1283 
   1284 		va = r600_resource(buffer)->gpu_address + offset;
   1285 
   1286 		switch (element_size) {
   1287 		default:
   1288 			assert(!"Unsupported ring buffer element size");
   1289 		case 0:
   1290 		case 2:
   1291 			element_size = 0;
   1292 			break;
   1293 		case 4:
   1294 			element_size = 1;
   1295 			break;
   1296 		case 8:
   1297 			element_size = 2;
   1298 			break;
   1299 		case 16:
   1300 			element_size = 3;
   1301 			break;
   1302 		}
   1303 
   1304 		switch (index_stride) {
   1305 		default:
   1306 			assert(!"Unsupported ring buffer index stride");
   1307 		case 0:
   1308 		case 8:
   1309 			index_stride = 0;
   1310 			break;
   1311 		case 16:
   1312 			index_stride = 1;
   1313 			break;
   1314 		case 32:
   1315 			index_stride = 2;
   1316 			break;
   1317 		case 64:
   1318 			index_stride = 3;
   1319 			break;
   1320 		}
   1321 
   1322 		if (sctx->b.chip_class >= VI && stride)
   1323 			num_records *= stride;
   1324 
   1325 		/* Set the descriptor. */
   1326 		uint32_t *desc = descs->list + slot*4;
   1327 		desc[0] = va;
   1328 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
   1329 			  S_008F04_STRIDE(stride) |
   1330 			  S_008F04_SWIZZLE_ENABLE(swizzle);
   1331 		desc[2] = num_records;
   1332 		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
   1333 			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
   1334 			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
   1335 			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
   1336 			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
   1337 			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
   1338 			  S_008F0C_ELEMENT_SIZE(element_size) |
   1339 			  S_008F0C_INDEX_STRIDE(index_stride) |
   1340 			  S_008F0C_ADD_TID_ENABLE(add_tid);
   1341 
   1342 		pipe_resource_reference(&buffers->buffers[slot], buffer);
   1343 		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
   1344 				      (struct r600_resource*)buffer,
   1345 				      buffers->shader_usage, buffers->priority);
   1346 		buffers->enabled_mask |= 1u << slot;
   1347 	} else {
   1348 		/* Clear the descriptor. */
   1349 		memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
   1350 		buffers->enabled_mask &= ~(1u << slot);
   1351 	}
   1352 
   1353 	descs->dirty_mask |= 1u << slot;
   1354 	sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
   1355 }
   1356 
   1357 /* STREAMOUT BUFFERS */
   1358 
   1359 static void si_set_streamout_targets(struct pipe_context *ctx,
   1360 				     unsigned num_targets,
   1361 				     struct pipe_stream_output_target **targets,
   1362 				     const unsigned *offsets)
   1363 {
   1364 	struct si_context *sctx = (struct si_context *)ctx;
   1365 	struct si_buffer_resources *buffers = &sctx->rw_buffers;
   1366 	struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
   1367 	unsigned old_num_targets = sctx->b.streamout.num_targets;
   1368 	unsigned i, bufidx;
   1369 
   1370 	/* We are going to unbind the buffers. Mark which caches need to be flushed. */
   1371 	if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
   1372 		/* Since streamout uses vector writes which go through TC L2
   1373 		 * and most other clients can use TC L2 as well, we don't need
   1374 		 * to flush it.
   1375 		 *
   1376 		 * The only cases which requires flushing it is VGT DMA index
   1377 		 * fetching (on <= CIK) and indirect draw data, which are rare
   1378 		 * cases. Thus, flag the TC L2 dirtiness in the resource and
   1379 		 * handle it at draw call time.
   1380 		 */
   1381 		for (i = 0; i < sctx->b.streamout.num_targets; i++)
   1382 			if (sctx->b.streamout.targets[i])
   1383 				r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
   1384 
   1385 		/* Invalidate the scalar cache in case a streamout buffer is
   1386 		 * going to be used as a constant buffer.
   1387 		 *
   1388 		 * Invalidate TC L1, because streamout bypasses it (done by
   1389 		 * setting GLC=1 in the store instruction), but it can contain
   1390 		 * outdated data of streamout buffers.
   1391 		 *
   1392 		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
   1393 		 * used as an input immediately.
   1394 		 */
   1395 		sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
   1396 				 SI_CONTEXT_INV_VMEM_L1 |
   1397 				 SI_CONTEXT_VS_PARTIAL_FLUSH;
   1398 	}
   1399 
   1400 	/* All readers of the streamout targets need to be finished before we can
   1401 	 * start writing to the targets.
   1402 	 */
   1403 	if (num_targets)
   1404 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
   1405 		                 SI_CONTEXT_CS_PARTIAL_FLUSH;
   1406 
   1407 	/* Streamout buffers must be bound in 2 places:
   1408 	 * 1) in VGT by setting the VGT_STRMOUT registers
   1409 	 * 2) as shader resources
   1410 	 */
   1411 
   1412 	/* Set the VGT regs. */
   1413 	r600_set_streamout_targets(ctx, num_targets, targets, offsets);
   1414 
   1415 	/* Set the shader resources.*/
   1416 	for (i = 0; i < num_targets; i++) {
   1417 		bufidx = SI_VS_STREAMOUT_BUF0 + i;
   1418 
   1419 		if (targets[i]) {
   1420 			struct pipe_resource *buffer = targets[i]->buffer;
   1421 			uint64_t va = r600_resource(buffer)->gpu_address;
   1422 
   1423 			/* Set the descriptor.
   1424 			 *
   1425 			 * On VI, the format must be non-INVALID, otherwise
   1426 			 * the buffer will be considered not bound and store
   1427 			 * instructions will be no-ops.
   1428 			 */
   1429 			uint32_t *desc = descs->list + bufidx*4;
   1430 			desc[0] = va;
   1431 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
   1432 			desc[2] = 0xffffffff;
   1433 			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
   1434 				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
   1435 				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
   1436 				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
   1437 				  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
   1438 
   1439 			/* Set the resource. */
   1440 			pipe_resource_reference(&buffers->buffers[bufidx],
   1441 						buffer);
   1442 			radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
   1443 							    (struct r600_resource*)buffer,
   1444 							    buffers->shader_usage,
   1445 							    RADEON_PRIO_SHADER_RW_BUFFER,
   1446 							    true);
   1447 			r600_resource(buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
   1448 
   1449 			buffers->enabled_mask |= 1u << bufidx;
   1450 		} else {
   1451 			/* Clear the descriptor and unset the resource. */
   1452 			memset(descs->list + bufidx*4, 0,
   1453 			       sizeof(uint32_t) * 4);
   1454 			pipe_resource_reference(&buffers->buffers[bufidx],
   1455 						NULL);
   1456 			buffers->enabled_mask &= ~(1u << bufidx);
   1457 		}
   1458 		descs->dirty_mask |= 1u << bufidx;
   1459 	}
   1460 	for (; i < old_num_targets; i++) {
   1461 		bufidx = SI_VS_STREAMOUT_BUF0 + i;
   1462 		/* Clear the descriptor and unset the resource. */
   1463 		memset(descs->list + bufidx*4, 0, sizeof(uint32_t) * 4);
   1464 		pipe_resource_reference(&buffers->buffers[bufidx], NULL);
   1465 		buffers->enabled_mask &= ~(1u << bufidx);
   1466 		descs->dirty_mask |= 1u << bufidx;
   1467 	}
   1468 
   1469 	sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
   1470 }
   1471 
   1472 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
   1473 					uint32_t *desc, uint64_t old_buf_va,
   1474 					struct pipe_resource *new_buf)
   1475 {
   1476 	/* Retrieve the buffer offset from the descriptor. */
   1477 	uint64_t old_desc_va =
   1478 		desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
   1479 
   1480 	assert(old_buf_va <= old_desc_va);
   1481 	uint64_t offset_within_buffer = old_desc_va - old_buf_va;
   1482 
   1483 	/* Update the descriptor. */
   1484 	si_set_buf_desc_address(r600_resource(new_buf), offset_within_buffer,
   1485 				desc);
   1486 }
   1487 
   1488 /* INTERNAL CONST BUFFERS */
   1489 
   1490 static void si_set_polygon_stipple(struct pipe_context *ctx,
   1491 				   const struct pipe_poly_stipple *state)
   1492 {
   1493 	struct si_context *sctx = (struct si_context *)ctx;
   1494 	struct pipe_constant_buffer cb = {};
   1495 	unsigned stipple[32];
   1496 	int i;
   1497 
   1498 	for (i = 0; i < 32; i++)
   1499 		stipple[i] = util_bitreverse(state->stipple[i]);
   1500 
   1501 	cb.user_buffer = stipple;
   1502 	cb.buffer_size = sizeof(stipple);
   1503 
   1504 	si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
   1505 }
   1506 
   1507 /* TEXTURE METADATA ENABLE/DISABLE */
   1508 
   1509 /* CMASK can be enabled (for fast clear) and disabled (for texture export)
   1510  * while the texture is bound, possibly by a different context. In that case,
   1511  * call this function to update compressed_colortex_masks.
   1512  */
   1513 void si_update_compressed_colortex_masks(struct si_context *sctx)
   1514 {
   1515 	for (int i = 0; i < SI_NUM_SHADERS; ++i) {
   1516 		si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]);
   1517 		si_images_update_compressed_colortex_mask(&sctx->images[i]);
   1518 		si_update_compressed_tex_shader_mask(sctx, i);
   1519 	}
   1520 }
   1521 
   1522 /* BUFFER DISCARD/INVALIDATION */
   1523 
   1524 /** Reset descriptors of buffer resources after \p buf has been invalidated. */
   1525 static void si_reset_buffer_resources(struct si_context *sctx,
   1526 				      struct si_buffer_resources *buffers,
   1527 				      unsigned descriptors_idx,
   1528 				      struct pipe_resource *buf,
   1529 				      uint64_t old_va)
   1530 {
   1531 	struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
   1532 	unsigned mask = buffers->enabled_mask;
   1533 
   1534 	while (mask) {
   1535 		unsigned i = u_bit_scan(&mask);
   1536 		if (buffers->buffers[i] == buf) {
   1537 			si_desc_reset_buffer_offset(&sctx->b.b,
   1538 						    descs->list + i*4,
   1539 						    old_va, buf);
   1540 			descs->dirty_mask |= 1u << i;
   1541 			sctx->descriptors_dirty |= 1u << descriptors_idx;
   1542 
   1543 			radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
   1544 							    (struct r600_resource *)buf,
   1545 							    buffers->shader_usage,
   1546 							    buffers->priority, true);
   1547 		}
   1548 	}
   1549 }
   1550 
   1551 /* Reallocate a buffer a update all resource bindings where the buffer is
   1552  * bound.
   1553  *
   1554  * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
   1555  * idle by discarding its contents. Apps usually tell us when to do this using
   1556  * map_buffer flags, for example.
   1557  */
   1558 static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
   1559 {
   1560 	struct si_context *sctx = (struct si_context*)ctx;
   1561 	struct r600_resource *rbuffer = r600_resource(buf);
   1562 	unsigned i, shader;
   1563 	uint64_t old_va = rbuffer->gpu_address;
   1564 	unsigned num_elems = sctx->vertex_elements ?
   1565 				       sctx->vertex_elements->count : 0;
   1566 
   1567 	/* Reallocate the buffer in the same pipe_resource. */
   1568 	r600_alloc_resource(&sctx->screen->b, rbuffer);
   1569 
   1570 	/* We changed the buffer, now we need to bind it where the old one
   1571 	 * was bound. This consists of 2 things:
   1572 	 *   1) Updating the resource descriptor and dirtying it.
   1573 	 *   2) Adding a relocation to the CS, so that it's usable.
   1574 	 */
   1575 
   1576 	/* Vertex buffers. */
   1577 	if (rbuffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
   1578 		for (i = 0; i < num_elems; i++) {
   1579 			int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
   1580 
   1581 			if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
   1582 				continue;
   1583 			if (!sctx->vertex_buffer[vb].buffer)
   1584 				continue;
   1585 
   1586 			if (sctx->vertex_buffer[vb].buffer == buf) {
   1587 				sctx->vertex_buffers_dirty = true;
   1588 				break;
   1589 			}
   1590 		}
   1591 	}
   1592 
   1593 	/* Streamout buffers. (other internal buffers can't be invalidated) */
   1594 	if (rbuffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
   1595 		for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
   1596 			struct si_buffer_resources *buffers = &sctx->rw_buffers;
   1597 			struct si_descriptors *descs =
   1598 				&sctx->descriptors[SI_DESCS_RW_BUFFERS];
   1599 
   1600 			if (buffers->buffers[i] != buf)
   1601 				continue;
   1602 
   1603 			si_desc_reset_buffer_offset(ctx, descs->list + i*4,
   1604 						    old_va, buf);
   1605 			descs->dirty_mask |= 1u << i;
   1606 			sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
   1607 
   1608 			radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
   1609 							    rbuffer, buffers->shader_usage,
   1610 							    RADEON_PRIO_SHADER_RW_BUFFER,
   1611 							    true);
   1612 
   1613 			/* Update the streamout state. */
   1614 			if (sctx->b.streamout.begin_emitted)
   1615 				r600_emit_streamout_end(&sctx->b);
   1616 			sctx->b.streamout.append_bitmask =
   1617 					sctx->b.streamout.enabled_mask;
   1618 			r600_streamout_buffers_dirty(&sctx->b);
   1619 		}
   1620 	}
   1621 
   1622 	/* Constant and shader buffers. */
   1623 	if (rbuffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
   1624 		for (shader = 0; shader < SI_NUM_SHADERS; shader++)
   1625 			si_reset_buffer_resources(sctx, &sctx->const_buffers[shader],
   1626 						  si_const_buffer_descriptors_idx(shader),
   1627 						  buf, old_va);
   1628 	}
   1629 
   1630 	if (rbuffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
   1631 		for (shader = 0; shader < SI_NUM_SHADERS; shader++)
   1632 			si_reset_buffer_resources(sctx, &sctx->shader_buffers[shader],
   1633 						  si_shader_buffer_descriptors_idx(shader),
   1634 						  buf, old_va);
   1635 	}
   1636 
   1637 	if (rbuffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
   1638 		/* Texture buffers - update bindings. */
   1639 		for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
   1640 			struct si_sampler_views *views = &sctx->samplers[shader].views;
   1641 			struct si_descriptors *descs =
   1642 				si_sampler_descriptors(sctx, shader);
   1643 			unsigned mask = views->enabled_mask;
   1644 
   1645 			while (mask) {
   1646 				unsigned i = u_bit_scan(&mask);
   1647 				if (views->views[i]->texture == buf) {
   1648 					si_desc_reset_buffer_offset(ctx,
   1649 								    descs->list +
   1650 								    i * 16 + 4,
   1651 								    old_va, buf);
   1652 					descs->dirty_mask |= 1u << i;
   1653 					sctx->descriptors_dirty |=
   1654 						1u << si_sampler_descriptors_idx(shader);
   1655 
   1656 					radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx,
   1657 									    rbuffer, RADEON_USAGE_READ,
   1658 									    RADEON_PRIO_SAMPLER_BUFFER,
   1659 									    true);
   1660 				}
   1661 			}
   1662 		}
   1663 	}
   1664 
   1665 	/* Shader images */
   1666 	if (rbuffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
   1667 		for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
   1668 			struct si_images_info *images = &sctx->images[shader];
   1669 			struct si_descriptors *descs =
   1670 				si_image_descriptors(sctx, shader);
   1671 			unsigned mask = images->enabled_mask;
   1672 
   1673 			while (mask) {
   1674 				unsigned i = u_bit_scan(&mask);
   1675 
   1676 				if (images->views[i].resource == buf) {
   1677 					if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
   1678 						si_mark_image_range_valid(&images->views[i]);
   1679 
   1680 					si_desc_reset_buffer_offset(
   1681 						ctx, descs->list + i * 8 + 4,
   1682 						old_va, buf);
   1683 					descs->dirty_mask |= 1u << i;
   1684 					sctx->descriptors_dirty |=
   1685 						1u << si_image_descriptors_idx(shader);
   1686 
   1687 					radeon_add_to_buffer_list_check_mem(
   1688 						&sctx->b, &sctx->b.gfx, rbuffer,
   1689 						RADEON_USAGE_READWRITE,
   1690 						RADEON_PRIO_SAMPLER_BUFFER, true);
   1691 				}
   1692 			}
   1693 		}
   1694 	}
   1695 }
   1696 
   1697 /* Update mutable image descriptor fields of all bound textures. */
   1698 void si_update_all_texture_descriptors(struct si_context *sctx)
   1699 {
   1700 	unsigned shader;
   1701 
   1702 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
   1703 		struct si_sampler_views *samplers = &sctx->samplers[shader].views;
   1704 		struct si_images_info *images = &sctx->images[shader];
   1705 		unsigned mask;
   1706 
   1707 		/* Images. */
   1708 		mask = images->enabled_mask;
   1709 		while (mask) {
   1710 			unsigned i = u_bit_scan(&mask);
   1711 			struct pipe_image_view *view = &images->views[i];
   1712 
   1713 			if (!view->resource ||
   1714 			    view->resource->target == PIPE_BUFFER)
   1715 				continue;
   1716 
   1717 			si_set_shader_image(sctx, shader, i, view, true);
   1718 		}
   1719 
   1720 		/* Sampler views. */
   1721 		mask = samplers->enabled_mask;
   1722 		while (mask) {
   1723 			unsigned i = u_bit_scan(&mask);
   1724 			struct pipe_sampler_view *view = samplers->views[i];
   1725 
   1726 			if (!view ||
   1727 			    !view->texture ||
   1728 			    view->texture->target == PIPE_BUFFER)
   1729 				continue;
   1730 
   1731 			si_set_sampler_view(sctx, shader, i,
   1732 					    samplers->views[i], true);
   1733 		}
   1734 
   1735 		si_update_compressed_tex_shader_mask(sctx, shader);
   1736 	}
   1737 }
   1738 
   1739 /* SHADER USER DATA */
   1740 
   1741 static void si_mark_shader_pointers_dirty(struct si_context *sctx,
   1742 					  unsigned shader)
   1743 {
   1744 	sctx->shader_pointers_dirty |=
   1745 		u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS,
   1746 				  SI_NUM_SHADER_DESCS);
   1747 
   1748 	if (shader == PIPE_SHADER_VERTEX)
   1749 		sctx->vertex_buffer_pointer_dirty = sctx->vertex_buffers.buffer != NULL;
   1750 
   1751 	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
   1752 }
   1753 
   1754 static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
   1755 {
   1756 	sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
   1757 	sctx->vertex_buffer_pointer_dirty = sctx->vertex_buffers.buffer != NULL;
   1758 	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
   1759 }
   1760 
   1761 /* Set a base register address for user data constants in the given shader.
   1762  * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
   1763  */
   1764 static void si_set_user_data_base(struct si_context *sctx,
   1765 				  unsigned shader, uint32_t new_base)
   1766 {
   1767 	uint32_t *base = &sctx->shader_userdata.sh_base[shader];
   1768 
   1769 	if (*base != new_base) {
   1770 		*base = new_base;
   1771 
   1772 		if (new_base)
   1773 			si_mark_shader_pointers_dirty(sctx, shader);
   1774 	}
   1775 }
   1776 
   1777 /* This must be called when these shaders are changed from non-NULL to NULL
   1778  * and vice versa:
   1779  * - geometry shader
   1780  * - tessellation control shader
   1781  * - tessellation evaluation shader
   1782  */
   1783 void si_shader_change_notify(struct si_context *sctx)
   1784 {
   1785 	/* VS can be bound as VS, ES, or LS. */
   1786 	if (sctx->tes_shader.cso)
   1787 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
   1788 				      R_00B530_SPI_SHADER_USER_DATA_LS_0);
   1789 	else if (sctx->gs_shader.cso)
   1790 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
   1791 				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
   1792 	else
   1793 		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
   1794 				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
   1795 
   1796 	/* TES can be bound as ES, VS, or not bound. */
   1797 	if (sctx->tes_shader.cso) {
   1798 		if (sctx->gs_shader.cso)
   1799 			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
   1800 					      R_00B330_SPI_SHADER_USER_DATA_ES_0);
   1801 		else
   1802 			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
   1803 					      R_00B130_SPI_SHADER_USER_DATA_VS_0);
   1804 	} else {
   1805 		si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
   1806 	}
   1807 }
   1808 
   1809 static void si_emit_shader_pointer(struct si_context *sctx,
   1810 				   struct si_descriptors *desc,
   1811 				   unsigned sh_base)
   1812 {
   1813 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
   1814 	uint64_t va;
   1815 
   1816 	assert(desc->buffer);
   1817 
   1818 	va = desc->buffer->gpu_address +
   1819 	     desc->buffer_offset;
   1820 
   1821 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
   1822 	radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
   1823 	radeon_emit(cs, va);
   1824 	radeon_emit(cs, va >> 32);
   1825 }
   1826 
   1827 void si_emit_graphics_shader_userdata(struct si_context *sctx,
   1828                                       struct r600_atom *atom)
   1829 {
   1830 	unsigned mask;
   1831 	uint32_t *sh_base = sctx->shader_userdata.sh_base;
   1832 	struct si_descriptors *descs;
   1833 
   1834 	descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
   1835 
   1836 	if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
   1837 		si_emit_shader_pointer(sctx, descs,
   1838 				       R_00B030_SPI_SHADER_USER_DATA_PS_0);
   1839 		si_emit_shader_pointer(sctx, descs,
   1840 				       R_00B130_SPI_SHADER_USER_DATA_VS_0);
   1841 		si_emit_shader_pointer(sctx, descs,
   1842 				       R_00B230_SPI_SHADER_USER_DATA_GS_0);
   1843 		si_emit_shader_pointer(sctx, descs,
   1844 				       R_00B330_SPI_SHADER_USER_DATA_ES_0);
   1845 		si_emit_shader_pointer(sctx, descs,
   1846 				       R_00B430_SPI_SHADER_USER_DATA_HS_0);
   1847 	}
   1848 
   1849 	mask = sctx->shader_pointers_dirty &
   1850 	       u_bit_consecutive(SI_DESCS_FIRST_SHADER,
   1851 				 SI_DESCS_FIRST_COMPUTE - SI_DESCS_FIRST_SHADER);
   1852 
   1853 	while (mask) {
   1854 		unsigned i = u_bit_scan(&mask);
   1855 		unsigned shader = (i - SI_DESCS_FIRST_SHADER) / SI_NUM_SHADER_DESCS;
   1856 		unsigned base = sh_base[shader];
   1857 
   1858 		if (base)
   1859 			si_emit_shader_pointer(sctx, descs + i, base);
   1860 	}
   1861 	sctx->shader_pointers_dirty &=
   1862 		~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
   1863 
   1864 	if (sctx->vertex_buffer_pointer_dirty) {
   1865 		si_emit_shader_pointer(sctx, &sctx->vertex_buffers,
   1866 				       sh_base[PIPE_SHADER_VERTEX]);
   1867 		sctx->vertex_buffer_pointer_dirty = false;
   1868 	}
   1869 }
   1870 
   1871 void si_emit_compute_shader_userdata(struct si_context *sctx)
   1872 {
   1873 	unsigned base = R_00B900_COMPUTE_USER_DATA_0;
   1874 	struct si_descriptors *descs = sctx->descriptors;
   1875 	unsigned compute_mask =
   1876 		u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_SHADER_DESCS);
   1877 	unsigned mask = sctx->shader_pointers_dirty & compute_mask;
   1878 
   1879 	while (mask) {
   1880 		unsigned i = u_bit_scan(&mask);
   1881 
   1882 		si_emit_shader_pointer(sctx, descs + i, base);
   1883 	}
   1884 	sctx->shader_pointers_dirty &= ~compute_mask;
   1885 }
   1886 
   1887 /* INIT/DEINIT/UPLOAD */
   1888 
   1889 void si_init_all_descriptors(struct si_context *sctx)
   1890 {
   1891 	int i;
   1892 	unsigned ce_offset = 0;
   1893 
   1894 	for (i = 0; i < SI_NUM_SHADERS; i++) {
   1895 		si_init_buffer_resources(&sctx->const_buffers[i],
   1896 					 si_const_buffer_descriptors(sctx, i),
   1897 					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS,
   1898 					 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER,
   1899 					 &ce_offset);
   1900 		si_init_buffer_resources(&sctx->shader_buffers[i],
   1901 					 si_shader_buffer_descriptors(sctx, i),
   1902 					 SI_NUM_SHADER_BUFFERS, SI_SGPR_SHADER_BUFFERS,
   1903 					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER,
   1904 					 &ce_offset);
   1905 
   1906 		si_init_descriptors(si_sampler_descriptors(sctx, i),
   1907 				    SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
   1908 				    null_texture_descriptor, &ce_offset);
   1909 
   1910 		si_init_descriptors(si_image_descriptors(sctx, i),
   1911 				    SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
   1912 				    null_image_descriptor, &ce_offset);
   1913 	}
   1914 
   1915 	si_init_buffer_resources(&sctx->rw_buffers,
   1916 				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
   1917 				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
   1918 				 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS,
   1919 				 &ce_offset);
   1920 	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
   1921 			    4, SI_NUM_VERTEX_BUFFERS, NULL, NULL);
   1922 
   1923 	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
   1924 
   1925 	assert(ce_offset <= 32768);
   1926 
   1927 	/* Set pipe_context functions. */
   1928 	sctx->b.b.bind_sampler_states = si_bind_sampler_states;
   1929 	sctx->b.b.set_shader_images = si_set_shader_images;
   1930 	sctx->b.b.set_constant_buffer = si_pipe_set_constant_buffer;
   1931 	sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
   1932 	sctx->b.b.set_shader_buffers = si_set_shader_buffers;
   1933 	sctx->b.b.set_sampler_views = si_set_sampler_views;
   1934 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
   1935 	sctx->b.invalidate_buffer = si_invalidate_buffer;
   1936 
   1937 	/* Shader user data. */
   1938 	si_init_atom(sctx, &sctx->shader_userdata.atom, &sctx->atoms.s.shader_userdata,
   1939 		     si_emit_graphics_shader_userdata);
   1940 
   1941 	/* Set default and immutable mappings. */
   1942 	si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
   1943 	si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
   1944 	si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
   1945 	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
   1946 }
   1947 
   1948 bool si_upload_graphics_shader_descriptors(struct si_context *sctx)
   1949 {
   1950 	const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE);
   1951 	unsigned dirty = sctx->descriptors_dirty & mask;
   1952 
   1953 	/* Assume nothing will go wrong: */
   1954 	sctx->shader_pointers_dirty |= dirty;
   1955 
   1956 	while (dirty) {
   1957 		unsigned i = u_bit_scan(&dirty);
   1958 
   1959 		if (!si_upload_descriptors(sctx, &sctx->descriptors[i],
   1960 					   &sctx->shader_userdata.atom))
   1961 			return false;
   1962 	}
   1963 
   1964 	sctx->descriptors_dirty &= ~mask;
   1965 	return true;
   1966 }
   1967 
   1968 bool si_upload_compute_shader_descriptors(struct si_context *sctx)
   1969 {
   1970 	/* Does not update rw_buffers as that is not needed for compute shaders
   1971 	 * and the input buffer is using the same SGPR's anyway.
   1972 	 */
   1973 	const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE,
   1974 						SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE);
   1975 	unsigned dirty = sctx->descriptors_dirty & mask;
   1976 
   1977 	/* Assume nothing will go wrong: */
   1978 	sctx->shader_pointers_dirty |= dirty;
   1979 
   1980 	while (dirty) {
   1981 		unsigned i = u_bit_scan(&dirty);
   1982 
   1983 		if (!si_upload_descriptors(sctx, &sctx->descriptors[i], NULL))
   1984 			return false;
   1985 	}
   1986 
   1987 	sctx->descriptors_dirty &= ~mask;
   1988 
   1989 	return true;
   1990 }
   1991 
   1992 void si_release_all_descriptors(struct si_context *sctx)
   1993 {
   1994 	int i;
   1995 
   1996 	for (i = 0; i < SI_NUM_SHADERS; i++) {
   1997 		si_release_buffer_resources(&sctx->const_buffers[i],
   1998 					    si_const_buffer_descriptors(sctx, i));
   1999 		si_release_buffer_resources(&sctx->shader_buffers[i],
   2000 					    si_shader_buffer_descriptors(sctx, i));
   2001 		si_release_sampler_views(&sctx->samplers[i].views);
   2002 		si_release_image_views(&sctx->images[i]);
   2003 	}
   2004 	si_release_buffer_resources(&sctx->rw_buffers,
   2005 				    &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
   2006 
   2007 	for (i = 0; i < SI_NUM_DESCS; ++i)
   2008 		si_release_descriptors(&sctx->descriptors[i]);
   2009 	si_release_descriptors(&sctx->vertex_buffers);
   2010 }
   2011 
   2012 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
   2013 {
   2014 	int i;
   2015 
   2016 	for (i = 0; i < SI_NUM_SHADERS; i++) {
   2017 		si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
   2018 		si_buffer_resources_begin_new_cs(sctx, &sctx->shader_buffers[i]);
   2019 		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
   2020 		si_image_views_begin_new_cs(sctx, &sctx->images[i]);
   2021 	}
   2022 	si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
   2023 	si_vertex_buffers_begin_new_cs(sctx);
   2024 
   2025 	for (i = 0; i < SI_NUM_DESCS; ++i)
   2026 		si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
   2027 
   2028 	si_shader_userdata_begin_new_cs(sctx);
   2029 }
   2030