Home | History | Annotate | Download | only in radeonsi
      1 /*
      2  * Copyright 2017 Advanced Micro Devices, Inc.
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  */
     23 
     24 /* This file handles register programming of primitive binning. */
     25 
     26 #include "si_pipe.h"
     27 #include "sid.h"
     28 #include "gfx9d.h"
     29 #include "radeon/r600_cs.h"
     30 
     31 struct uvec2 {
     32 	unsigned x, y;
     33 };
     34 
     35 struct si_bin_size_map {
     36 	unsigned start;
     37 	unsigned bin_size_x;
     38 	unsigned bin_size_y;
     39 };
     40 
     41 typedef struct si_bin_size_map si_bin_size_subtable[3][9];
     42 
     43 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
     44 static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
     45 				     const si_bin_size_subtable table[],
     46 				     unsigned sum)
     47 {
     48 	unsigned log_num_rb_per_se =
     49 		util_logbase2_ceil(sscreen->info.num_render_backends /
     50 				   sscreen->info.max_se);
     51 	unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
     52 	unsigned i;
     53 
     54 	/* Get the chip-specific subtable. */
     55 	const struct si_bin_size_map *subtable =
     56 		&table[log_num_rb_per_se][log_num_se][0];
     57 
     58 	for (i = 0; subtable[i].start != UINT_MAX; i++) {
     59 		if (sum >= subtable[i].start && sum < subtable[i + 1].start)
     60 			break;
     61 	}
     62 
     63 	struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
     64 	return size;
     65 }
     66 
     67 static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
     68 					  unsigned cb_target_enabled_4bit)
     69 {
     70 	unsigned nr_samples = sctx->framebuffer.nr_samples;
     71 	unsigned sum = 0;
     72 
     73 	/* Compute the sum of all Bpp. */
     74 	for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
     75 		if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
     76 			continue;
     77 
     78 		struct r600_texture *rtex =
     79 			(struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
     80 		sum += rtex->surface.bpe;
     81 	}
     82 
     83 	/* Multiply the sum by some function of the number of samples. */
     84 	if (nr_samples >= 2) {
     85 		if (sctx->ps_iter_samples >= 2)
     86 			sum *= nr_samples;
     87 		else
     88 			sum *= 2;
     89 	}
     90 
     91 	static const si_bin_size_subtable table[] = {
     92 		{
     93 			/* One RB / SE */
     94 			{
     95 				/* One shader engine */
     96 				{        0,  128,  128 },
     97 				{        1,   64,  128 },
     98 				{        2,   32,  128 },
     99 				{        3,   16,  128 },
    100 				{       17,    0,    0 },
    101 				{ UINT_MAX,    0,    0 },
    102 			},
    103 			{
    104 				/* Two shader engines */
    105 				{        0,  128,  128 },
    106 				{        2,   64,  128 },
    107 				{        3,   32,  128 },
    108 				{        5,   16,  128 },
    109 				{       17,    0,    0 },
    110 				{ UINT_MAX,    0,    0 },
    111 			},
    112 			{
    113 				/* Four shader engines */
    114 				{        0,  128,  128 },
    115 				{        3,   64,  128 },
    116 				{        5,   16,  128 },
    117 				{       17,    0,    0 },
    118 				{ UINT_MAX,    0,    0 },
    119 			},
    120 		},
    121 		{
    122 			/* Two RB / SE */
    123 			{
    124 				/* One shader engine */
    125 				{        0,  128,  128 },
    126 				{        2,   64,  128 },
    127 				{        3,   32,  128 },
    128 				{        5,   16,  128 },
    129 				{       33,    0,    0 },
    130 				{ UINT_MAX,    0,    0 },
    131 			},
    132 			{
    133 				/* Two shader engines */
    134 				{        0,  128,  128 },
    135 				{        3,   64,  128 },
    136 				{        5,   32,  128 },
    137 				{        9,   16,  128 },
    138 				{       33,    0,    0 },
    139 				{ UINT_MAX,    0,    0 },
    140 			},
    141 			{
    142 				/* Four shader engines */
    143 				{        0,  256,  256 },
    144 				{        2,  128,  256 },
    145 				{        3,  128,  128 },
    146 				{        5,   64,  128 },
    147 				{        9,   16,  128 },
    148 				{       33,    0,    0 },
    149 				{ UINT_MAX,    0,    0 },
    150 			},
    151 		},
    152 		{
    153 			/* Four RB / SE */
    154 			{
    155 				/* One shader engine */
    156 				{        0,  128,  256 },
    157 				{        2,  128,  128 },
    158 				{        3,   64,  128 },
    159 				{        5,   32,  128 },
    160 				{        9,   16,  128 },
    161 				{       33,    0,    0 },
    162 				{ UINT_MAX,    0,    0 },
    163 			},
    164 			{
    165 				/* Two shader engines */
    166 				{        0,  256,  256 },
    167 				{        2,  128,  256 },
    168 				{        3,  128,  128 },
    169 				{        5,   64,  128 },
    170 				{        9,   32,  128 },
    171 				{       17,   16,  128 },
    172 				{       33,    0,    0 },
    173 				{ UINT_MAX,    0,    0 },
    174 			},
    175 			{
    176 				/* Four shader engines */
    177 				{        0,  256,  512 },
    178 				{        2,  256,  256 },
    179 				{        3,  128,  256 },
    180 				{        5,  128,  128 },
    181 				{        9,   64,  128 },
    182 				{       17,   16,  128 },
    183 				{       33,    0,    0 },
    184 				{ UINT_MAX,    0,    0 },
    185 			},
    186 		},
    187 	};
    188 
    189 	return si_find_bin_size(sctx->screen, table, sum);
    190 }
    191 
    192 static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
    193 {
    194 	struct si_state_dsa *dsa = sctx->queued.named.dsa;
    195 
    196 	if (!sctx->framebuffer.state.zsbuf ||
    197 	    (!dsa->depth_enabled && !dsa->stencil_enabled)) {
    198 		/* Return the max size. */
    199 		struct uvec2 size = {512, 512};
    200 		return size;
    201 	}
    202 
    203 	struct r600_texture *rtex =
    204 		(struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
    205 	unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
    206 	unsigned stencil_coeff = rtex->surface.has_stencil &&
    207 				 dsa->stencil_enabled ? 1 : 0;
    208 	unsigned sum = 4 * (depth_coeff + stencil_coeff) *
    209 		       sctx->framebuffer.nr_samples;
    210 
    211 	static const si_bin_size_subtable table[] = {
    212 		{
    213 			// One RB / SE
    214 			{
    215 				// One shader engine
    216 				{        0,  128,  256 },
    217 				{        2,  128,  128 },
    218 				{        4,   64,  128 },
    219 				{        7,   32,  128 },
    220 				{       13,   16,  128 },
    221 				{       49,    0,    0 },
    222 				{ UINT_MAX,    0,    0 },
    223 			},
    224 			{
    225 				// Two shader engines
    226 				{        0,  256,  256 },
    227 				{        2,  128,  256 },
    228 				{        4,  128,  128 },
    229 				{        7,   64,  128 },
    230 				{       13,   32,  128 },
    231 				{       25,   16,  128 },
    232 				{       49,    0,    0 },
    233 				{ UINT_MAX,    0,    0 },
    234 			},
    235 			{
    236 				// Four shader engines
    237 				{        0,  256,  512 },
    238 				{        2,  256,  256 },
    239 				{        4,  128,  256 },
    240 				{        7,  128,  128 },
    241 				{       13,   64,  128 },
    242 				{       25,   16,  128 },
    243 				{       49,    0,    0 },
    244 				{ UINT_MAX,    0,    0 },
    245 			},
    246 		},
    247 		{
    248 			// Two RB / SE
    249 			{
    250 				// One shader engine
    251 				{        0,  256,  256 },
    252 				{        2,  128,  256 },
    253 				{        4,  128,  128 },
    254 				{        7,   64,  128 },
    255 				{       13,   32,  128 },
    256 				{       25,   16,  128 },
    257 				{       97,    0,    0 },
    258 				{ UINT_MAX,    0,    0 },
    259 			},
    260 			{
    261 				// Two shader engines
    262 				{        0,  256,  512 },
    263 				{        2,  256,  256 },
    264 				{        4,  128,  256 },
    265 				{        7,  128,  128 },
    266 				{       13,   64,  128 },
    267 				{       25,   32,  128 },
    268 				{       49,   16,  128 },
    269 				{       97,    0,    0 },
    270 				{ UINT_MAX,    0,    0 },
    271 			},
    272 			{
    273 				// Four shader engines
    274 				{        0,  512,  512 },
    275 				{        2,  256,  512 },
    276 				{        4,  256,  256 },
    277 				{        7,  128,  256 },
    278 				{       13,  128,  128 },
    279 				{       25,   64,  128 },
    280 				{       49,   16,  128 },
    281 				{       97,    0,    0 },
    282 				{ UINT_MAX,    0,    0 },
    283 			},
    284 		},
    285 		{
    286 			// Four RB / SE
    287 			{
    288 				// One shader engine
    289 				{        0,  256,  512 },
    290 				{        2,  256,  256 },
    291 				{        4,  128,  256 },
    292 				{        7,  128,  128 },
    293 				{       13,   64,  128 },
    294 				{       25,   32,  128 },
    295 				{       49,   16,  128 },
    296 				{ UINT_MAX,    0,    0 },
    297 			},
    298 			{
    299 				// Two shader engines
    300 				{        0,  512,  512 },
    301 				{        2,  256,  512 },
    302 				{        4,  256,  256 },
    303 				{        7,  128,  256 },
    304 				{       13,  128,  128 },
    305 				{       25,   64,  128 },
    306 				{       49,   32,  128 },
    307 				{       97,   16,  128 },
    308 				{ UINT_MAX,    0,    0 },
    309 			},
    310 			{
    311 				// Four shader engines
    312 				{        0,  512,  512 },
    313 				{        4,  256,  512 },
    314 				{        7,  256,  256 },
    315 				{       13,  128,  256 },
    316 				{       25,  128,  128 },
    317 				{       49,   64,  128 },
    318 				{       97,   16,  128 },
    319 				{ UINT_MAX,    0,    0 },
    320 			},
    321 		},
    322 	};
    323 
    324 	return si_find_bin_size(sctx->screen, table, sum);
    325 }
    326 
    327 static void si_emit_dpbb_disable(struct si_context *sctx)
    328 {
    329 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    330 
    331 	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
    332 			       S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
    333 			       S_028C44_DISABLE_START_OF_PRIM(1));
    334 	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
    335 			       S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
    336 }
    337 
    338 void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
    339 {
    340 	struct si_screen *sscreen = sctx->screen;
    341 	struct si_state_blend *blend = sctx->queued.named.blend;
    342 	struct si_state_dsa *dsa = sctx->queued.named.dsa;
    343 	unsigned db_shader_control = sctx->ps_db_shader_control;
    344 
    345 	assert(sctx->b.chip_class >= GFX9);
    346 
    347 	if (!sscreen->dpbb_allowed || !blend || !dsa) {
    348 		si_emit_dpbb_disable(sctx);
    349 		return;
    350 	}
    351 
    352 	bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) ||
    353 			   G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
    354 			   G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
    355 			   blend->alpha_to_coverage;
    356 
    357 	/* This is ported from Vulkan, but it doesn't make much sense to me.
    358 	 * Maybe it's for RE-Z? But Vulkan doesn't use RE-Z. TODO: Clarify this.
    359 	 */
    360 	bool ps_can_reject_z_trivially =
    361 		!G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
    362 		G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control);
    363 
    364 	/* Disable binning if PS can kill trivially with DB writes.
    365 	 * Ported from Vulkan. (heuristic?)
    366 	 */
    367 	if (ps_can_kill &&
    368 	    ps_can_reject_z_trivially &&
    369 	    sctx->framebuffer.state.zsbuf &&
    370 	    dsa->db_can_write) {
    371 		si_emit_dpbb_disable(sctx);
    372 		return;
    373 	}
    374 
    375 	/* Compute the bin size. */
    376 	/* TODO: We could also look at enabled pixel shader outputs. */
    377 	unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit &
    378 					  blend->cb_target_enabled_4bit;
    379 	struct uvec2 color_bin_size =
    380 		si_get_color_bin_size(sctx, cb_target_enabled_4bit);
    381 	struct uvec2 depth_bin_size = si_get_depth_bin_size(sctx);
    382 
    383 	unsigned color_area = color_bin_size.x * color_bin_size.y;
    384 	unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
    385 
    386 	struct uvec2 bin_size = color_area < depth_area ? color_bin_size
    387 							: depth_bin_size;
    388 
    389 	if (!bin_size.x || !bin_size.y) {
    390 		si_emit_dpbb_disable(sctx);
    391 		return;
    392 	}
    393 
    394 	/* Enable DFSM if it's preferred. */
    395 	unsigned punchout_mode = V_028060_FORCE_OFF;
    396 	bool disable_start_of_prim = true;
    397 
    398 	if (sscreen->dfsm_allowed &&
    399 	    cb_target_enabled_4bit &&
    400 	    !G_02880C_KILL_ENABLE(db_shader_control) &&
    401 	    /* These two also imply that DFSM is disabled when PS writes to memory. */
    402 	    !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
    403 	    !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
    404 	    G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
    405 		punchout_mode = V_028060_AUTO;
    406 		disable_start_of_prim = (cb_target_enabled_4bit &
    407 					 blend->blend_enable_4bit) != 0;
    408 	}
    409 
    410 	/* Tunable parameters. Also test with DFSM enabled/disabled. */
    411 	unsigned context_states_per_bin; /* allowed range: [0, 5] */
    412 	unsigned persistent_states_per_bin; /* allowed range: [0, 31] */
    413 	unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
    414 
    415 	switch (sctx->b.family) {
    416 	case CHIP_VEGA10:
    417 	case CHIP_RAVEN:
    418 		/* Tuned for Raven. Vega might need different values. */
    419 		context_states_per_bin = 5;
    420 		persistent_states_per_bin = 31;
    421 		fpovs_per_batch = 63;
    422 		break;
    423 	default:
    424 		assert(0);
    425 	}
    426 
    427 	/* Emit registers. */
    428 	struct uvec2 bin_size_extend = {};
    429 	if (bin_size.x >= 32)
    430 		bin_size_extend.x = util_logbase2(bin_size.x) - 5;
    431 	if (bin_size.y >= 32)
    432 		bin_size_extend.y = util_logbase2(bin_size.y) - 5;
    433 
    434 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
    435 	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
    436 			       S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
    437 			       S_028C44_BIN_SIZE_X(bin_size.x == 16) |
    438 			       S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
    439 			       S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
    440 			       S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
    441 			       S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
    442 			       S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
    443 			       S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
    444 			       S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
    445 			       S_028C44_OPTIMAL_BIN_SELECTION(1));
    446 	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
    447 			       S_028060_PUNCHOUT_MODE(punchout_mode));
    448 }
    449