1 /* 2 * Copyright 2017 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 /* This file handles register programming of primitive binning. */ 25 26 #include "si_pipe.h" 27 #include "sid.h" 28 #include "gfx9d.h" 29 #include "radeon/r600_cs.h" 30 31 struct uvec2 { 32 unsigned x, y; 33 }; 34 35 struct si_bin_size_map { 36 unsigned start; 37 unsigned bin_size_x; 38 unsigned bin_size_y; 39 }; 40 41 typedef struct si_bin_size_map si_bin_size_subtable[3][9]; 42 43 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */ 44 static struct uvec2 si_find_bin_size(struct si_screen *sscreen, 45 const si_bin_size_subtable table[], 46 unsigned sum) 47 { 48 unsigned log_num_rb_per_se = 49 util_logbase2_ceil(sscreen->info.num_render_backends / 50 sscreen->info.max_se); 51 unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se); 52 unsigned i; 53 54 /* Get the chip-specific subtable. */ 55 const struct si_bin_size_map *subtable = 56 &table[log_num_rb_per_se][log_num_se][0]; 57 58 for (i = 0; subtable[i].start != UINT_MAX; i++) { 59 if (sum >= subtable[i].start && sum < subtable[i + 1].start) 60 break; 61 } 62 63 struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y}; 64 return size; 65 } 66 67 static struct uvec2 si_get_color_bin_size(struct si_context *sctx, 68 unsigned cb_target_enabled_4bit) 69 { 70 unsigned nr_samples = sctx->framebuffer.nr_samples; 71 unsigned sum = 0; 72 73 /* Compute the sum of all Bpp. */ 74 for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { 75 if (!(cb_target_enabled_4bit & (0xf << (i * 4)))) 76 continue; 77 78 struct r600_texture *rtex = 79 (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture; 80 sum += rtex->surface.bpe; 81 } 82 83 /* Multiply the sum by some function of the number of samples. */ 84 if (nr_samples >= 2) { 85 if (sctx->ps_iter_samples >= 2) 86 sum *= nr_samples; 87 else 88 sum *= 2; 89 } 90 91 static const si_bin_size_subtable table[] = { 92 { 93 /* One RB / SE */ 94 { 95 /* One shader engine */ 96 { 0, 128, 128 }, 97 { 1, 64, 128 }, 98 { 2, 32, 128 }, 99 { 3, 16, 128 }, 100 { 17, 0, 0 }, 101 { UINT_MAX, 0, 0 }, 102 }, 103 { 104 /* Two shader engines */ 105 { 0, 128, 128 }, 106 { 2, 64, 128 }, 107 { 3, 32, 128 }, 108 { 5, 16, 128 }, 109 { 17, 0, 0 }, 110 { UINT_MAX, 0, 0 }, 111 }, 112 { 113 /* Four shader engines */ 114 { 0, 128, 128 }, 115 { 3, 64, 128 }, 116 { 5, 16, 128 }, 117 { 17, 0, 0 }, 118 { UINT_MAX, 0, 0 }, 119 }, 120 }, 121 { 122 /* Two RB / SE */ 123 { 124 /* One shader engine */ 125 { 0, 128, 128 }, 126 { 2, 64, 128 }, 127 { 3, 32, 128 }, 128 { 5, 16, 128 }, 129 { 33, 0, 0 }, 130 { UINT_MAX, 0, 0 }, 131 }, 132 { 133 /* Two shader engines */ 134 { 0, 128, 128 }, 135 { 3, 64, 128 }, 136 { 5, 32, 128 }, 137 { 9, 16, 128 }, 138 { 33, 0, 0 }, 139 { UINT_MAX, 0, 0 }, 140 }, 141 { 142 /* Four shader engines */ 143 { 0, 256, 256 }, 144 { 2, 128, 256 }, 145 { 3, 128, 128 }, 146 { 5, 64, 128 }, 147 { 9, 16, 128 }, 148 { 33, 0, 0 }, 149 { UINT_MAX, 0, 0 }, 150 }, 151 }, 152 { 153 /* Four RB / SE */ 154 { 155 /* One shader engine */ 156 { 0, 128, 256 }, 157 { 2, 128, 128 }, 158 { 3, 64, 128 }, 159 { 5, 32, 128 }, 160 { 9, 16, 128 }, 161 { 33, 0, 0 }, 162 { UINT_MAX, 0, 0 }, 163 }, 164 { 165 /* Two shader engines */ 166 { 0, 256, 256 }, 167 { 2, 128, 256 }, 168 { 3, 128, 128 }, 169 { 5, 64, 128 }, 170 { 9, 32, 128 }, 171 { 17, 16, 128 }, 172 { 33, 0, 0 }, 173 { UINT_MAX, 0, 0 }, 174 }, 175 { 176 /* Four shader engines */ 177 { 0, 256, 512 }, 178 { 2, 256, 256 }, 179 { 3, 128, 256 }, 180 { 5, 128, 128 }, 181 { 9, 64, 128 }, 182 { 17, 16, 128 }, 183 { 33, 0, 0 }, 184 { UINT_MAX, 0, 0 }, 185 }, 186 }, 187 }; 188 189 return si_find_bin_size(sctx->screen, table, sum); 190 } 191 192 static struct uvec2 si_get_depth_bin_size(struct si_context *sctx) 193 { 194 struct si_state_dsa *dsa = sctx->queued.named.dsa; 195 196 if (!sctx->framebuffer.state.zsbuf || 197 (!dsa->depth_enabled && !dsa->stencil_enabled)) { 198 /* Return the max size. */ 199 struct uvec2 size = {512, 512}; 200 return size; 201 } 202 203 struct r600_texture *rtex = 204 (struct r600_texture*)sctx->framebuffer.state.zsbuf->texture; 205 unsigned depth_coeff = dsa->depth_enabled ? 5 : 0; 206 unsigned stencil_coeff = rtex->surface.has_stencil && 207 dsa->stencil_enabled ? 1 : 0; 208 unsigned sum = 4 * (depth_coeff + stencil_coeff) * 209 sctx->framebuffer.nr_samples; 210 211 static const si_bin_size_subtable table[] = { 212 { 213 // One RB / SE 214 { 215 // One shader engine 216 { 0, 128, 256 }, 217 { 2, 128, 128 }, 218 { 4, 64, 128 }, 219 { 7, 32, 128 }, 220 { 13, 16, 128 }, 221 { 49, 0, 0 }, 222 { UINT_MAX, 0, 0 }, 223 }, 224 { 225 // Two shader engines 226 { 0, 256, 256 }, 227 { 2, 128, 256 }, 228 { 4, 128, 128 }, 229 { 7, 64, 128 }, 230 { 13, 32, 128 }, 231 { 25, 16, 128 }, 232 { 49, 0, 0 }, 233 { UINT_MAX, 0, 0 }, 234 }, 235 { 236 // Four shader engines 237 { 0, 256, 512 }, 238 { 2, 256, 256 }, 239 { 4, 128, 256 }, 240 { 7, 128, 128 }, 241 { 13, 64, 128 }, 242 { 25, 16, 128 }, 243 { 49, 0, 0 }, 244 { UINT_MAX, 0, 0 }, 245 }, 246 }, 247 { 248 // Two RB / SE 249 { 250 // One shader engine 251 { 0, 256, 256 }, 252 { 2, 128, 256 }, 253 { 4, 128, 128 }, 254 { 7, 64, 128 }, 255 { 13, 32, 128 }, 256 { 25, 16, 128 }, 257 { 97, 0, 0 }, 258 { UINT_MAX, 0, 0 }, 259 }, 260 { 261 // Two shader engines 262 { 0, 256, 512 }, 263 { 2, 256, 256 }, 264 { 4, 128, 256 }, 265 { 7, 128, 128 }, 266 { 13, 64, 128 }, 267 { 25, 32, 128 }, 268 { 49, 16, 128 }, 269 { 97, 0, 0 }, 270 { UINT_MAX, 0, 0 }, 271 }, 272 { 273 // Four shader engines 274 { 0, 512, 512 }, 275 { 2, 256, 512 }, 276 { 4, 256, 256 }, 277 { 7, 128, 256 }, 278 { 13, 128, 128 }, 279 { 25, 64, 128 }, 280 { 49, 16, 128 }, 281 { 97, 0, 0 }, 282 { UINT_MAX, 0, 0 }, 283 }, 284 }, 285 { 286 // Four RB / SE 287 { 288 // One shader engine 289 { 0, 256, 512 }, 290 { 2, 256, 256 }, 291 { 4, 128, 256 }, 292 { 7, 128, 128 }, 293 { 13, 64, 128 }, 294 { 25, 32, 128 }, 295 { 49, 16, 128 }, 296 { UINT_MAX, 0, 0 }, 297 }, 298 { 299 // Two shader engines 300 { 0, 512, 512 }, 301 { 2, 256, 512 }, 302 { 4, 256, 256 }, 303 { 7, 128, 256 }, 304 { 13, 128, 128 }, 305 { 25, 64, 128 }, 306 { 49, 32, 128 }, 307 { 97, 16, 128 }, 308 { UINT_MAX, 0, 0 }, 309 }, 310 { 311 // Four shader engines 312 { 0, 512, 512 }, 313 { 4, 256, 512 }, 314 { 7, 256, 256 }, 315 { 13, 128, 256 }, 316 { 25, 128, 128 }, 317 { 49, 64, 128 }, 318 { 97, 16, 128 }, 319 { UINT_MAX, 0, 0 }, 320 }, 321 }, 322 }; 323 324 return si_find_bin_size(sctx->screen, table, sum); 325 } 326 327 static void si_emit_dpbb_disable(struct si_context *sctx) 328 { 329 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 330 331 radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0, 332 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | 333 S_028C44_DISABLE_START_OF_PRIM(1)); 334 radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL, 335 S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF)); 336 } 337 338 void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state) 339 { 340 struct si_screen *sscreen = sctx->screen; 341 struct si_state_blend *blend = sctx->queued.named.blend; 342 struct si_state_dsa *dsa = sctx->queued.named.dsa; 343 unsigned db_shader_control = sctx->ps_db_shader_control; 344 345 assert(sctx->b.chip_class >= GFX9); 346 347 if (!sscreen->dpbb_allowed || !blend || !dsa) { 348 si_emit_dpbb_disable(sctx); 349 return; 350 } 351 352 bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) || 353 G_02880C_MASK_EXPORT_ENABLE(db_shader_control) || 354 G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || 355 blend->alpha_to_coverage; 356 357 /* This is ported from Vulkan, but it doesn't make much sense to me. 358 * Maybe it's for RE-Z? But Vulkan doesn't use RE-Z. TODO: Clarify this. 359 */ 360 bool ps_can_reject_z_trivially = 361 !G_02880C_Z_EXPORT_ENABLE(db_shader_control) || 362 G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control); 363 364 /* Disable binning if PS can kill trivially with DB writes. 365 * Ported from Vulkan. (heuristic?) 366 */ 367 if (ps_can_kill && 368 ps_can_reject_z_trivially && 369 sctx->framebuffer.state.zsbuf && 370 dsa->db_can_write) { 371 si_emit_dpbb_disable(sctx); 372 return; 373 } 374 375 /* Compute the bin size. */ 376 /* TODO: We could also look at enabled pixel shader outputs. */ 377 unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit & 378 blend->cb_target_enabled_4bit; 379 struct uvec2 color_bin_size = 380 si_get_color_bin_size(sctx, cb_target_enabled_4bit); 381 struct uvec2 depth_bin_size = si_get_depth_bin_size(sctx); 382 383 unsigned color_area = color_bin_size.x * color_bin_size.y; 384 unsigned depth_area = depth_bin_size.x * depth_bin_size.y; 385 386 struct uvec2 bin_size = color_area < depth_area ? color_bin_size 387 : depth_bin_size; 388 389 if (!bin_size.x || !bin_size.y) { 390 si_emit_dpbb_disable(sctx); 391 return; 392 } 393 394 /* Enable DFSM if it's preferred. */ 395 unsigned punchout_mode = V_028060_FORCE_OFF; 396 bool disable_start_of_prim = true; 397 398 if (sscreen->dfsm_allowed && 399 cb_target_enabled_4bit && 400 !G_02880C_KILL_ENABLE(db_shader_control) && 401 /* These two also imply that DFSM is disabled when PS writes to memory. */ 402 !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) && 403 !G_02880C_EXEC_ON_NOOP(db_shader_control) && 404 G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) { 405 punchout_mode = V_028060_AUTO; 406 disable_start_of_prim = (cb_target_enabled_4bit & 407 blend->blend_enable_4bit) != 0; 408 } 409 410 /* Tunable parameters. Also test with DFSM enabled/disabled. */ 411 unsigned context_states_per_bin; /* allowed range: [0, 5] */ 412 unsigned persistent_states_per_bin; /* allowed range: [0, 31] */ 413 unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */ 414 415 switch (sctx->b.family) { 416 case CHIP_VEGA10: 417 case CHIP_RAVEN: 418 /* Tuned for Raven. Vega might need different values. */ 419 context_states_per_bin = 5; 420 persistent_states_per_bin = 31; 421 fpovs_per_batch = 63; 422 break; 423 default: 424 assert(0); 425 } 426 427 /* Emit registers. */ 428 struct uvec2 bin_size_extend = {}; 429 if (bin_size.x >= 32) 430 bin_size_extend.x = util_logbase2(bin_size.x) - 5; 431 if (bin_size.y >= 32) 432 bin_size_extend.y = util_logbase2(bin_size.y) - 5; 433 434 struct radeon_winsys_cs *cs = sctx->b.gfx.cs; 435 radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0, 436 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | 437 S_028C44_BIN_SIZE_X(bin_size.x == 16) | 438 S_028C44_BIN_SIZE_Y(bin_size.y == 16) | 439 S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) | 440 S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | 441 S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) | 442 S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) | 443 S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) | 444 S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | 445 S_028C44_OPTIMAL_BIN_SELECTION(1)); 446 radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL, 447 S_028060_PUNCHOUT_MODE(punchout_mode)); 448 } 449