1 /* 2 * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org> 3 * Copyright 2014 Marek Olk <marek.olsak (at) amd.com> 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25 #include "r600_query.h" 26 #include "r600_pipe.h" 27 #include "r600_cs.h" 28 #include "util/u_memory.h" 29 #include "util/u_upload_mgr.h" 30 #include "util/os_time.h" 31 #include "tgsi/tgsi_text.h" 32 33 #define R600_MAX_STREAMS 4 34 35 struct r600_hw_query_params { 36 unsigned start_offset; 37 unsigned end_offset; 38 unsigned fence_offset; 39 unsigned pair_stride; 40 unsigned pair_count; 41 }; 42 43 /* Queries without buffer handling or suspend/resume. */ 44 struct r600_query_sw { 45 struct r600_query b; 46 47 uint64_t begin_result; 48 uint64_t end_result; 49 50 uint64_t begin_time; 51 uint64_t end_time; 52 53 /* Fence for GPU_FINISHED. */ 54 struct pipe_fence_handle *fence; 55 }; 56 57 static void r600_query_sw_destroy(struct r600_common_screen *rscreen, 58 struct r600_query *rquery) 59 { 60 struct r600_query_sw *query = (struct r600_query_sw *)rquery; 61 62 rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL); 63 FREE(query); 64 } 65 66 static enum radeon_value_id winsys_id_from_type(unsigned type) 67 { 68 switch (type) { 69 case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY; 70 case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY; 71 case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM; 72 case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT; 73 case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS; 74 case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS; 75 case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS; 76 case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS; 77 case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER; 78 case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED; 79 case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS; 80 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS; 81 case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE; 82 case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE; 83 case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE; 84 case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE; 85 case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK; 86 case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK; 87 case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME; 88 default: unreachable("query type does not correspond to winsys id"); 89 } 90 } 91 92 static bool r600_query_sw_begin(struct r600_common_context *rctx, 93 struct r600_query *rquery) 94 { 95 struct r600_query_sw *query = (struct r600_query_sw *)rquery; 96 enum radeon_value_id ws_id; 97 98 switch(query->b.type) { 99 case PIPE_QUERY_TIMESTAMP_DISJOINT: 100 case PIPE_QUERY_GPU_FINISHED: 101 break; 102 case R600_QUERY_DRAW_CALLS: 103 query->begin_result = rctx->num_draw_calls; 104 break; 105 case R600_QUERY_DECOMPRESS_CALLS: 106 query->begin_result = rctx->num_decompress_calls; 107 break; 108 case R600_QUERY_MRT_DRAW_CALLS: 109 query->begin_result = rctx->num_mrt_draw_calls; 110 break; 111 case R600_QUERY_PRIM_RESTART_CALLS: 112 query->begin_result = rctx->num_prim_restart_calls; 113 break; 114 case R600_QUERY_SPILL_DRAW_CALLS: 115 query->begin_result = rctx->num_spill_draw_calls; 116 break; 117 case R600_QUERY_COMPUTE_CALLS: 118 query->begin_result = rctx->num_compute_calls; 119 break; 120 case R600_QUERY_SPILL_COMPUTE_CALLS: 121 query->begin_result = rctx->num_spill_compute_calls; 122 break; 123 case R600_QUERY_DMA_CALLS: 124 query->begin_result = rctx->num_dma_calls; 125 break; 126 case R600_QUERY_CP_DMA_CALLS: 127 query->begin_result = rctx->num_cp_dma_calls; 128 break; 129 case R600_QUERY_NUM_VS_FLUSHES: 130 query->begin_result = rctx->num_vs_flushes; 131 break; 132 case R600_QUERY_NUM_PS_FLUSHES: 133 query->begin_result = rctx->num_ps_flushes; 134 break; 135 case R600_QUERY_NUM_CS_FLUSHES: 136 query->begin_result = rctx->num_cs_flushes; 137 break; 138 case R600_QUERY_NUM_CB_CACHE_FLUSHES: 139 query->begin_result = rctx->num_cb_cache_flushes; 140 break; 141 case R600_QUERY_NUM_DB_CACHE_FLUSHES: 142 query->begin_result = rctx->num_db_cache_flushes; 143 break; 144 case R600_QUERY_NUM_RESIDENT_HANDLES: 145 query->begin_result = rctx->num_resident_handles; 146 break; 147 case R600_QUERY_TC_OFFLOADED_SLOTS: 148 query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0; 149 break; 150 case R600_QUERY_TC_DIRECT_SLOTS: 151 query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0; 152 break; 153 case R600_QUERY_TC_NUM_SYNCS: 154 query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0; 155 break; 156 case R600_QUERY_REQUESTED_VRAM: 157 case R600_QUERY_REQUESTED_GTT: 158 case R600_QUERY_MAPPED_VRAM: 159 case R600_QUERY_MAPPED_GTT: 160 case R600_QUERY_VRAM_USAGE: 161 case R600_QUERY_VRAM_VIS_USAGE: 162 case R600_QUERY_GTT_USAGE: 163 case R600_QUERY_GPU_TEMPERATURE: 164 case R600_QUERY_CURRENT_GPU_SCLK: 165 case R600_QUERY_CURRENT_GPU_MCLK: 166 case R600_QUERY_NUM_MAPPED_BUFFERS: 167 query->begin_result = 0; 168 break; 169 case R600_QUERY_BUFFER_WAIT_TIME: 170 case R600_QUERY_NUM_GFX_IBS: 171 case R600_QUERY_NUM_SDMA_IBS: 172 case R600_QUERY_NUM_BYTES_MOVED: 173 case R600_QUERY_NUM_EVICTIONS: 174 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { 175 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); 176 query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); 177 break; 178 } 179 case R600_QUERY_GFX_BO_LIST_SIZE: 180 ws_id = winsys_id_from_type(query->b.type); 181 query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); 182 query->begin_time = rctx->ws->query_value(rctx->ws, 183 RADEON_NUM_GFX_IBS); 184 break; 185 case R600_QUERY_CS_THREAD_BUSY: 186 ws_id = winsys_id_from_type(query->b.type); 187 query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); 188 query->begin_time = os_time_get_nano(); 189 break; 190 case R600_QUERY_GALLIUM_THREAD_BUSY: 191 query->begin_result = 192 rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0; 193 query->begin_time = os_time_get_nano(); 194 break; 195 case R600_QUERY_GPU_LOAD: 196 case R600_QUERY_GPU_SHADERS_BUSY: 197 case R600_QUERY_GPU_TA_BUSY: 198 case R600_QUERY_GPU_GDS_BUSY: 199 case R600_QUERY_GPU_VGT_BUSY: 200 case R600_QUERY_GPU_IA_BUSY: 201 case R600_QUERY_GPU_SX_BUSY: 202 case R600_QUERY_GPU_WD_BUSY: 203 case R600_QUERY_GPU_BCI_BUSY: 204 case R600_QUERY_GPU_SC_BUSY: 205 case R600_QUERY_GPU_PA_BUSY: 206 case R600_QUERY_GPU_DB_BUSY: 207 case R600_QUERY_GPU_CP_BUSY: 208 case R600_QUERY_GPU_CB_BUSY: 209 case R600_QUERY_GPU_SDMA_BUSY: 210 case R600_QUERY_GPU_PFP_BUSY: 211 case R600_QUERY_GPU_MEQ_BUSY: 212 case R600_QUERY_GPU_ME_BUSY: 213 case R600_QUERY_GPU_SURF_SYNC_BUSY: 214 case R600_QUERY_GPU_CP_DMA_BUSY: 215 case R600_QUERY_GPU_SCRATCH_RAM_BUSY: 216 query->begin_result = r600_begin_counter(rctx->screen, 217 query->b.type); 218 break; 219 case R600_QUERY_NUM_COMPILATIONS: 220 query->begin_result = p_atomic_read(&rctx->screen->num_compilations); 221 break; 222 case R600_QUERY_NUM_SHADERS_CREATED: 223 query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created); 224 break; 225 case R600_QUERY_NUM_SHADER_CACHE_HITS: 226 query->begin_result = 227 p_atomic_read(&rctx->screen->num_shader_cache_hits); 228 break; 229 case R600_QUERY_GPIN_ASIC_ID: 230 case R600_QUERY_GPIN_NUM_SIMD: 231 case R600_QUERY_GPIN_NUM_RB: 232 case R600_QUERY_GPIN_NUM_SPI: 233 case R600_QUERY_GPIN_NUM_SE: 234 break; 235 default: 236 unreachable("r600_query_sw_begin: bad query type"); 237 } 238 239 return true; 240 } 241 242 static bool r600_query_sw_end(struct r600_common_context *rctx, 243 struct r600_query *rquery) 244 { 245 struct r600_query_sw *query = (struct r600_query_sw *)rquery; 246 enum radeon_value_id ws_id; 247 248 switch(query->b.type) { 249 case PIPE_QUERY_TIMESTAMP_DISJOINT: 250 break; 251 case PIPE_QUERY_GPU_FINISHED: 252 rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED); 253 break; 254 case R600_QUERY_DRAW_CALLS: 255 query->end_result = rctx->num_draw_calls; 256 break; 257 case R600_QUERY_DECOMPRESS_CALLS: 258 query->end_result = rctx->num_decompress_calls; 259 break; 260 case R600_QUERY_MRT_DRAW_CALLS: 261 query->end_result = rctx->num_mrt_draw_calls; 262 break; 263 case R600_QUERY_PRIM_RESTART_CALLS: 264 query->end_result = rctx->num_prim_restart_calls; 265 break; 266 case R600_QUERY_SPILL_DRAW_CALLS: 267 query->end_result = rctx->num_spill_draw_calls; 268 break; 269 case R600_QUERY_COMPUTE_CALLS: 270 query->end_result = rctx->num_compute_calls; 271 break; 272 case R600_QUERY_SPILL_COMPUTE_CALLS: 273 query->end_result = rctx->num_spill_compute_calls; 274 break; 275 case R600_QUERY_DMA_CALLS: 276 query->end_result = rctx->num_dma_calls; 277 break; 278 case R600_QUERY_CP_DMA_CALLS: 279 query->end_result = rctx->num_cp_dma_calls; 280 break; 281 case R600_QUERY_NUM_VS_FLUSHES: 282 query->end_result = rctx->num_vs_flushes; 283 break; 284 case R600_QUERY_NUM_PS_FLUSHES: 285 query->end_result = rctx->num_ps_flushes; 286 break; 287 case R600_QUERY_NUM_CS_FLUSHES: 288 query->end_result = rctx->num_cs_flushes; 289 break; 290 case R600_QUERY_NUM_CB_CACHE_FLUSHES: 291 query->end_result = rctx->num_cb_cache_flushes; 292 break; 293 case R600_QUERY_NUM_DB_CACHE_FLUSHES: 294 query->end_result = rctx->num_db_cache_flushes; 295 break; 296 case R600_QUERY_NUM_RESIDENT_HANDLES: 297 query->end_result = rctx->num_resident_handles; 298 break; 299 case R600_QUERY_TC_OFFLOADED_SLOTS: 300 query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0; 301 break; 302 case R600_QUERY_TC_DIRECT_SLOTS: 303 query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0; 304 break; 305 case R600_QUERY_TC_NUM_SYNCS: 306 query->end_result = rctx->tc ? rctx->tc->num_syncs : 0; 307 break; 308 case R600_QUERY_REQUESTED_VRAM: 309 case R600_QUERY_REQUESTED_GTT: 310 case R600_QUERY_MAPPED_VRAM: 311 case R600_QUERY_MAPPED_GTT: 312 case R600_QUERY_VRAM_USAGE: 313 case R600_QUERY_VRAM_VIS_USAGE: 314 case R600_QUERY_GTT_USAGE: 315 case R600_QUERY_GPU_TEMPERATURE: 316 case R600_QUERY_CURRENT_GPU_SCLK: 317 case R600_QUERY_CURRENT_GPU_MCLK: 318 case R600_QUERY_BUFFER_WAIT_TIME: 319 case R600_QUERY_NUM_MAPPED_BUFFERS: 320 case R600_QUERY_NUM_GFX_IBS: 321 case R600_QUERY_NUM_SDMA_IBS: 322 case R600_QUERY_NUM_BYTES_MOVED: 323 case R600_QUERY_NUM_EVICTIONS: 324 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { 325 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); 326 query->end_result = rctx->ws->query_value(rctx->ws, ws_id); 327 break; 328 } 329 case R600_QUERY_GFX_BO_LIST_SIZE: 330 ws_id = winsys_id_from_type(query->b.type); 331 query->end_result = rctx->ws->query_value(rctx->ws, ws_id); 332 query->end_time = rctx->ws->query_value(rctx->ws, 333 RADEON_NUM_GFX_IBS); 334 break; 335 case R600_QUERY_CS_THREAD_BUSY: 336 ws_id = winsys_id_from_type(query->b.type); 337 query->end_result = rctx->ws->query_value(rctx->ws, ws_id); 338 query->end_time = os_time_get_nano(); 339 break; 340 case R600_QUERY_GALLIUM_THREAD_BUSY: 341 query->end_result = 342 rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0; 343 query->end_time = os_time_get_nano(); 344 break; 345 case R600_QUERY_GPU_LOAD: 346 case R600_QUERY_GPU_SHADERS_BUSY: 347 case R600_QUERY_GPU_TA_BUSY: 348 case R600_QUERY_GPU_GDS_BUSY: 349 case R600_QUERY_GPU_VGT_BUSY: 350 case R600_QUERY_GPU_IA_BUSY: 351 case R600_QUERY_GPU_SX_BUSY: 352 case R600_QUERY_GPU_WD_BUSY: 353 case R600_QUERY_GPU_BCI_BUSY: 354 case R600_QUERY_GPU_SC_BUSY: 355 case R600_QUERY_GPU_PA_BUSY: 356 case R600_QUERY_GPU_DB_BUSY: 357 case R600_QUERY_GPU_CP_BUSY: 358 case R600_QUERY_GPU_CB_BUSY: 359 case R600_QUERY_GPU_SDMA_BUSY: 360 case R600_QUERY_GPU_PFP_BUSY: 361 case R600_QUERY_GPU_MEQ_BUSY: 362 case R600_QUERY_GPU_ME_BUSY: 363 case R600_QUERY_GPU_SURF_SYNC_BUSY: 364 case R600_QUERY_GPU_CP_DMA_BUSY: 365 case R600_QUERY_GPU_SCRATCH_RAM_BUSY: 366 query->end_result = r600_end_counter(rctx->screen, 367 query->b.type, 368 query->begin_result); 369 query->begin_result = 0; 370 break; 371 case R600_QUERY_NUM_COMPILATIONS: 372 query->end_result = p_atomic_read(&rctx->screen->num_compilations); 373 break; 374 case R600_QUERY_NUM_SHADERS_CREATED: 375 query->end_result = p_atomic_read(&rctx->screen->num_shaders_created); 376 break; 377 case R600_QUERY_NUM_SHADER_CACHE_HITS: 378 query->end_result = 379 p_atomic_read(&rctx->screen->num_shader_cache_hits); 380 break; 381 case R600_QUERY_GPIN_ASIC_ID: 382 case R600_QUERY_GPIN_NUM_SIMD: 383 case R600_QUERY_GPIN_NUM_RB: 384 case R600_QUERY_GPIN_NUM_SPI: 385 case R600_QUERY_GPIN_NUM_SE: 386 break; 387 default: 388 unreachable("r600_query_sw_end: bad query type"); 389 } 390 391 return true; 392 } 393 394 static bool r600_query_sw_get_result(struct r600_common_context *rctx, 395 struct r600_query *rquery, 396 bool wait, 397 union pipe_query_result *result) 398 { 399 struct r600_query_sw *query = (struct r600_query_sw *)rquery; 400 401 switch (query->b.type) { 402 case PIPE_QUERY_TIMESTAMP_DISJOINT: 403 /* Convert from cycles per millisecond to cycles per second (Hz). */ 404 result->timestamp_disjoint.frequency = 405 (uint64_t)rctx->screen->info.clock_crystal_freq * 1000; 406 result->timestamp_disjoint.disjoint = false; 407 return true; 408 case PIPE_QUERY_GPU_FINISHED: { 409 struct pipe_screen *screen = rctx->b.screen; 410 struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b; 411 412 result->b = screen->fence_finish(screen, ctx, query->fence, 413 wait ? PIPE_TIMEOUT_INFINITE : 0); 414 return result->b; 415 } 416 417 case R600_QUERY_GFX_BO_LIST_SIZE: 418 result->u64 = (query->end_result - query->begin_result) / 419 (query->end_time - query->begin_time); 420 return true; 421 case R600_QUERY_CS_THREAD_BUSY: 422 case R600_QUERY_GALLIUM_THREAD_BUSY: 423 result->u64 = (query->end_result - query->begin_result) * 100 / 424 (query->end_time - query->begin_time); 425 return true; 426 case R600_QUERY_GPIN_ASIC_ID: 427 result->u32 = 0; 428 return true; 429 case R600_QUERY_GPIN_NUM_SIMD: 430 result->u32 = rctx->screen->info.num_good_compute_units; 431 return true; 432 case R600_QUERY_GPIN_NUM_RB: 433 result->u32 = rctx->screen->info.num_render_backends; 434 return true; 435 case R600_QUERY_GPIN_NUM_SPI: 436 result->u32 = 1; /* all supported chips have one SPI per SE */ 437 return true; 438 case R600_QUERY_GPIN_NUM_SE: 439 result->u32 = rctx->screen->info.max_se; 440 return true; 441 } 442 443 result->u64 = query->end_result - query->begin_result; 444 445 switch (query->b.type) { 446 case R600_QUERY_BUFFER_WAIT_TIME: 447 case R600_QUERY_GPU_TEMPERATURE: 448 result->u64 /= 1000; 449 break; 450 case R600_QUERY_CURRENT_GPU_SCLK: 451 case R600_QUERY_CURRENT_GPU_MCLK: 452 result->u64 *= 1000000; 453 break; 454 } 455 456 return true; 457 } 458 459 460 static struct r600_query_ops sw_query_ops = { 461 .destroy = r600_query_sw_destroy, 462 .begin = r600_query_sw_begin, 463 .end = r600_query_sw_end, 464 .get_result = r600_query_sw_get_result, 465 .get_result_resource = NULL 466 }; 467 468 static struct pipe_query *r600_query_sw_create(unsigned query_type) 469 { 470 struct r600_query_sw *query; 471 472 query = CALLOC_STRUCT(r600_query_sw); 473 if (!query) 474 return NULL; 475 476 query->b.type = query_type; 477 query->b.ops = &sw_query_ops; 478 479 return (struct pipe_query *)query; 480 } 481 482 void r600_query_hw_destroy(struct r600_common_screen *rscreen, 483 struct r600_query *rquery) 484 { 485 struct r600_query_hw *query = (struct r600_query_hw *)rquery; 486 struct r600_query_buffer *prev = query->buffer.previous; 487 488 /* Release all query buffers. */ 489 while (prev) { 490 struct r600_query_buffer *qbuf = prev; 491 prev = prev->previous; 492 r600_resource_reference(&qbuf->buf, NULL); 493 FREE(qbuf); 494 } 495 496 r600_resource_reference(&query->buffer.buf, NULL); 497 FREE(rquery); 498 } 499 500 static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen, 501 struct r600_query_hw *query) 502 { 503 unsigned buf_size = MAX2(query->result_size, 504 rscreen->info.min_alloc_size); 505 506 /* Queries are normally read by the CPU after 507 * being written by the gpu, hence staging is probably a good 508 * usage pattern. 509 */ 510 struct r600_resource *buf = (struct r600_resource*) 511 pipe_buffer_create(&rscreen->b, 0, 512 PIPE_USAGE_STAGING, buf_size); 513 if (!buf) 514 return NULL; 515 516 if (!query->ops->prepare_buffer(rscreen, query, buf)) { 517 r600_resource_reference(&buf, NULL); 518 return NULL; 519 } 520 521 return buf; 522 } 523 524 static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen, 525 struct r600_query_hw *query, 526 struct r600_resource *buffer) 527 { 528 /* Callers ensure that the buffer is currently unused by the GPU. */ 529 uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL, 530 PIPE_TRANSFER_WRITE | 531 PIPE_TRANSFER_UNSYNCHRONIZED); 532 if (!results) 533 return false; 534 535 memset(results, 0, buffer->b.b.width0); 536 537 if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER || 538 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) { 539 unsigned max_rbs = rscreen->info.num_render_backends; 540 unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask; 541 unsigned num_results; 542 unsigned i, j; 543 544 /* Set top bits for unused backends. */ 545 num_results = buffer->b.b.width0 / query->result_size; 546 for (j = 0; j < num_results; j++) { 547 for (i = 0; i < max_rbs; i++) { 548 if (!(enabled_rb_mask & (1<<i))) { 549 results[(i * 4)+1] = 0x80000000; 550 results[(i * 4)+3] = 0x80000000; 551 } 552 } 553 results += 4 * max_rbs; 554 } 555 } 556 557 return true; 558 } 559 560 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, 561 struct r600_query *rquery, 562 bool wait, 563 enum pipe_query_value_type result_type, 564 int index, 565 struct pipe_resource *resource, 566 unsigned offset); 567 568 static struct r600_query_ops query_hw_ops = { 569 .destroy = r600_query_hw_destroy, 570 .begin = r600_query_hw_begin, 571 .end = r600_query_hw_end, 572 .get_result = r600_query_hw_get_result, 573 .get_result_resource = r600_query_hw_get_result_resource, 574 }; 575 576 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, 577 struct r600_query_hw *query, 578 struct r600_resource *buffer, 579 uint64_t va); 580 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, 581 struct r600_query_hw *query, 582 struct r600_resource *buffer, 583 uint64_t va); 584 static void r600_query_hw_add_result(struct r600_common_screen *rscreen, 585 struct r600_query_hw *, void *buffer, 586 union pipe_query_result *result); 587 static void r600_query_hw_clear_result(struct r600_query_hw *, 588 union pipe_query_result *); 589 590 static struct r600_query_hw_ops query_hw_default_hw_ops = { 591 .prepare_buffer = r600_query_hw_prepare_buffer, 592 .emit_start = r600_query_hw_do_emit_start, 593 .emit_stop = r600_query_hw_do_emit_stop, 594 .clear_result = r600_query_hw_clear_result, 595 .add_result = r600_query_hw_add_result, 596 }; 597 598 bool r600_query_hw_init(struct r600_common_screen *rscreen, 599 struct r600_query_hw *query) 600 { 601 query->buffer.buf = r600_new_query_buffer(rscreen, query); 602 if (!query->buffer.buf) 603 return false; 604 605 return true; 606 } 607 608 static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen, 609 unsigned query_type, 610 unsigned index) 611 { 612 struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw); 613 if (!query) 614 return NULL; 615 616 query->b.type = query_type; 617 query->b.ops = &query_hw_ops; 618 query->ops = &query_hw_default_hw_ops; 619 620 switch (query_type) { 621 case PIPE_QUERY_OCCLUSION_COUNTER: 622 case PIPE_QUERY_OCCLUSION_PREDICATE: 623 query->result_size = 16 * rscreen->info.num_render_backends; 624 query->result_size += 16; /* for the fence + alignment */ 625 query->num_cs_dw_begin = 6; 626 query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen); 627 break; 628 case PIPE_QUERY_TIME_ELAPSED: 629 query->result_size = 24; 630 query->num_cs_dw_begin = 8; 631 query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen); 632 break; 633 case PIPE_QUERY_TIMESTAMP: 634 query->result_size = 16; 635 query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen); 636 query->flags = R600_QUERY_HW_FLAG_NO_START; 637 break; 638 case PIPE_QUERY_PRIMITIVES_EMITTED: 639 case PIPE_QUERY_PRIMITIVES_GENERATED: 640 case PIPE_QUERY_SO_STATISTICS: 641 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 642 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ 643 query->result_size = 32; 644 query->num_cs_dw_begin = 6; 645 query->num_cs_dw_end = 6; 646 query->stream = index; 647 break; 648 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 649 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ 650 query->result_size = 32 * R600_MAX_STREAMS; 651 query->num_cs_dw_begin = 6 * R600_MAX_STREAMS; 652 query->num_cs_dw_end = 6 * R600_MAX_STREAMS; 653 break; 654 case PIPE_QUERY_PIPELINE_STATISTICS: 655 /* 11 values on EG, 8 on R600. */ 656 query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16; 657 query->result_size += 8; /* for the fence + alignment */ 658 query->num_cs_dw_begin = 6; 659 query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen); 660 break; 661 default: 662 assert(0); 663 FREE(query); 664 return NULL; 665 } 666 667 if (!r600_query_hw_init(rscreen, query)) { 668 FREE(query); 669 return NULL; 670 } 671 672 return (struct pipe_query *)query; 673 } 674 675 static void r600_update_occlusion_query_state(struct r600_common_context *rctx, 676 unsigned type, int diff) 677 { 678 if (type == PIPE_QUERY_OCCLUSION_COUNTER || 679 type == PIPE_QUERY_OCCLUSION_PREDICATE) { 680 bool old_enable = rctx->num_occlusion_queries != 0; 681 bool old_perfect_enable = 682 rctx->num_perfect_occlusion_queries != 0; 683 bool enable, perfect_enable; 684 685 rctx->num_occlusion_queries += diff; 686 assert(rctx->num_occlusion_queries >= 0); 687 688 if (type == PIPE_QUERY_OCCLUSION_COUNTER) { 689 rctx->num_perfect_occlusion_queries += diff; 690 assert(rctx->num_perfect_occlusion_queries >= 0); 691 } 692 693 enable = rctx->num_occlusion_queries != 0; 694 perfect_enable = rctx->num_perfect_occlusion_queries != 0; 695 696 if (enable != old_enable || perfect_enable != old_perfect_enable) { 697 struct r600_context *ctx = (struct r600_context*)rctx; 698 r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom); 699 } 700 } 701 } 702 703 static unsigned event_type_for_stream(unsigned stream) 704 { 705 switch (stream) { 706 default: 707 case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS; 708 case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1; 709 case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2; 710 case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3; 711 } 712 } 713 714 static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va, 715 unsigned stream) 716 { 717 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 718 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3)); 719 radeon_emit(cs, va); 720 radeon_emit(cs, va >> 32); 721 } 722 723 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, 724 struct r600_query_hw *query, 725 struct r600_resource *buffer, 726 uint64_t va) 727 { 728 struct radeon_winsys_cs *cs = ctx->gfx.cs; 729 730 switch (query->b.type) { 731 case PIPE_QUERY_OCCLUSION_COUNTER: 732 case PIPE_QUERY_OCCLUSION_PREDICATE: 733 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 734 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); 735 radeon_emit(cs, va); 736 radeon_emit(cs, va >> 32); 737 break; 738 case PIPE_QUERY_PRIMITIVES_EMITTED: 739 case PIPE_QUERY_PRIMITIVES_GENERATED: 740 case PIPE_QUERY_SO_STATISTICS: 741 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 742 emit_sample_streamout(cs, va, query->stream); 743 break; 744 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 745 for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) 746 emit_sample_streamout(cs, va + 32 * stream, stream); 747 break; 748 case PIPE_QUERY_TIME_ELAPSED: 749 /* Write the timestamp after the last draw is done. 750 * (bottom-of-pipe) 751 */ 752 r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 753 0, EOP_DATA_SEL_TIMESTAMP, 754 NULL, va, 0, query->b.type); 755 break; 756 case PIPE_QUERY_PIPELINE_STATISTICS: 757 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 758 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); 759 radeon_emit(cs, va); 760 radeon_emit(cs, va >> 32); 761 break; 762 default: 763 assert(0); 764 } 765 r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, 766 RADEON_PRIO_QUERY); 767 } 768 769 static void r600_query_hw_emit_start(struct r600_common_context *ctx, 770 struct r600_query_hw *query) 771 { 772 uint64_t va; 773 774 if (!query->buffer.buf) 775 return; // previous buffer allocation failure 776 777 r600_update_occlusion_query_state(ctx, query->b.type, 1); 778 r600_update_prims_generated_query_state(ctx, query->b.type, 1); 779 780 ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end, 781 true); 782 783 /* Get a new query buffer if needed. */ 784 if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) { 785 struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer); 786 *qbuf = query->buffer; 787 query->buffer.results_end = 0; 788 query->buffer.previous = qbuf; 789 query->buffer.buf = r600_new_query_buffer(ctx->screen, query); 790 if (!query->buffer.buf) 791 return; 792 } 793 794 /* emit begin query */ 795 va = query->buffer.buf->gpu_address + query->buffer.results_end; 796 797 query->ops->emit_start(ctx, query, query->buffer.buf, va); 798 799 ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end; 800 } 801 802 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, 803 struct r600_query_hw *query, 804 struct r600_resource *buffer, 805 uint64_t va) 806 { 807 struct radeon_winsys_cs *cs = ctx->gfx.cs; 808 uint64_t fence_va = 0; 809 810 switch (query->b.type) { 811 case PIPE_QUERY_OCCLUSION_COUNTER: 812 case PIPE_QUERY_OCCLUSION_PREDICATE: 813 va += 8; 814 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 815 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); 816 radeon_emit(cs, va); 817 radeon_emit(cs, va >> 32); 818 819 fence_va = va + ctx->screen->info.num_render_backends * 16 - 8; 820 break; 821 case PIPE_QUERY_PRIMITIVES_EMITTED: 822 case PIPE_QUERY_PRIMITIVES_GENERATED: 823 case PIPE_QUERY_SO_STATISTICS: 824 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 825 va += 16; 826 emit_sample_streamout(cs, va, query->stream); 827 break; 828 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 829 va += 16; 830 for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) 831 emit_sample_streamout(cs, va + 32 * stream, stream); 832 break; 833 case PIPE_QUERY_TIME_ELAPSED: 834 va += 8; 835 /* fall through */ 836 case PIPE_QUERY_TIMESTAMP: 837 r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 838 0, EOP_DATA_SEL_TIMESTAMP, NULL, va, 839 0, query->b.type); 840 fence_va = va + 8; 841 break; 842 case PIPE_QUERY_PIPELINE_STATISTICS: { 843 unsigned sample_size = (query->result_size - 8) / 2; 844 845 va += sample_size; 846 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 847 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); 848 radeon_emit(cs, va); 849 radeon_emit(cs, va >> 32); 850 851 fence_va = va + sample_size; 852 break; 853 } 854 default: 855 assert(0); 856 } 857 r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, 858 RADEON_PRIO_QUERY); 859 860 if (fence_va) 861 r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 862 EOP_DATA_SEL_VALUE_32BIT, 863 query->buffer.buf, fence_va, 0x80000000, 864 query->b.type); 865 } 866 867 static void r600_query_hw_emit_stop(struct r600_common_context *ctx, 868 struct r600_query_hw *query) 869 { 870 uint64_t va; 871 872 if (!query->buffer.buf) 873 return; // previous buffer allocation failure 874 875 /* The queries which need begin already called this in begin_query. */ 876 if (query->flags & R600_QUERY_HW_FLAG_NO_START) { 877 ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false); 878 } 879 880 /* emit end query */ 881 va = query->buffer.buf->gpu_address + query->buffer.results_end; 882 883 query->ops->emit_stop(ctx, query, query->buffer.buf, va); 884 885 query->buffer.results_end += query->result_size; 886 887 if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) 888 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end; 889 890 r600_update_occlusion_query_state(ctx, query->b.type, -1); 891 r600_update_prims_generated_query_state(ctx, query->b.type, -1); 892 } 893 894 static void emit_set_predicate(struct r600_common_context *ctx, 895 struct r600_resource *buf, uint64_t va, 896 uint32_t op) 897 { 898 struct radeon_winsys_cs *cs = ctx->gfx.cs; 899 900 radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); 901 radeon_emit(cs, va); 902 radeon_emit(cs, op | ((va >> 32) & 0xFF)); 903 r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_READ, 904 RADEON_PRIO_QUERY); 905 } 906 907 static void r600_emit_query_predication(struct r600_common_context *ctx, 908 struct r600_atom *atom) 909 { 910 struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond; 911 struct r600_query_buffer *qbuf; 912 uint32_t op; 913 bool flag_wait, invert; 914 915 if (!query) 916 return; 917 918 invert = ctx->render_cond_invert; 919 flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || 920 ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; 921 922 switch (query->b.type) { 923 case PIPE_QUERY_OCCLUSION_COUNTER: 924 case PIPE_QUERY_OCCLUSION_PREDICATE: 925 op = PRED_OP(PREDICATION_OP_ZPASS); 926 break; 927 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 928 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 929 op = PRED_OP(PREDICATION_OP_PRIMCOUNT); 930 invert = !invert; 931 break; 932 default: 933 assert(0); 934 return; 935 } 936 937 /* if true then invert, see GL_ARB_conditional_render_inverted */ 938 if (invert) 939 op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */ 940 else 941 op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */ 942 943 op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; 944 945 /* emit predicate packets for all data blocks */ 946 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { 947 unsigned results_base = 0; 948 uint64_t va_base = qbuf->buf->gpu_address; 949 950 while (results_base < qbuf->results_end) { 951 uint64_t va = va_base + results_base; 952 953 if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { 954 for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) { 955 emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op); 956 957 /* set CONTINUE bit for all packets except the first */ 958 op |= PREDICATION_CONTINUE; 959 } 960 } else { 961 emit_set_predicate(ctx, qbuf->buf, va, op); 962 op |= PREDICATION_CONTINUE; 963 } 964 965 results_base += query->result_size; 966 } 967 } 968 } 969 970 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) 971 { 972 struct r600_common_screen *rscreen = 973 (struct r600_common_screen *)ctx->screen; 974 975 if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || 976 query_type == PIPE_QUERY_GPU_FINISHED || 977 query_type >= PIPE_QUERY_DRIVER_SPECIFIC) 978 return r600_query_sw_create(query_type); 979 980 return r600_query_hw_create(rscreen, query_type, index); 981 } 982 983 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query) 984 { 985 struct r600_common_context *rctx = (struct r600_common_context *)ctx; 986 struct r600_query *rquery = (struct r600_query *)query; 987 988 rquery->ops->destroy(rctx->screen, rquery); 989 } 990 991 static boolean r600_begin_query(struct pipe_context *ctx, 992 struct pipe_query *query) 993 { 994 struct r600_common_context *rctx = (struct r600_common_context *)ctx; 995 struct r600_query *rquery = (struct r600_query *)query; 996 997 return rquery->ops->begin(rctx, rquery); 998 } 999 1000 void r600_query_hw_reset_buffers(struct r600_common_context *rctx, 1001 struct r600_query_hw *query) 1002 { 1003 struct r600_query_buffer *prev = query->buffer.previous; 1004 1005 /* Discard the old query buffers. */ 1006 while (prev) { 1007 struct r600_query_buffer *qbuf = prev; 1008 prev = prev->previous; 1009 r600_resource_reference(&qbuf->buf, NULL); 1010 FREE(qbuf); 1011 } 1012 1013 query->buffer.results_end = 0; 1014 query->buffer.previous = NULL; 1015 1016 /* Obtain a new buffer if the current one can't be mapped without a stall. */ 1017 if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) || 1018 !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { 1019 r600_resource_reference(&query->buffer.buf, NULL); 1020 query->buffer.buf = r600_new_query_buffer(rctx->screen, query); 1021 } else { 1022 if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf)) 1023 r600_resource_reference(&query->buffer.buf, NULL); 1024 } 1025 } 1026 1027 bool r600_query_hw_begin(struct r600_common_context *rctx, 1028 struct r600_query *rquery) 1029 { 1030 struct r600_query_hw *query = (struct r600_query_hw *)rquery; 1031 1032 if (query->flags & R600_QUERY_HW_FLAG_NO_START) { 1033 assert(0); 1034 return false; 1035 } 1036 1037 if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES)) 1038 r600_query_hw_reset_buffers(rctx, query); 1039 1040 r600_query_hw_emit_start(rctx, query); 1041 if (!query->buffer.buf) 1042 return false; 1043 1044 LIST_ADDTAIL(&query->list, &rctx->active_queries); 1045 return true; 1046 } 1047 1048 static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query) 1049 { 1050 struct r600_common_context *rctx = (struct r600_common_context *)ctx; 1051 struct r600_query *rquery = (struct r600_query *)query; 1052 1053 return rquery->ops->end(rctx, rquery); 1054 } 1055 1056 bool r600_query_hw_end(struct r600_common_context *rctx, 1057 struct r600_query *rquery) 1058 { 1059 struct r600_query_hw *query = (struct r600_query_hw *)rquery; 1060 1061 if (query->flags & R600_QUERY_HW_FLAG_NO_START) 1062 r600_query_hw_reset_buffers(rctx, query); 1063 1064 r600_query_hw_emit_stop(rctx, query); 1065 1066 if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) 1067 LIST_DELINIT(&query->list); 1068 1069 if (!query->buffer.buf) 1070 return false; 1071 1072 return true; 1073 } 1074 1075 static void r600_get_hw_query_params(struct r600_common_context *rctx, 1076 struct r600_query_hw *rquery, int index, 1077 struct r600_hw_query_params *params) 1078 { 1079 unsigned max_rbs = rctx->screen->info.num_render_backends; 1080 1081 params->pair_stride = 0; 1082 params->pair_count = 1; 1083 1084 switch (rquery->b.type) { 1085 case PIPE_QUERY_OCCLUSION_COUNTER: 1086 case PIPE_QUERY_OCCLUSION_PREDICATE: 1087 params->start_offset = 0; 1088 params->end_offset = 8; 1089 params->fence_offset = max_rbs * 16; 1090 params->pair_stride = 16; 1091 params->pair_count = max_rbs; 1092 break; 1093 case PIPE_QUERY_TIME_ELAPSED: 1094 params->start_offset = 0; 1095 params->end_offset = 8; 1096 params->fence_offset = 16; 1097 break; 1098 case PIPE_QUERY_TIMESTAMP: 1099 params->start_offset = 0; 1100 params->end_offset = 0; 1101 params->fence_offset = 8; 1102 break; 1103 case PIPE_QUERY_PRIMITIVES_EMITTED: 1104 params->start_offset = 8; 1105 params->end_offset = 24; 1106 params->fence_offset = params->end_offset + 4; 1107 break; 1108 case PIPE_QUERY_PRIMITIVES_GENERATED: 1109 params->start_offset = 0; 1110 params->end_offset = 16; 1111 params->fence_offset = params->end_offset + 4; 1112 break; 1113 case PIPE_QUERY_SO_STATISTICS: 1114 params->start_offset = 8 - index * 8; 1115 params->end_offset = 24 - index * 8; 1116 params->fence_offset = params->end_offset + 4; 1117 break; 1118 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 1119 params->pair_count = R600_MAX_STREAMS; 1120 params->pair_stride = 32; 1121 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 1122 params->start_offset = 0; 1123 params->end_offset = 16; 1124 1125 /* We can re-use the high dword of the last 64-bit value as a 1126 * fence: it is initialized as 0, and the high bit is set by 1127 * the write of the streamout stats event. 1128 */ 1129 params->fence_offset = rquery->result_size - 4; 1130 break; 1131 case PIPE_QUERY_PIPELINE_STATISTICS: 1132 { 1133 /* Offsets apply to EG+ */ 1134 static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80}; 1135 params->start_offset = offsets[index]; 1136 params->end_offset = 88 + offsets[index]; 1137 params->fence_offset = 2 * 88; 1138 break; 1139 } 1140 default: 1141 unreachable("r600_get_hw_query_params unsupported"); 1142 } 1143 } 1144 1145 static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index, 1146 bool test_status_bit) 1147 { 1148 uint32_t *current_result = (uint32_t*)map; 1149 uint64_t start, end; 1150 1151 start = (uint64_t)current_result[start_index] | 1152 (uint64_t)current_result[start_index+1] << 32; 1153 end = (uint64_t)current_result[end_index] | 1154 (uint64_t)current_result[end_index+1] << 32; 1155 1156 if (!test_status_bit || 1157 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) { 1158 return end - start; 1159 } 1160 return 0; 1161 } 1162 1163 static void r600_query_hw_add_result(struct r600_common_screen *rscreen, 1164 struct r600_query_hw *query, 1165 void *buffer, 1166 union pipe_query_result *result) 1167 { 1168 unsigned max_rbs = rscreen->info.num_render_backends; 1169 1170 switch (query->b.type) { 1171 case PIPE_QUERY_OCCLUSION_COUNTER: { 1172 for (unsigned i = 0; i < max_rbs; ++i) { 1173 unsigned results_base = i * 16; 1174 result->u64 += 1175 r600_query_read_result(buffer + results_base, 0, 2, true); 1176 } 1177 break; 1178 } 1179 case PIPE_QUERY_OCCLUSION_PREDICATE: { 1180 for (unsigned i = 0; i < max_rbs; ++i) { 1181 unsigned results_base = i * 16; 1182 result->b = result->b || 1183 r600_query_read_result(buffer + results_base, 0, 2, true) != 0; 1184 } 1185 break; 1186 } 1187 case PIPE_QUERY_TIME_ELAPSED: 1188 result->u64 += r600_query_read_result(buffer, 0, 2, false); 1189 break; 1190 case PIPE_QUERY_TIMESTAMP: 1191 result->u64 = *(uint64_t*)buffer; 1192 break; 1193 case PIPE_QUERY_PRIMITIVES_EMITTED: 1194 /* SAMPLE_STREAMOUTSTATS stores this structure: 1195 * { 1196 * u64 NumPrimitivesWritten; 1197 * u64 PrimitiveStorageNeeded; 1198 * } 1199 * We only need NumPrimitivesWritten here. */ 1200 result->u64 += r600_query_read_result(buffer, 2, 6, true); 1201 break; 1202 case PIPE_QUERY_PRIMITIVES_GENERATED: 1203 /* Here we read PrimitiveStorageNeeded. */ 1204 result->u64 += r600_query_read_result(buffer, 0, 4, true); 1205 break; 1206 case PIPE_QUERY_SO_STATISTICS: 1207 result->so_statistics.num_primitives_written += 1208 r600_query_read_result(buffer, 2, 6, true); 1209 result->so_statistics.primitives_storage_needed += 1210 r600_query_read_result(buffer, 0, 4, true); 1211 break; 1212 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 1213 result->b = result->b || 1214 r600_query_read_result(buffer, 2, 6, true) != 1215 r600_query_read_result(buffer, 0, 4, true); 1216 break; 1217 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 1218 for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) { 1219 result->b = result->b || 1220 r600_query_read_result(buffer, 2, 6, true) != 1221 r600_query_read_result(buffer, 0, 4, true); 1222 buffer = (char *)buffer + 32; 1223 } 1224 break; 1225 case PIPE_QUERY_PIPELINE_STATISTICS: 1226 if (rscreen->chip_class >= EVERGREEN) { 1227 result->pipeline_statistics.ps_invocations += 1228 r600_query_read_result(buffer, 0, 22, false); 1229 result->pipeline_statistics.c_primitives += 1230 r600_query_read_result(buffer, 2, 24, false); 1231 result->pipeline_statistics.c_invocations += 1232 r600_query_read_result(buffer, 4, 26, false); 1233 result->pipeline_statistics.vs_invocations += 1234 r600_query_read_result(buffer, 6, 28, false); 1235 result->pipeline_statistics.gs_invocations += 1236 r600_query_read_result(buffer, 8, 30, false); 1237 result->pipeline_statistics.gs_primitives += 1238 r600_query_read_result(buffer, 10, 32, false); 1239 result->pipeline_statistics.ia_primitives += 1240 r600_query_read_result(buffer, 12, 34, false); 1241 result->pipeline_statistics.ia_vertices += 1242 r600_query_read_result(buffer, 14, 36, false); 1243 result->pipeline_statistics.hs_invocations += 1244 r600_query_read_result(buffer, 16, 38, false); 1245 result->pipeline_statistics.ds_invocations += 1246 r600_query_read_result(buffer, 18, 40, false); 1247 result->pipeline_statistics.cs_invocations += 1248 r600_query_read_result(buffer, 20, 42, false); 1249 } else { 1250 result->pipeline_statistics.ps_invocations += 1251 r600_query_read_result(buffer, 0, 16, false); 1252 result->pipeline_statistics.c_primitives += 1253 r600_query_read_result(buffer, 2, 18, false); 1254 result->pipeline_statistics.c_invocations += 1255 r600_query_read_result(buffer, 4, 20, false); 1256 result->pipeline_statistics.vs_invocations += 1257 r600_query_read_result(buffer, 6, 22, false); 1258 result->pipeline_statistics.gs_invocations += 1259 r600_query_read_result(buffer, 8, 24, false); 1260 result->pipeline_statistics.gs_primitives += 1261 r600_query_read_result(buffer, 10, 26, false); 1262 result->pipeline_statistics.ia_primitives += 1263 r600_query_read_result(buffer, 12, 28, false); 1264 result->pipeline_statistics.ia_vertices += 1265 r600_query_read_result(buffer, 14, 30, false); 1266 } 1267 #if 0 /* for testing */ 1268 printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, " 1269 "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, " 1270 "Clipper prims=%llu, PS=%llu, CS=%llu\n", 1271 result->pipeline_statistics.ia_vertices, 1272 result->pipeline_statistics.ia_primitives, 1273 result->pipeline_statistics.vs_invocations, 1274 result->pipeline_statistics.hs_invocations, 1275 result->pipeline_statistics.ds_invocations, 1276 result->pipeline_statistics.gs_invocations, 1277 result->pipeline_statistics.gs_primitives, 1278 result->pipeline_statistics.c_invocations, 1279 result->pipeline_statistics.c_primitives, 1280 result->pipeline_statistics.ps_invocations, 1281 result->pipeline_statistics.cs_invocations); 1282 #endif 1283 break; 1284 default: 1285 assert(0); 1286 } 1287 } 1288 1289 static boolean r600_get_query_result(struct pipe_context *ctx, 1290 struct pipe_query *query, boolean wait, 1291 union pipe_query_result *result) 1292 { 1293 struct r600_common_context *rctx = (struct r600_common_context *)ctx; 1294 struct r600_query *rquery = (struct r600_query *)query; 1295 1296 return rquery->ops->get_result(rctx, rquery, wait, result); 1297 } 1298 1299 static void r600_get_query_result_resource(struct pipe_context *ctx, 1300 struct pipe_query *query, 1301 boolean wait, 1302 enum pipe_query_value_type result_type, 1303 int index, 1304 struct pipe_resource *resource, 1305 unsigned offset) 1306 { 1307 struct r600_common_context *rctx = (struct r600_common_context *)ctx; 1308 struct r600_query *rquery = (struct r600_query *)query; 1309 1310 rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index, 1311 resource, offset); 1312 } 1313 1314 static void r600_query_hw_clear_result(struct r600_query_hw *query, 1315 union pipe_query_result *result) 1316 { 1317 util_query_clear_result(result, query->b.type); 1318 } 1319 1320 bool r600_query_hw_get_result(struct r600_common_context *rctx, 1321 struct r600_query *rquery, 1322 bool wait, union pipe_query_result *result) 1323 { 1324 struct r600_common_screen *rscreen = rctx->screen; 1325 struct r600_query_hw *query = (struct r600_query_hw *)rquery; 1326 struct r600_query_buffer *qbuf; 1327 1328 query->ops->clear_result(query, result); 1329 1330 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { 1331 unsigned usage = PIPE_TRANSFER_READ | 1332 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); 1333 unsigned results_base = 0; 1334 void *map; 1335 1336 if (rquery->b.flushed) 1337 map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); 1338 else 1339 map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage); 1340 1341 if (!map) 1342 return false; 1343 1344 while (results_base != qbuf->results_end) { 1345 query->ops->add_result(rscreen, query, map + results_base, 1346 result); 1347 results_base += query->result_size; 1348 } 1349 } 1350 1351 /* Convert the time to expected units. */ 1352 if (rquery->type == PIPE_QUERY_TIME_ELAPSED || 1353 rquery->type == PIPE_QUERY_TIMESTAMP) { 1354 result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq; 1355 } 1356 return true; 1357 } 1358 1359 /* Create the compute shader that is used to collect the results. 1360 * 1361 * One compute grid with a single thread is launched for every query result 1362 * buffer. The thread (optionally) reads a previous summary buffer, then 1363 * accumulates data from the query result buffer, and writes the result either 1364 * to a summary buffer to be consumed by the next grid invocation or to the 1365 * user-supplied buffer. 1366 * 1367 * Data layout: 1368 * 1369 * CONST 1370 * 0.x = end_offset 1371 * 0.y = result_stride 1372 * 0.z = result_count 1373 * 0.w = bit field: 1374 * 1: read previously accumulated values 1375 * 2: write accumulated values for chaining 1376 * 4: write result available 1377 * 8: convert result to boolean (0/1) 1378 * 16: only read one dword and use that as result 1379 * 32: apply timestamp conversion 1380 * 64: store full 64 bits result 1381 * 128: store signed 32 bits result 1382 * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs 1383 * 1.x = fence_offset 1384 * 1.y = pair_stride 1385 * 1.z = pair_count 1386 * 1387 * BUFFER[0] = query result buffer 1388 * BUFFER[1] = previous summary buffer 1389 * BUFFER[2] = next summary buffer or user-supplied buffer 1390 */ 1391 static void r600_create_query_result_shader(struct r600_common_context *rctx) 1392 { 1393 /* TEMP[0].xy = accumulated result so far 1394 * TEMP[0].z = result not available 1395 * 1396 * TEMP[1].x = current result index 1397 * TEMP[1].y = current pair index 1398 */ 1399 static const char text_tmpl[] = 1400 "COMP\n" 1401 "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" 1402 "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" 1403 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" 1404 "DCL BUFFER[0]\n" 1405 "DCL BUFFER[1]\n" 1406 "DCL BUFFER[2]\n" 1407 "DCL CONST[0][0..1]\n" 1408 "DCL TEMP[0..5]\n" 1409 "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" 1410 "IMM[1] UINT32 {1, 2, 4, 8}\n" 1411 "IMM[2] UINT32 {16, 32, 64, 128}\n" 1412 "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ 1413 "IMM[4] UINT32 {256, 0, 0, 0}\n" 1414 1415 "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n" 1416 "UIF TEMP[5]\n" 1417 /* Check result availability. */ 1418 "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n" 1419 "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" 1420 "MOV TEMP[1], TEMP[0].zzzz\n" 1421 "NOT TEMP[0].z, TEMP[0].zzzz\n" 1422 1423 /* Load result if available. */ 1424 "UIF TEMP[1]\n" 1425 "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" 1426 "ENDIF\n" 1427 "ELSE\n" 1428 /* Load previously accumulated result if requested. */ 1429 "MOV TEMP[0], IMM[0].xxxx\n" 1430 "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n" 1431 "UIF TEMP[4]\n" 1432 "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" 1433 "ENDIF\n" 1434 1435 "MOV TEMP[1].x, IMM[0].xxxx\n" 1436 "BGNLOOP\n" 1437 /* Break if accumulated result so far is not available. */ 1438 "UIF TEMP[0].zzzz\n" 1439 "BRK\n" 1440 "ENDIF\n" 1441 1442 /* Break if result_index >= result_count. */ 1443 "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n" 1444 "UIF TEMP[5]\n" 1445 "BRK\n" 1446 "ENDIF\n" 1447 1448 /* Load fence and check result availability */ 1449 "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n" 1450 "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" 1451 "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" 1452 "NOT TEMP[0].z, TEMP[0].zzzz\n" 1453 "UIF TEMP[0].zzzz\n" 1454 "BRK\n" 1455 "ENDIF\n" 1456 1457 "MOV TEMP[1].y, IMM[0].xxxx\n" 1458 "BGNLOOP\n" 1459 /* Load start and end. */ 1460 "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n" 1461 "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n" 1462 "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" 1463 1464 "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n" 1465 "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" 1466 1467 "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n" 1468 1469 "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n" 1470 "UIF TEMP[5].zzzz\n" 1471 /* Load second start/end half-pair and 1472 * take the difference 1473 */ 1474 "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n" 1475 "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" 1476 "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" 1477 1478 "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" 1479 "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n" 1480 "ENDIF\n" 1481 1482 "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n" 1483 1484 /* Increment pair index */ 1485 "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" 1486 "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n" 1487 "UIF TEMP[5]\n" 1488 "BRK\n" 1489 "ENDIF\n" 1490 "ENDLOOP\n" 1491 1492 /* Increment result index */ 1493 "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" 1494 "ENDLOOP\n" 1495 "ENDIF\n" 1496 1497 "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n" 1498 "UIF TEMP[4]\n" 1499 /* Store accumulated data for chaining. */ 1500 "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" 1501 "ELSE\n" 1502 "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n" 1503 "UIF TEMP[4]\n" 1504 /* Store result availability. */ 1505 "NOT TEMP[0].z, TEMP[0]\n" 1506 "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" 1507 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" 1508 1509 "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" 1510 "UIF TEMP[4]\n" 1511 "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" 1512 "ENDIF\n" 1513 "ELSE\n" 1514 /* Store result if it is available. */ 1515 "NOT TEMP[4], TEMP[0].zzzz\n" 1516 "UIF TEMP[4]\n" 1517 /* Apply timestamp conversion */ 1518 "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n" 1519 "UIF TEMP[4]\n" 1520 "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" 1521 "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" 1522 "ENDIF\n" 1523 1524 /* Convert to boolean */ 1525 "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n" 1526 "UIF TEMP[4]\n" 1527 "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n" 1528 "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" 1529 "MOV TEMP[0].y, IMM[0].xxxx\n" 1530 "ENDIF\n" 1531 1532 "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" 1533 "UIF TEMP[4]\n" 1534 "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" 1535 "ELSE\n" 1536 /* Clamping */ 1537 "UIF TEMP[0].yyyy\n" 1538 "MOV TEMP[0].x, IMM[0].wwww\n" 1539 "ENDIF\n" 1540 1541 "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n" 1542 "UIF TEMP[4]\n" 1543 "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" 1544 "ENDIF\n" 1545 1546 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" 1547 "ENDIF\n" 1548 "ENDIF\n" 1549 "ENDIF\n" 1550 "ENDIF\n" 1551 1552 "END\n"; 1553 1554 char text[sizeof(text_tmpl) + 32]; 1555 struct tgsi_token tokens[1024]; 1556 struct pipe_compute_state state = {}; 1557 1558 /* Hard code the frequency into the shader so that the backend can 1559 * use the full range of optimizations for divide-by-constant. 1560 */ 1561 snprintf(text, sizeof(text), text_tmpl, 1562 rctx->screen->info.clock_crystal_freq); 1563 1564 if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { 1565 assert(false); 1566 return; 1567 } 1568 1569 state.ir_type = PIPE_SHADER_IR_TGSI; 1570 state.prog = tokens; 1571 1572 rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state); 1573 } 1574 1575 static void r600_restore_qbo_state(struct r600_common_context *rctx, 1576 struct r600_qbo_state *st) 1577 { 1578 rctx->b.bind_compute_state(&rctx->b, st->saved_compute); 1579 1580 rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); 1581 pipe_resource_reference(&st->saved_const0.buffer, NULL); 1582 1583 rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); 1584 for (unsigned i = 0; i < 3; ++i) 1585 pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); 1586 } 1587 1588 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, 1589 struct r600_query *rquery, 1590 bool wait, 1591 enum pipe_query_value_type result_type, 1592 int index, 1593 struct pipe_resource *resource, 1594 unsigned offset) 1595 { 1596 struct r600_query_hw *query = (struct r600_query_hw *)rquery; 1597 struct r600_query_buffer *qbuf; 1598 struct r600_query_buffer *qbuf_prev; 1599 struct pipe_resource *tmp_buffer = NULL; 1600 unsigned tmp_buffer_offset = 0; 1601 struct r600_qbo_state saved_state = {}; 1602 struct pipe_grid_info grid = {}; 1603 struct pipe_constant_buffer constant_buffer = {}; 1604 struct pipe_shader_buffer ssbo[3]; 1605 struct r600_hw_query_params params; 1606 struct { 1607 uint32_t end_offset; 1608 uint32_t result_stride; 1609 uint32_t result_count; 1610 uint32_t config; 1611 uint32_t fence_offset; 1612 uint32_t pair_stride; 1613 uint32_t pair_count; 1614 } consts; 1615 1616 if (!rctx->query_result_shader) { 1617 r600_create_query_result_shader(rctx); 1618 if (!rctx->query_result_shader) 1619 return; 1620 } 1621 1622 if (query->buffer.previous) { 1623 u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16, 1624 &tmp_buffer_offset, &tmp_buffer); 1625 if (!tmp_buffer) 1626 return; 1627 } 1628 1629 rctx->save_qbo_state(&rctx->b, &saved_state); 1630 1631 r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, ¶ms); 1632 consts.end_offset = params.end_offset - params.start_offset; 1633 consts.fence_offset = params.fence_offset - params.start_offset; 1634 consts.result_stride = query->result_size; 1635 consts.pair_stride = params.pair_stride; 1636 consts.pair_count = params.pair_count; 1637 1638 constant_buffer.buffer_size = sizeof(consts); 1639 constant_buffer.user_buffer = &consts; 1640 1641 ssbo[1].buffer = tmp_buffer; 1642 ssbo[1].buffer_offset = tmp_buffer_offset; 1643 ssbo[1].buffer_size = 16; 1644 1645 ssbo[2] = ssbo[1]; 1646 1647 rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader); 1648 1649 grid.block[0] = 1; 1650 grid.block[1] = 1; 1651 grid.block[2] = 1; 1652 grid.grid[0] = 1; 1653 grid.grid[1] = 1; 1654 grid.grid[2] = 1; 1655 1656 consts.config = 0; 1657 if (index < 0) 1658 consts.config |= 4; 1659 if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) 1660 consts.config |= 8; 1661 else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || 1662 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) 1663 consts.config |= 8 | 256; 1664 else if (query->b.type == PIPE_QUERY_TIMESTAMP || 1665 query->b.type == PIPE_QUERY_TIME_ELAPSED) 1666 consts.config |= 32; 1667 1668 switch (result_type) { 1669 case PIPE_QUERY_TYPE_U64: 1670 case PIPE_QUERY_TYPE_I64: 1671 consts.config |= 64; 1672 break; 1673 case PIPE_QUERY_TYPE_I32: 1674 consts.config |= 128; 1675 break; 1676 case PIPE_QUERY_TYPE_U32: 1677 break; 1678 } 1679 1680 rctx->flags |= rctx->screen->barrier_flags.cp_to_L2; 1681 1682 for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { 1683 if (query->b.type != PIPE_QUERY_TIMESTAMP) { 1684 qbuf_prev = qbuf->previous; 1685 consts.result_count = qbuf->results_end / query->result_size; 1686 consts.config &= ~3; 1687 if (qbuf != &query->buffer) 1688 consts.config |= 1; 1689 if (qbuf->previous) 1690 consts.config |= 2; 1691 } else { 1692 /* Only read the last timestamp. */ 1693 qbuf_prev = NULL; 1694 consts.result_count = 0; 1695 consts.config |= 16; 1696 params.start_offset += qbuf->results_end - query->result_size; 1697 } 1698 1699 rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); 1700 1701 ssbo[0].buffer = &qbuf->buf->b.b; 1702 ssbo[0].buffer_offset = params.start_offset; 1703 ssbo[0].buffer_size = qbuf->results_end - params.start_offset; 1704 1705 if (!qbuf->previous) { 1706 ssbo[2].buffer = resource; 1707 ssbo[2].buffer_offset = offset; 1708 ssbo[2].buffer_size = 8; 1709 1710 } 1711 1712 rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo); 1713 1714 if (wait && qbuf == &query->buffer) { 1715 uint64_t va; 1716 1717 /* Wait for result availability. Wait only for readiness 1718 * of the last entry, since the fence writes should be 1719 * serialized in the CP. 1720 */ 1721 va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size; 1722 va += params.fence_offset; 1723 1724 r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000); 1725 } 1726 1727 rctx->b.launch_grid(&rctx->b, &grid); 1728 rctx->flags |= rctx->screen->barrier_flags.compute_to_L2; 1729 } 1730 1731 r600_restore_qbo_state(rctx, &saved_state); 1732 pipe_resource_reference(&tmp_buffer, NULL); 1733 } 1734 1735 static void r600_render_condition(struct pipe_context *ctx, 1736 struct pipe_query *query, 1737 boolean condition, 1738 enum pipe_render_cond_flag mode) 1739 { 1740 struct r600_common_context *rctx = (struct r600_common_context *)ctx; 1741 struct r600_query_hw *rquery = (struct r600_query_hw *)query; 1742 struct r600_query_buffer *qbuf; 1743 struct r600_atom *atom = &rctx->render_cond_atom; 1744 1745 /* Compute the size of SET_PREDICATION packets. */ 1746 atom->num_dw = 0; 1747 if (query) { 1748 for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) 1749 atom->num_dw += (qbuf->results_end / rquery->result_size) * 5; 1750 1751 if (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) 1752 atom->num_dw *= R600_MAX_STREAMS; 1753 } 1754 1755 rctx->render_cond = query; 1756 rctx->render_cond_invert = condition; 1757 rctx->render_cond_mode = mode; 1758 1759 rctx->set_atom_dirty(rctx, atom, query != NULL); 1760 } 1761 1762 void r600_suspend_queries(struct r600_common_context *ctx) 1763 { 1764 struct r600_query_hw *query; 1765 1766 LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) { 1767 r600_query_hw_emit_stop(ctx, query); 1768 } 1769 assert(ctx->num_cs_dw_queries_suspend == 0); 1770 } 1771 1772 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx, 1773 struct list_head *query_list) 1774 { 1775 struct r600_query_hw *query; 1776 unsigned num_dw = 0; 1777 1778 LIST_FOR_EACH_ENTRY(query, query_list, list) { 1779 /* begin + end */ 1780 num_dw += query->num_cs_dw_begin + query->num_cs_dw_end; 1781 1782 /* Workaround for the fact that 1783 * num_cs_dw_nontimer_queries_suspend is incremented for every 1784 * resumed query, which raises the bar in need_cs_space for 1785 * queries about to be resumed. 1786 */ 1787 num_dw += query->num_cs_dw_end; 1788 } 1789 /* primitives generated query */ 1790 num_dw += ctx->streamout.enable_atom.num_dw; 1791 /* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */ 1792 num_dw += 13; 1793 1794 return num_dw; 1795 } 1796 1797 void r600_resume_queries(struct r600_common_context *ctx) 1798 { 1799 struct r600_query_hw *query; 1800 unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries); 1801 1802 assert(ctx->num_cs_dw_queries_suspend == 0); 1803 1804 /* Check CS space here. Resuming must not be interrupted by flushes. */ 1805 ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true); 1806 1807 LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) { 1808 r600_query_hw_emit_start(ctx, query); 1809 } 1810 } 1811 1812 /* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */ 1813 void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) 1814 { 1815 struct r600_common_context *ctx = 1816 (struct r600_common_context*)rscreen->aux_context; 1817 struct radeon_winsys_cs *cs = ctx->gfx.cs; 1818 struct r600_resource *buffer; 1819 uint32_t *results; 1820 unsigned i, mask = 0; 1821 unsigned max_rbs; 1822 1823 if (ctx->family == CHIP_JUNIPER) { 1824 /* 1825 * Fix for predication lockups - the chip can only ever have 1826 * 4 RBs, however it looks like the predication logic assumes 1827 * there's 8, trying to read results from query buffers never 1828 * written to. By increasing this number we'll write the 1829 * status bit for these as per the normal disabled rb logic. 1830 */ 1831 ctx->screen->info.num_render_backends = 8; 1832 } 1833 max_rbs = ctx->screen->info.num_render_backends; 1834 1835 assert(rscreen->chip_class <= CAYMAN); 1836 1837 /* 1838 * if backend_map query is supported by the kernel. 1839 * Note the kernel drm driver for a long time never filled in the 1840 * associated data on eg/cm, only on r600/r700, hence ignore the valid 1841 * bit there if the map is zero. 1842 * (Albeit some chips with just one active rb can have a valid 0 map.) 1843 */ 1844 if (rscreen->info.r600_gb_backend_map_valid && 1845 (ctx->chip_class < EVERGREEN || rscreen->info.r600_gb_backend_map != 0)) { 1846 unsigned num_tile_pipes = rscreen->info.num_tile_pipes; 1847 unsigned backend_map = rscreen->info.r600_gb_backend_map; 1848 unsigned item_width, item_mask; 1849 1850 if (ctx->chip_class >= EVERGREEN) { 1851 item_width = 4; 1852 item_mask = 0x7; 1853 } else { 1854 item_width = 2; 1855 item_mask = 0x3; 1856 } 1857 1858 while (num_tile_pipes--) { 1859 i = backend_map & item_mask; 1860 mask |= (1<<i); 1861 backend_map >>= item_width; 1862 } 1863 if (mask != 0) { 1864 rscreen->info.enabled_rb_mask = mask; 1865 return; 1866 } 1867 } 1868 1869 /* otherwise backup path for older kernels */ 1870 1871 /* create buffer for event data */ 1872 buffer = (struct r600_resource*) 1873 pipe_buffer_create(ctx->b.screen, 0, 1874 PIPE_USAGE_STAGING, max_rbs * 16); 1875 if (!buffer) 1876 return; 1877 1878 /* initialize buffer with zeroes */ 1879 results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE); 1880 if (results) { 1881 memset(results, 0, max_rbs * 4 * 4); 1882 1883 /* emit EVENT_WRITE for ZPASS_DONE */ 1884 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); 1885 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); 1886 radeon_emit(cs, buffer->gpu_address); 1887 radeon_emit(cs, buffer->gpu_address >> 32); 1888 1889 r600_emit_reloc(ctx, &ctx->gfx, buffer, 1890 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); 1891 1892 /* analyze results */ 1893 results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ); 1894 if (results) { 1895 for(i = 0; i < max_rbs; i++) { 1896 /* at least highest bit will be set if backend is used */ 1897 if (results[i*4 + 1]) 1898 mask |= (1<<i); 1899 } 1900 } 1901 } 1902 1903 r600_resource_reference(&buffer, NULL); 1904 1905 if (mask) { 1906 if (rscreen->debug_flags & DBG_INFO && 1907 mask != rscreen->info.enabled_rb_mask) { 1908 printf("enabled_rb_mask (fixed) = 0x%x\n", mask); 1909 } 1910 rscreen->info.enabled_rb_mask = mask; 1911 } 1912 } 1913 1914 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \ 1915 { \ 1916 .name = name_, \ 1917 .query_type = R600_QUERY_##query_type_, \ 1918 .type = PIPE_DRIVER_QUERY_TYPE_##type_, \ 1919 .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \ 1920 .group_id = group_id_ \ 1921 } 1922 1923 #define X(name_, query_type_, type_, result_type_) \ 1924 XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0) 1925 1926 #define XG(group_, name_, query_type_, type_, result_type_) \ 1927 XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_) 1928 1929 static struct pipe_driver_query_info r600_driver_query_list[] = { 1930 X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE), 1931 X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE), 1932 X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE), 1933 X("draw-calls", DRAW_CALLS, UINT64, AVERAGE), 1934 X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE), 1935 X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE), 1936 X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE), 1937 X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE), 1938 X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE), 1939 X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE), 1940 X("dma-calls", DMA_CALLS, UINT64, AVERAGE), 1941 X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE), 1942 X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE), 1943 X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE), 1944 X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE), 1945 X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE), 1946 X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE), 1947 X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE), 1948 X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE), 1949 X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE), 1950 X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE), 1951 X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE), 1952 X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE), 1953 X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE), 1954 X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE), 1955 X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE), 1956 X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE), 1957 X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE), 1958 X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE), 1959 X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE), 1960 X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE), 1961 X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE), 1962 X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE), 1963 X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE), 1964 X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE), 1965 X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE), 1966 X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE), 1967 X("GTT-usage", GTT_USAGE, BYTES, AVERAGE), 1968 1969 /* GPIN queries are for the benefit of old versions of GPUPerfStudio, 1970 * which use it as a fallback path to detect the GPU type. 1971 * 1972 * Note: The names of these queries are significant for GPUPerfStudio 1973 * (and possibly their order as well). */ 1974 XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE), 1975 XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE), 1976 XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE), 1977 XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE), 1978 XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE), 1979 1980 X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE), 1981 X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE), 1982 X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE), 1983 1984 /* The following queries must be at the end of the list because their 1985 * availability is adjusted dynamically based on the DRM version. */ 1986 X("GPU-load", GPU_LOAD, UINT64, AVERAGE), 1987 X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE), 1988 X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE), 1989 X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE), 1990 X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE), 1991 X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE), 1992 X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE), 1993 X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE), 1994 X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE), 1995 X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE), 1996 X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE), 1997 X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE), 1998 X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE), 1999 X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE), 2000 X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE), 2001 X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE), 2002 X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE), 2003 X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE), 2004 X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE), 2005 X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE), 2006 X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE), 2007 }; 2008 2009 #undef X 2010 #undef XG 2011 #undef XFULL 2012 2013 static unsigned r600_get_num_queries(struct r600_common_screen *rscreen) 2014 { 2015 if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) 2016 return ARRAY_SIZE(r600_driver_query_list); 2017 else 2018 return ARRAY_SIZE(r600_driver_query_list) - 25; 2019 } 2020 2021 static int r600_get_driver_query_info(struct pipe_screen *screen, 2022 unsigned index, 2023 struct pipe_driver_query_info *info) 2024 { 2025 struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; 2026 unsigned num_queries = r600_get_num_queries(rscreen); 2027 2028 if (!info) { 2029 unsigned num_perfcounters = 2030 r600_get_perfcounter_info(rscreen, 0, NULL); 2031 2032 return num_queries + num_perfcounters; 2033 } 2034 2035 if (index >= num_queries) 2036 return r600_get_perfcounter_info(rscreen, index - num_queries, info); 2037 2038 *info = r600_driver_query_list[index]; 2039 2040 switch (info->query_type) { 2041 case R600_QUERY_REQUESTED_VRAM: 2042 case R600_QUERY_VRAM_USAGE: 2043 case R600_QUERY_MAPPED_VRAM: 2044 info->max_value.u64 = rscreen->info.vram_size; 2045 break; 2046 case R600_QUERY_REQUESTED_GTT: 2047 case R600_QUERY_GTT_USAGE: 2048 case R600_QUERY_MAPPED_GTT: 2049 info->max_value.u64 = rscreen->info.gart_size; 2050 break; 2051 case R600_QUERY_GPU_TEMPERATURE: 2052 info->max_value.u64 = 125; 2053 break; 2054 case R600_QUERY_VRAM_VIS_USAGE: 2055 info->max_value.u64 = rscreen->info.vram_vis_size; 2056 break; 2057 } 2058 2059 if (info->group_id != ~(unsigned)0 && rscreen->perfcounters) 2060 info->group_id += rscreen->perfcounters->num_groups; 2061 2062 return 1; 2063 } 2064 2065 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware 2066 * performance counter groups, so be careful when changing this and related 2067 * functions. 2068 */ 2069 static int r600_get_driver_query_group_info(struct pipe_screen *screen, 2070 unsigned index, 2071 struct pipe_driver_query_group_info *info) 2072 { 2073 struct r600_common_screen *rscreen = (struct r600_common_screen *)screen; 2074 unsigned num_pc_groups = 0; 2075 2076 if (rscreen->perfcounters) 2077 num_pc_groups = rscreen->perfcounters->num_groups; 2078 2079 if (!info) 2080 return num_pc_groups + R600_NUM_SW_QUERY_GROUPS; 2081 2082 if (index < num_pc_groups) 2083 return r600_get_perfcounter_group_info(rscreen, index, info); 2084 2085 index -= num_pc_groups; 2086 if (index >= R600_NUM_SW_QUERY_GROUPS) 2087 return 0; 2088 2089 info->name = "GPIN"; 2090 info->max_active_queries = 5; 2091 info->num_queries = 5; 2092 return 1; 2093 } 2094 2095 void r600_query_init(struct r600_common_context *rctx) 2096 { 2097 rctx->b.create_query = r600_create_query; 2098 rctx->b.create_batch_query = r600_create_batch_query; 2099 rctx->b.destroy_query = r600_destroy_query; 2100 rctx->b.begin_query = r600_begin_query; 2101 rctx->b.end_query = r600_end_query; 2102 rctx->b.get_query_result = r600_get_query_result; 2103 rctx->b.get_query_result_resource = r600_get_query_result_resource; 2104 rctx->render_cond_atom.emit = r600_emit_query_predication; 2105 2106 if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0) 2107 rctx->b.render_condition = r600_render_condition; 2108 2109 LIST_INITHEAD(&rctx->active_queries); 2110 } 2111 2112 void r600_init_screen_query_functions(struct r600_common_screen *rscreen) 2113 { 2114 rscreen->b.get_driver_query_info = r600_get_driver_query_info; 2115 rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info; 2116 } 2117