Home | History | Annotate | Download | only in r600
      1 /*
      2  * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org>
      3  * Copyright 2014 Marek Olk <marek.olsak (at) amd.com>
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * on the rights to use, copy, modify, merge, publish, distribute, sub
      9  * license, and/or sell copies of the Software, and to permit persons to whom
     10  * the Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     23  */
     24 
     25 #include "r600_query.h"
     26 #include "r600_pipe.h"
     27 #include "r600_cs.h"
     28 #include "util/u_memory.h"
     29 #include "util/u_upload_mgr.h"
     30 #include "util/os_time.h"
     31 #include "tgsi/tgsi_text.h"
     32 
     33 #define R600_MAX_STREAMS 4
     34 
     35 struct r600_hw_query_params {
     36 	unsigned start_offset;
     37 	unsigned end_offset;
     38 	unsigned fence_offset;
     39 	unsigned pair_stride;
     40 	unsigned pair_count;
     41 };
     42 
     43 /* Queries without buffer handling or suspend/resume. */
     44 struct r600_query_sw {
     45 	struct r600_query b;
     46 
     47 	uint64_t begin_result;
     48 	uint64_t end_result;
     49 
     50 	uint64_t begin_time;
     51 	uint64_t end_time;
     52 
     53 	/* Fence for GPU_FINISHED. */
     54 	struct pipe_fence_handle *fence;
     55 };
     56 
     57 static void r600_query_sw_destroy(struct r600_common_screen *rscreen,
     58 				  struct r600_query *rquery)
     59 {
     60 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
     61 
     62 	rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL);
     63 	FREE(query);
     64 }
     65 
     66 static enum radeon_value_id winsys_id_from_type(unsigned type)
     67 {
     68 	switch (type) {
     69 	case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
     70 	case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
     71 	case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
     72 	case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
     73 	case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
     74 	case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
     75 	case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
     76 	case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
     77 	case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
     78 	case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
     79 	case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
     80 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
     81 	case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
     82 	case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
     83 	case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
     84 	case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
     85 	case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
     86 	case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
     87 	case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
     88 	default: unreachable("query type does not correspond to winsys id");
     89 	}
     90 }
     91 
     92 static bool r600_query_sw_begin(struct r600_common_context *rctx,
     93 				struct r600_query *rquery)
     94 {
     95 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
     96 	enum radeon_value_id ws_id;
     97 
     98 	switch(query->b.type) {
     99 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
    100 	case PIPE_QUERY_GPU_FINISHED:
    101 		break;
    102 	case R600_QUERY_DRAW_CALLS:
    103 		query->begin_result = rctx->num_draw_calls;
    104 		break;
    105 	case R600_QUERY_DECOMPRESS_CALLS:
    106 		query->begin_result = rctx->num_decompress_calls;
    107 		break;
    108 	case R600_QUERY_MRT_DRAW_CALLS:
    109 		query->begin_result = rctx->num_mrt_draw_calls;
    110 		break;
    111 	case R600_QUERY_PRIM_RESTART_CALLS:
    112 		query->begin_result = rctx->num_prim_restart_calls;
    113 		break;
    114 	case R600_QUERY_SPILL_DRAW_CALLS:
    115 		query->begin_result = rctx->num_spill_draw_calls;
    116 		break;
    117 	case R600_QUERY_COMPUTE_CALLS:
    118 		query->begin_result = rctx->num_compute_calls;
    119 		break;
    120 	case R600_QUERY_SPILL_COMPUTE_CALLS:
    121 		query->begin_result = rctx->num_spill_compute_calls;
    122 		break;
    123 	case R600_QUERY_DMA_CALLS:
    124 		query->begin_result = rctx->num_dma_calls;
    125 		break;
    126 	case R600_QUERY_CP_DMA_CALLS:
    127 		query->begin_result = rctx->num_cp_dma_calls;
    128 		break;
    129 	case R600_QUERY_NUM_VS_FLUSHES:
    130 		query->begin_result = rctx->num_vs_flushes;
    131 		break;
    132 	case R600_QUERY_NUM_PS_FLUSHES:
    133 		query->begin_result = rctx->num_ps_flushes;
    134 		break;
    135 	case R600_QUERY_NUM_CS_FLUSHES:
    136 		query->begin_result = rctx->num_cs_flushes;
    137 		break;
    138 	case R600_QUERY_NUM_CB_CACHE_FLUSHES:
    139 		query->begin_result = rctx->num_cb_cache_flushes;
    140 		break;
    141 	case R600_QUERY_NUM_DB_CACHE_FLUSHES:
    142 		query->begin_result = rctx->num_db_cache_flushes;
    143 		break;
    144 	case R600_QUERY_NUM_RESIDENT_HANDLES:
    145 		query->begin_result = rctx->num_resident_handles;
    146 		break;
    147 	case R600_QUERY_TC_OFFLOADED_SLOTS:
    148 		query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
    149 		break;
    150 	case R600_QUERY_TC_DIRECT_SLOTS:
    151 		query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
    152 		break;
    153 	case R600_QUERY_TC_NUM_SYNCS:
    154 		query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0;
    155 		break;
    156 	case R600_QUERY_REQUESTED_VRAM:
    157 	case R600_QUERY_REQUESTED_GTT:
    158 	case R600_QUERY_MAPPED_VRAM:
    159 	case R600_QUERY_MAPPED_GTT:
    160 	case R600_QUERY_VRAM_USAGE:
    161 	case R600_QUERY_VRAM_VIS_USAGE:
    162 	case R600_QUERY_GTT_USAGE:
    163 	case R600_QUERY_GPU_TEMPERATURE:
    164 	case R600_QUERY_CURRENT_GPU_SCLK:
    165 	case R600_QUERY_CURRENT_GPU_MCLK:
    166 	case R600_QUERY_NUM_MAPPED_BUFFERS:
    167 		query->begin_result = 0;
    168 		break;
    169 	case R600_QUERY_BUFFER_WAIT_TIME:
    170 	case R600_QUERY_NUM_GFX_IBS:
    171 	case R600_QUERY_NUM_SDMA_IBS:
    172 	case R600_QUERY_NUM_BYTES_MOVED:
    173 	case R600_QUERY_NUM_EVICTIONS:
    174 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
    175 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
    176 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
    177 		break;
    178 	}
    179 	case R600_QUERY_GFX_BO_LIST_SIZE:
    180 		ws_id = winsys_id_from_type(query->b.type);
    181 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
    182 		query->begin_time = rctx->ws->query_value(rctx->ws,
    183 							  RADEON_NUM_GFX_IBS);
    184 		break;
    185 	case R600_QUERY_CS_THREAD_BUSY:
    186 		ws_id = winsys_id_from_type(query->b.type);
    187 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
    188 		query->begin_time = os_time_get_nano();
    189 		break;
    190 	case R600_QUERY_GALLIUM_THREAD_BUSY:
    191 		query->begin_result =
    192 			rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
    193 		query->begin_time = os_time_get_nano();
    194 		break;
    195 	case R600_QUERY_GPU_LOAD:
    196 	case R600_QUERY_GPU_SHADERS_BUSY:
    197 	case R600_QUERY_GPU_TA_BUSY:
    198 	case R600_QUERY_GPU_GDS_BUSY:
    199 	case R600_QUERY_GPU_VGT_BUSY:
    200 	case R600_QUERY_GPU_IA_BUSY:
    201 	case R600_QUERY_GPU_SX_BUSY:
    202 	case R600_QUERY_GPU_WD_BUSY:
    203 	case R600_QUERY_GPU_BCI_BUSY:
    204 	case R600_QUERY_GPU_SC_BUSY:
    205 	case R600_QUERY_GPU_PA_BUSY:
    206 	case R600_QUERY_GPU_DB_BUSY:
    207 	case R600_QUERY_GPU_CP_BUSY:
    208 	case R600_QUERY_GPU_CB_BUSY:
    209 	case R600_QUERY_GPU_SDMA_BUSY:
    210 	case R600_QUERY_GPU_PFP_BUSY:
    211 	case R600_QUERY_GPU_MEQ_BUSY:
    212 	case R600_QUERY_GPU_ME_BUSY:
    213 	case R600_QUERY_GPU_SURF_SYNC_BUSY:
    214 	case R600_QUERY_GPU_CP_DMA_BUSY:
    215 	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
    216 		query->begin_result = r600_begin_counter(rctx->screen,
    217 							 query->b.type);
    218 		break;
    219 	case R600_QUERY_NUM_COMPILATIONS:
    220 		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
    221 		break;
    222 	case R600_QUERY_NUM_SHADERS_CREATED:
    223 		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
    224 		break;
    225 	case R600_QUERY_NUM_SHADER_CACHE_HITS:
    226 		query->begin_result =
    227 			p_atomic_read(&rctx->screen->num_shader_cache_hits);
    228 		break;
    229 	case R600_QUERY_GPIN_ASIC_ID:
    230 	case R600_QUERY_GPIN_NUM_SIMD:
    231 	case R600_QUERY_GPIN_NUM_RB:
    232 	case R600_QUERY_GPIN_NUM_SPI:
    233 	case R600_QUERY_GPIN_NUM_SE:
    234 		break;
    235 	default:
    236 		unreachable("r600_query_sw_begin: bad query type");
    237 	}
    238 
    239 	return true;
    240 }
    241 
    242 static bool r600_query_sw_end(struct r600_common_context *rctx,
    243 			      struct r600_query *rquery)
    244 {
    245 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
    246 	enum radeon_value_id ws_id;
    247 
    248 	switch(query->b.type) {
    249 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
    250 		break;
    251 	case PIPE_QUERY_GPU_FINISHED:
    252 		rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
    253 		break;
    254 	case R600_QUERY_DRAW_CALLS:
    255 		query->end_result = rctx->num_draw_calls;
    256 		break;
    257 	case R600_QUERY_DECOMPRESS_CALLS:
    258 		query->end_result = rctx->num_decompress_calls;
    259 		break;
    260 	case R600_QUERY_MRT_DRAW_CALLS:
    261 		query->end_result = rctx->num_mrt_draw_calls;
    262 		break;
    263 	case R600_QUERY_PRIM_RESTART_CALLS:
    264 		query->end_result = rctx->num_prim_restart_calls;
    265 		break;
    266 	case R600_QUERY_SPILL_DRAW_CALLS:
    267 		query->end_result = rctx->num_spill_draw_calls;
    268 		break;
    269 	case R600_QUERY_COMPUTE_CALLS:
    270 		query->end_result = rctx->num_compute_calls;
    271 		break;
    272 	case R600_QUERY_SPILL_COMPUTE_CALLS:
    273 		query->end_result = rctx->num_spill_compute_calls;
    274 		break;
    275 	case R600_QUERY_DMA_CALLS:
    276 		query->end_result = rctx->num_dma_calls;
    277 		break;
    278 	case R600_QUERY_CP_DMA_CALLS:
    279 		query->end_result = rctx->num_cp_dma_calls;
    280 		break;
    281 	case R600_QUERY_NUM_VS_FLUSHES:
    282 		query->end_result = rctx->num_vs_flushes;
    283 		break;
    284 	case R600_QUERY_NUM_PS_FLUSHES:
    285 		query->end_result = rctx->num_ps_flushes;
    286 		break;
    287 	case R600_QUERY_NUM_CS_FLUSHES:
    288 		query->end_result = rctx->num_cs_flushes;
    289 		break;
    290 	case R600_QUERY_NUM_CB_CACHE_FLUSHES:
    291 		query->end_result = rctx->num_cb_cache_flushes;
    292 		break;
    293 	case R600_QUERY_NUM_DB_CACHE_FLUSHES:
    294 		query->end_result = rctx->num_db_cache_flushes;
    295 		break;
    296 	case R600_QUERY_NUM_RESIDENT_HANDLES:
    297 		query->end_result = rctx->num_resident_handles;
    298 		break;
    299 	case R600_QUERY_TC_OFFLOADED_SLOTS:
    300 		query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
    301 		break;
    302 	case R600_QUERY_TC_DIRECT_SLOTS:
    303 		query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
    304 		break;
    305 	case R600_QUERY_TC_NUM_SYNCS:
    306 		query->end_result = rctx->tc ? rctx->tc->num_syncs : 0;
    307 		break;
    308 	case R600_QUERY_REQUESTED_VRAM:
    309 	case R600_QUERY_REQUESTED_GTT:
    310 	case R600_QUERY_MAPPED_VRAM:
    311 	case R600_QUERY_MAPPED_GTT:
    312 	case R600_QUERY_VRAM_USAGE:
    313 	case R600_QUERY_VRAM_VIS_USAGE:
    314 	case R600_QUERY_GTT_USAGE:
    315 	case R600_QUERY_GPU_TEMPERATURE:
    316 	case R600_QUERY_CURRENT_GPU_SCLK:
    317 	case R600_QUERY_CURRENT_GPU_MCLK:
    318 	case R600_QUERY_BUFFER_WAIT_TIME:
    319 	case R600_QUERY_NUM_MAPPED_BUFFERS:
    320 	case R600_QUERY_NUM_GFX_IBS:
    321 	case R600_QUERY_NUM_SDMA_IBS:
    322 	case R600_QUERY_NUM_BYTES_MOVED:
    323 	case R600_QUERY_NUM_EVICTIONS:
    324 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
    325 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
    326 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
    327 		break;
    328 	}
    329 	case R600_QUERY_GFX_BO_LIST_SIZE:
    330 		ws_id = winsys_id_from_type(query->b.type);
    331 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
    332 		query->end_time = rctx->ws->query_value(rctx->ws,
    333 							RADEON_NUM_GFX_IBS);
    334 		break;
    335 	case R600_QUERY_CS_THREAD_BUSY:
    336 		ws_id = winsys_id_from_type(query->b.type);
    337 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
    338 		query->end_time = os_time_get_nano();
    339 		break;
    340 	case R600_QUERY_GALLIUM_THREAD_BUSY:
    341 		query->end_result =
    342 			rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
    343 		query->end_time = os_time_get_nano();
    344 		break;
    345 	case R600_QUERY_GPU_LOAD:
    346 	case R600_QUERY_GPU_SHADERS_BUSY:
    347 	case R600_QUERY_GPU_TA_BUSY:
    348 	case R600_QUERY_GPU_GDS_BUSY:
    349 	case R600_QUERY_GPU_VGT_BUSY:
    350 	case R600_QUERY_GPU_IA_BUSY:
    351 	case R600_QUERY_GPU_SX_BUSY:
    352 	case R600_QUERY_GPU_WD_BUSY:
    353 	case R600_QUERY_GPU_BCI_BUSY:
    354 	case R600_QUERY_GPU_SC_BUSY:
    355 	case R600_QUERY_GPU_PA_BUSY:
    356 	case R600_QUERY_GPU_DB_BUSY:
    357 	case R600_QUERY_GPU_CP_BUSY:
    358 	case R600_QUERY_GPU_CB_BUSY:
    359 	case R600_QUERY_GPU_SDMA_BUSY:
    360 	case R600_QUERY_GPU_PFP_BUSY:
    361 	case R600_QUERY_GPU_MEQ_BUSY:
    362 	case R600_QUERY_GPU_ME_BUSY:
    363 	case R600_QUERY_GPU_SURF_SYNC_BUSY:
    364 	case R600_QUERY_GPU_CP_DMA_BUSY:
    365 	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
    366 		query->end_result = r600_end_counter(rctx->screen,
    367 						     query->b.type,
    368 						     query->begin_result);
    369 		query->begin_result = 0;
    370 		break;
    371 	case R600_QUERY_NUM_COMPILATIONS:
    372 		query->end_result = p_atomic_read(&rctx->screen->num_compilations);
    373 		break;
    374 	case R600_QUERY_NUM_SHADERS_CREATED:
    375 		query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
    376 		break;
    377 	case R600_QUERY_NUM_SHADER_CACHE_HITS:
    378 		query->end_result =
    379 			p_atomic_read(&rctx->screen->num_shader_cache_hits);
    380 		break;
    381 	case R600_QUERY_GPIN_ASIC_ID:
    382 	case R600_QUERY_GPIN_NUM_SIMD:
    383 	case R600_QUERY_GPIN_NUM_RB:
    384 	case R600_QUERY_GPIN_NUM_SPI:
    385 	case R600_QUERY_GPIN_NUM_SE:
    386 		break;
    387 	default:
    388 		unreachable("r600_query_sw_end: bad query type");
    389 	}
    390 
    391 	return true;
    392 }
    393 
    394 static bool r600_query_sw_get_result(struct r600_common_context *rctx,
    395 				     struct r600_query *rquery,
    396 				     bool wait,
    397 				     union pipe_query_result *result)
    398 {
    399 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
    400 
    401 	switch (query->b.type) {
    402 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
    403 		/* Convert from cycles per millisecond to cycles per second (Hz). */
    404 		result->timestamp_disjoint.frequency =
    405 			(uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
    406 		result->timestamp_disjoint.disjoint = false;
    407 		return true;
    408 	case PIPE_QUERY_GPU_FINISHED: {
    409 		struct pipe_screen *screen = rctx->b.screen;
    410 		struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b;
    411 
    412 		result->b = screen->fence_finish(screen, ctx, query->fence,
    413 						 wait ? PIPE_TIMEOUT_INFINITE : 0);
    414 		return result->b;
    415 	}
    416 
    417 	case R600_QUERY_GFX_BO_LIST_SIZE:
    418 		result->u64 = (query->end_result - query->begin_result) /
    419 			      (query->end_time - query->begin_time);
    420 		return true;
    421 	case R600_QUERY_CS_THREAD_BUSY:
    422 	case R600_QUERY_GALLIUM_THREAD_BUSY:
    423 		result->u64 = (query->end_result - query->begin_result) * 100 /
    424 			      (query->end_time - query->begin_time);
    425 		return true;
    426 	case R600_QUERY_GPIN_ASIC_ID:
    427 		result->u32 = 0;
    428 		return true;
    429 	case R600_QUERY_GPIN_NUM_SIMD:
    430 		result->u32 = rctx->screen->info.num_good_compute_units;
    431 		return true;
    432 	case R600_QUERY_GPIN_NUM_RB:
    433 		result->u32 = rctx->screen->info.num_render_backends;
    434 		return true;
    435 	case R600_QUERY_GPIN_NUM_SPI:
    436 		result->u32 = 1; /* all supported chips have one SPI per SE */
    437 		return true;
    438 	case R600_QUERY_GPIN_NUM_SE:
    439 		result->u32 = rctx->screen->info.max_se;
    440 		return true;
    441 	}
    442 
    443 	result->u64 = query->end_result - query->begin_result;
    444 
    445 	switch (query->b.type) {
    446 	case R600_QUERY_BUFFER_WAIT_TIME:
    447 	case R600_QUERY_GPU_TEMPERATURE:
    448 		result->u64 /= 1000;
    449 		break;
    450 	case R600_QUERY_CURRENT_GPU_SCLK:
    451 	case R600_QUERY_CURRENT_GPU_MCLK:
    452 		result->u64 *= 1000000;
    453 		break;
    454 	}
    455 
    456 	return true;
    457 }
    458 
    459 
    460 static struct r600_query_ops sw_query_ops = {
    461 	.destroy = r600_query_sw_destroy,
    462 	.begin = r600_query_sw_begin,
    463 	.end = r600_query_sw_end,
    464 	.get_result = r600_query_sw_get_result,
    465 	.get_result_resource = NULL
    466 };
    467 
    468 static struct pipe_query *r600_query_sw_create(unsigned query_type)
    469 {
    470 	struct r600_query_sw *query;
    471 
    472 	query = CALLOC_STRUCT(r600_query_sw);
    473 	if (!query)
    474 		return NULL;
    475 
    476 	query->b.type = query_type;
    477 	query->b.ops = &sw_query_ops;
    478 
    479 	return (struct pipe_query *)query;
    480 }
    481 
    482 void r600_query_hw_destroy(struct r600_common_screen *rscreen,
    483 			   struct r600_query *rquery)
    484 {
    485 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
    486 	struct r600_query_buffer *prev = query->buffer.previous;
    487 
    488 	/* Release all query buffers. */
    489 	while (prev) {
    490 		struct r600_query_buffer *qbuf = prev;
    491 		prev = prev->previous;
    492 		r600_resource_reference(&qbuf->buf, NULL);
    493 		FREE(qbuf);
    494 	}
    495 
    496 	r600_resource_reference(&query->buffer.buf, NULL);
    497 	FREE(rquery);
    498 }
    499 
    500 static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen,
    501 						   struct r600_query_hw *query)
    502 {
    503 	unsigned buf_size = MAX2(query->result_size,
    504 				 rscreen->info.min_alloc_size);
    505 
    506 	/* Queries are normally read by the CPU after
    507 	 * being written by the gpu, hence staging is probably a good
    508 	 * usage pattern.
    509 	 */
    510 	struct r600_resource *buf = (struct r600_resource*)
    511 		pipe_buffer_create(&rscreen->b, 0,
    512 				   PIPE_USAGE_STAGING, buf_size);
    513 	if (!buf)
    514 		return NULL;
    515 
    516 	if (!query->ops->prepare_buffer(rscreen, query, buf)) {
    517 		r600_resource_reference(&buf, NULL);
    518 		return NULL;
    519 	}
    520 
    521 	return buf;
    522 }
    523 
    524 static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen,
    525 					 struct r600_query_hw *query,
    526 					 struct r600_resource *buffer)
    527 {
    528 	/* Callers ensure that the buffer is currently unused by the GPU. */
    529 	uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL,
    530 						   PIPE_TRANSFER_WRITE |
    531 						   PIPE_TRANSFER_UNSYNCHRONIZED);
    532 	if (!results)
    533 		return false;
    534 
    535 	memset(results, 0, buffer->b.b.width0);
    536 
    537 	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
    538 	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
    539 		unsigned max_rbs = rscreen->info.num_render_backends;
    540 		unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask;
    541 		unsigned num_results;
    542 		unsigned i, j;
    543 
    544 		/* Set top bits for unused backends. */
    545 		num_results = buffer->b.b.width0 / query->result_size;
    546 		for (j = 0; j < num_results; j++) {
    547 			for (i = 0; i < max_rbs; i++) {
    548 				if (!(enabled_rb_mask & (1<<i))) {
    549 					results[(i * 4)+1] = 0x80000000;
    550 					results[(i * 4)+3] = 0x80000000;
    551 				}
    552 			}
    553 			results += 4 * max_rbs;
    554 		}
    555 	}
    556 
    557 	return true;
    558 }
    559 
    560 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
    561                                               struct r600_query *rquery,
    562                                               bool wait,
    563                                               enum pipe_query_value_type result_type,
    564                                               int index,
    565                                               struct pipe_resource *resource,
    566                                               unsigned offset);
    567 
    568 static struct r600_query_ops query_hw_ops = {
    569 	.destroy = r600_query_hw_destroy,
    570 	.begin = r600_query_hw_begin,
    571 	.end = r600_query_hw_end,
    572 	.get_result = r600_query_hw_get_result,
    573 	.get_result_resource = r600_query_hw_get_result_resource,
    574 };
    575 
    576 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
    577 					struct r600_query_hw *query,
    578 					struct r600_resource *buffer,
    579 					uint64_t va);
    580 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
    581 				       struct r600_query_hw *query,
    582 				       struct r600_resource *buffer,
    583 				       uint64_t va);
    584 static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
    585 				     struct r600_query_hw *, void *buffer,
    586 				     union pipe_query_result *result);
    587 static void r600_query_hw_clear_result(struct r600_query_hw *,
    588 				       union pipe_query_result *);
    589 
    590 static struct r600_query_hw_ops query_hw_default_hw_ops = {
    591 	.prepare_buffer = r600_query_hw_prepare_buffer,
    592 	.emit_start = r600_query_hw_do_emit_start,
    593 	.emit_stop = r600_query_hw_do_emit_stop,
    594 	.clear_result = r600_query_hw_clear_result,
    595 	.add_result = r600_query_hw_add_result,
    596 };
    597 
    598 bool r600_query_hw_init(struct r600_common_screen *rscreen,
    599 			struct r600_query_hw *query)
    600 {
    601 	query->buffer.buf = r600_new_query_buffer(rscreen, query);
    602 	if (!query->buffer.buf)
    603 		return false;
    604 
    605 	return true;
    606 }
    607 
    608 static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen,
    609 					       unsigned query_type,
    610 					       unsigned index)
    611 {
    612 	struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
    613 	if (!query)
    614 		return NULL;
    615 
    616 	query->b.type = query_type;
    617 	query->b.ops = &query_hw_ops;
    618 	query->ops = &query_hw_default_hw_ops;
    619 
    620 	switch (query_type) {
    621 	case PIPE_QUERY_OCCLUSION_COUNTER:
    622 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    623 		query->result_size = 16 * rscreen->info.num_render_backends;
    624 		query->result_size += 16; /* for the fence + alignment */
    625 		query->num_cs_dw_begin = 6;
    626 		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
    627 		break;
    628 	case PIPE_QUERY_TIME_ELAPSED:
    629 		query->result_size = 24;
    630 		query->num_cs_dw_begin = 8;
    631 		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
    632 		break;
    633 	case PIPE_QUERY_TIMESTAMP:
    634 		query->result_size = 16;
    635 		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
    636 		query->flags = R600_QUERY_HW_FLAG_NO_START;
    637 		break;
    638 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    639 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    640 	case PIPE_QUERY_SO_STATISTICS:
    641 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    642 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
    643 		query->result_size = 32;
    644 		query->num_cs_dw_begin = 6;
    645 		query->num_cs_dw_end = 6;
    646 		query->stream = index;
    647 		break;
    648 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
    649 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
    650 		query->result_size = 32 * R600_MAX_STREAMS;
    651 		query->num_cs_dw_begin = 6 * R600_MAX_STREAMS;
    652 		query->num_cs_dw_end = 6 * R600_MAX_STREAMS;
    653 		break;
    654 	case PIPE_QUERY_PIPELINE_STATISTICS:
    655 		/* 11 values on EG, 8 on R600. */
    656 		query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16;
    657 		query->result_size += 8; /* for the fence + alignment */
    658 		query->num_cs_dw_begin = 6;
    659 		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
    660 		break;
    661 	default:
    662 		assert(0);
    663 		FREE(query);
    664 		return NULL;
    665 	}
    666 
    667 	if (!r600_query_hw_init(rscreen, query)) {
    668 		FREE(query);
    669 		return NULL;
    670 	}
    671 
    672 	return (struct pipe_query *)query;
    673 }
    674 
    675 static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
    676 					      unsigned type, int diff)
    677 {
    678 	if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
    679 	    type == PIPE_QUERY_OCCLUSION_PREDICATE) {
    680 		bool old_enable = rctx->num_occlusion_queries != 0;
    681 		bool old_perfect_enable =
    682 			rctx->num_perfect_occlusion_queries != 0;
    683 		bool enable, perfect_enable;
    684 
    685 		rctx->num_occlusion_queries += diff;
    686 		assert(rctx->num_occlusion_queries >= 0);
    687 
    688 		if (type == PIPE_QUERY_OCCLUSION_COUNTER) {
    689 			rctx->num_perfect_occlusion_queries += diff;
    690 			assert(rctx->num_perfect_occlusion_queries >= 0);
    691 		}
    692 
    693 		enable = rctx->num_occlusion_queries != 0;
    694 		perfect_enable = rctx->num_perfect_occlusion_queries != 0;
    695 
    696 		if (enable != old_enable || perfect_enable != old_perfect_enable) {
    697 			struct r600_context *ctx = (struct r600_context*)rctx;
    698 			r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom);
    699 		}
    700 	}
    701 }
    702 
    703 static unsigned event_type_for_stream(unsigned stream)
    704 {
    705 	switch (stream) {
    706 	default:
    707 	case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
    708 	case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
    709 	case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;
    710 	case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;
    711 	}
    712 }
    713 
    714 static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va,
    715 				  unsigned stream)
    716 {
    717 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    718 	radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
    719 	radeon_emit(cs, va);
    720 	radeon_emit(cs, va >> 32);
    721 }
    722 
    723 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
    724 					struct r600_query_hw *query,
    725 					struct r600_resource *buffer,
    726 					uint64_t va)
    727 {
    728 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
    729 
    730 	switch (query->b.type) {
    731 	case PIPE_QUERY_OCCLUSION_COUNTER:
    732 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    733 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    734 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
    735 		radeon_emit(cs, va);
    736 		radeon_emit(cs, va >> 32);
    737 		break;
    738 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    739 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    740 	case PIPE_QUERY_SO_STATISTICS:
    741 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    742 		emit_sample_streamout(cs, va, query->stream);
    743 		break;
    744 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
    745 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
    746 			emit_sample_streamout(cs, va + 32 * stream, stream);
    747 		break;
    748 	case PIPE_QUERY_TIME_ELAPSED:
    749 		/* Write the timestamp after the last draw is done.
    750 		 * (bottom-of-pipe)
    751 		 */
    752 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
    753 					 0, EOP_DATA_SEL_TIMESTAMP,
    754 					 NULL, va, 0, query->b.type);
    755 		break;
    756 	case PIPE_QUERY_PIPELINE_STATISTICS:
    757 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    758 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
    759 		radeon_emit(cs, va);
    760 		radeon_emit(cs, va >> 32);
    761 		break;
    762 	default:
    763 		assert(0);
    764 	}
    765 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
    766 			RADEON_PRIO_QUERY);
    767 }
    768 
    769 static void r600_query_hw_emit_start(struct r600_common_context *ctx,
    770 				     struct r600_query_hw *query)
    771 {
    772 	uint64_t va;
    773 
    774 	if (!query->buffer.buf)
    775 		return; // previous buffer allocation failure
    776 
    777 	r600_update_occlusion_query_state(ctx, query->b.type, 1);
    778 	r600_update_prims_generated_query_state(ctx, query->b.type, 1);
    779 
    780 	ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
    781 			       true);
    782 
    783 	/* Get a new query buffer if needed. */
    784 	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
    785 		struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
    786 		*qbuf = query->buffer;
    787 		query->buffer.results_end = 0;
    788 		query->buffer.previous = qbuf;
    789 		query->buffer.buf = r600_new_query_buffer(ctx->screen, query);
    790 		if (!query->buffer.buf)
    791 			return;
    792 	}
    793 
    794 	/* emit begin query */
    795 	va = query->buffer.buf->gpu_address + query->buffer.results_end;
    796 
    797 	query->ops->emit_start(ctx, query, query->buffer.buf, va);
    798 
    799 	ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
    800 }
    801 
    802 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
    803 				       struct r600_query_hw *query,
    804 				       struct r600_resource *buffer,
    805 				       uint64_t va)
    806 {
    807 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
    808 	uint64_t fence_va = 0;
    809 
    810 	switch (query->b.type) {
    811 	case PIPE_QUERY_OCCLUSION_COUNTER:
    812 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    813 		va += 8;
    814 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    815 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
    816 		radeon_emit(cs, va);
    817 		radeon_emit(cs, va >> 32);
    818 
    819 		fence_va = va + ctx->screen->info.num_render_backends * 16 - 8;
    820 		break;
    821 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    822 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    823 	case PIPE_QUERY_SO_STATISTICS:
    824 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    825 		va += 16;
    826 		emit_sample_streamout(cs, va, query->stream);
    827 		break;
    828 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
    829 		va += 16;
    830 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
    831 			emit_sample_streamout(cs, va + 32 * stream, stream);
    832 		break;
    833 	case PIPE_QUERY_TIME_ELAPSED:
    834 		va += 8;
    835 		/* fall through */
    836 	case PIPE_QUERY_TIMESTAMP:
    837 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
    838 					 0, EOP_DATA_SEL_TIMESTAMP, NULL, va,
    839 					 0, query->b.type);
    840 		fence_va = va + 8;
    841 		break;
    842 	case PIPE_QUERY_PIPELINE_STATISTICS: {
    843 		unsigned sample_size = (query->result_size - 8) / 2;
    844 
    845 		va += sample_size;
    846 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    847 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
    848 		radeon_emit(cs, va);
    849 		radeon_emit(cs, va >> 32);
    850 
    851 		fence_va = va + sample_size;
    852 		break;
    853 	}
    854 	default:
    855 		assert(0);
    856 	}
    857 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
    858 			RADEON_PRIO_QUERY);
    859 
    860 	if (fence_va)
    861 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0,
    862 					 EOP_DATA_SEL_VALUE_32BIT,
    863 					 query->buffer.buf, fence_va, 0x80000000,
    864 					 query->b.type);
    865 }
    866 
    867 static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
    868 				    struct r600_query_hw *query)
    869 {
    870 	uint64_t va;
    871 
    872 	if (!query->buffer.buf)
    873 		return; // previous buffer allocation failure
    874 
    875 	/* The queries which need begin already called this in begin_query. */
    876 	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
    877 		ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);
    878 	}
    879 
    880 	/* emit end query */
    881 	va = query->buffer.buf->gpu_address + query->buffer.results_end;
    882 
    883 	query->ops->emit_stop(ctx, query, query->buffer.buf, va);
    884 
    885 	query->buffer.results_end += query->result_size;
    886 
    887 	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
    888 		ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
    889 
    890 	r600_update_occlusion_query_state(ctx, query->b.type, -1);
    891 	r600_update_prims_generated_query_state(ctx, query->b.type, -1);
    892 }
    893 
    894 static void emit_set_predicate(struct r600_common_context *ctx,
    895 			       struct r600_resource *buf, uint64_t va,
    896 			       uint32_t op)
    897 {
    898 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
    899 
    900 	radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
    901 	radeon_emit(cs, va);
    902 	radeon_emit(cs, op | ((va >> 32) & 0xFF));
    903 	r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_READ,
    904 			RADEON_PRIO_QUERY);
    905 }
    906 
    907 static void r600_emit_query_predication(struct r600_common_context *ctx,
    908 					struct r600_atom *atom)
    909 {
    910 	struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
    911 	struct r600_query_buffer *qbuf;
    912 	uint32_t op;
    913 	bool flag_wait, invert;
    914 
    915 	if (!query)
    916 		return;
    917 
    918 	invert = ctx->render_cond_invert;
    919 	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
    920 		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
    921 
    922 	switch (query->b.type) {
    923 	case PIPE_QUERY_OCCLUSION_COUNTER:
    924 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    925 		op = PRED_OP(PREDICATION_OP_ZPASS);
    926 		break;
    927 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    928 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
    929 		op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
    930 		invert = !invert;
    931 		break;
    932 	default:
    933 		assert(0);
    934 		return;
    935 	}
    936 
    937 	/* if true then invert, see GL_ARB_conditional_render_inverted */
    938 	if (invert)
    939 		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
    940 	else
    941 		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
    942 
    943 	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
    944 
    945 	/* emit predicate packets for all data blocks */
    946 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
    947 		unsigned results_base = 0;
    948 		uint64_t va_base = qbuf->buf->gpu_address;
    949 
    950 		while (results_base < qbuf->results_end) {
    951 			uint64_t va = va_base + results_base;
    952 
    953 			if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
    954 				for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
    955 					emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
    956 
    957 					/* set CONTINUE bit for all packets except the first */
    958 					op |= PREDICATION_CONTINUE;
    959 				}
    960 			} else {
    961 				emit_set_predicate(ctx, qbuf->buf, va, op);
    962 				op |= PREDICATION_CONTINUE;
    963 			}
    964 
    965 			results_base += query->result_size;
    966 		}
    967 	}
    968 }
    969 
    970 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
    971 {
    972 	struct r600_common_screen *rscreen =
    973 		(struct r600_common_screen *)ctx->screen;
    974 
    975 	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
    976 	    query_type == PIPE_QUERY_GPU_FINISHED ||
    977 	    query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
    978 		return r600_query_sw_create(query_type);
    979 
    980 	return r600_query_hw_create(rscreen, query_type, index);
    981 }
    982 
    983 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
    984 {
    985 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
    986 	struct r600_query *rquery = (struct r600_query *)query;
    987 
    988 	rquery->ops->destroy(rctx->screen, rquery);
    989 }
    990 
    991 static boolean r600_begin_query(struct pipe_context *ctx,
    992                                 struct pipe_query *query)
    993 {
    994 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
    995 	struct r600_query *rquery = (struct r600_query *)query;
    996 
    997 	return rquery->ops->begin(rctx, rquery);
    998 }
    999 
   1000 void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
   1001 				 struct r600_query_hw *query)
   1002 {
   1003 	struct r600_query_buffer *prev = query->buffer.previous;
   1004 
   1005 	/* Discard the old query buffers. */
   1006 	while (prev) {
   1007 		struct r600_query_buffer *qbuf = prev;
   1008 		prev = prev->previous;
   1009 		r600_resource_reference(&qbuf->buf, NULL);
   1010 		FREE(qbuf);
   1011 	}
   1012 
   1013 	query->buffer.results_end = 0;
   1014 	query->buffer.previous = NULL;
   1015 
   1016 	/* Obtain a new buffer if the current one can't be mapped without a stall. */
   1017 	if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
   1018 	    !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
   1019 		r600_resource_reference(&query->buffer.buf, NULL);
   1020 		query->buffer.buf = r600_new_query_buffer(rctx->screen, query);
   1021 	} else {
   1022 		if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf))
   1023 			r600_resource_reference(&query->buffer.buf, NULL);
   1024 	}
   1025 }
   1026 
   1027 bool r600_query_hw_begin(struct r600_common_context *rctx,
   1028 			 struct r600_query *rquery)
   1029 {
   1030 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
   1031 
   1032 	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
   1033 		assert(0);
   1034 		return false;
   1035 	}
   1036 
   1037 	if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
   1038 		r600_query_hw_reset_buffers(rctx, query);
   1039 
   1040 	r600_query_hw_emit_start(rctx, query);
   1041 	if (!query->buffer.buf)
   1042 		return false;
   1043 
   1044 	LIST_ADDTAIL(&query->list, &rctx->active_queries);
   1045 	return true;
   1046 }
   1047 
   1048 static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
   1049 {
   1050 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
   1051 	struct r600_query *rquery = (struct r600_query *)query;
   1052 
   1053 	return rquery->ops->end(rctx, rquery);
   1054 }
   1055 
   1056 bool r600_query_hw_end(struct r600_common_context *rctx,
   1057 		       struct r600_query *rquery)
   1058 {
   1059 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
   1060 
   1061 	if (query->flags & R600_QUERY_HW_FLAG_NO_START)
   1062 		r600_query_hw_reset_buffers(rctx, query);
   1063 
   1064 	r600_query_hw_emit_stop(rctx, query);
   1065 
   1066 	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
   1067 		LIST_DELINIT(&query->list);
   1068 
   1069 	if (!query->buffer.buf)
   1070 		return false;
   1071 
   1072 	return true;
   1073 }
   1074 
   1075 static void r600_get_hw_query_params(struct r600_common_context *rctx,
   1076 				     struct r600_query_hw *rquery, int index,
   1077 				     struct r600_hw_query_params *params)
   1078 {
   1079 	unsigned max_rbs = rctx->screen->info.num_render_backends;
   1080 
   1081 	params->pair_stride = 0;
   1082 	params->pair_count = 1;
   1083 
   1084 	switch (rquery->b.type) {
   1085 	case PIPE_QUERY_OCCLUSION_COUNTER:
   1086 	case PIPE_QUERY_OCCLUSION_PREDICATE:
   1087 		params->start_offset = 0;
   1088 		params->end_offset = 8;
   1089 		params->fence_offset = max_rbs * 16;
   1090 		params->pair_stride = 16;
   1091 		params->pair_count = max_rbs;
   1092 		break;
   1093 	case PIPE_QUERY_TIME_ELAPSED:
   1094 		params->start_offset = 0;
   1095 		params->end_offset = 8;
   1096 		params->fence_offset = 16;
   1097 		break;
   1098 	case PIPE_QUERY_TIMESTAMP:
   1099 		params->start_offset = 0;
   1100 		params->end_offset = 0;
   1101 		params->fence_offset = 8;
   1102 		break;
   1103 	case PIPE_QUERY_PRIMITIVES_EMITTED:
   1104 		params->start_offset = 8;
   1105 		params->end_offset = 24;
   1106 		params->fence_offset = params->end_offset + 4;
   1107 		break;
   1108 	case PIPE_QUERY_PRIMITIVES_GENERATED:
   1109 		params->start_offset = 0;
   1110 		params->end_offset = 16;
   1111 		params->fence_offset = params->end_offset + 4;
   1112 		break;
   1113 	case PIPE_QUERY_SO_STATISTICS:
   1114 		params->start_offset = 8 - index * 8;
   1115 		params->end_offset = 24 - index * 8;
   1116 		params->fence_offset = params->end_offset + 4;
   1117 		break;
   1118 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
   1119 		params->pair_count = R600_MAX_STREAMS;
   1120 		params->pair_stride = 32;
   1121 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
   1122 		params->start_offset = 0;
   1123 		params->end_offset = 16;
   1124 
   1125 		/* We can re-use the high dword of the last 64-bit value as a
   1126 		 * fence: it is initialized as 0, and the high bit is set by
   1127 		 * the write of the streamout stats event.
   1128 		 */
   1129 		params->fence_offset = rquery->result_size - 4;
   1130 		break;
   1131 	case PIPE_QUERY_PIPELINE_STATISTICS:
   1132 	{
   1133 		/* Offsets apply to EG+ */
   1134 		static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
   1135 		params->start_offset = offsets[index];
   1136 		params->end_offset = 88 + offsets[index];
   1137 		params->fence_offset = 2 * 88;
   1138 		break;
   1139 	}
   1140 	default:
   1141 		unreachable("r600_get_hw_query_params unsupported");
   1142 	}
   1143 }
   1144 
   1145 static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
   1146 				       bool test_status_bit)
   1147 {
   1148 	uint32_t *current_result = (uint32_t*)map;
   1149 	uint64_t start, end;
   1150 
   1151 	start = (uint64_t)current_result[start_index] |
   1152 		(uint64_t)current_result[start_index+1] << 32;
   1153 	end = (uint64_t)current_result[end_index] |
   1154 	      (uint64_t)current_result[end_index+1] << 32;
   1155 
   1156 	if (!test_status_bit ||
   1157 	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
   1158 		return end - start;
   1159 	}
   1160 	return 0;
   1161 }
   1162 
   1163 static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
   1164 				     struct r600_query_hw *query,
   1165 				     void *buffer,
   1166 				     union pipe_query_result *result)
   1167 {
   1168 	unsigned max_rbs = rscreen->info.num_render_backends;
   1169 
   1170 	switch (query->b.type) {
   1171 	case PIPE_QUERY_OCCLUSION_COUNTER: {
   1172 		for (unsigned i = 0; i < max_rbs; ++i) {
   1173 			unsigned results_base = i * 16;
   1174 			result->u64 +=
   1175 				r600_query_read_result(buffer + results_base, 0, 2, true);
   1176 		}
   1177 		break;
   1178 	}
   1179 	case PIPE_QUERY_OCCLUSION_PREDICATE: {
   1180 		for (unsigned i = 0; i < max_rbs; ++i) {
   1181 			unsigned results_base = i * 16;
   1182 			result->b = result->b ||
   1183 				r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
   1184 		}
   1185 		break;
   1186 	}
   1187 	case PIPE_QUERY_TIME_ELAPSED:
   1188 		result->u64 += r600_query_read_result(buffer, 0, 2, false);
   1189 		break;
   1190 	case PIPE_QUERY_TIMESTAMP:
   1191 		result->u64 = *(uint64_t*)buffer;
   1192 		break;
   1193 	case PIPE_QUERY_PRIMITIVES_EMITTED:
   1194 		/* SAMPLE_STREAMOUTSTATS stores this structure:
   1195 		 * {
   1196 		 *    u64 NumPrimitivesWritten;
   1197 		 *    u64 PrimitiveStorageNeeded;
   1198 		 * }
   1199 		 * We only need NumPrimitivesWritten here. */
   1200 		result->u64 += r600_query_read_result(buffer, 2, 6, true);
   1201 		break;
   1202 	case PIPE_QUERY_PRIMITIVES_GENERATED:
   1203 		/* Here we read PrimitiveStorageNeeded. */
   1204 		result->u64 += r600_query_read_result(buffer, 0, 4, true);
   1205 		break;
   1206 	case PIPE_QUERY_SO_STATISTICS:
   1207 		result->so_statistics.num_primitives_written +=
   1208 			r600_query_read_result(buffer, 2, 6, true);
   1209 		result->so_statistics.primitives_storage_needed +=
   1210 			r600_query_read_result(buffer, 0, 4, true);
   1211 		break;
   1212 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
   1213 		result->b = result->b ||
   1214 			r600_query_read_result(buffer, 2, 6, true) !=
   1215 			r600_query_read_result(buffer, 0, 4, true);
   1216 		break;
   1217 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
   1218 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
   1219 			result->b = result->b ||
   1220 				r600_query_read_result(buffer, 2, 6, true) !=
   1221 				r600_query_read_result(buffer, 0, 4, true);
   1222 			buffer = (char *)buffer + 32;
   1223 		}
   1224 		break;
   1225 	case PIPE_QUERY_PIPELINE_STATISTICS:
   1226 		if (rscreen->chip_class >= EVERGREEN) {
   1227 			result->pipeline_statistics.ps_invocations +=
   1228 				r600_query_read_result(buffer, 0, 22, false);
   1229 			result->pipeline_statistics.c_primitives +=
   1230 				r600_query_read_result(buffer, 2, 24, false);
   1231 			result->pipeline_statistics.c_invocations +=
   1232 				r600_query_read_result(buffer, 4, 26, false);
   1233 			result->pipeline_statistics.vs_invocations +=
   1234 				r600_query_read_result(buffer, 6, 28, false);
   1235 			result->pipeline_statistics.gs_invocations +=
   1236 				r600_query_read_result(buffer, 8, 30, false);
   1237 			result->pipeline_statistics.gs_primitives +=
   1238 				r600_query_read_result(buffer, 10, 32, false);
   1239 			result->pipeline_statistics.ia_primitives +=
   1240 				r600_query_read_result(buffer, 12, 34, false);
   1241 			result->pipeline_statistics.ia_vertices +=
   1242 				r600_query_read_result(buffer, 14, 36, false);
   1243 			result->pipeline_statistics.hs_invocations +=
   1244 				r600_query_read_result(buffer, 16, 38, false);
   1245 			result->pipeline_statistics.ds_invocations +=
   1246 				r600_query_read_result(buffer, 18, 40, false);
   1247 			result->pipeline_statistics.cs_invocations +=
   1248 				r600_query_read_result(buffer, 20, 42, false);
   1249 		} else {
   1250 			result->pipeline_statistics.ps_invocations +=
   1251 				r600_query_read_result(buffer, 0, 16, false);
   1252 			result->pipeline_statistics.c_primitives +=
   1253 				r600_query_read_result(buffer, 2, 18, false);
   1254 			result->pipeline_statistics.c_invocations +=
   1255 				r600_query_read_result(buffer, 4, 20, false);
   1256 			result->pipeline_statistics.vs_invocations +=
   1257 				r600_query_read_result(buffer, 6, 22, false);
   1258 			result->pipeline_statistics.gs_invocations +=
   1259 				r600_query_read_result(buffer, 8, 24, false);
   1260 			result->pipeline_statistics.gs_primitives +=
   1261 				r600_query_read_result(buffer, 10, 26, false);
   1262 			result->pipeline_statistics.ia_primitives +=
   1263 				r600_query_read_result(buffer, 12, 28, false);
   1264 			result->pipeline_statistics.ia_vertices +=
   1265 				r600_query_read_result(buffer, 14, 30, false);
   1266 		}
   1267 #if 0 /* for testing */
   1268 		printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
   1269 		       "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
   1270 		       "Clipper prims=%llu, PS=%llu, CS=%llu\n",
   1271 		       result->pipeline_statistics.ia_vertices,
   1272 		       result->pipeline_statistics.ia_primitives,
   1273 		       result->pipeline_statistics.vs_invocations,
   1274 		       result->pipeline_statistics.hs_invocations,
   1275 		       result->pipeline_statistics.ds_invocations,
   1276 		       result->pipeline_statistics.gs_invocations,
   1277 		       result->pipeline_statistics.gs_primitives,
   1278 		       result->pipeline_statistics.c_invocations,
   1279 		       result->pipeline_statistics.c_primitives,
   1280 		       result->pipeline_statistics.ps_invocations,
   1281 		       result->pipeline_statistics.cs_invocations);
   1282 #endif
   1283 		break;
   1284 	default:
   1285 		assert(0);
   1286 	}
   1287 }
   1288 
   1289 static boolean r600_get_query_result(struct pipe_context *ctx,
   1290 				     struct pipe_query *query, boolean wait,
   1291 				     union pipe_query_result *result)
   1292 {
   1293 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
   1294 	struct r600_query *rquery = (struct r600_query *)query;
   1295 
   1296 	return rquery->ops->get_result(rctx, rquery, wait, result);
   1297 }
   1298 
   1299 static void r600_get_query_result_resource(struct pipe_context *ctx,
   1300                                            struct pipe_query *query,
   1301                                            boolean wait,
   1302                                            enum pipe_query_value_type result_type,
   1303                                            int index,
   1304                                            struct pipe_resource *resource,
   1305                                            unsigned offset)
   1306 {
   1307 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
   1308 	struct r600_query *rquery = (struct r600_query *)query;
   1309 
   1310 	rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,
   1311 	                                 resource, offset);
   1312 }
   1313 
   1314 static void r600_query_hw_clear_result(struct r600_query_hw *query,
   1315 				       union pipe_query_result *result)
   1316 {
   1317 	util_query_clear_result(result, query->b.type);
   1318 }
   1319 
   1320 bool r600_query_hw_get_result(struct r600_common_context *rctx,
   1321 			      struct r600_query *rquery,
   1322 			      bool wait, union pipe_query_result *result)
   1323 {
   1324 	struct r600_common_screen *rscreen = rctx->screen;
   1325 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
   1326 	struct r600_query_buffer *qbuf;
   1327 
   1328 	query->ops->clear_result(query, result);
   1329 
   1330 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
   1331 		unsigned usage = PIPE_TRANSFER_READ |
   1332 				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
   1333 		unsigned results_base = 0;
   1334 		void *map;
   1335 
   1336 		if (rquery->b.flushed)
   1337 			map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
   1338 		else
   1339 			map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage);
   1340 
   1341 		if (!map)
   1342 			return false;
   1343 
   1344 		while (results_base != qbuf->results_end) {
   1345 			query->ops->add_result(rscreen, query, map + results_base,
   1346 					       result);
   1347 			results_base += query->result_size;
   1348 		}
   1349 	}
   1350 
   1351 	/* Convert the time to expected units. */
   1352 	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
   1353 	    rquery->type == PIPE_QUERY_TIMESTAMP) {
   1354 		result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq;
   1355 	}
   1356 	return true;
   1357 }
   1358 
   1359 /* Create the compute shader that is used to collect the results.
   1360  *
   1361  * One compute grid with a single thread is launched for every query result
   1362  * buffer. The thread (optionally) reads a previous summary buffer, then
   1363  * accumulates data from the query result buffer, and writes the result either
   1364  * to a summary buffer to be consumed by the next grid invocation or to the
   1365  * user-supplied buffer.
   1366  *
   1367  * Data layout:
   1368  *
   1369  * CONST
   1370  *  0.x = end_offset
   1371  *  0.y = result_stride
   1372  *  0.z = result_count
   1373  *  0.w = bit field:
   1374  *          1: read previously accumulated values
   1375  *          2: write accumulated values for chaining
   1376  *          4: write result available
   1377  *          8: convert result to boolean (0/1)
   1378  *         16: only read one dword and use that as result
   1379  *         32: apply timestamp conversion
   1380  *         64: store full 64 bits result
   1381  *        128: store signed 32 bits result
   1382  *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
   1383  *  1.x = fence_offset
   1384  *  1.y = pair_stride
   1385  *  1.z = pair_count
   1386  *
   1387  * BUFFER[0] = query result buffer
   1388  * BUFFER[1] = previous summary buffer
   1389  * BUFFER[2] = next summary buffer or user-supplied buffer
   1390  */
   1391 static void r600_create_query_result_shader(struct r600_common_context *rctx)
   1392 {
   1393 	/* TEMP[0].xy = accumulated result so far
   1394 	 * TEMP[0].z = result not available
   1395 	 *
   1396 	 * TEMP[1].x = current result index
   1397 	 * TEMP[1].y = current pair index
   1398 	 */
   1399 	static const char text_tmpl[] =
   1400 		"COMP\n"
   1401 		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
   1402 		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
   1403 		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
   1404 		"DCL BUFFER[0]\n"
   1405 		"DCL BUFFER[1]\n"
   1406 		"DCL BUFFER[2]\n"
   1407 		"DCL CONST[0][0..1]\n"
   1408 		"DCL TEMP[0..5]\n"
   1409 		"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
   1410 		"IMM[1] UINT32 {1, 2, 4, 8}\n"
   1411 		"IMM[2] UINT32 {16, 32, 64, 128}\n"
   1412 		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
   1413 		"IMM[4] UINT32 {256, 0, 0, 0}\n"
   1414 
   1415 		"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
   1416 		"UIF TEMP[5]\n"
   1417 			/* Check result availability. */
   1418 			"LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
   1419 			"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
   1420 			"MOV TEMP[1], TEMP[0].zzzz\n"
   1421 			"NOT TEMP[0].z, TEMP[0].zzzz\n"
   1422 
   1423 			/* Load result if available. */
   1424 			"UIF TEMP[1]\n"
   1425 				"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
   1426 			"ENDIF\n"
   1427 		"ELSE\n"
   1428 			/* Load previously accumulated result if requested. */
   1429 			"MOV TEMP[0], IMM[0].xxxx\n"
   1430 			"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
   1431 			"UIF TEMP[4]\n"
   1432 				"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
   1433 			"ENDIF\n"
   1434 
   1435 			"MOV TEMP[1].x, IMM[0].xxxx\n"
   1436 			"BGNLOOP\n"
   1437 				/* Break if accumulated result so far is not available. */
   1438 				"UIF TEMP[0].zzzz\n"
   1439 					"BRK\n"
   1440 				"ENDIF\n"
   1441 
   1442 				/* Break if result_index >= result_count. */
   1443 				"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
   1444 				"UIF TEMP[5]\n"
   1445 					"BRK\n"
   1446 				"ENDIF\n"
   1447 
   1448 				/* Load fence and check result availability */
   1449 				"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
   1450 				"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
   1451 				"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
   1452 				"NOT TEMP[0].z, TEMP[0].zzzz\n"
   1453 				"UIF TEMP[0].zzzz\n"
   1454 					"BRK\n"
   1455 				"ENDIF\n"
   1456 
   1457 				"MOV TEMP[1].y, IMM[0].xxxx\n"
   1458 				"BGNLOOP\n"
   1459 					/* Load start and end. */
   1460 					"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
   1461 					"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
   1462 					"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
   1463 
   1464 					"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
   1465 					"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
   1466 
   1467 					"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
   1468 
   1469 					"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
   1470 					"UIF TEMP[5].zzzz\n"
   1471 						/* Load second start/end half-pair and
   1472 						 * take the difference
   1473 						 */
   1474 						"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
   1475 						"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
   1476 						"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
   1477 
   1478 						"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
   1479 						"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
   1480 					"ENDIF\n"
   1481 
   1482 					"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
   1483 
   1484 					/* Increment pair index */
   1485 					"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
   1486 					"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
   1487 					"UIF TEMP[5]\n"
   1488 						"BRK\n"
   1489 					"ENDIF\n"
   1490 				"ENDLOOP\n"
   1491 
   1492 				/* Increment result index */
   1493 				"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
   1494 			"ENDLOOP\n"
   1495 		"ENDIF\n"
   1496 
   1497 		"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
   1498 		"UIF TEMP[4]\n"
   1499 			/* Store accumulated data for chaining. */
   1500 			"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
   1501 		"ELSE\n"
   1502 			"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
   1503 			"UIF TEMP[4]\n"
   1504 				/* Store result availability. */
   1505 				"NOT TEMP[0].z, TEMP[0]\n"
   1506 				"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
   1507 				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
   1508 
   1509 				"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
   1510 				"UIF TEMP[4]\n"
   1511 					"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
   1512 				"ENDIF\n"
   1513 			"ELSE\n"
   1514 				/* Store result if it is available. */
   1515 				"NOT TEMP[4], TEMP[0].zzzz\n"
   1516 				"UIF TEMP[4]\n"
   1517 					/* Apply timestamp conversion */
   1518 					"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
   1519 					"UIF TEMP[4]\n"
   1520 						"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
   1521 						"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
   1522 					"ENDIF\n"
   1523 
   1524 					/* Convert to boolean */
   1525 					"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
   1526 					"UIF TEMP[4]\n"
   1527 						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
   1528 						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
   1529 						"MOV TEMP[0].y, IMM[0].xxxx\n"
   1530 					"ENDIF\n"
   1531 
   1532 					"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
   1533 					"UIF TEMP[4]\n"
   1534 						"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
   1535 					"ELSE\n"
   1536 						/* Clamping */
   1537 						"UIF TEMP[0].yyyy\n"
   1538 							"MOV TEMP[0].x, IMM[0].wwww\n"
   1539 						"ENDIF\n"
   1540 
   1541 						"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
   1542 						"UIF TEMP[4]\n"
   1543 							"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
   1544 						"ENDIF\n"
   1545 
   1546 						"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
   1547 					"ENDIF\n"
   1548 				"ENDIF\n"
   1549 			"ENDIF\n"
   1550 		"ENDIF\n"
   1551 
   1552 		"END\n";
   1553 
   1554 	char text[sizeof(text_tmpl) + 32];
   1555 	struct tgsi_token tokens[1024];
   1556 	struct pipe_compute_state state = {};
   1557 
   1558 	/* Hard code the frequency into the shader so that the backend can
   1559 	 * use the full range of optimizations for divide-by-constant.
   1560 	 */
   1561 	snprintf(text, sizeof(text), text_tmpl,
   1562 		 rctx->screen->info.clock_crystal_freq);
   1563 
   1564 	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
   1565 		assert(false);
   1566 		return;
   1567 	}
   1568 
   1569 	state.ir_type = PIPE_SHADER_IR_TGSI;
   1570 	state.prog = tokens;
   1571 
   1572 	rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);
   1573 }
   1574 
   1575 static void r600_restore_qbo_state(struct r600_common_context *rctx,
   1576 				   struct r600_qbo_state *st)
   1577 {
   1578 	rctx->b.bind_compute_state(&rctx->b, st->saved_compute);
   1579 
   1580 	rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
   1581 	pipe_resource_reference(&st->saved_const0.buffer, NULL);
   1582 
   1583 	rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
   1584 	for (unsigned i = 0; i < 3; ++i)
   1585 		pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
   1586 }
   1587 
   1588 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
   1589                                               struct r600_query *rquery,
   1590                                               bool wait,
   1591                                               enum pipe_query_value_type result_type,
   1592                                               int index,
   1593                                               struct pipe_resource *resource,
   1594                                               unsigned offset)
   1595 {
   1596 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
   1597 	struct r600_query_buffer *qbuf;
   1598 	struct r600_query_buffer *qbuf_prev;
   1599 	struct pipe_resource *tmp_buffer = NULL;
   1600 	unsigned tmp_buffer_offset = 0;
   1601 	struct r600_qbo_state saved_state = {};
   1602 	struct pipe_grid_info grid = {};
   1603 	struct pipe_constant_buffer constant_buffer = {};
   1604 	struct pipe_shader_buffer ssbo[3];
   1605 	struct r600_hw_query_params params;
   1606 	struct {
   1607 		uint32_t end_offset;
   1608 		uint32_t result_stride;
   1609 		uint32_t result_count;
   1610 		uint32_t config;
   1611 		uint32_t fence_offset;
   1612 		uint32_t pair_stride;
   1613 		uint32_t pair_count;
   1614 	} consts;
   1615 
   1616 	if (!rctx->query_result_shader) {
   1617 		r600_create_query_result_shader(rctx);
   1618 		if (!rctx->query_result_shader)
   1619 			return;
   1620 	}
   1621 
   1622 	if (query->buffer.previous) {
   1623 		u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
   1624 				     &tmp_buffer_offset, &tmp_buffer);
   1625 		if (!tmp_buffer)
   1626 			return;
   1627 	}
   1628 
   1629 	rctx->save_qbo_state(&rctx->b, &saved_state);
   1630 
   1631 	r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, &params);
   1632 	consts.end_offset = params.end_offset - params.start_offset;
   1633 	consts.fence_offset = params.fence_offset - params.start_offset;
   1634 	consts.result_stride = query->result_size;
   1635 	consts.pair_stride = params.pair_stride;
   1636 	consts.pair_count = params.pair_count;
   1637 
   1638 	constant_buffer.buffer_size = sizeof(consts);
   1639 	constant_buffer.user_buffer = &consts;
   1640 
   1641 	ssbo[1].buffer = tmp_buffer;
   1642 	ssbo[1].buffer_offset = tmp_buffer_offset;
   1643 	ssbo[1].buffer_size = 16;
   1644 
   1645 	ssbo[2] = ssbo[1];
   1646 
   1647 	rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);
   1648 
   1649 	grid.block[0] = 1;
   1650 	grid.block[1] = 1;
   1651 	grid.block[2] = 1;
   1652 	grid.grid[0] = 1;
   1653 	grid.grid[1] = 1;
   1654 	grid.grid[2] = 1;
   1655 
   1656 	consts.config = 0;
   1657 	if (index < 0)
   1658 		consts.config |= 4;
   1659 	if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE)
   1660 		consts.config |= 8;
   1661 	else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
   1662 		 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
   1663 		consts.config |= 8 | 256;
   1664 	else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
   1665 		 query->b.type == PIPE_QUERY_TIME_ELAPSED)
   1666 		consts.config |= 32;
   1667 
   1668 	switch (result_type) {
   1669 	case PIPE_QUERY_TYPE_U64:
   1670 	case PIPE_QUERY_TYPE_I64:
   1671 		consts.config |= 64;
   1672 		break;
   1673 	case PIPE_QUERY_TYPE_I32:
   1674 		consts.config |= 128;
   1675 		break;
   1676 	case PIPE_QUERY_TYPE_U32:
   1677 		break;
   1678 	}
   1679 
   1680 	rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;
   1681 
   1682 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
   1683 		if (query->b.type != PIPE_QUERY_TIMESTAMP) {
   1684 			qbuf_prev = qbuf->previous;
   1685 			consts.result_count = qbuf->results_end / query->result_size;
   1686 			consts.config &= ~3;
   1687 			if (qbuf != &query->buffer)
   1688 				consts.config |= 1;
   1689 			if (qbuf->previous)
   1690 				consts.config |= 2;
   1691 		} else {
   1692 			/* Only read the last timestamp. */
   1693 			qbuf_prev = NULL;
   1694 			consts.result_count = 0;
   1695 			consts.config |= 16;
   1696 			params.start_offset += qbuf->results_end - query->result_size;
   1697 		}
   1698 
   1699 		rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
   1700 
   1701 		ssbo[0].buffer = &qbuf->buf->b.b;
   1702 		ssbo[0].buffer_offset = params.start_offset;
   1703 		ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
   1704 
   1705 		if (!qbuf->previous) {
   1706 			ssbo[2].buffer = resource;
   1707 			ssbo[2].buffer_offset = offset;
   1708 			ssbo[2].buffer_size = 8;
   1709 
   1710 		}
   1711 
   1712 		rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
   1713 
   1714 		if (wait && qbuf == &query->buffer) {
   1715 			uint64_t va;
   1716 
   1717 			/* Wait for result availability. Wait only for readiness
   1718 			 * of the last entry, since the fence writes should be
   1719 			 * serialized in the CP.
   1720 			 */
   1721 			va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
   1722 			va += params.fence_offset;
   1723 
   1724 			r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
   1725 		}
   1726 
   1727 		rctx->b.launch_grid(&rctx->b, &grid);
   1728 		rctx->flags |= rctx->screen->barrier_flags.compute_to_L2;
   1729 	}
   1730 
   1731 	r600_restore_qbo_state(rctx, &saved_state);
   1732 	pipe_resource_reference(&tmp_buffer, NULL);
   1733 }
   1734 
   1735 static void r600_render_condition(struct pipe_context *ctx,
   1736 				  struct pipe_query *query,
   1737 				  boolean condition,
   1738 				  enum pipe_render_cond_flag mode)
   1739 {
   1740 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
   1741 	struct r600_query_hw *rquery = (struct r600_query_hw *)query;
   1742 	struct r600_query_buffer *qbuf;
   1743 	struct r600_atom *atom = &rctx->render_cond_atom;
   1744 
   1745 	/* Compute the size of SET_PREDICATION packets. */
   1746 	atom->num_dw = 0;
   1747 	if (query) {
   1748 		for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
   1749 			atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
   1750 
   1751 		if (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
   1752 			atom->num_dw *= R600_MAX_STREAMS;
   1753 	}
   1754 
   1755 	rctx->render_cond = query;
   1756 	rctx->render_cond_invert = condition;
   1757 	rctx->render_cond_mode = mode;
   1758 
   1759 	rctx->set_atom_dirty(rctx, atom, query != NULL);
   1760 }
   1761 
   1762 void r600_suspend_queries(struct r600_common_context *ctx)
   1763 {
   1764 	struct r600_query_hw *query;
   1765 
   1766 	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
   1767 		r600_query_hw_emit_stop(ctx, query);
   1768 	}
   1769 	assert(ctx->num_cs_dw_queries_suspend == 0);
   1770 }
   1771 
   1772 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
   1773 						    struct list_head *query_list)
   1774 {
   1775 	struct r600_query_hw *query;
   1776 	unsigned num_dw = 0;
   1777 
   1778 	LIST_FOR_EACH_ENTRY(query, query_list, list) {
   1779 		/* begin + end */
   1780 		num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
   1781 
   1782 		/* Workaround for the fact that
   1783 		 * num_cs_dw_nontimer_queries_suspend is incremented for every
   1784 		 * resumed query, which raises the bar in need_cs_space for
   1785 		 * queries about to be resumed.
   1786 		 */
   1787 		num_dw += query->num_cs_dw_end;
   1788 	}
   1789 	/* primitives generated query */
   1790 	num_dw += ctx->streamout.enable_atom.num_dw;
   1791 	/* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */
   1792 	num_dw += 13;
   1793 
   1794 	return num_dw;
   1795 }
   1796 
   1797 void r600_resume_queries(struct r600_common_context *ctx)
   1798 {
   1799 	struct r600_query_hw *query;
   1800 	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
   1801 
   1802 	assert(ctx->num_cs_dw_queries_suspend == 0);
   1803 
   1804 	/* Check CS space here. Resuming must not be interrupted by flushes. */
   1805 	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);
   1806 
   1807 	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
   1808 		r600_query_hw_emit_start(ctx, query);
   1809 	}
   1810 }
   1811 
   1812 /* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */
   1813 void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen)
   1814 {
   1815 	struct r600_common_context *ctx =
   1816 		(struct r600_common_context*)rscreen->aux_context;
   1817 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
   1818 	struct r600_resource *buffer;
   1819 	uint32_t *results;
   1820 	unsigned i, mask = 0;
   1821 	unsigned max_rbs;
   1822 
   1823 	if (ctx->family == CHIP_JUNIPER) {
   1824 		/*
   1825 		 * Fix for predication lockups - the chip can only ever have
   1826 		 * 4 RBs, however it looks like the predication logic assumes
   1827 		 * there's 8, trying to read results from query buffers never
   1828 		 * written to. By increasing this number we'll write the
   1829 		 * status bit for these as per the normal disabled rb logic.
   1830 		 */
   1831 		ctx->screen->info.num_render_backends = 8;
   1832 	}
   1833 	max_rbs = ctx->screen->info.num_render_backends;
   1834 
   1835 	assert(rscreen->chip_class <= CAYMAN);
   1836 
   1837 	/*
   1838 	 * if backend_map query is supported by the kernel.
   1839 	 * Note the kernel drm driver for a long time never filled in the
   1840 	 * associated data on eg/cm, only on r600/r700, hence ignore the valid
   1841 	 * bit there if the map is zero.
   1842 	 * (Albeit some chips with just one active rb can have a valid 0 map.)
   1843 	 */
   1844 	if (rscreen->info.r600_gb_backend_map_valid &&
   1845 	    (ctx->chip_class < EVERGREEN || rscreen->info.r600_gb_backend_map != 0)) {
   1846 		unsigned num_tile_pipes = rscreen->info.num_tile_pipes;
   1847 		unsigned backend_map = rscreen->info.r600_gb_backend_map;
   1848 		unsigned item_width, item_mask;
   1849 
   1850 		if (ctx->chip_class >= EVERGREEN) {
   1851 			item_width = 4;
   1852 			item_mask = 0x7;
   1853 		} else {
   1854 			item_width = 2;
   1855 			item_mask = 0x3;
   1856 		}
   1857 
   1858 		while (num_tile_pipes--) {
   1859 			i = backend_map & item_mask;
   1860 			mask |= (1<<i);
   1861 			backend_map >>= item_width;
   1862 		}
   1863 		if (mask != 0) {
   1864 			rscreen->info.enabled_rb_mask = mask;
   1865 			return;
   1866 		}
   1867 	}
   1868 
   1869 	/* otherwise backup path for older kernels */
   1870 
   1871 	/* create buffer for event data */
   1872 	buffer = (struct r600_resource*)
   1873 		pipe_buffer_create(ctx->b.screen, 0,
   1874 				   PIPE_USAGE_STAGING, max_rbs * 16);
   1875 	if (!buffer)
   1876 		return;
   1877 
   1878 	/* initialize buffer with zeroes */
   1879 	results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
   1880 	if (results) {
   1881 		memset(results, 0, max_rbs * 4 * 4);
   1882 
   1883 		/* emit EVENT_WRITE for ZPASS_DONE */
   1884 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
   1885 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
   1886 		radeon_emit(cs, buffer->gpu_address);
   1887 		radeon_emit(cs, buffer->gpu_address >> 32);
   1888 
   1889 		r600_emit_reloc(ctx, &ctx->gfx, buffer,
   1890                                 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
   1891 
   1892 		/* analyze results */
   1893 		results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ);
   1894 		if (results) {
   1895 			for(i = 0; i < max_rbs; i++) {
   1896 				/* at least highest bit will be set if backend is used */
   1897 				if (results[i*4 + 1])
   1898 					mask |= (1<<i);
   1899 			}
   1900 		}
   1901 	}
   1902 
   1903 	r600_resource_reference(&buffer, NULL);
   1904 
   1905 	if (mask) {
   1906 		if (rscreen->debug_flags & DBG_INFO &&
   1907 		    mask != rscreen->info.enabled_rb_mask) {
   1908 			printf("enabled_rb_mask (fixed) = 0x%x\n", mask);
   1909 		}
   1910 		rscreen->info.enabled_rb_mask = mask;
   1911 	}
   1912 }
   1913 
   1914 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
   1915 	{ \
   1916 		.name = name_, \
   1917 		.query_type = R600_QUERY_##query_type_, \
   1918 		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
   1919 		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
   1920 		.group_id = group_id_ \
   1921 	}
   1922 
   1923 #define X(name_, query_type_, type_, result_type_) \
   1924 	XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
   1925 
   1926 #define XG(group_, name_, query_type_, type_, result_type_) \
   1927 	XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
   1928 
   1929 static struct pipe_driver_query_info r600_driver_query_list[] = {
   1930 	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
   1931 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
   1932 	X("num-shader-cache-hits",	NUM_SHADER_CACHE_HITS,	UINT64, CUMULATIVE),
   1933 	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
   1934 	X("decompress-calls",		DECOMPRESS_CALLS,	UINT64, AVERAGE),
   1935 	X("MRT-draw-calls",		MRT_DRAW_CALLS,		UINT64, AVERAGE),
   1936 	X("prim-restart-calls",		PRIM_RESTART_CALLS,	UINT64, AVERAGE),
   1937 	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
   1938 	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
   1939 	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
   1940 	X("dma-calls",			DMA_CALLS,		UINT64, AVERAGE),
   1941 	X("cp-dma-calls",		CP_DMA_CALLS,		UINT64, AVERAGE),
   1942 	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
   1943 	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
   1944 	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
   1945 	X("num-CB-cache-flushes",	NUM_CB_CACHE_FLUSHES,	UINT64, AVERAGE),
   1946 	X("num-DB-cache-flushes",	NUM_DB_CACHE_FLUSHES,	UINT64, AVERAGE),
   1947 	X("num-resident-handles",	NUM_RESIDENT_HANDLES,	UINT64, AVERAGE),
   1948 	X("tc-offloaded-slots",		TC_OFFLOADED_SLOTS,     UINT64, AVERAGE),
   1949 	X("tc-direct-slots",		TC_DIRECT_SLOTS,	UINT64, AVERAGE),
   1950 	X("tc-num-syncs",		TC_NUM_SYNCS,		UINT64, AVERAGE),
   1951 	X("CS-thread-busy",		CS_THREAD_BUSY,		UINT64, AVERAGE),
   1952 	X("gallium-thread-busy",	GALLIUM_THREAD_BUSY,	UINT64, AVERAGE),
   1953 	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
   1954 	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
   1955 	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
   1956 	X("mapped-GTT",			MAPPED_GTT,		BYTES, AVERAGE),
   1957 	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
   1958 	X("num-mapped-buffers",		NUM_MAPPED_BUFFERS,	UINT64, AVERAGE),
   1959 	X("num-GFX-IBs",		NUM_GFX_IBS,		UINT64, AVERAGE),
   1960 	X("num-SDMA-IBs",		NUM_SDMA_IBS,		UINT64, AVERAGE),
   1961 	X("GFX-BO-list-size",		GFX_BO_LIST_SIZE,	UINT64, AVERAGE),
   1962 	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
   1963 	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
   1964 	X("VRAM-CPU-page-faults",	NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
   1965 	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
   1966 	X("VRAM-vis-usage",		VRAM_VIS_USAGE,		BYTES, AVERAGE),
   1967 	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
   1968 
   1969 	/* GPIN queries are for the benefit of old versions of GPUPerfStudio,
   1970 	 * which use it as a fallback path to detect the GPU type.
   1971 	 *
   1972 	 * Note: The names of these queries are significant for GPUPerfStudio
   1973 	 * (and possibly their order as well). */
   1974 	XG(GPIN, "GPIN_000",		GPIN_ASIC_ID,		UINT, AVERAGE),
   1975 	XG(GPIN, "GPIN_001",		GPIN_NUM_SIMD,		UINT, AVERAGE),
   1976 	XG(GPIN, "GPIN_002",		GPIN_NUM_RB,		UINT, AVERAGE),
   1977 	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
   1978 	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
   1979 
   1980 	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
   1981 	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
   1982 	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
   1983 
   1984 	/* The following queries must be at the end of the list because their
   1985 	 * availability is adjusted dynamically based on the DRM version. */
   1986 	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
   1987 	X("GPU-shaders-busy",		GPU_SHADERS_BUSY,	UINT64, AVERAGE),
   1988 	X("GPU-ta-busy",		GPU_TA_BUSY,		UINT64, AVERAGE),
   1989 	X("GPU-gds-busy",		GPU_GDS_BUSY,		UINT64, AVERAGE),
   1990 	X("GPU-vgt-busy",		GPU_VGT_BUSY,		UINT64, AVERAGE),
   1991 	X("GPU-ia-busy",		GPU_IA_BUSY,		UINT64, AVERAGE),
   1992 	X("GPU-sx-busy",		GPU_SX_BUSY,		UINT64, AVERAGE),
   1993 	X("GPU-wd-busy",		GPU_WD_BUSY,		UINT64, AVERAGE),
   1994 	X("GPU-bci-busy",		GPU_BCI_BUSY,		UINT64, AVERAGE),
   1995 	X("GPU-sc-busy",		GPU_SC_BUSY,		UINT64, AVERAGE),
   1996 	X("GPU-pa-busy",		GPU_PA_BUSY,		UINT64, AVERAGE),
   1997 	X("GPU-db-busy",		GPU_DB_BUSY,		UINT64, AVERAGE),
   1998 	X("GPU-cp-busy",		GPU_CP_BUSY,		UINT64, AVERAGE),
   1999 	X("GPU-cb-busy",		GPU_CB_BUSY,		UINT64, AVERAGE),
   2000 	X("GPU-sdma-busy",		GPU_SDMA_BUSY,		UINT64, AVERAGE),
   2001 	X("GPU-pfp-busy",		GPU_PFP_BUSY,		UINT64, AVERAGE),
   2002 	X("GPU-meq-busy",		GPU_MEQ_BUSY,		UINT64, AVERAGE),
   2003 	X("GPU-me-busy",		GPU_ME_BUSY,		UINT64, AVERAGE),
   2004 	X("GPU-surf-sync-busy",		GPU_SURF_SYNC_BUSY,	UINT64, AVERAGE),
   2005 	X("GPU-cp-dma-busy",		GPU_CP_DMA_BUSY,	UINT64, AVERAGE),
   2006 	X("GPU-scratch-ram-busy",	GPU_SCRATCH_RAM_BUSY,	UINT64, AVERAGE),
   2007 };
   2008 
   2009 #undef X
   2010 #undef XG
   2011 #undef XFULL
   2012 
   2013 static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
   2014 {
   2015 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
   2016 		return ARRAY_SIZE(r600_driver_query_list);
   2017 	else
   2018 		return ARRAY_SIZE(r600_driver_query_list) - 25;
   2019 }
   2020 
   2021 static int r600_get_driver_query_info(struct pipe_screen *screen,
   2022 				      unsigned index,
   2023 				      struct pipe_driver_query_info *info)
   2024 {
   2025 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
   2026 	unsigned num_queries = r600_get_num_queries(rscreen);
   2027 
   2028 	if (!info) {
   2029 		unsigned num_perfcounters =
   2030 			r600_get_perfcounter_info(rscreen, 0, NULL);
   2031 
   2032 		return num_queries + num_perfcounters;
   2033 	}
   2034 
   2035 	if (index >= num_queries)
   2036 		return r600_get_perfcounter_info(rscreen, index - num_queries, info);
   2037 
   2038 	*info = r600_driver_query_list[index];
   2039 
   2040 	switch (info->query_type) {
   2041 	case R600_QUERY_REQUESTED_VRAM:
   2042 	case R600_QUERY_VRAM_USAGE:
   2043 	case R600_QUERY_MAPPED_VRAM:
   2044 		info->max_value.u64 = rscreen->info.vram_size;
   2045 		break;
   2046 	case R600_QUERY_REQUESTED_GTT:
   2047 	case R600_QUERY_GTT_USAGE:
   2048 	case R600_QUERY_MAPPED_GTT:
   2049 		info->max_value.u64 = rscreen->info.gart_size;
   2050 		break;
   2051 	case R600_QUERY_GPU_TEMPERATURE:
   2052 		info->max_value.u64 = 125;
   2053 		break;
   2054 	case R600_QUERY_VRAM_VIS_USAGE:
   2055 		info->max_value.u64 = rscreen->info.vram_vis_size;
   2056 		break;
   2057 	}
   2058 
   2059 	if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
   2060 		info->group_id += rscreen->perfcounters->num_groups;
   2061 
   2062 	return 1;
   2063 }
   2064 
   2065 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
   2066  * performance counter groups, so be careful when changing this and related
   2067  * functions.
   2068  */
   2069 static int r600_get_driver_query_group_info(struct pipe_screen *screen,
   2070 					    unsigned index,
   2071 					    struct pipe_driver_query_group_info *info)
   2072 {
   2073 	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
   2074 	unsigned num_pc_groups = 0;
   2075 
   2076 	if (rscreen->perfcounters)
   2077 		num_pc_groups = rscreen->perfcounters->num_groups;
   2078 
   2079 	if (!info)
   2080 		return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
   2081 
   2082 	if (index < num_pc_groups)
   2083 		return r600_get_perfcounter_group_info(rscreen, index, info);
   2084 
   2085 	index -= num_pc_groups;
   2086 	if (index >= R600_NUM_SW_QUERY_GROUPS)
   2087 		return 0;
   2088 
   2089 	info->name = "GPIN";
   2090 	info->max_active_queries = 5;
   2091 	info->num_queries = 5;
   2092 	return 1;
   2093 }
   2094 
   2095 void r600_query_init(struct r600_common_context *rctx)
   2096 {
   2097 	rctx->b.create_query = r600_create_query;
   2098 	rctx->b.create_batch_query = r600_create_batch_query;
   2099 	rctx->b.destroy_query = r600_destroy_query;
   2100 	rctx->b.begin_query = r600_begin_query;
   2101 	rctx->b.end_query = r600_end_query;
   2102 	rctx->b.get_query_result = r600_get_query_result;
   2103 	rctx->b.get_query_result_resource = r600_get_query_result_resource;
   2104 	rctx->render_cond_atom.emit = r600_emit_query_predication;
   2105 
   2106 	if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
   2107 	    rctx->b.render_condition = r600_render_condition;
   2108 
   2109 	LIST_INITHEAD(&rctx->active_queries);
   2110 }
   2111 
   2112 void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
   2113 {
   2114 	rscreen->b.get_driver_query_info = r600_get_driver_query_info;
   2115 	rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
   2116 }
   2117