Home | History | Annotate | Download | only in radeon
      1 /*
      2  * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org>
      3  * Copyright 2014 Marek Olk <marek.olsak (at) amd.com>
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * on the rights to use, copy, modify, merge, publish, distribute, sub
      9  * license, and/or sell copies of the Software, and to permit persons to whom
     10  * the Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     23  */
     24 
     25 #include "r600_query.h"
     26 #include "r600_cs.h"
     27 #include "util/u_memory.h"
     28 #include "util/u_upload_mgr.h"
     29 
     30 #include "tgsi/tgsi_text.h"
     31 
     32 struct r600_hw_query_params {
     33 	unsigned start_offset;
     34 	unsigned end_offset;
     35 	unsigned fence_offset;
     36 	unsigned pair_stride;
     37 	unsigned pair_count;
     38 };
     39 
     40 /* Queries without buffer handling or suspend/resume. */
     41 struct r600_query_sw {
     42 	struct r600_query b;
     43 
     44 	uint64_t begin_result;
     45 	uint64_t end_result;
     46 	/* Fence for GPU_FINISHED. */
     47 	struct pipe_fence_handle *fence;
     48 };
     49 
     50 static void r600_query_sw_destroy(struct r600_common_context *rctx,
     51 				  struct r600_query *rquery)
     52 {
     53 	struct pipe_screen *screen = rctx->b.screen;
     54 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
     55 
     56 	screen->fence_reference(screen, &query->fence, NULL);
     57 	FREE(query);
     58 }
     59 
     60 static enum radeon_value_id winsys_id_from_type(unsigned type)
     61 {
     62 	switch (type) {
     63 	case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
     64 	case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
     65 	case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
     66 	case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
     67 	case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
     68 	case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
     69 	case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
     70 	case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
     71 	case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
     72 	case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
     73 	case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
     74 	case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
     75 	case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
     76 	case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
     77 	default: unreachable("query type does not correspond to winsys id");
     78 	}
     79 }
     80 
     81 static bool r600_query_sw_begin(struct r600_common_context *rctx,
     82 				struct r600_query *rquery)
     83 {
     84 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
     85 
     86 	switch(query->b.type) {
     87 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
     88 	case PIPE_QUERY_GPU_FINISHED:
     89 		break;
     90 	case R600_QUERY_DRAW_CALLS:
     91 		query->begin_result = rctx->num_draw_calls;
     92 		break;
     93 	case R600_QUERY_SPILL_DRAW_CALLS:
     94 		query->begin_result = rctx->num_spill_draw_calls;
     95 		break;
     96 	case R600_QUERY_COMPUTE_CALLS:
     97 		query->begin_result = rctx->num_compute_calls;
     98 		break;
     99 	case R600_QUERY_SPILL_COMPUTE_CALLS:
    100 		query->begin_result = rctx->num_spill_compute_calls;
    101 		break;
    102 	case R600_QUERY_DMA_CALLS:
    103 		query->begin_result = rctx->num_dma_calls;
    104 		break;
    105 	case R600_QUERY_CP_DMA_CALLS:
    106 		query->begin_result = rctx->num_cp_dma_calls;
    107 		break;
    108 	case R600_QUERY_NUM_VS_FLUSHES:
    109 		query->begin_result = rctx->num_vs_flushes;
    110 		break;
    111 	case R600_QUERY_NUM_PS_FLUSHES:
    112 		query->begin_result = rctx->num_ps_flushes;
    113 		break;
    114 	case R600_QUERY_NUM_CS_FLUSHES:
    115 		query->begin_result = rctx->num_cs_flushes;
    116 		break;
    117 	case R600_QUERY_NUM_FB_CACHE_FLUSHES:
    118 		query->begin_result = rctx->num_fb_cache_flushes;
    119 		break;
    120 	case R600_QUERY_NUM_L2_INVALIDATES:
    121 		query->begin_result = rctx->num_L2_invalidates;
    122 		break;
    123 	case R600_QUERY_NUM_L2_WRITEBACKS:
    124 		query->begin_result = rctx->num_L2_writebacks;
    125 		break;
    126 	case R600_QUERY_REQUESTED_VRAM:
    127 	case R600_QUERY_REQUESTED_GTT:
    128 	case R600_QUERY_MAPPED_VRAM:
    129 	case R600_QUERY_MAPPED_GTT:
    130 	case R600_QUERY_VRAM_USAGE:
    131 	case R600_QUERY_GTT_USAGE:
    132 	case R600_QUERY_GPU_TEMPERATURE:
    133 	case R600_QUERY_CURRENT_GPU_SCLK:
    134 	case R600_QUERY_CURRENT_GPU_MCLK:
    135 	case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
    136 		query->begin_result = 0;
    137 		break;
    138 	case R600_QUERY_BUFFER_WAIT_TIME:
    139 	case R600_QUERY_NUM_GFX_IBS:
    140 	case R600_QUERY_NUM_SDMA_IBS:
    141 	case R600_QUERY_NUM_BYTES_MOVED:
    142 	case R600_QUERY_NUM_EVICTIONS: {
    143 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
    144 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
    145 		break;
    146 	}
    147 	case R600_QUERY_GPU_LOAD:
    148 		query->begin_result = r600_begin_counter_gui(rctx->screen);
    149 		break;
    150 	case R600_QUERY_GPU_SHADERS_BUSY:
    151 		query->begin_result = r600_begin_counter_spi(rctx->screen);
    152 		break;
    153 	case R600_QUERY_NUM_COMPILATIONS:
    154 		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
    155 		break;
    156 	case R600_QUERY_NUM_SHADERS_CREATED:
    157 		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
    158 		break;
    159 	case R600_QUERY_NUM_SHADER_CACHE_HITS:
    160 		query->begin_result =
    161 			p_atomic_read(&rctx->screen->num_shader_cache_hits);
    162 		break;
    163 	case R600_QUERY_GPIN_ASIC_ID:
    164 	case R600_QUERY_GPIN_NUM_SIMD:
    165 	case R600_QUERY_GPIN_NUM_RB:
    166 	case R600_QUERY_GPIN_NUM_SPI:
    167 	case R600_QUERY_GPIN_NUM_SE:
    168 		break;
    169 	default:
    170 		unreachable("r600_query_sw_begin: bad query type");
    171 	}
    172 
    173 	return true;
    174 }
    175 
    176 static bool r600_query_sw_end(struct r600_common_context *rctx,
    177 			      struct r600_query *rquery)
    178 {
    179 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
    180 
    181 	switch(query->b.type) {
    182 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
    183 		break;
    184 	case PIPE_QUERY_GPU_FINISHED:
    185 		rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
    186 		break;
    187 	case R600_QUERY_DRAW_CALLS:
    188 		query->end_result = rctx->num_draw_calls;
    189 		break;
    190 	case R600_QUERY_SPILL_DRAW_CALLS:
    191 		query->end_result = rctx->num_spill_draw_calls;
    192 		break;
    193 	case R600_QUERY_COMPUTE_CALLS:
    194 		query->end_result = rctx->num_compute_calls;
    195 		break;
    196 	case R600_QUERY_SPILL_COMPUTE_CALLS:
    197 		query->end_result = rctx->num_spill_compute_calls;
    198 		break;
    199 	case R600_QUERY_DMA_CALLS:
    200 		query->end_result = rctx->num_dma_calls;
    201 		break;
    202 	case R600_QUERY_CP_DMA_CALLS:
    203 		query->end_result = rctx->num_cp_dma_calls;
    204 		break;
    205 	case R600_QUERY_NUM_VS_FLUSHES:
    206 		query->end_result = rctx->num_vs_flushes;
    207 		break;
    208 	case R600_QUERY_NUM_PS_FLUSHES:
    209 		query->end_result = rctx->num_ps_flushes;
    210 		break;
    211 	case R600_QUERY_NUM_CS_FLUSHES:
    212 		query->end_result = rctx->num_cs_flushes;
    213 		break;
    214 	case R600_QUERY_NUM_FB_CACHE_FLUSHES:
    215 		query->end_result = rctx->num_fb_cache_flushes;
    216 		break;
    217 	case R600_QUERY_NUM_L2_INVALIDATES:
    218 		query->end_result = rctx->num_L2_invalidates;
    219 		break;
    220 	case R600_QUERY_NUM_L2_WRITEBACKS:
    221 		query->end_result = rctx->num_L2_writebacks;
    222 		break;
    223 	case R600_QUERY_REQUESTED_VRAM:
    224 	case R600_QUERY_REQUESTED_GTT:
    225 	case R600_QUERY_MAPPED_VRAM:
    226 	case R600_QUERY_MAPPED_GTT:
    227 	case R600_QUERY_VRAM_USAGE:
    228 	case R600_QUERY_GTT_USAGE:
    229 	case R600_QUERY_GPU_TEMPERATURE:
    230 	case R600_QUERY_CURRENT_GPU_SCLK:
    231 	case R600_QUERY_CURRENT_GPU_MCLK:
    232 	case R600_QUERY_BUFFER_WAIT_TIME:
    233 	case R600_QUERY_NUM_GFX_IBS:
    234 	case R600_QUERY_NUM_SDMA_IBS:
    235 	case R600_QUERY_NUM_BYTES_MOVED:
    236 	case R600_QUERY_NUM_EVICTIONS: {
    237 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
    238 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
    239 		break;
    240 	}
    241 	case R600_QUERY_GPU_LOAD:
    242 		query->end_result = r600_end_counter_gui(rctx->screen,
    243 							 query->begin_result);
    244 		query->begin_result = 0;
    245 		break;
    246 	case R600_QUERY_GPU_SHADERS_BUSY:
    247 		query->end_result = r600_end_counter_spi(rctx->screen,
    248 							 query->begin_result);
    249 		query->begin_result = 0;
    250 		break;
    251 	case R600_QUERY_NUM_COMPILATIONS:
    252 		query->end_result = p_atomic_read(&rctx->screen->num_compilations);
    253 		break;
    254 	case R600_QUERY_NUM_SHADERS_CREATED:
    255 		query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
    256 		break;
    257 	case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
    258 		query->end_result = rctx->last_tex_ps_draw_ratio;
    259 		break;
    260 	case R600_QUERY_NUM_SHADER_CACHE_HITS:
    261 		query->end_result =
    262 			p_atomic_read(&rctx->screen->num_shader_cache_hits);
    263 		break;
    264 	case R600_QUERY_GPIN_ASIC_ID:
    265 	case R600_QUERY_GPIN_NUM_SIMD:
    266 	case R600_QUERY_GPIN_NUM_RB:
    267 	case R600_QUERY_GPIN_NUM_SPI:
    268 	case R600_QUERY_GPIN_NUM_SE:
    269 		break;
    270 	default:
    271 		unreachable("r600_query_sw_end: bad query type");
    272 	}
    273 
    274 	return true;
    275 }
    276 
    277 static bool r600_query_sw_get_result(struct r600_common_context *rctx,
    278 				     struct r600_query *rquery,
    279 				     bool wait,
    280 				     union pipe_query_result *result)
    281 {
    282 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
    283 
    284 	switch (query->b.type) {
    285 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
    286 		/* Convert from cycles per millisecond to cycles per second (Hz). */
    287 		result->timestamp_disjoint.frequency =
    288 			(uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
    289 		result->timestamp_disjoint.disjoint = false;
    290 		return true;
    291 	case PIPE_QUERY_GPU_FINISHED: {
    292 		struct pipe_screen *screen = rctx->b.screen;
    293 		result->b = screen->fence_finish(screen, &rctx->b, query->fence,
    294 						 wait ? PIPE_TIMEOUT_INFINITE : 0);
    295 		return result->b;
    296 	}
    297 
    298 	case R600_QUERY_GPIN_ASIC_ID:
    299 		result->u32 = 0;
    300 		return true;
    301 	case R600_QUERY_GPIN_NUM_SIMD:
    302 		result->u32 = rctx->screen->info.num_good_compute_units;
    303 		return true;
    304 	case R600_QUERY_GPIN_NUM_RB:
    305 		result->u32 = rctx->screen->info.num_render_backends;
    306 		return true;
    307 	case R600_QUERY_GPIN_NUM_SPI:
    308 		result->u32 = 1; /* all supported chips have one SPI per SE */
    309 		return true;
    310 	case R600_QUERY_GPIN_NUM_SE:
    311 		result->u32 = rctx->screen->info.max_se;
    312 		return true;
    313 	}
    314 
    315 	result->u64 = query->end_result - query->begin_result;
    316 
    317 	switch (query->b.type) {
    318 	case R600_QUERY_BUFFER_WAIT_TIME:
    319 	case R600_QUERY_GPU_TEMPERATURE:
    320 		result->u64 /= 1000;
    321 		break;
    322 	case R600_QUERY_CURRENT_GPU_SCLK:
    323 	case R600_QUERY_CURRENT_GPU_MCLK:
    324 		result->u64 *= 1000000;
    325 		break;
    326 	}
    327 
    328 	return true;
    329 }
    330 
    331 
    332 static struct r600_query_ops sw_query_ops = {
    333 	.destroy = r600_query_sw_destroy,
    334 	.begin = r600_query_sw_begin,
    335 	.end = r600_query_sw_end,
    336 	.get_result = r600_query_sw_get_result,
    337 	.get_result_resource = NULL
    338 };
    339 
    340 static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx,
    341 					       unsigned query_type)
    342 {
    343 	struct r600_query_sw *query;
    344 
    345 	query = CALLOC_STRUCT(r600_query_sw);
    346 	if (!query)
    347 		return NULL;
    348 
    349 	query->b.type = query_type;
    350 	query->b.ops = &sw_query_ops;
    351 
    352 	return (struct pipe_query *)query;
    353 }
    354 
    355 void r600_query_hw_destroy(struct r600_common_context *rctx,
    356 			   struct r600_query *rquery)
    357 {
    358 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
    359 	struct r600_query_buffer *prev = query->buffer.previous;
    360 
    361 	/* Release all query buffers. */
    362 	while (prev) {
    363 		struct r600_query_buffer *qbuf = prev;
    364 		prev = prev->previous;
    365 		r600_resource_reference(&qbuf->buf, NULL);
    366 		FREE(qbuf);
    367 	}
    368 
    369 	r600_resource_reference(&query->buffer.buf, NULL);
    370 	FREE(rquery);
    371 }
    372 
    373 static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx,
    374 						   struct r600_query_hw *query)
    375 {
    376 	unsigned buf_size = MAX2(query->result_size,
    377 				 ctx->screen->info.min_alloc_size);
    378 
    379 	/* Queries are normally read by the CPU after
    380 	 * being written by the gpu, hence staging is probably a good
    381 	 * usage pattern.
    382 	 */
    383 	struct r600_resource *buf = (struct r600_resource*)
    384 		pipe_buffer_create(ctx->b.screen, 0,
    385 				   PIPE_USAGE_STAGING, buf_size);
    386 	if (!buf)
    387 		return NULL;
    388 
    389 	if (!query->ops->prepare_buffer(ctx, query, buf)) {
    390 		r600_resource_reference(&buf, NULL);
    391 		return NULL;
    392 	}
    393 
    394 	return buf;
    395 }
    396 
    397 static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
    398 					 struct r600_query_hw *query,
    399 					 struct r600_resource *buffer)
    400 {
    401 	/* Callers ensure that the buffer is currently unused by the GPU. */
    402 	uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL,
    403 						PIPE_TRANSFER_WRITE |
    404 						PIPE_TRANSFER_UNSYNCHRONIZED);
    405 	if (!results)
    406 		return false;
    407 
    408 	memset(results, 0, buffer->b.b.width0);
    409 
    410 	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
    411 	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
    412 		unsigned num_results;
    413 		unsigned i, j;
    414 
    415 		/* Set top bits for unused backends. */
    416 		num_results = buffer->b.b.width0 / query->result_size;
    417 		for (j = 0; j < num_results; j++) {
    418 			for (i = 0; i < ctx->max_db; i++) {
    419 				if (!(ctx->backend_mask & (1<<i))) {
    420 					results[(i * 4)+1] = 0x80000000;
    421 					results[(i * 4)+3] = 0x80000000;
    422 				}
    423 			}
    424 			results += 4 * ctx->max_db;
    425 		}
    426 	}
    427 
    428 	return true;
    429 }
    430 
    431 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
    432                                               struct r600_query *rquery,
    433                                               bool wait,
    434                                               enum pipe_query_value_type result_type,
    435                                               int index,
    436                                               struct pipe_resource *resource,
    437                                               unsigned offset);
    438 
    439 static struct r600_query_ops query_hw_ops = {
    440 	.destroy = r600_query_hw_destroy,
    441 	.begin = r600_query_hw_begin,
    442 	.end = r600_query_hw_end,
    443 	.get_result = r600_query_hw_get_result,
    444 	.get_result_resource = r600_query_hw_get_result_resource,
    445 };
    446 
    447 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
    448 					struct r600_query_hw *query,
    449 					struct r600_resource *buffer,
    450 					uint64_t va);
    451 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
    452 				       struct r600_query_hw *query,
    453 				       struct r600_resource *buffer,
    454 				       uint64_t va);
    455 static void r600_query_hw_add_result(struct r600_common_context *ctx,
    456 				     struct r600_query_hw *, void *buffer,
    457 				     union pipe_query_result *result);
    458 static void r600_query_hw_clear_result(struct r600_query_hw *,
    459 				       union pipe_query_result *);
    460 
    461 static struct r600_query_hw_ops query_hw_default_hw_ops = {
    462 	.prepare_buffer = r600_query_hw_prepare_buffer,
    463 	.emit_start = r600_query_hw_do_emit_start,
    464 	.emit_stop = r600_query_hw_do_emit_stop,
    465 	.clear_result = r600_query_hw_clear_result,
    466 	.add_result = r600_query_hw_add_result,
    467 };
    468 
    469 bool r600_query_hw_init(struct r600_common_context *rctx,
    470 			struct r600_query_hw *query)
    471 {
    472 	query->buffer.buf = r600_new_query_buffer(rctx, query);
    473 	if (!query->buffer.buf)
    474 		return false;
    475 
    476 	return true;
    477 }
    478 
    479 static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
    480 					       unsigned query_type,
    481 					       unsigned index)
    482 {
    483 	struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
    484 	if (!query)
    485 		return NULL;
    486 
    487 	query->b.type = query_type;
    488 	query->b.ops = &query_hw_ops;
    489 	query->ops = &query_hw_default_hw_ops;
    490 
    491 	switch (query_type) {
    492 	case PIPE_QUERY_OCCLUSION_COUNTER:
    493 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    494 		query->result_size = 16 * rctx->max_db;
    495 		query->result_size += 16; /* for the fence + alignment */
    496 		query->num_cs_dw_begin = 6;
    497 		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
    498 		break;
    499 	case PIPE_QUERY_TIME_ELAPSED:
    500 		query->result_size = 24;
    501 		query->num_cs_dw_begin = 8;
    502 		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
    503 		break;
    504 	case PIPE_QUERY_TIMESTAMP:
    505 		query->result_size = 16;
    506 		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
    507 		query->flags = R600_QUERY_HW_FLAG_NO_START;
    508 		break;
    509 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    510 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    511 	case PIPE_QUERY_SO_STATISTICS:
    512 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    513 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
    514 		query->result_size = 32;
    515 		query->num_cs_dw_begin = 6;
    516 		query->num_cs_dw_end = 6;
    517 		query->stream = index;
    518 		break;
    519 	case PIPE_QUERY_PIPELINE_STATISTICS:
    520 		/* 11 values on EG, 8 on R600. */
    521 		query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
    522 		query->result_size += 8; /* for the fence + alignment */
    523 		query->num_cs_dw_begin = 6;
    524 		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
    525 		break;
    526 	default:
    527 		assert(0);
    528 		FREE(query);
    529 		return NULL;
    530 	}
    531 
    532 	if (!r600_query_hw_init(rctx, query)) {
    533 		FREE(query);
    534 		return NULL;
    535 	}
    536 
    537 	return (struct pipe_query *)query;
    538 }
    539 
    540 static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
    541 					      unsigned type, int diff)
    542 {
    543 	if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
    544 	    type == PIPE_QUERY_OCCLUSION_PREDICATE) {
    545 		bool old_enable = rctx->num_occlusion_queries != 0;
    546 		bool old_perfect_enable =
    547 			rctx->num_perfect_occlusion_queries != 0;
    548 		bool enable, perfect_enable;
    549 
    550 		rctx->num_occlusion_queries += diff;
    551 		assert(rctx->num_occlusion_queries >= 0);
    552 
    553 		if (type == PIPE_QUERY_OCCLUSION_COUNTER) {
    554 			rctx->num_perfect_occlusion_queries += diff;
    555 			assert(rctx->num_perfect_occlusion_queries >= 0);
    556 		}
    557 
    558 		enable = rctx->num_occlusion_queries != 0;
    559 		perfect_enable = rctx->num_perfect_occlusion_queries != 0;
    560 
    561 		if (enable != old_enable || perfect_enable != old_perfect_enable) {
    562 			rctx->set_occlusion_query_state(&rctx->b, enable);
    563 		}
    564 	}
    565 }
    566 
    567 static unsigned event_type_for_stream(struct r600_query_hw *query)
    568 {
    569 	switch (query->stream) {
    570 	default:
    571 	case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
    572 	case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
    573 	case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;
    574 	case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;
    575 	}
    576 }
    577 
    578 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
    579 					struct r600_query_hw *query,
    580 					struct r600_resource *buffer,
    581 					uint64_t va)
    582 {
    583 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
    584 
    585 	switch (query->b.type) {
    586 	case PIPE_QUERY_OCCLUSION_COUNTER:
    587 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    588 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    589 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
    590 		radeon_emit(cs, va);
    591 		radeon_emit(cs, (va >> 32) & 0xFFFF);
    592 		break;
    593 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    594 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    595 	case PIPE_QUERY_SO_STATISTICS:
    596 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    597 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    598 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
    599 		radeon_emit(cs, va);
    600 		radeon_emit(cs, (va >> 32) & 0xFFFF);
    601 		break;
    602 	case PIPE_QUERY_TIME_ELAPSED:
    603 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
    604 					 0, 3, NULL, va, 0, 0);
    605 		break;
    606 	case PIPE_QUERY_PIPELINE_STATISTICS:
    607 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    608 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
    609 		radeon_emit(cs, va);
    610 		radeon_emit(cs, (va >> 32) & 0xFFFF);
    611 		break;
    612 	default:
    613 		assert(0);
    614 	}
    615 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
    616 			RADEON_PRIO_QUERY);
    617 }
    618 
    619 static void r600_query_hw_emit_start(struct r600_common_context *ctx,
    620 				     struct r600_query_hw *query)
    621 {
    622 	uint64_t va;
    623 
    624 	if (!query->buffer.buf)
    625 		return; // previous buffer allocation failure
    626 
    627 	r600_update_occlusion_query_state(ctx, query->b.type, 1);
    628 	r600_update_prims_generated_query_state(ctx, query->b.type, 1);
    629 
    630 	ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
    631 			       true);
    632 
    633 	/* Get a new query buffer if needed. */
    634 	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
    635 		struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
    636 		*qbuf = query->buffer;
    637 		query->buffer.results_end = 0;
    638 		query->buffer.previous = qbuf;
    639 		query->buffer.buf = r600_new_query_buffer(ctx, query);
    640 		if (!query->buffer.buf)
    641 			return;
    642 	}
    643 
    644 	/* emit begin query */
    645 	va = query->buffer.buf->gpu_address + query->buffer.results_end;
    646 
    647 	query->ops->emit_start(ctx, query, query->buffer.buf, va);
    648 
    649 	ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
    650 }
    651 
    652 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
    653 				       struct r600_query_hw *query,
    654 				       struct r600_resource *buffer,
    655 				       uint64_t va)
    656 {
    657 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
    658 	uint64_t fence_va = 0;
    659 
    660 	switch (query->b.type) {
    661 	case PIPE_QUERY_OCCLUSION_COUNTER:
    662 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    663 		va += 8;
    664 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    665 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
    666 		radeon_emit(cs, va);
    667 		radeon_emit(cs, (va >> 32) & 0xFFFF);
    668 
    669 		fence_va = va + ctx->max_db * 16 - 8;
    670 		break;
    671 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    672 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    673 	case PIPE_QUERY_SO_STATISTICS:
    674 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    675 		va += query->result_size/2;
    676 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    677 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
    678 		radeon_emit(cs, va);
    679 		radeon_emit(cs, (va >> 32) & 0xFFFF);
    680 		break;
    681 	case PIPE_QUERY_TIME_ELAPSED:
    682 		va += 8;
    683 		/* fall through */
    684 	case PIPE_QUERY_TIMESTAMP:
    685 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
    686 					 0, 3, NULL, va, 0, 0);
    687 		fence_va = va + 8;
    688 		break;
    689 	case PIPE_QUERY_PIPELINE_STATISTICS: {
    690 		unsigned sample_size = (query->result_size - 8) / 2;
    691 
    692 		va += sample_size;
    693 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
    694 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
    695 		radeon_emit(cs, va);
    696 		radeon_emit(cs, (va >> 32) & 0xFFFF);
    697 
    698 		fence_va = va + sample_size;
    699 		break;
    700 	}
    701 	default:
    702 		assert(0);
    703 	}
    704 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
    705 			RADEON_PRIO_QUERY);
    706 
    707 	if (fence_va)
    708 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 1,
    709 					 query->buffer.buf, fence_va, 0, 0x80000000);
    710 }
    711 
    712 static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
    713 				    struct r600_query_hw *query)
    714 {
    715 	uint64_t va;
    716 
    717 	if (!query->buffer.buf)
    718 		return; // previous buffer allocation failure
    719 
    720 	/* The queries which need begin already called this in begin_query. */
    721 	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
    722 		ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);
    723 	}
    724 
    725 	/* emit end query */
    726 	va = query->buffer.buf->gpu_address + query->buffer.results_end;
    727 
    728 	query->ops->emit_stop(ctx, query, query->buffer.buf, va);
    729 
    730 	query->buffer.results_end += query->result_size;
    731 
    732 	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
    733 		ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
    734 
    735 	r600_update_occlusion_query_state(ctx, query->b.type, -1);
    736 	r600_update_prims_generated_query_state(ctx, query->b.type, -1);
    737 }
    738 
    739 static void r600_emit_query_predication(struct r600_common_context *ctx,
    740 					struct r600_atom *atom)
    741 {
    742 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
    743 	struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
    744 	struct r600_query_buffer *qbuf;
    745 	uint32_t op;
    746 	bool flag_wait;
    747 
    748 	if (!query)
    749 		return;
    750 
    751 	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
    752 		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
    753 
    754 	switch (query->b.type) {
    755 	case PIPE_QUERY_OCCLUSION_COUNTER:
    756 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    757 		op = PRED_OP(PREDICATION_OP_ZPASS);
    758 		break;
    759 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    760 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    761 	case PIPE_QUERY_SO_STATISTICS:
    762 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    763 		op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
    764 		break;
    765 	default:
    766 		assert(0);
    767 		return;
    768 	}
    769 
    770 	/* if true then invert, see GL_ARB_conditional_render_inverted */
    771 	if (ctx->render_cond_invert)
    772 		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
    773 	else
    774 		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
    775 
    776 	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
    777 
    778 	/* emit predicate packets for all data blocks */
    779 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
    780 		unsigned results_base = 0;
    781 		uint64_t va = qbuf->buf->gpu_address;
    782 
    783 		while (results_base < qbuf->results_end) {
    784 			radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
    785 			radeon_emit(cs, va + results_base);
    786 			radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
    787 			r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
    788 					RADEON_PRIO_QUERY);
    789 			results_base += query->result_size;
    790 
    791 			/* set CONTINUE bit for all packets except the first */
    792 			op |= PREDICATION_CONTINUE;
    793 		}
    794 	}
    795 }
    796 
    797 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
    798 {
    799 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
    800 
    801 	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
    802 	    query_type == PIPE_QUERY_GPU_FINISHED ||
    803 	    query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
    804 		return r600_query_sw_create(ctx, query_type);
    805 
    806 	return r600_query_hw_create(rctx, query_type, index);
    807 }
    808 
    809 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
    810 {
    811 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
    812 	struct r600_query *rquery = (struct r600_query *)query;
    813 
    814 	rquery->ops->destroy(rctx, rquery);
    815 }
    816 
    817 static boolean r600_begin_query(struct pipe_context *ctx,
    818                                 struct pipe_query *query)
    819 {
    820 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
    821 	struct r600_query *rquery = (struct r600_query *)query;
    822 
    823 	return rquery->ops->begin(rctx, rquery);
    824 }
    825 
    826 void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
    827 				 struct r600_query_hw *query)
    828 {
    829 	struct r600_query_buffer *prev = query->buffer.previous;
    830 
    831 	/* Discard the old query buffers. */
    832 	while (prev) {
    833 		struct r600_query_buffer *qbuf = prev;
    834 		prev = prev->previous;
    835 		r600_resource_reference(&qbuf->buf, NULL);
    836 		FREE(qbuf);
    837 	}
    838 
    839 	query->buffer.results_end = 0;
    840 	query->buffer.previous = NULL;
    841 
    842 	/* Obtain a new buffer if the current one can't be mapped without a stall. */
    843 	if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
    844 	    !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
    845 		r600_resource_reference(&query->buffer.buf, NULL);
    846 		query->buffer.buf = r600_new_query_buffer(rctx, query);
    847 	} else {
    848 		if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf))
    849 			r600_resource_reference(&query->buffer.buf, NULL);
    850 	}
    851 }
    852 
    853 bool r600_query_hw_begin(struct r600_common_context *rctx,
    854 			 struct r600_query *rquery)
    855 {
    856 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
    857 
    858 	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
    859 		assert(0);
    860 		return false;
    861 	}
    862 
    863 	if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
    864 		r600_query_hw_reset_buffers(rctx, query);
    865 
    866 	r600_query_hw_emit_start(rctx, query);
    867 	if (!query->buffer.buf)
    868 		return false;
    869 
    870 	LIST_ADDTAIL(&query->list, &rctx->active_queries);
    871 	return true;
    872 }
    873 
    874 static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
    875 {
    876 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
    877 	struct r600_query *rquery = (struct r600_query *)query;
    878 
    879 	return rquery->ops->end(rctx, rquery);
    880 }
    881 
    882 bool r600_query_hw_end(struct r600_common_context *rctx,
    883 		       struct r600_query *rquery)
    884 {
    885 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
    886 
    887 	if (query->flags & R600_QUERY_HW_FLAG_NO_START)
    888 		r600_query_hw_reset_buffers(rctx, query);
    889 
    890 	r600_query_hw_emit_stop(rctx, query);
    891 
    892 	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
    893 		LIST_DELINIT(&query->list);
    894 
    895 	if (!query->buffer.buf)
    896 		return false;
    897 
    898 	return true;
    899 }
    900 
    901 static void r600_get_hw_query_params(struct r600_common_context *rctx,
    902 				     struct r600_query_hw *rquery, int index,
    903 				     struct r600_hw_query_params *params)
    904 {
    905 	params->pair_stride = 0;
    906 	params->pair_count = 1;
    907 
    908 	switch (rquery->b.type) {
    909 	case PIPE_QUERY_OCCLUSION_COUNTER:
    910 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    911 		params->start_offset = 0;
    912 		params->end_offset = 8;
    913 		params->fence_offset = rctx->max_db * 16;
    914 		params->pair_stride = 16;
    915 		params->pair_count = rctx->max_db;
    916 		break;
    917 	case PIPE_QUERY_TIME_ELAPSED:
    918 		params->start_offset = 0;
    919 		params->end_offset = 8;
    920 		params->fence_offset = 16;
    921 		break;
    922 	case PIPE_QUERY_TIMESTAMP:
    923 		params->start_offset = 0;
    924 		params->end_offset = 0;
    925 		params->fence_offset = 8;
    926 		break;
    927 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    928 		params->start_offset = 8;
    929 		params->end_offset = 24;
    930 		params->fence_offset = params->end_offset + 4;
    931 		break;
    932 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    933 		params->start_offset = 0;
    934 		params->end_offset = 16;
    935 		params->fence_offset = params->end_offset + 4;
    936 		break;
    937 	case PIPE_QUERY_SO_STATISTICS:
    938 		params->start_offset = 8 - index * 8;
    939 		params->end_offset = 24 - index * 8;
    940 		params->fence_offset = params->end_offset + 4;
    941 		break;
    942 	case PIPE_QUERY_PIPELINE_STATISTICS:
    943 	{
    944 		/* Offsets apply to EG+ */
    945 		static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
    946 		params->start_offset = offsets[index];
    947 		params->end_offset = 88 + offsets[index];
    948 		params->fence_offset = 2 * 88;
    949 		break;
    950 	}
    951 	default:
    952 		unreachable("r600_get_hw_query_params unsupported");
    953 	}
    954 }
    955 
    956 static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
    957 				       bool test_status_bit)
    958 {
    959 	uint32_t *current_result = (uint32_t*)map;
    960 	uint64_t start, end;
    961 
    962 	start = (uint64_t)current_result[start_index] |
    963 		(uint64_t)current_result[start_index+1] << 32;
    964 	end = (uint64_t)current_result[end_index] |
    965 	      (uint64_t)current_result[end_index+1] << 32;
    966 
    967 	if (!test_status_bit ||
    968 	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
    969 		return end - start;
    970 	}
    971 	return 0;
    972 }
    973 
    974 static void r600_query_hw_add_result(struct r600_common_context *ctx,
    975 				     struct r600_query_hw *query,
    976 				     void *buffer,
    977 				     union pipe_query_result *result)
    978 {
    979 	switch (query->b.type) {
    980 	case PIPE_QUERY_OCCLUSION_COUNTER: {
    981 		for (unsigned i = 0; i < ctx->max_db; ++i) {
    982 			unsigned results_base = i * 16;
    983 			result->u64 +=
    984 				r600_query_read_result(buffer + results_base, 0, 2, true);
    985 		}
    986 		break;
    987 	}
    988 	case PIPE_QUERY_OCCLUSION_PREDICATE: {
    989 		for (unsigned i = 0; i < ctx->max_db; ++i) {
    990 			unsigned results_base = i * 16;
    991 			result->b = result->b ||
    992 				r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
    993 		}
    994 		break;
    995 	}
    996 	case PIPE_QUERY_TIME_ELAPSED:
    997 		result->u64 += r600_query_read_result(buffer, 0, 2, false);
    998 		break;
    999 	case PIPE_QUERY_TIMESTAMP:
   1000 		result->u64 = *(uint64_t*)buffer;
   1001 		break;
   1002 	case PIPE_QUERY_PRIMITIVES_EMITTED:
   1003 		/* SAMPLE_STREAMOUTSTATS stores this structure:
   1004 		 * {
   1005 		 *    u64 NumPrimitivesWritten;
   1006 		 *    u64 PrimitiveStorageNeeded;
   1007 		 * }
   1008 		 * We only need NumPrimitivesWritten here. */
   1009 		result->u64 += r600_query_read_result(buffer, 2, 6, true);
   1010 		break;
   1011 	case PIPE_QUERY_PRIMITIVES_GENERATED:
   1012 		/* Here we read PrimitiveStorageNeeded. */
   1013 		result->u64 += r600_query_read_result(buffer, 0, 4, true);
   1014 		break;
   1015 	case PIPE_QUERY_SO_STATISTICS:
   1016 		result->so_statistics.num_primitives_written +=
   1017 			r600_query_read_result(buffer, 2, 6, true);
   1018 		result->so_statistics.primitives_storage_needed +=
   1019 			r600_query_read_result(buffer, 0, 4, true);
   1020 		break;
   1021 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
   1022 		result->b = result->b ||
   1023 			r600_query_read_result(buffer, 2, 6, true) !=
   1024 			r600_query_read_result(buffer, 0, 4, true);
   1025 		break;
   1026 	case PIPE_QUERY_PIPELINE_STATISTICS:
   1027 		if (ctx->chip_class >= EVERGREEN) {
   1028 			result->pipeline_statistics.ps_invocations +=
   1029 				r600_query_read_result(buffer, 0, 22, false);
   1030 			result->pipeline_statistics.c_primitives +=
   1031 				r600_query_read_result(buffer, 2, 24, false);
   1032 			result->pipeline_statistics.c_invocations +=
   1033 				r600_query_read_result(buffer, 4, 26, false);
   1034 			result->pipeline_statistics.vs_invocations +=
   1035 				r600_query_read_result(buffer, 6, 28, false);
   1036 			result->pipeline_statistics.gs_invocations +=
   1037 				r600_query_read_result(buffer, 8, 30, false);
   1038 			result->pipeline_statistics.gs_primitives +=
   1039 				r600_query_read_result(buffer, 10, 32, false);
   1040 			result->pipeline_statistics.ia_primitives +=
   1041 				r600_query_read_result(buffer, 12, 34, false);
   1042 			result->pipeline_statistics.ia_vertices +=
   1043 				r600_query_read_result(buffer, 14, 36, false);
   1044 			result->pipeline_statistics.hs_invocations +=
   1045 				r600_query_read_result(buffer, 16, 38, false);
   1046 			result->pipeline_statistics.ds_invocations +=
   1047 				r600_query_read_result(buffer, 18, 40, false);
   1048 			result->pipeline_statistics.cs_invocations +=
   1049 				r600_query_read_result(buffer, 20, 42, false);
   1050 		} else {
   1051 			result->pipeline_statistics.ps_invocations +=
   1052 				r600_query_read_result(buffer, 0, 16, false);
   1053 			result->pipeline_statistics.c_primitives +=
   1054 				r600_query_read_result(buffer, 2, 18, false);
   1055 			result->pipeline_statistics.c_invocations +=
   1056 				r600_query_read_result(buffer, 4, 20, false);
   1057 			result->pipeline_statistics.vs_invocations +=
   1058 				r600_query_read_result(buffer, 6, 22, false);
   1059 			result->pipeline_statistics.gs_invocations +=
   1060 				r600_query_read_result(buffer, 8, 24, false);
   1061 			result->pipeline_statistics.gs_primitives +=
   1062 				r600_query_read_result(buffer, 10, 26, false);
   1063 			result->pipeline_statistics.ia_primitives +=
   1064 				r600_query_read_result(buffer, 12, 28, false);
   1065 			result->pipeline_statistics.ia_vertices +=
   1066 				r600_query_read_result(buffer, 14, 30, false);
   1067 		}
   1068 #if 0 /* for testing */
   1069 		printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
   1070 		       "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
   1071 		       "Clipper prims=%llu, PS=%llu, CS=%llu\n",
   1072 		       result->pipeline_statistics.ia_vertices,
   1073 		       result->pipeline_statistics.ia_primitives,
   1074 		       result->pipeline_statistics.vs_invocations,
   1075 		       result->pipeline_statistics.hs_invocations,
   1076 		       result->pipeline_statistics.ds_invocations,
   1077 		       result->pipeline_statistics.gs_invocations,
   1078 		       result->pipeline_statistics.gs_primitives,
   1079 		       result->pipeline_statistics.c_invocations,
   1080 		       result->pipeline_statistics.c_primitives,
   1081 		       result->pipeline_statistics.ps_invocations,
   1082 		       result->pipeline_statistics.cs_invocations);
   1083 #endif
   1084 		break;
   1085 	default:
   1086 		assert(0);
   1087 	}
   1088 }
   1089 
   1090 static boolean r600_get_query_result(struct pipe_context *ctx,
   1091 				     struct pipe_query *query, boolean wait,
   1092 				     union pipe_query_result *result)
   1093 {
   1094 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
   1095 	struct r600_query *rquery = (struct r600_query *)query;
   1096 
   1097 	return rquery->ops->get_result(rctx, rquery, wait, result);
   1098 }
   1099 
   1100 static void r600_get_query_result_resource(struct pipe_context *ctx,
   1101                                            struct pipe_query *query,
   1102                                            boolean wait,
   1103                                            enum pipe_query_value_type result_type,
   1104                                            int index,
   1105                                            struct pipe_resource *resource,
   1106                                            unsigned offset)
   1107 {
   1108 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
   1109 	struct r600_query *rquery = (struct r600_query *)query;
   1110 
   1111 	rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,
   1112 	                                 resource, offset);
   1113 }
   1114 
   1115 static void r600_query_hw_clear_result(struct r600_query_hw *query,
   1116 				       union pipe_query_result *result)
   1117 {
   1118 	util_query_clear_result(result, query->b.type);
   1119 }
   1120 
   1121 bool r600_query_hw_get_result(struct r600_common_context *rctx,
   1122 			      struct r600_query *rquery,
   1123 			      bool wait, union pipe_query_result *result)
   1124 {
   1125 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
   1126 	struct r600_query_buffer *qbuf;
   1127 
   1128 	query->ops->clear_result(query, result);
   1129 
   1130 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
   1131 		unsigned results_base = 0;
   1132 		void *map;
   1133 
   1134 		map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf,
   1135 						      PIPE_TRANSFER_READ |
   1136 						      (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
   1137 		if (!map)
   1138 			return false;
   1139 
   1140 		while (results_base != qbuf->results_end) {
   1141 			query->ops->add_result(rctx, query, map + results_base,
   1142 					       result);
   1143 			results_base += query->result_size;
   1144 		}
   1145 	}
   1146 
   1147 	/* Convert the time to expected units. */
   1148 	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
   1149 	    rquery->type == PIPE_QUERY_TIMESTAMP) {
   1150 		result->u64 = (1000000 * result->u64) / rctx->screen->info.clock_crystal_freq;
   1151 	}
   1152 	return true;
   1153 }
   1154 
   1155 /* Create the compute shader that is used to collect the results.
   1156  *
   1157  * One compute grid with a single thread is launched for every query result
   1158  * buffer. The thread (optionally) reads a previous summary buffer, then
   1159  * accumulates data from the query result buffer, and writes the result either
   1160  * to a summary buffer to be consumed by the next grid invocation or to the
   1161  * user-supplied buffer.
   1162  *
   1163  * Data layout:
   1164  *
   1165  * CONST
   1166  *  0.x = end_offset
   1167  *  0.y = result_stride
   1168  *  0.z = result_count
   1169  *  0.w = bit field:
   1170  *          1: read previously accumulated values
   1171  *          2: write accumulated values for chaining
   1172  *          4: write result available
   1173  *          8: convert result to boolean (0/1)
   1174  *         16: only read one dword and use that as result
   1175  *         32: apply timestamp conversion
   1176  *         64: store full 64 bits result
   1177  *        128: store signed 32 bits result
   1178  *  1.x = fence_offset
   1179  *  1.y = pair_stride
   1180  *  1.z = pair_count
   1181  *
   1182  * BUFFER[0] = query result buffer
   1183  * BUFFER[1] = previous summary buffer
   1184  * BUFFER[2] = next summary buffer or user-supplied buffer
   1185  */
   1186 static void r600_create_query_result_shader(struct r600_common_context *rctx)
   1187 {
   1188 	/* TEMP[0].xy = accumulated result so far
   1189 	 * TEMP[0].z = result not available
   1190 	 *
   1191 	 * TEMP[1].x = current result index
   1192 	 * TEMP[1].y = current pair index
   1193 	 */
   1194 	static const char text_tmpl[] =
   1195 		"COMP\n"
   1196 		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
   1197 		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
   1198 		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
   1199 		"DCL BUFFER[0]\n"
   1200 		"DCL BUFFER[1]\n"
   1201 		"DCL BUFFER[2]\n"
   1202 		"DCL CONST[0..1]\n"
   1203 		"DCL TEMP[0..5]\n"
   1204 		"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
   1205 		"IMM[1] UINT32 {1, 2, 4, 8}\n"
   1206 		"IMM[2] UINT32 {16, 32, 64, 128}\n"
   1207 		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
   1208 
   1209 		"AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n"
   1210 		"UIF TEMP[5]\n"
   1211 			/* Check result availability. */
   1212 			"LOAD TEMP[1].x, BUFFER[0], CONST[1].xxxx\n"
   1213 			"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
   1214 			"MOV TEMP[1], TEMP[0].zzzz\n"
   1215 			"NOT TEMP[0].z, TEMP[0].zzzz\n"
   1216 
   1217 			/* Load result if available. */
   1218 			"UIF TEMP[1]\n"
   1219 				"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
   1220 			"ENDIF\n"
   1221 		"ELSE\n"
   1222 			/* Load previously accumulated result if requested. */
   1223 			"MOV TEMP[0], IMM[0].xxxx\n"
   1224 			"AND TEMP[4], CONST[0].wwww, IMM[1].xxxx\n"
   1225 			"UIF TEMP[4]\n"
   1226 				"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
   1227 			"ENDIF\n"
   1228 
   1229 			"MOV TEMP[1].x, IMM[0].xxxx\n"
   1230 			"BGNLOOP\n"
   1231 				/* Break if accumulated result so far is not available. */
   1232 				"UIF TEMP[0].zzzz\n"
   1233 					"BRK\n"
   1234 				"ENDIF\n"
   1235 
   1236 				/* Break if result_index >= result_count. */
   1237 				"USGE TEMP[5], TEMP[1].xxxx, CONST[0].zzzz\n"
   1238 				"UIF TEMP[5]\n"
   1239 					"BRK\n"
   1240 				"ENDIF\n"
   1241 
   1242 				/* Load fence and check result availability */
   1243 				"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy, CONST[1].xxxx\n"
   1244 				"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
   1245 				"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
   1246 				"NOT TEMP[0].z, TEMP[0].zzzz\n"
   1247 				"UIF TEMP[0].zzzz\n"
   1248 					"BRK\n"
   1249 				"ENDIF\n"
   1250 
   1251 				"MOV TEMP[1].y, IMM[0].xxxx\n"
   1252 				"BGNLOOP\n"
   1253 					/* Load start and end. */
   1254 					"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy\n"
   1255 					"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[1].yyyy, TEMP[5].xxxx\n"
   1256 					"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
   1257 
   1258 					"UADD TEMP[5].x, TEMP[5].xxxx, CONST[0].xxxx\n"
   1259 					"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].xxxx\n"
   1260 
   1261 					"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
   1262 					"U64ADD TEMP[0].xy, TEMP[0], TEMP[3]\n"
   1263 
   1264 					/* Increment pair index */
   1265 					"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
   1266 					"USGE TEMP[5], TEMP[1].yyyy, CONST[1].zzzz\n"
   1267 					"UIF TEMP[5]\n"
   1268 						"BRK\n"
   1269 					"ENDIF\n"
   1270 				"ENDLOOP\n"
   1271 
   1272 				/* Increment result index */
   1273 				"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
   1274 			"ENDLOOP\n"
   1275 		"ENDIF\n"
   1276 
   1277 		"AND TEMP[4], CONST[0].wwww, IMM[1].yyyy\n"
   1278 		"UIF TEMP[4]\n"
   1279 			/* Store accumulated data for chaining. */
   1280 			"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
   1281 		"ELSE\n"
   1282 			"AND TEMP[4], CONST[0].wwww, IMM[1].zzzz\n"
   1283 			"UIF TEMP[4]\n"
   1284 				/* Store result availability. */
   1285 				"NOT TEMP[0].z, TEMP[0]\n"
   1286 				"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
   1287 				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
   1288 
   1289 				"AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
   1290 				"UIF TEMP[4]\n"
   1291 					"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
   1292 				"ENDIF\n"
   1293 			"ELSE\n"
   1294 				/* Store result if it is available. */
   1295 				"NOT TEMP[4], TEMP[0].zzzz\n"
   1296 				"UIF TEMP[4]\n"
   1297 					/* Apply timestamp conversion */
   1298 					"AND TEMP[4], CONST[0].wwww, IMM[2].yyyy\n"
   1299 					"UIF TEMP[4]\n"
   1300 						"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
   1301 						"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
   1302 					"ENDIF\n"
   1303 
   1304 					/* Convert to boolean */
   1305 					"AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n"
   1306 					"UIF TEMP[4]\n"
   1307 						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n"
   1308 						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
   1309 						"MOV TEMP[0].y, IMM[0].xxxx\n"
   1310 					"ENDIF\n"
   1311 
   1312 					"AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
   1313 					"UIF TEMP[4]\n"
   1314 						"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
   1315 					"ELSE\n"
   1316 						/* Clamping */
   1317 						"UIF TEMP[0].yyyy\n"
   1318 							"MOV TEMP[0].x, IMM[0].wwww\n"
   1319 						"ENDIF\n"
   1320 
   1321 						"AND TEMP[4], CONST[0].wwww, IMM[2].wwww\n"
   1322 						"UIF TEMP[4]\n"
   1323 							"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
   1324 						"ENDIF\n"
   1325 
   1326 						"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
   1327 					"ENDIF\n"
   1328 				"ENDIF\n"
   1329 			"ENDIF\n"
   1330 		"ENDIF\n"
   1331 
   1332 		"END\n";
   1333 
   1334 	char text[sizeof(text_tmpl) + 32];
   1335 	struct tgsi_token tokens[1024];
   1336 	struct pipe_compute_state state = {};
   1337 
   1338 	/* Hard code the frequency into the shader so that the backend can
   1339 	 * use the full range of optimizations for divide-by-constant.
   1340 	 */
   1341 	snprintf(text, sizeof(text), text_tmpl,
   1342 		 rctx->screen->info.clock_crystal_freq);
   1343 
   1344 	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
   1345 		assert(false);
   1346 		return;
   1347 	}
   1348 
   1349 	state.ir_type = PIPE_SHADER_IR_TGSI;
   1350 	state.prog = tokens;
   1351 
   1352 	rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);
   1353 }
   1354 
   1355 static void r600_restore_qbo_state(struct r600_common_context *rctx,
   1356 				   struct r600_qbo_state *st)
   1357 {
   1358 	rctx->b.bind_compute_state(&rctx->b, st->saved_compute);
   1359 
   1360 	rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
   1361 	pipe_resource_reference(&st->saved_const0.buffer, NULL);
   1362 
   1363 	rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
   1364 	for (unsigned i = 0; i < 3; ++i)
   1365 		pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
   1366 }
   1367 
   1368 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
   1369                                               struct r600_query *rquery,
   1370                                               bool wait,
   1371                                               enum pipe_query_value_type result_type,
   1372                                               int index,
   1373                                               struct pipe_resource *resource,
   1374                                               unsigned offset)
   1375 {
   1376 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
   1377 	struct r600_query_buffer *qbuf;
   1378 	struct r600_query_buffer *qbuf_prev;
   1379 	struct pipe_resource *tmp_buffer = NULL;
   1380 	unsigned tmp_buffer_offset = 0;
   1381 	struct r600_qbo_state saved_state = {};
   1382 	struct pipe_grid_info grid = {};
   1383 	struct pipe_constant_buffer constant_buffer = {};
   1384 	struct pipe_shader_buffer ssbo[3];
   1385 	struct r600_hw_query_params params;
   1386 	struct {
   1387 		uint32_t end_offset;
   1388 		uint32_t result_stride;
   1389 		uint32_t result_count;
   1390 		uint32_t config;
   1391 		uint32_t fence_offset;
   1392 		uint32_t pair_stride;
   1393 		uint32_t pair_count;
   1394 	} consts;
   1395 
   1396 	if (!rctx->query_result_shader) {
   1397 		r600_create_query_result_shader(rctx);
   1398 		if (!rctx->query_result_shader)
   1399 			return;
   1400 	}
   1401 
   1402 	if (query->buffer.previous) {
   1403 		u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
   1404 				     &tmp_buffer_offset, &tmp_buffer);
   1405 		if (!tmp_buffer)
   1406 			return;
   1407 	}
   1408 
   1409 	rctx->save_qbo_state(&rctx->b, &saved_state);
   1410 
   1411 	r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, &params);
   1412 	consts.end_offset = params.end_offset - params.start_offset;
   1413 	consts.fence_offset = params.fence_offset - params.start_offset;
   1414 	consts.result_stride = query->result_size;
   1415 	consts.pair_stride = params.pair_stride;
   1416 	consts.pair_count = params.pair_count;
   1417 
   1418 	constant_buffer.buffer_size = sizeof(consts);
   1419 	constant_buffer.user_buffer = &consts;
   1420 
   1421 	ssbo[1].buffer = tmp_buffer;
   1422 	ssbo[1].buffer_offset = tmp_buffer_offset;
   1423 	ssbo[1].buffer_size = 16;
   1424 
   1425 	ssbo[2] = ssbo[1];
   1426 
   1427 	rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);
   1428 
   1429 	grid.block[0] = 1;
   1430 	grid.block[1] = 1;
   1431 	grid.block[2] = 1;
   1432 	grid.grid[0] = 1;
   1433 	grid.grid[1] = 1;
   1434 	grid.grid[2] = 1;
   1435 
   1436 	consts.config = 0;
   1437 	if (index < 0)
   1438 		consts.config |= 4;
   1439 	if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
   1440 	    query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
   1441 		consts.config |= 8;
   1442 	else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
   1443 		 query->b.type == PIPE_QUERY_TIME_ELAPSED)
   1444 		consts.config |= 32;
   1445 
   1446 	switch (result_type) {
   1447 	case PIPE_QUERY_TYPE_U64:
   1448 	case PIPE_QUERY_TYPE_I64:
   1449 		consts.config |= 64;
   1450 		break;
   1451 	case PIPE_QUERY_TYPE_I32:
   1452 		consts.config |= 128;
   1453 		break;
   1454 	case PIPE_QUERY_TYPE_U32:
   1455 		break;
   1456 	}
   1457 
   1458 	rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;
   1459 
   1460 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
   1461 		if (query->b.type != PIPE_QUERY_TIMESTAMP) {
   1462 			qbuf_prev = qbuf->previous;
   1463 			consts.result_count = qbuf->results_end / query->result_size;
   1464 			consts.config &= ~3;
   1465 			if (qbuf != &query->buffer)
   1466 				consts.config |= 1;
   1467 			if (qbuf->previous)
   1468 				consts.config |= 2;
   1469 		} else {
   1470 			/* Only read the last timestamp. */
   1471 			qbuf_prev = NULL;
   1472 			consts.result_count = 0;
   1473 			consts.config |= 16;
   1474 			params.start_offset += qbuf->results_end - query->result_size;
   1475 		}
   1476 
   1477 		rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
   1478 
   1479 		ssbo[0].buffer = &qbuf->buf->b.b;
   1480 		ssbo[0].buffer_offset = params.start_offset;
   1481 		ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
   1482 
   1483 		if (!qbuf->previous) {
   1484 			ssbo[2].buffer = resource;
   1485 			ssbo[2].buffer_offset = offset;
   1486 			ssbo[2].buffer_size = 8;
   1487 
   1488 			((struct r600_resource *)resource)->TC_L2_dirty = true;
   1489 		}
   1490 
   1491 		rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
   1492 
   1493 		if (wait && qbuf == &query->buffer) {
   1494 			uint64_t va;
   1495 
   1496 			/* Wait for result availability. Wait only for readiness
   1497 			 * of the last entry, since the fence writes should be
   1498 			 * serialized in the CP.
   1499 			 */
   1500 			va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
   1501 			va += params.fence_offset;
   1502 
   1503 			r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
   1504 		}
   1505 
   1506 		rctx->b.launch_grid(&rctx->b, &grid);
   1507 		rctx->flags |= rctx->screen->barrier_flags.compute_to_L2;
   1508 	}
   1509 
   1510 	r600_restore_qbo_state(rctx, &saved_state);
   1511 	pipe_resource_reference(&tmp_buffer, NULL);
   1512 }
   1513 
   1514 static void r600_render_condition(struct pipe_context *ctx,
   1515 				  struct pipe_query *query,
   1516 				  boolean condition,
   1517 				  uint mode)
   1518 {
   1519 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
   1520 	struct r600_query_hw *rquery = (struct r600_query_hw *)query;
   1521 	struct r600_query_buffer *qbuf;
   1522 	struct r600_atom *atom = &rctx->render_cond_atom;
   1523 
   1524 	rctx->render_cond = query;
   1525 	rctx->render_cond_invert = condition;
   1526 	rctx->render_cond_mode = mode;
   1527 
   1528 	/* Compute the size of SET_PREDICATION packets. */
   1529 	atom->num_dw = 0;
   1530 	if (query) {
   1531 		for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
   1532 			atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
   1533 	}
   1534 
   1535 	rctx->set_atom_dirty(rctx, atom, query != NULL);
   1536 }
   1537 
   1538 void r600_suspend_queries(struct r600_common_context *ctx)
   1539 {
   1540 	struct r600_query_hw *query;
   1541 
   1542 	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
   1543 		r600_query_hw_emit_stop(ctx, query);
   1544 	}
   1545 	assert(ctx->num_cs_dw_queries_suspend == 0);
   1546 }
   1547 
   1548 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
   1549 						    struct list_head *query_list)
   1550 {
   1551 	struct r600_query_hw *query;
   1552 	unsigned num_dw = 0;
   1553 
   1554 	LIST_FOR_EACH_ENTRY(query, query_list, list) {
   1555 		/* begin + end */
   1556 		num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
   1557 
   1558 		/* Workaround for the fact that
   1559 		 * num_cs_dw_nontimer_queries_suspend is incremented for every
   1560 		 * resumed query, which raises the bar in need_cs_space for
   1561 		 * queries about to be resumed.
   1562 		 */
   1563 		num_dw += query->num_cs_dw_end;
   1564 	}
   1565 	/* primitives generated query */
   1566 	num_dw += ctx->streamout.enable_atom.num_dw;
   1567 	/* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */
   1568 	num_dw += 13;
   1569 
   1570 	return num_dw;
   1571 }
   1572 
   1573 void r600_resume_queries(struct r600_common_context *ctx)
   1574 {
   1575 	struct r600_query_hw *query;
   1576 	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
   1577 
   1578 	assert(ctx->num_cs_dw_queries_suspend == 0);
   1579 
   1580 	/* Check CS space here. Resuming must not be interrupted by flushes. */
   1581 	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);
   1582 
   1583 	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
   1584 		r600_query_hw_emit_start(ctx, query);
   1585 	}
   1586 }
   1587 
   1588 /* Get backends mask */
   1589 void r600_query_init_backend_mask(struct r600_common_context *ctx)
   1590 {
   1591 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
   1592 	struct r600_resource *buffer;
   1593 	uint32_t *results;
   1594 	unsigned num_backends = ctx->screen->info.num_render_backends;
   1595 	unsigned i, mask = 0;
   1596 
   1597 	/* if backend_map query is supported by the kernel */
   1598 	if (ctx->screen->info.r600_gb_backend_map_valid) {
   1599 		unsigned num_tile_pipes = ctx->screen->info.num_tile_pipes;
   1600 		unsigned backend_map = ctx->screen->info.r600_gb_backend_map;
   1601 		unsigned item_width, item_mask;
   1602 
   1603 		if (ctx->chip_class >= EVERGREEN) {
   1604 			item_width = 4;
   1605 			item_mask = 0x7;
   1606 		} else {
   1607 			item_width = 2;
   1608 			item_mask = 0x3;
   1609 		}
   1610 
   1611 		while (num_tile_pipes--) {
   1612 			i = backend_map & item_mask;
   1613 			mask |= (1<<i);
   1614 			backend_map >>= item_width;
   1615 		}
   1616 		if (mask != 0) {
   1617 			ctx->backend_mask = mask;
   1618 			return;
   1619 		}
   1620 	}
   1621 
   1622 	/* otherwise backup path for older kernels */
   1623 
   1624 	/* create buffer for event data */
   1625 	buffer = (struct r600_resource*)
   1626 		pipe_buffer_create(ctx->b.screen, 0,
   1627 				   PIPE_USAGE_STAGING, ctx->max_db*16);
   1628 	if (!buffer)
   1629 		goto err;
   1630 
   1631 	/* initialize buffer with zeroes */
   1632 	results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
   1633 	if (results) {
   1634 		memset(results, 0, ctx->max_db * 4 * 4);
   1635 
   1636 		/* emit EVENT_WRITE for ZPASS_DONE */
   1637 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
   1638 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
   1639 		radeon_emit(cs, buffer->gpu_address);
   1640 		radeon_emit(cs, buffer->gpu_address >> 32);
   1641 
   1642 		r600_emit_reloc(ctx, &ctx->gfx, buffer,
   1643                                 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
   1644 
   1645 		/* analyze results */
   1646 		results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ);
   1647 		if (results) {
   1648 			for(i = 0; i < ctx->max_db; i++) {
   1649 				/* at least highest bit will be set if backend is used */
   1650 				if (results[i*4 + 1])
   1651 					mask |= (1<<i);
   1652 			}
   1653 		}
   1654 	}
   1655 
   1656 	r600_resource_reference(&buffer, NULL);
   1657 
   1658 	if (mask != 0) {
   1659 		ctx->backend_mask = mask;
   1660 		return;
   1661 	}
   1662 
   1663 err:
   1664 	/* fallback to old method - set num_backends lower bits to 1 */
   1665 	ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
   1666 	return;
   1667 }
   1668 
   1669 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
   1670 	{ \
   1671 		.name = name_, \
   1672 		.query_type = R600_QUERY_##query_type_, \
   1673 		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
   1674 		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
   1675 		.group_id = group_id_ \
   1676 	}
   1677 
   1678 #define X(name_, query_type_, type_, result_type_) \
   1679 	XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
   1680 
   1681 #define XG(group_, name_, query_type_, type_, result_type_) \
   1682 	XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
   1683 
   1684 static struct pipe_driver_query_info r600_driver_query_list[] = {
   1685 	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
   1686 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
   1687 	X("num-shader-cache-hits",	NUM_SHADER_CACHE_HITS,	UINT64, CUMULATIVE),
   1688 	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
   1689 	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
   1690 	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
   1691 	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
   1692 	X("dma-calls",			DMA_CALLS,		UINT64, AVERAGE),
   1693 	X("cp-dma-calls",		CP_DMA_CALLS,		UINT64, AVERAGE),
   1694 	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
   1695 	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
   1696 	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
   1697 	X("num-fb-cache-flushes",	NUM_FB_CACHE_FLUSHES,	UINT64, AVERAGE),
   1698 	X("num-L2-invalidates",		NUM_L2_INVALIDATES,	UINT64, AVERAGE),
   1699 	X("num-L2-writebacks",		NUM_L2_WRITEBACKS,	UINT64, AVERAGE),
   1700 	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
   1701 	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
   1702 	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
   1703 	X("mapped-GTT",			MAPPED_GTT,		BYTES, AVERAGE),
   1704 	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
   1705 	X("num-GFX-IBs",		NUM_GFX_IBS,		UINT64, AVERAGE),
   1706 	X("num-SDMA-IBs",		NUM_SDMA_IBS,		UINT64, AVERAGE),
   1707 	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
   1708 	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
   1709 	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
   1710 	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
   1711 	X("back-buffer-ps-draw-ratio",	BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
   1712 
   1713 	/* GPIN queries are for the benefit of old versions of GPUPerfStudio,
   1714 	 * which use it as a fallback path to detect the GPU type.
   1715 	 *
   1716 	 * Note: The names of these queries are significant for GPUPerfStudio
   1717 	 * (and possibly their order as well). */
   1718 	XG(GPIN, "GPIN_000",		GPIN_ASIC_ID,		UINT, AVERAGE),
   1719 	XG(GPIN, "GPIN_001",		GPIN_NUM_SIMD,		UINT, AVERAGE),
   1720 	XG(GPIN, "GPIN_002",		GPIN_NUM_RB,		UINT, AVERAGE),
   1721 	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
   1722 	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
   1723 
   1724 	/* The following queries must be at the end of the list because their
   1725 	 * availability is adjusted dynamically based on the DRM version. */
   1726 	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
   1727 	X("GPU-shaders-busy",		GPU_SHADERS_BUSY,	UINT64, AVERAGE),
   1728 	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
   1729 	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
   1730 	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
   1731 };
   1732 
   1733 #undef X
   1734 #undef XG
   1735 #undef XFULL
   1736 
   1737 static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
   1738 {
   1739 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
   1740 		return ARRAY_SIZE(r600_driver_query_list);
   1741 	else if (rscreen->info.drm_major == 3)
   1742 		return ARRAY_SIZE(r600_driver_query_list) - 3;
   1743 	else
   1744 		return ARRAY_SIZE(r600_driver_query_list) - 5;
   1745 }
   1746 
   1747 static int r600_get_driver_query_info(struct pipe_screen *screen,
   1748 				      unsigned index,
   1749 				      struct pipe_driver_query_info *info)
   1750 {
   1751 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
   1752 	unsigned num_queries = r600_get_num_queries(rscreen);
   1753 
   1754 	if (!info) {
   1755 		unsigned num_perfcounters =
   1756 			r600_get_perfcounter_info(rscreen, 0, NULL);
   1757 
   1758 		return num_queries + num_perfcounters;
   1759 	}
   1760 
   1761 	if (index >= num_queries)
   1762 		return r600_get_perfcounter_info(rscreen, index - num_queries, info);
   1763 
   1764 	*info = r600_driver_query_list[index];
   1765 
   1766 	switch (info->query_type) {
   1767 	case R600_QUERY_REQUESTED_VRAM:
   1768 	case R600_QUERY_VRAM_USAGE:
   1769 	case R600_QUERY_MAPPED_VRAM:
   1770 		info->max_value.u64 = rscreen->info.vram_size;
   1771 		break;
   1772 	case R600_QUERY_REQUESTED_GTT:
   1773 	case R600_QUERY_GTT_USAGE:
   1774 	case R600_QUERY_MAPPED_GTT:
   1775 		info->max_value.u64 = rscreen->info.gart_size;
   1776 		break;
   1777 	case R600_QUERY_GPU_TEMPERATURE:
   1778 		info->max_value.u64 = 125;
   1779 		break;
   1780 	}
   1781 
   1782 	if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
   1783 		info->group_id += rscreen->perfcounters->num_groups;
   1784 
   1785 	return 1;
   1786 }
   1787 
   1788 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
   1789  * performance counter groups, so be careful when changing this and related
   1790  * functions.
   1791  */
   1792 static int r600_get_driver_query_group_info(struct pipe_screen *screen,
   1793 					    unsigned index,
   1794 					    struct pipe_driver_query_group_info *info)
   1795 {
   1796 	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
   1797 	unsigned num_pc_groups = 0;
   1798 
   1799 	if (rscreen->perfcounters)
   1800 		num_pc_groups = rscreen->perfcounters->num_groups;
   1801 
   1802 	if (!info)
   1803 		return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
   1804 
   1805 	if (index < num_pc_groups)
   1806 		return r600_get_perfcounter_group_info(rscreen, index, info);
   1807 
   1808 	index -= num_pc_groups;
   1809 	if (index >= R600_NUM_SW_QUERY_GROUPS)
   1810 		return 0;
   1811 
   1812 	info->name = "GPIN";
   1813 	info->max_active_queries = 5;
   1814 	info->num_queries = 5;
   1815 	return 1;
   1816 }
   1817 
   1818 void r600_query_init(struct r600_common_context *rctx)
   1819 {
   1820 	rctx->b.create_query = r600_create_query;
   1821 	rctx->b.create_batch_query = r600_create_batch_query;
   1822 	rctx->b.destroy_query = r600_destroy_query;
   1823 	rctx->b.begin_query = r600_begin_query;
   1824 	rctx->b.end_query = r600_end_query;
   1825 	rctx->b.get_query_result = r600_get_query_result;
   1826 	rctx->b.get_query_result_resource = r600_get_query_result_resource;
   1827 	rctx->render_cond_atom.emit = r600_emit_query_predication;
   1828 
   1829 	if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
   1830 	    rctx->b.render_condition = r600_render_condition;
   1831 
   1832 	LIST_INITHEAD(&rctx->active_queries);
   1833 }
   1834 
   1835 void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
   1836 {
   1837 	rscreen->b.get_driver_query_info = r600_get_driver_query_info;
   1838 	rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
   1839 }
   1840