Home | History | Annotate | Download | only in radeonsi
      1 /*
      2  * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *      Jerome Glisse
     25  */
     26 #include "r600_hw_context_priv.h"
     27 #include "radeonsi_pm4.h"
     28 #include "radeonsi_pipe.h"
     29 #include "sid.h"
     30 #include "util/u_memory.h"
     31 #include <errno.h>
     32 
     33 #define GROUP_FORCE_NEW_BLOCK	0
     34 
     35 /* Get backends mask */
     36 void si_get_backend_mask(struct r600_context *ctx)
     37 {
     38 	struct radeon_winsys_cs *cs = ctx->cs;
     39 	struct si_resource *buffer;
     40 	uint32_t *results;
     41 	unsigned num_backends = ctx->screen->info.r600_num_backends;
     42 	unsigned i, mask = 0;
     43 
     44 	/* if backend_map query is supported by the kernel */
     45 	if (ctx->screen->info.r600_backend_map_valid) {
     46 		unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
     47 		unsigned backend_map = ctx->screen->info.r600_backend_map;
     48 		unsigned item_width, item_mask;
     49 
     50 		if (ctx->chip_class >= CAYMAN) {
     51 			item_width = 4;
     52 			item_mask = 0x7;
     53 		}
     54 
     55 		while(num_tile_pipes--) {
     56 			i = backend_map & item_mask;
     57 			mask |= (1<<i);
     58 			backend_map >>= item_width;
     59 		}
     60 		if (mask != 0) {
     61 			ctx->backend_mask = mask;
     62 			return;
     63 		}
     64 	}
     65 
     66 	/* otherwise backup path for older kernels */
     67 
     68 	/* create buffer for event data */
     69 	buffer = si_resource_create_custom(&ctx->screen->screen,
     70 					   PIPE_USAGE_STAGING,
     71 					   ctx->max_db*16);
     72 	if (!buffer)
     73 		goto err;
     74 
     75 	/* initialize buffer with zeroes */
     76 	results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
     77 	if (results) {
     78 		uint64_t va = 0;
     79 
     80 		memset(results, 0, ctx->max_db * 4 * 4);
     81 		ctx->ws->buffer_unmap(buffer->cs_buf);
     82 
     83 		/* emit EVENT_WRITE for ZPASS_DONE */
     84 		va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
     85 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
     86 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
     87 		cs->buf[cs->cdw++] = va;
     88 		cs->buf[cs->cdw++] = va >> 32;
     89 
     90 		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
     91 		cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
     92 
     93 		/* analyze results */
     94 		results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
     95 		if (results) {
     96 			for(i = 0; i < ctx->max_db; i++) {
     97 				/* at least highest bit will be set if backend is used */
     98 				if (results[i*4 + 1])
     99 					mask |= (1<<i);
    100 			}
    101 			ctx->ws->buffer_unmap(buffer->cs_buf);
    102 		}
    103 	}
    104 
    105 	si_resource_reference(&buffer, NULL);
    106 
    107 	if (mask != 0) {
    108 		ctx->backend_mask = mask;
    109 		return;
    110 	}
    111 
    112 err:
    113 	/* fallback to old method - set num_backends lower bits to 1 */
    114 	ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
    115 	return;
    116 }
    117 
    118 /* initialize */
    119 void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
    120 			boolean count_draw_in)
    121 {
    122 	/* The number of dwords we already used in the CS so far. */
    123 	num_dw += ctx->cs->cdw;
    124 
    125 	if (count_draw_in) {
    126 		/* The number of dwords all the dirty states would take. */
    127 		num_dw += ctx->pm4_dirty_cdwords;
    128 
    129 		/* The upper-bound of how much a draw command would take. */
    130 		num_dw += SI_MAX_DRAW_CS_DWORDS;
    131 	}
    132 
    133 	/* Count in queries_suspend. */
    134 	num_dw += ctx->num_cs_dw_queries_suspend;
    135 
    136 	/* Count in streamout_end at the end of CS. */
    137 	num_dw += ctx->num_cs_dw_streamout_end;
    138 
    139 	/* Count in render_condition(NULL) at the end of CS. */
    140 	if (ctx->predicate_drawing) {
    141 		num_dw += 3;
    142 	}
    143 
    144 	/* Count in framebuffer cache flushes at the end of CS. */
    145 	num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
    146 
    147 	/* Save 16 dwords for the fence mechanism. */
    148 	num_dw += 16;
    149 
    150 	/* Flush if there's not enough space. */
    151 	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
    152 		radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
    153 	}
    154 }
    155 
    156 static void r600_flush_framebuffer(struct r600_context *ctx)
    157 {
    158 	struct si_pm4_state *pm4;
    159 
    160 	if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
    161 		return;
    162 
    163 	pm4 = CALLOC_STRUCT(si_pm4_state);
    164 	si_cmd_surface_sync(pm4, S_0085F0_CB0_DEST_BASE_ENA(1) |
    165 				S_0085F0_CB1_DEST_BASE_ENA(1) |
    166 				S_0085F0_CB2_DEST_BASE_ENA(1) |
    167 				S_0085F0_CB3_DEST_BASE_ENA(1) |
    168 				S_0085F0_CB4_DEST_BASE_ENA(1) |
    169 				S_0085F0_CB5_DEST_BASE_ENA(1) |
    170 				S_0085F0_CB6_DEST_BASE_ENA(1) |
    171 				S_0085F0_CB7_DEST_BASE_ENA(1) |
    172 				S_0085F0_DB_ACTION_ENA(1) |
    173 				S_0085F0_DB_DEST_BASE_ENA(1));
    174 	si_pm4_emit(ctx, pm4);
    175 	si_pm4_free_state(ctx, pm4, ~0);
    176 
    177 	ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
    178 }
    179 
    180 void si_context_flush(struct r600_context *ctx, unsigned flags)
    181 {
    182 	struct radeon_winsys_cs *cs = ctx->cs;
    183 	bool queries_suspended = false;
    184 
    185 #if 0
    186 	bool streamout_suspended = false;
    187 #endif
    188 
    189 	if (!cs->cdw)
    190 		return;
    191 
    192 	/* suspend queries */
    193 	if (ctx->num_cs_dw_queries_suspend) {
    194 		r600_context_queries_suspend(ctx);
    195 		queries_suspended = true;
    196 	}
    197 
    198 #if 0
    199 	if (ctx->num_cs_dw_streamout_end) {
    200 		r600_context_streamout_end(ctx);
    201 		streamout_suspended = true;
    202 	}
    203 #endif
    204 
    205 	r600_flush_framebuffer(ctx);
    206 
    207 	/* partial flush is needed to avoid lockups on some chips with user fences */
    208 	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
    209 	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
    210 
    211 	/* force to keep tiling flags */
    212 	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
    213 
    214 	/* Flush the CS. */
    215 	ctx->ws->cs_flush(ctx->cs, flags);
    216 
    217 	ctx->pm4_dirty_cdwords = 0;
    218 	ctx->flags = 0;
    219 
    220 #if 0
    221 	if (streamout_suspended) {
    222 		ctx->streamout_start = TRUE;
    223 		ctx->streamout_append_bitmask = ~0;
    224 	}
    225 #endif
    226 
    227 	/* resume queries */
    228 	if (queries_suspended) {
    229 		r600_context_queries_resume(ctx);
    230 	}
    231 
    232 	/* set all valid group as dirty so they get reemited on
    233 	 * next draw command
    234 	 */
    235 	si_pm4_reset_emitted(ctx);
    236 }
    237 
    238 void si_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
    239 {
    240 	struct radeon_winsys_cs *cs = ctx->cs;
    241 	uint64_t va;
    242 
    243 	si_need_cs_space(ctx, 10, FALSE);
    244 
    245 	va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
    246 	va = va + (offset << 2);
    247 
    248 	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
    249 	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
    250 	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
    251 	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
    252 	cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;       /* ADDRESS_LO */
    253 	/* DATA_SEL | INT_EN | ADDRESS_HI */
    254 	cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
    255 	cs->buf[cs->cdw++] = value;                   /* DATA_LO */
    256 	cs->buf[cs->cdw++] = 0;                       /* DATA_HI */
    257 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
    258 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
    259 }
    260 
    261 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
    262 				       bool test_status_bit)
    263 {
    264 	uint32_t *current_result = (uint32_t*)map;
    265 	uint64_t start, end;
    266 
    267 	start = (uint64_t)current_result[start_index] |
    268 		(uint64_t)current_result[start_index+1] << 32;
    269 	end = (uint64_t)current_result[end_index] |
    270 	      (uint64_t)current_result[end_index+1] << 32;
    271 
    272 	if (!test_status_bit ||
    273 	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
    274 		return end - start;
    275 	}
    276 	return 0;
    277 }
    278 
    279 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
    280 {
    281 	unsigned results_base = query->results_start;
    282 	char *map;
    283 
    284 	map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
    285 				  PIPE_TRANSFER_READ |
    286 				  (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
    287 	if (!map)
    288 		return FALSE;
    289 
    290 	/* count all results across all data blocks */
    291 	switch (query->type) {
    292 	case PIPE_QUERY_OCCLUSION_COUNTER:
    293 		while (results_base != query->results_end) {
    294 			query->result.u64 +=
    295 				r600_query_read_result(map + results_base, 0, 2, true);
    296 			results_base = (results_base + 16) % query->buffer->b.b.width0;
    297 		}
    298 		break;
    299 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    300 		while (results_base != query->results_end) {
    301 			query->result.b = query->result.b ||
    302 				r600_query_read_result(map + results_base, 0, 2, true) != 0;
    303 			results_base = (results_base + 16) % query->buffer->b.b.width0;
    304 		}
    305 		break;
    306 	case PIPE_QUERY_TIME_ELAPSED:
    307 		while (results_base != query->results_end) {
    308 			query->result.u64 +=
    309 				r600_query_read_result(map + results_base, 0, 2, false);
    310 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
    311 		}
    312 		break;
    313 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    314 		/* SAMPLE_STREAMOUTSTATS stores this structure:
    315 		 * {
    316 		 *    u64 NumPrimitivesWritten;
    317 		 *    u64 PrimitiveStorageNeeded;
    318 		 * }
    319 		 * We only need NumPrimitivesWritten here. */
    320 		while (results_base != query->results_end) {
    321 			query->result.u64 +=
    322 				r600_query_read_result(map + results_base, 2, 6, true);
    323 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
    324 		}
    325 		break;
    326 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    327 		/* Here we read PrimitiveStorageNeeded. */
    328 		while (results_base != query->results_end) {
    329 			query->result.u64 +=
    330 				r600_query_read_result(map + results_base, 0, 4, true);
    331 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
    332 		}
    333 		break;
    334 	case PIPE_QUERY_SO_STATISTICS:
    335 		while (results_base != query->results_end) {
    336 			query->result.so.num_primitives_written +=
    337 				r600_query_read_result(map + results_base, 2, 6, true);
    338 			query->result.so.primitives_storage_needed +=
    339 				r600_query_read_result(map + results_base, 0, 4, true);
    340 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
    341 		}
    342 		break;
    343 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    344 		while (results_base != query->results_end) {
    345 			query->result.b = query->result.b ||
    346 				r600_query_read_result(map + results_base, 2, 6, true) !=
    347 				r600_query_read_result(map + results_base, 0, 4, true);
    348 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
    349 		}
    350 		break;
    351 	default:
    352 		assert(0);
    353 	}
    354 
    355 	query->results_start = query->results_end;
    356 	ctx->ws->buffer_unmap(query->buffer->cs_buf);
    357 	return TRUE;
    358 }
    359 
    360 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
    361 {
    362 	struct radeon_winsys_cs *cs = ctx->cs;
    363 	unsigned new_results_end, i;
    364 	uint32_t *results;
    365 	uint64_t va;
    366 
    367 	si_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
    368 
    369 	new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
    370 
    371 	/* collect current results if query buffer is full */
    372 	if (new_results_end == query->results_start) {
    373 		r600_query_result(ctx, query, TRUE);
    374 	}
    375 
    376 	switch (query->type) {
    377 	case PIPE_QUERY_OCCLUSION_COUNTER:
    378 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    379 		results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
    380 		if (results) {
    381 			results = (uint32_t*)((char*)results + query->results_end);
    382 			memset(results, 0, query->result_size);
    383 
    384 			/* Set top bits for unused backends */
    385 			for (i = 0; i < ctx->max_db; i++) {
    386 				if (!(ctx->backend_mask & (1<<i))) {
    387 					results[(i * 4)+1] = 0x80000000;
    388 					results[(i * 4)+3] = 0x80000000;
    389 				}
    390 			}
    391 			ctx->ws->buffer_unmap(query->buffer->cs_buf);
    392 		}
    393 		break;
    394 	case PIPE_QUERY_TIME_ELAPSED:
    395 		break;
    396 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    397 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    398 	case PIPE_QUERY_SO_STATISTICS:
    399 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    400 		results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
    401 		results = (uint32_t*)((char*)results + query->results_end);
    402 		memset(results, 0, query->result_size);
    403 		ctx->ws->buffer_unmap(query->buffer->cs_buf);
    404 		break;
    405 	default:
    406 		assert(0);
    407 	}
    408 
    409 	/* emit begin query */
    410 	va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
    411 	va += query->results_end;
    412 
    413 	switch (query->type) {
    414 	case PIPE_QUERY_OCCLUSION_COUNTER:
    415 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    416 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
    417 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
    418 		cs->buf[cs->cdw++] = va;
    419 		cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
    420 		break;
    421 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    422 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    423 	case PIPE_QUERY_SO_STATISTICS:
    424 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    425 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
    426 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
    427 		cs->buf[cs->cdw++] = query->results_end;
    428 		cs->buf[cs->cdw++] = 0;
    429 		break;
    430 	case PIPE_QUERY_TIME_ELAPSED:
    431 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
    432 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
    433 		cs->buf[cs->cdw++] = va;
    434 		cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
    435 		cs->buf[cs->cdw++] = 0;
    436 		cs->buf[cs->cdw++] = 0;
    437 		break;
    438 	default:
    439 		assert(0);
    440 	}
    441 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
    442 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
    443 
    444 	ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
    445 }
    446 
    447 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
    448 {
    449 	struct radeon_winsys_cs *cs = ctx->cs;
    450 	uint64_t va;
    451 
    452 	va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
    453 	/* emit end query */
    454 	switch (query->type) {
    455 	case PIPE_QUERY_OCCLUSION_COUNTER:
    456 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    457 		va += query->results_end + 8;
    458 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
    459 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
    460 		cs->buf[cs->cdw++] = va;
    461 		cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
    462 		break;
    463 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    464 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    465 	case PIPE_QUERY_SO_STATISTICS:
    466 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    467 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
    468 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
    469 		cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
    470 		cs->buf[cs->cdw++] = 0;
    471 		break;
    472 	case PIPE_QUERY_TIME_ELAPSED:
    473 		va += query->results_end + query->result_size/2;
    474 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
    475 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
    476 		cs->buf[cs->cdw++] = va;
    477 		cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
    478 		cs->buf[cs->cdw++] = 0;
    479 		cs->buf[cs->cdw++] = 0;
    480 		break;
    481 	default:
    482 		assert(0);
    483 	}
    484 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
    485 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
    486 
    487 	query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
    488 	ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
    489 }
    490 
    491 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
    492 			    int flag_wait)
    493 {
    494 	struct radeon_winsys_cs *cs = ctx->cs;
    495 	uint64_t va;
    496 
    497 	if (operation == PREDICATION_OP_CLEAR) {
    498 		si_need_cs_space(ctx, 3, FALSE);
    499 
    500 		cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
    501 		cs->buf[cs->cdw++] = 0;
    502 		cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
    503 	} else {
    504 		unsigned results_base = query->results_start;
    505 		unsigned count;
    506 		uint32_t op;
    507 
    508 		/* find count of the query data blocks */
    509 		count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
    510 		count /= query->result_size;
    511 
    512 		si_need_cs_space(ctx, 5 * count, TRUE);
    513 
    514 		op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
    515 				(flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
    516 		va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
    517 
    518 		/* emit predicate packets for all data blocks */
    519 		while (results_base != query->results_end) {
    520 			cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
    521 			cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
    522 			cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
    523 			cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
    524 			cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
    525 									     RADEON_USAGE_READ);
    526 			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
    527 
    528 			/* set CONTINUE bit for all packets except the first */
    529 			op |= PREDICATION_CONTINUE;
    530 		}
    531 	}
    532 }
    533 
    534 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
    535 {
    536 	struct r600_query *query;
    537 	unsigned buffer_size = 4096;
    538 
    539 	query = CALLOC_STRUCT(r600_query);
    540 	if (query == NULL)
    541 		return NULL;
    542 
    543 	query->type = query_type;
    544 
    545 	switch (query_type) {
    546 	case PIPE_QUERY_OCCLUSION_COUNTER:
    547 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    548 		query->result_size = 16 * ctx->max_db;
    549 		query->num_cs_dw = 6;
    550 		break;
    551 	case PIPE_QUERY_TIME_ELAPSED:
    552 		query->result_size = 16;
    553 		query->num_cs_dw = 8;
    554 		break;
    555 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    556 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    557 	case PIPE_QUERY_SO_STATISTICS:
    558 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    559 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
    560 		query->result_size = 32;
    561 		query->num_cs_dw = 6;
    562 		break;
    563 	default:
    564 		assert(0);
    565 		FREE(query);
    566 		return NULL;
    567 	}
    568 
    569 	/* adjust buffer size to simplify offsets wrapping math */
    570 	buffer_size -= buffer_size % query->result_size;
    571 
    572 	/* Queries are normally read by the CPU after
    573 	 * being written by the gpu, hence staging is probably a good
    574 	 * usage pattern.
    575 	 */
    576 	query->buffer = si_resource_create_custom(&ctx->screen->screen,
    577 						  PIPE_USAGE_STAGING,
    578 						  buffer_size);
    579 	if (!query->buffer) {
    580 		FREE(query);
    581 		return NULL;
    582 	}
    583 	return query;
    584 }
    585 
    586 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
    587 {
    588 	si_resource_reference(&query->buffer, NULL);
    589 	free(query);
    590 }
    591 
    592 boolean r600_context_query_result(struct r600_context *ctx,
    593 				struct r600_query *query,
    594 				boolean wait, void *vresult)
    595 {
    596 	boolean *result_b = (boolean*)vresult;
    597 	uint64_t *result_u64 = (uint64_t*)vresult;
    598 	struct pipe_query_data_so_statistics *result_so =
    599 		(struct pipe_query_data_so_statistics*)vresult;
    600 
    601 	if (!r600_query_result(ctx, query, wait))
    602 		return FALSE;
    603 
    604 	switch (query->type) {
    605 	case PIPE_QUERY_OCCLUSION_COUNTER:
    606 	case PIPE_QUERY_PRIMITIVES_EMITTED:
    607 	case PIPE_QUERY_PRIMITIVES_GENERATED:
    608 		*result_u64 = query->result.u64;
    609 		break;
    610 	case PIPE_QUERY_OCCLUSION_PREDICATE:
    611 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    612 		*result_b = query->result.b;
    613 		break;
    614 	case PIPE_QUERY_TIME_ELAPSED:
    615 		*result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
    616 		break;
    617 	case PIPE_QUERY_SO_STATISTICS:
    618 		*result_so = query->result.so;
    619 		break;
    620 	default:
    621 		assert(0);
    622 	}
    623 	return TRUE;
    624 }
    625 
    626 void r600_context_queries_suspend(struct r600_context *ctx)
    627 {
    628 	struct r600_query *query;
    629 
    630 	LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
    631 		r600_query_end(ctx, query);
    632 	}
    633 	assert(ctx->num_cs_dw_queries_suspend == 0);
    634 }
    635 
    636 void r600_context_queries_resume(struct r600_context *ctx)
    637 {
    638 	struct r600_query *query;
    639 
    640 	assert(ctx->num_cs_dw_queries_suspend == 0);
    641 
    642 	LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
    643 		r600_query_begin(ctx, query);
    644 	}
    645 }
    646 
    647 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
    648 {
    649 	struct radeon_winsys_cs *cs = ctx->cs;
    650 	si_need_cs_space(ctx, 14 + 21, TRUE);
    651 
    652 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
    653 	cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
    654 	cs->buf[cs->cdw++] = 0;
    655 
    656 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
    657 	cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
    658 	cs->buf[cs->cdw++] = t->stride >> 2;
    659 
    660 #if 0
    661 	cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
    662 	cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
    663 	cs->buf[cs->cdw++] = 0; /* src address lo */
    664 	cs->buf[cs->cdw++] = 0; /* src address hi */
    665 	cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
    666 	cs->buf[cs->cdw++] = 0; /* unused */
    667 #endif
    668 
    669 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
    670 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
    671 
    672 #if 0 /* I have not found this useful yet. */
    673 	cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
    674 	cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
    675 	cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
    676 	cs->buf[cs->cdw++] = 0; /* unused */
    677 	cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
    678 	cs->buf[cs->cdw++] = 0; /* unused */
    679 
    680 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
    681 	cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
    682 	cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
    683 
    684 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
    685 	cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
    686 	cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
    687 
    688 	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
    689 	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct si_resource*)t->b.buffer,
    690 							     RADEON_USAGE_WRITE);
    691 
    692 	cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
    693 	cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
    694 	cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2;  /* register */
    695 	cs->buf[cs->cdw++] = 0;
    696 	cs->buf[cs->cdw++] = 0; /* reference value */
    697 	cs->buf[cs->cdw++] = 0xffffffff; /* mask */
    698 	cs->buf[cs->cdw++] = 4; /* poll interval */
    699 #endif
    700 }
    701