Home | History | Annotate | Download | only in a4xx
      1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
      2 
      3 /*
      4  * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org>
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     23  * SOFTWARE.
     24  *
     25  * Authors:
     26  *    Rob Clark <robclark (at) freedesktop.org>
     27  */
     28 
     29 #include "freedreno_query_hw.h"
     30 #include "freedreno_context.h"
     31 #include "freedreno_util.h"
     32 
     33 #include "fd4_query.h"
     34 #include "fd4_context.h"
     35 #include "fd4_draw.h"
     36 #include "fd4_format.h"
     37 
     38 
     39 struct fd_rb_samp_ctrs {
     40 	uint64_t ctr[16];
     41 };
     42 
     43 /*
     44  * Occlusion Query:
     45  *
     46  * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
     47  * interpret results
     48  */
     49 
     50 static struct fd_hw_sample *
     51 occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
     52 {
     53 	struct fd_hw_sample *samp =
     54 			fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
     55 
     56 	/* low bits of sample addr should be zero (since they are control
     57 	 * flags in RB_SAMPLE_COUNT_CONTROL):
     58 	 */
     59 	debug_assert((samp->offset & 0x3) == 0);
     60 
     61 	/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
     62 	 * HW_QUERY_BASE_REG register:
     63 	 */
     64 	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
     65 	OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
     66 	OUT_RING(ring, HW_QUERY_BASE_REG);
     67 	OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY |
     68 			samp->offset);
     69 
     70 	OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
     71 	OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
     72 						INDEX4_SIZE_32_BIT, USE_VISIBILITY));
     73 	OUT_RING(ring, 1);             /* NumInstances */
     74 	OUT_RING(ring, 0);             /* NumIndices */
     75 
     76 	fd_event_write(batch, ring, ZPASS_DONE);
     77 
     78 	return samp;
     79 }
     80 
     81 static uint64_t
     82 count_samples(const struct fd_rb_samp_ctrs *start,
     83 		const struct fd_rb_samp_ctrs *end)
     84 {
     85 	return end->ctr[0] - start->ctr[0];
     86 }
     87 
     88 static void
     89 occlusion_counter_accumulate_result(struct fd_context *ctx,
     90 		const void *start, const void *end,
     91 		union pipe_query_result *result)
     92 {
     93 	uint64_t n = count_samples(start, end);
     94 	result->u64 += n;
     95 }
     96 
     97 static void
     98 occlusion_predicate_accumulate_result(struct fd_context *ctx,
     99 		const void *start, const void *end,
    100 		union pipe_query_result *result)
    101 {
    102 	uint64_t n = count_samples(start, end);
    103 	result->b |= (n > 0);
    104 }
    105 
    106 /*
    107  * Time Elapsed Query:
    108  *
    109  * Note: we could in theory support timestamp queries, but they
    110  * won't give sensible results for tilers.
    111  */
    112 
    113 static void
    114 time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring)
    115 {
    116 	/* Right now, the assignment of countable to counter register is
    117 	 * just hard coded.  If we start exposing more countables than we
    118 	 * have counters, we will need to be more clever.
    119 	 */
    120 	fd_wfi(ctx->batch, ring);
    121 	OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
    122 	OUT_RING(ring, CP_ALWAYS_COUNT);
    123 }
    124 
    125 static struct fd_hw_sample *
    126 time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
    127 {
    128 	struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
    129 
    130 	/* use unused part of vsc_size_mem as scratch space, to avoid
    131 	 * extra allocation:
    132 	 */
    133 	struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
    134 	const int sample_off = 128;
    135 	const int addr_off = sample_off + 8;
    136 
    137 	debug_assert(batch->ctx->screen->max_freq > 0);
    138 
    139 	/* Basic issue is that we need to read counter value to a relative
    140 	 * destination (with per-tile offset) rather than absolute dest
    141 	 * addr.  But there is no pm4 packet that can do that.  This is
    142 	 * where it would be *really* nice if we could write our own fw
    143 	 * since afaict implementing the sort of packet we need would be
    144 	 * trivial.
    145 	 *
    146 	 * Instead, we:
    147 	 * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
    148 	 * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
    149 	 * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
    150 	 *     address to the per-sample offset in the scratch buffer
    151 	 * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
    152 	 *     to CP_ME_NRT_ADDR
    153 	 * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
    154 	 *     buffer to CP_ME_NRT_DATA to trigger the write out to query
    155 	 *     result buffer
    156 	 *
    157 	 * Straightforward, right?
    158 	 *
    159 	 * Maybe could swap the order of things in the scratch buffer to
    160 	 * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
    161 	 * shot, but that's really just polishing a turd..
    162 	 */
    163 
    164 	fd_wfi(batch, ring);
    165 
    166 	/* copy sample counter _LO and _HI to scratch: */
    167 	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
    168 	OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
    169 			CP_REG_TO_MEM_0_64B |
    170 			CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */
    171 	OUT_RELOCW(ring, scratch_bo, sample_off, 0, 0);
    172 
    173 	/* ok... here we really *would* like to use the CP_SET_CONSTANT
    174 	 * mode which can add a constant to value in reg2 and write to
    175 	 * reg1... *but* that only works for banked/context registers,
    176 	 * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
    177 	 * CP math to the scratch buffer instead:
    178 	 *
    179 	 * (note first 8 bytes are counter value, use offset 0x8 for
    180 	 * address calculation)
    181 	 */
    182 
    183 	/* per-sample offset to scratch bo: */
    184 	OUT_PKT3(ring, CP_MEM_WRITE, 2);
    185 	OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0);
    186 	OUT_RING(ring, samp->offset);
    187 
    188 	/* now add to that the per-tile base: */
    189 	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
    190 	OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
    191 			CP_REG_TO_MEM_0_ACCUMULATE |
    192 			CP_REG_TO_MEM_0_CNT(1-1));       /* readback 1 regs */
    193 	OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0);
    194 
    195 	/* now copy that back to CP_ME_NRT_ADDR: */
    196 	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
    197 	OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
    198 	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
    199 
    200 	/* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
    201 	 * to trigger the write to result buffer
    202 	 */
    203 	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
    204 	OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
    205 	OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
    206 
    207 	/* and again to get the value of the _HI reg from scratch: */
    208 	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
    209 	OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
    210 	OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
    211 
    212 	/* Sigh.. */
    213 
    214 	return samp;
    215 }
    216 
    217 static void
    218 time_elapsed_accumulate_result(struct fd_context *ctx,
    219 		const void *start, const void *end,
    220 		union pipe_query_result *result)
    221 {
    222 	uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
    223 	/* max_freq is in Hz, convert cycle count to ns: */
    224 	result->u64 += n * 1000000000 / ctx->screen->max_freq;
    225 }
    226 
    227 static void
    228 timestamp_accumulate_result(struct fd_context *ctx,
    229 		const void *start, const void *end,
    230 		union pipe_query_result *result)
    231 {
    232 	/* just return the value from fist tile: */
    233 	if (result->u64 != 0)
    234 		return;
    235 	uint64_t n = *(uint64_t *)start;
    236 	/* max_freq is in Hz, convert cycle count to ns: */
    237 	result->u64 = n * 1000000000 / ctx->screen->max_freq;
    238 }
    239 
    240 static const struct fd_hw_sample_provider occlusion_counter = {
    241 		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
    242 		.active = FD_STAGE_DRAW,
    243 		.get_sample = occlusion_get_sample,
    244 		.accumulate_result = occlusion_counter_accumulate_result,
    245 };
    246 
    247 static const struct fd_hw_sample_provider occlusion_predicate = {
    248 		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
    249 		.active = FD_STAGE_DRAW,
    250 		.get_sample = occlusion_get_sample,
    251 		.accumulate_result = occlusion_predicate_accumulate_result,
    252 };
    253 
    254 static const struct fd_hw_sample_provider time_elapsed = {
    255 		.query_type = PIPE_QUERY_TIME_ELAPSED,
    256 		.active = FD_STAGE_DRAW | FD_STAGE_CLEAR,
    257 		.enable = time_elapsed_enable,
    258 		.get_sample = time_elapsed_get_sample,
    259 		.accumulate_result = time_elapsed_accumulate_result,
    260 };
    261 
    262 /* NOTE: timestamp query isn't going to give terribly sensible results
    263  * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
    264  * add in a binning pass, the results get even more non-sensical.  So
    265  * we just return the timestamp on the first tile and hope that is
    266  * kind of good enough.
    267  */
    268 static const struct fd_hw_sample_provider timestamp = {
    269 		.query_type = PIPE_QUERY_TIMESTAMP,
    270 		.active = FD_STAGE_ALL,
    271 		.enable = time_elapsed_enable,
    272 		.get_sample = time_elapsed_get_sample,
    273 		.accumulate_result = timestamp_accumulate_result,
    274 };
    275 
    276 void fd4_query_context_init(struct pipe_context *pctx)
    277 {
    278 	fd_hw_query_register_provider(pctx, &occlusion_counter);
    279 	fd_hw_query_register_provider(pctx, &occlusion_predicate);
    280 	fd_hw_query_register_provider(pctx, &time_elapsed);
    281 	fd_hw_query_register_provider(pctx, &timestamp);
    282 }
    283