1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ 2 3 /* 4 * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 * 25 * Authors: 26 * Rob Clark <robclark (at) freedesktop.org> 27 */ 28 29 #include "freedreno_query_hw.h" 30 #include "freedreno_context.h" 31 #include "freedreno_util.h" 32 33 #include "fd4_query.h" 34 #include "fd4_context.h" 35 #include "fd4_draw.h" 36 #include "fd4_format.h" 37 38 39 struct fd_rb_samp_ctrs { 40 uint64_t ctr[16]; 41 }; 42 43 /* 44 * Occlusion Query: 45 * 46 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they 47 * interpret results 48 */ 49 50 static struct fd_hw_sample * 51 occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) 52 { 53 struct fd_hw_sample *samp = 54 fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs)); 55 56 /* low bits of sample addr should be zero (since they are control 57 * flags in RB_SAMPLE_COUNT_CONTROL): 58 */ 59 debug_assert((samp->offset & 0x3) == 0); 60 61 /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of 62 * HW_QUERY_BASE_REG register: 63 */ 64 OUT_PKT3(ring, CP_SET_CONSTANT, 3); 65 OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000); 66 OUT_RING(ring, HW_QUERY_BASE_REG); 67 OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | 68 samp->offset); 69 70 OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3); 71 OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, 72 INDEX4_SIZE_32_BIT, USE_VISIBILITY)); 73 OUT_RING(ring, 1); /* NumInstances */ 74 OUT_RING(ring, 0); /* NumIndices */ 75 76 fd_event_write(batch, ring, ZPASS_DONE); 77 78 return samp; 79 } 80 81 static uint64_t 82 count_samples(const struct fd_rb_samp_ctrs *start, 83 const struct fd_rb_samp_ctrs *end) 84 { 85 return end->ctr[0] - start->ctr[0]; 86 } 87 88 static void 89 occlusion_counter_accumulate_result(struct fd_context *ctx, 90 const void *start, const void *end, 91 union pipe_query_result *result) 92 { 93 uint64_t n = count_samples(start, end); 94 result->u64 += n; 95 } 96 97 static void 98 occlusion_predicate_accumulate_result(struct fd_context *ctx, 99 const void *start, const void *end, 100 union pipe_query_result *result) 101 { 102 uint64_t n = count_samples(start, end); 103 result->b |= (n > 0); 104 } 105 106 /* 107 * Time Elapsed Query: 108 * 109 * Note: we could in theory support timestamp queries, but they 110 * won't give sensible results for tilers. 111 */ 112 113 static void 114 time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring) 115 { 116 /* Right now, the assignment of countable to counter register is 117 * just hard coded. If we start exposing more countables than we 118 * have counters, we will need to be more clever. 119 */ 120 fd_wfi(ctx->batch, ring); 121 OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1); 122 OUT_RING(ring, CP_ALWAYS_COUNT); 123 } 124 125 static struct fd_hw_sample * 126 time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) 127 { 128 struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t)); 129 130 /* use unused part of vsc_size_mem as scratch space, to avoid 131 * extra allocation: 132 */ 133 struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem; 134 const int sample_off = 128; 135 const int addr_off = sample_off + 8; 136 137 debug_assert(batch->ctx->screen->max_freq > 0); 138 139 /* Basic issue is that we need to read counter value to a relative 140 * destination (with per-tile offset) rather than absolute dest 141 * addr. But there is no pm4 packet that can do that. This is 142 * where it would be *really* nice if we could write our own fw 143 * since afaict implementing the sort of packet we need would be 144 * trivial. 145 * 146 * Instead, we: 147 * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer 148 * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer 149 * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base 150 * address to the per-sample offset in the scratch buffer 151 * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3 152 * to CP_ME_NRT_ADDR 153 * (5) CP_MEM_TO_REG's to copy saved counter value from scratch 154 * buffer to CP_ME_NRT_DATA to trigger the write out to query 155 * result buffer 156 * 157 * Straightforward, right? 158 * 159 * Maybe could swap the order of things in the scratch buffer to 160 * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one 161 * shot, but that's really just polishing a turd.. 162 */ 163 164 fd_wfi(batch, ring); 165 166 /* copy sample counter _LO and _HI to scratch: */ 167 OUT_PKT3(ring, CP_REG_TO_MEM, 2); 168 OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | 169 CP_REG_TO_MEM_0_64B | 170 CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */ 171 OUT_RELOCW(ring, scratch_bo, sample_off, 0, 0); 172 173 /* ok... here we really *would* like to use the CP_SET_CONSTANT 174 * mode which can add a constant to value in reg2 and write to 175 * reg1... *but* that only works for banked/context registers, 176 * and CP_ME_NRT_DATA isn't one of those.. so we need to do some 177 * CP math to the scratch buffer instead: 178 * 179 * (note first 8 bytes are counter value, use offset 0x8 for 180 * address calculation) 181 */ 182 183 /* per-sample offset to scratch bo: */ 184 OUT_PKT3(ring, CP_MEM_WRITE, 2); 185 OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0); 186 OUT_RING(ring, samp->offset); 187 188 /* now add to that the per-tile base: */ 189 OUT_PKT3(ring, CP_REG_TO_MEM, 2); 190 OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | 191 CP_REG_TO_MEM_0_ACCUMULATE | 192 CP_REG_TO_MEM_0_CNT(1-1)); /* readback 1 regs */ 193 OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0); 194 195 /* now copy that back to CP_ME_NRT_ADDR: */ 196 OUT_PKT3(ring, CP_MEM_TO_REG, 2); 197 OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR); 198 OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); 199 200 /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA 201 * to trigger the write to result buffer 202 */ 203 OUT_PKT3(ring, CP_MEM_TO_REG, 2); 204 OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); 205 OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); 206 207 /* and again to get the value of the _HI reg from scratch: */ 208 OUT_PKT3(ring, CP_MEM_TO_REG, 2); 209 OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); 210 OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0); 211 212 /* Sigh.. */ 213 214 return samp; 215 } 216 217 static void 218 time_elapsed_accumulate_result(struct fd_context *ctx, 219 const void *start, const void *end, 220 union pipe_query_result *result) 221 { 222 uint64_t n = *(uint64_t *)end - *(uint64_t *)start; 223 /* max_freq is in Hz, convert cycle count to ns: */ 224 result->u64 += n * 1000000000 / ctx->screen->max_freq; 225 } 226 227 static void 228 timestamp_accumulate_result(struct fd_context *ctx, 229 const void *start, const void *end, 230 union pipe_query_result *result) 231 { 232 /* just return the value from fist tile: */ 233 if (result->u64 != 0) 234 return; 235 uint64_t n = *(uint64_t *)start; 236 /* max_freq is in Hz, convert cycle count to ns: */ 237 result->u64 = n * 1000000000 / ctx->screen->max_freq; 238 } 239 240 static const struct fd_hw_sample_provider occlusion_counter = { 241 .query_type = PIPE_QUERY_OCCLUSION_COUNTER, 242 .active = FD_STAGE_DRAW, 243 .get_sample = occlusion_get_sample, 244 .accumulate_result = occlusion_counter_accumulate_result, 245 }; 246 247 static const struct fd_hw_sample_provider occlusion_predicate = { 248 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, 249 .active = FD_STAGE_DRAW, 250 .get_sample = occlusion_get_sample, 251 .accumulate_result = occlusion_predicate_accumulate_result, 252 }; 253 254 static const struct fd_hw_sample_provider time_elapsed = { 255 .query_type = PIPE_QUERY_TIME_ELAPSED, 256 .active = FD_STAGE_DRAW | FD_STAGE_CLEAR, 257 .enable = time_elapsed_enable, 258 .get_sample = time_elapsed_get_sample, 259 .accumulate_result = time_elapsed_accumulate_result, 260 }; 261 262 /* NOTE: timestamp query isn't going to give terribly sensible results 263 * on a tiler. But it is needed by qapitrace profile heatmap. If you 264 * add in a binning pass, the results get even more non-sensical. So 265 * we just return the timestamp on the first tile and hope that is 266 * kind of good enough. 267 */ 268 static const struct fd_hw_sample_provider timestamp = { 269 .query_type = PIPE_QUERY_TIMESTAMP, 270 .active = FD_STAGE_ALL, 271 .enable = time_elapsed_enable, 272 .get_sample = time_elapsed_get_sample, 273 .accumulate_result = timestamp_accumulate_result, 274 }; 275 276 void fd4_query_context_init(struct pipe_context *pctx) 277 { 278 fd_hw_query_register_provider(pctx, &occlusion_counter); 279 fd_hw_query_register_provider(pctx, &occlusion_predicate); 280 fd_hw_query_register_provider(pctx, &time_elapsed); 281 fd_hw_query_register_provider(pctx, ×tamp); 282 } 283