1 /* 2 * Copyright 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric (at) anholt.net> 25 * 26 */ 27 28 /** @file brw_queryobj.c 29 * 30 * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query, 31 * GL_EXT_transform_feedback, and friends). 32 * 33 * The hardware provides a PIPE_CONTROL command that can report the number of 34 * fragments that passed the depth test, or the hardware timer. They are 35 * appropriately synced with the stage of the pipeline for our extensions' 36 * needs. 37 */ 38 #include "main/imports.h" 39 40 #include "brw_context.h" 41 #include "brw_defines.h" 42 #include "brw_state.h" 43 #include "intel_batchbuffer.h" 44 45 /** 46 * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. 47 */ 48 void 49 brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx) 50 { 51 if (brw->gen == 6) { 52 /* Emit Sandybridge workaround flush: */ 53 brw_emit_pipe_control_flush(brw, 54 PIPE_CONTROL_CS_STALL | 55 PIPE_CONTROL_STALL_AT_SCOREBOARD); 56 } 57 58 uint32_t flags = PIPE_CONTROL_WRITE_TIMESTAMP; 59 60 if (brw->gen == 9 && brw->gt == 4) 61 flags |= PIPE_CONTROL_CS_STALL; 62 63 brw_emit_pipe_control_write(brw, flags, 64 query_bo, idx * sizeof(uint64_t), 0, 0); 65 } 66 67 /** 68 * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer. 69 */ 70 void 71 brw_write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx) 72 { 73 uint32_t flags = PIPE_CONTROL_WRITE_DEPTH_COUNT | PIPE_CONTROL_DEPTH_STALL; 74 75 if (brw->gen == 9 && brw->gt == 4) 76 flags |= PIPE_CONTROL_CS_STALL; 77 78 brw_emit_pipe_control_write(brw, flags, 79 query_bo, idx * sizeof(uint64_t), 80 0, 0); 81 } 82 83 /** 84 * Wait on the query object's BO and calculate the final result. 85 */ 86 static void 87 brw_queryobj_get_results(struct gl_context *ctx, 88 struct brw_query_object *query) 89 { 90 struct brw_context *brw = brw_context(ctx); 91 92 int i; 93 uint64_t *results; 94 95 assert(brw->gen < 6); 96 97 if (query->bo == NULL) 98 return; 99 100 /* If the application has requested the query result, but this batch is 101 * still contributing to it, flush it now so the results will be present 102 * when mapped. 103 */ 104 if (drm_intel_bo_references(brw->batch.bo, query->bo)) 105 intel_batchbuffer_flush(brw); 106 107 if (unlikely(brw->perf_debug)) { 108 if (drm_intel_bo_busy(query->bo)) { 109 perf_debug("Stalling on the GPU waiting for a query object.\n"); 110 } 111 } 112 113 drm_intel_bo_map(query->bo, false); 114 results = query->bo->virtual; 115 switch (query->Base.Target) { 116 case GL_TIME_ELAPSED_EXT: 117 /* The query BO contains the starting and ending timestamps. 118 * Subtract the two and convert to nanoseconds. 119 */ 120 query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32)); 121 break; 122 123 case GL_TIMESTAMP: 124 /* The query BO contains a single timestamp value in results[0]. */ 125 query->Base.Result = 1000 * (results[0] >> 32); 126 break; 127 128 case GL_SAMPLES_PASSED_ARB: 129 /* Loop over pairs of values from the BO, which are the PS_DEPTH_COUNT 130 * value at the start and end of the batchbuffer. Subtract them to 131 * get the number of fragments which passed the depth test in each 132 * individual batch, and add those differences up to get the number 133 * of fragments for the entire query. 134 * 135 * Note that query->Base.Result may already be non-zero. We may have 136 * run out of space in the query's BO and allocated a new one. If so, 137 * this function was already called to accumulate the results so far. 138 */ 139 for (i = 0; i < query->last_index; i++) { 140 query->Base.Result += results[i * 2 + 1] - results[i * 2]; 141 } 142 break; 143 144 case GL_ANY_SAMPLES_PASSED: 145 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 146 /* If the starting and ending PS_DEPTH_COUNT from any of the batches 147 * differ, then some fragments passed the depth test. 148 */ 149 for (i = 0; i < query->last_index; i++) { 150 if (results[i * 2 + 1] != results[i * 2]) { 151 query->Base.Result = GL_TRUE; 152 break; 153 } 154 } 155 break; 156 157 default: 158 unreachable("Unrecognized query target in brw_queryobj_get_results()"); 159 } 160 drm_intel_bo_unmap(query->bo); 161 162 /* Now that we've processed the data stored in the query's buffer object, 163 * we can release it. 164 */ 165 drm_intel_bo_unreference(query->bo); 166 query->bo = NULL; 167 } 168 169 /** 170 * The NewQueryObject() driver hook. 171 * 172 * Allocates and initializes a new query object. 173 */ 174 static struct gl_query_object * 175 brw_new_query_object(struct gl_context *ctx, GLuint id) 176 { 177 struct brw_query_object *query; 178 179 query = calloc(1, sizeof(struct brw_query_object)); 180 181 query->Base.Id = id; 182 query->Base.Result = 0; 183 query->Base.Active = false; 184 query->Base.Ready = true; 185 186 return &query->Base; 187 } 188 189 /** 190 * The DeleteQuery() driver hook. 191 */ 192 static void 193 brw_delete_query(struct gl_context *ctx, struct gl_query_object *q) 194 { 195 struct brw_query_object *query = (struct brw_query_object *)q; 196 197 drm_intel_bo_unreference(query->bo); 198 free(query); 199 } 200 201 /** 202 * Gen4-5 driver hook for glBeginQuery(). 203 * 204 * Initializes driver structures and emits any GPU commands required to begin 205 * recording data for the query. 206 */ 207 static void 208 brw_begin_query(struct gl_context *ctx, struct gl_query_object *q) 209 { 210 struct brw_context *brw = brw_context(ctx); 211 struct brw_query_object *query = (struct brw_query_object *)q; 212 213 assert(brw->gen < 6); 214 215 switch (query->Base.Target) { 216 case GL_TIME_ELAPSED_EXT: 217 /* For timestamp queries, we record the starting time right away so that 218 * we measure the full time between BeginQuery and EndQuery. There's 219 * some debate about whether this is the right thing to do. Our decision 220 * is based on the following text from the ARB_timer_query extension: 221 * 222 * "(5) Should the extension measure total time elapsed between the full 223 * completion of the BeginQuery and EndQuery commands, or just time 224 * spent in the graphics library? 225 * 226 * RESOLVED: This extension will measure the total time elapsed 227 * between the full completion of these commands. Future extensions 228 * may implement a query to determine time elapsed at different stages 229 * of the graphics pipeline." 230 * 231 * We write a starting timestamp now (at index 0). At EndQuery() time, 232 * we'll write a second timestamp (at index 1), and subtract the two to 233 * obtain the time elapsed. Notably, this includes time elapsed while 234 * the system was doing other work, such as running other applications. 235 */ 236 drm_intel_bo_unreference(query->bo); 237 query->bo = drm_intel_bo_alloc(brw->bufmgr, "timer query", 4096, 4096); 238 brw_write_timestamp(brw, query->bo, 0); 239 break; 240 241 case GL_ANY_SAMPLES_PASSED: 242 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 243 case GL_SAMPLES_PASSED_ARB: 244 /* For occlusion queries, we delay taking an initial sample until the 245 * first drawing occurs in this batch. See the reasoning in the comments 246 * for brw_emit_query_begin() below. 247 * 248 * Since we're starting a new query, we need to be sure to throw away 249 * any previous occlusion query results. 250 */ 251 drm_intel_bo_unreference(query->bo); 252 query->bo = NULL; 253 query->last_index = -1; 254 255 brw->query.obj = query; 256 257 /* Depth statistics on Gen4 require strange workarounds, so we try to 258 * avoid them when necessary. They're required for occlusion queries, 259 * so turn them on now. 260 */ 261 brw->stats_wm++; 262 brw->ctx.NewDriverState |= BRW_NEW_STATS_WM; 263 break; 264 265 default: 266 unreachable("Unrecognized query target in brw_begin_query()"); 267 } 268 } 269 270 /** 271 * Gen4-5 driver hook for glEndQuery(). 272 * 273 * Emits GPU commands to record a final query value, ending any data capturing. 274 * However, the final result isn't necessarily available until the GPU processes 275 * those commands. brw_queryobj_get_results() processes the captured data to 276 * produce the final result. 277 */ 278 static void 279 brw_end_query(struct gl_context *ctx, struct gl_query_object *q) 280 { 281 struct brw_context *brw = brw_context(ctx); 282 struct brw_query_object *query = (struct brw_query_object *)q; 283 284 assert(brw->gen < 6); 285 286 switch (query->Base.Target) { 287 case GL_TIME_ELAPSED_EXT: 288 /* Write the final timestamp. */ 289 brw_write_timestamp(brw, query->bo, 1); 290 break; 291 292 case GL_ANY_SAMPLES_PASSED: 293 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 294 case GL_SAMPLES_PASSED_ARB: 295 296 /* No query->bo means that EndQuery was called after BeginQuery with no 297 * intervening drawing. Rather than doing nothing at all here in this 298 * case, we emit the query_begin and query_end state to the 299 * hardware. This is to guarantee that waiting on the result of this 300 * empty state will cause all previous queries to complete at all, as 301 * required by the specification: 302 * 303 * It must always be true that if any query object 304 * returns a result available of TRUE, all queries of the 305 * same type issued prior to that query must also return 306 * TRUE. [Open GL 4.3 (Core Profile) Section 4.2.1] 307 */ 308 if (!query->bo) { 309 brw_emit_query_begin(brw); 310 } 311 312 assert(query->bo); 313 314 brw_emit_query_end(brw); 315 316 brw->query.obj = NULL; 317 318 brw->stats_wm--; 319 brw->ctx.NewDriverState |= BRW_NEW_STATS_WM; 320 break; 321 322 default: 323 unreachable("Unrecognized query target in brw_end_query()"); 324 } 325 } 326 327 /** 328 * The Gen4-5 WaitQuery() driver hook. 329 * 330 * Wait for a query result to become available and return it. This is the 331 * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname. 332 */ 333 static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q) 334 { 335 struct brw_query_object *query = (struct brw_query_object *)q; 336 337 assert(brw_context(ctx)->gen < 6); 338 339 brw_queryobj_get_results(ctx, query); 340 query->Base.Ready = true; 341 } 342 343 /** 344 * The Gen4-5 CheckQuery() driver hook. 345 * 346 * Checks whether a query result is ready yet. If not, flushes. 347 * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname. 348 */ 349 static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q) 350 { 351 struct brw_context *brw = brw_context(ctx); 352 struct brw_query_object *query = (struct brw_query_object *)q; 353 354 assert(brw->gen < 6); 355 356 /* From the GL_ARB_occlusion_query spec: 357 * 358 * "Instead of allowing for an infinite loop, performing a 359 * QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is 360 * not ready yet on the first time it is queried. This ensures that 361 * the async query will return true in finite time. 362 */ 363 if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo)) 364 intel_batchbuffer_flush(brw); 365 366 if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) { 367 brw_queryobj_get_results(ctx, query); 368 query->Base.Ready = true; 369 } 370 } 371 372 /** 373 * Ensure there query's BO has enough space to store a new pair of values. 374 * 375 * If not, gather the existing BO's results and create a new buffer of the 376 * same size. 377 */ 378 static void 379 ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query) 380 { 381 struct brw_context *brw = brw_context(ctx); 382 383 assert(brw->gen < 6); 384 385 if (!query->bo || query->last_index * 2 + 1 >= 4096 / sizeof(uint64_t)) { 386 387 if (query->bo != NULL) { 388 /* The old query BO did not have enough space, so we allocated a new 389 * one. Gather the results so far (adding up the differences) and 390 * release the old BO. 391 */ 392 brw_queryobj_get_results(ctx, query); 393 } 394 395 query->bo = drm_intel_bo_alloc(brw->bufmgr, "query", 4096, 1); 396 query->last_index = 0; 397 } 398 } 399 400 /** 401 * Record the PS_DEPTH_COUNT value (for occlusion queries) just before 402 * primitive drawing. 403 * 404 * In a pre-hardware context world, the single PS_DEPTH_COUNT register is 405 * shared among all applications using the GPU. However, our query value 406 * needs to only include fragments generated by our application/GL context. 407 * 408 * To accommodate this, we record PS_DEPTH_COUNT at the start and end of 409 * each batchbuffer (technically, the first primitive drawn and flush time). 410 * Subtracting each pair of values calculates the change in PS_DEPTH_COUNT 411 * caused by a batchbuffer. Since there is no preemption inside batches, 412 * this is guaranteed to only measure the effects of our current application. 413 * 414 * Adding each of these differences (in case drawing is done over many batches) 415 * produces the final expected value. 416 * 417 * In a world with hardware contexts, PS_DEPTH_COUNT is saved and restored 418 * as part of the context state, so this is unnecessary, and skipped. 419 */ 420 void 421 brw_emit_query_begin(struct brw_context *brw) 422 { 423 struct gl_context *ctx = &brw->ctx; 424 struct brw_query_object *query = brw->query.obj; 425 426 if (brw->hw_ctx) 427 return; 428 429 /* Skip if we're not doing any queries, or we've already recorded the 430 * initial query value for this batchbuffer. 431 */ 432 if (!query || brw->query.begin_emitted) 433 return; 434 435 ensure_bo_has_space(ctx, query); 436 437 brw_write_depth_count(brw, query->bo, query->last_index * 2); 438 439 brw->query.begin_emitted = true; 440 } 441 442 /** 443 * Called at batchbuffer flush to get an ending PS_DEPTH_COUNT 444 * (for non-hardware context platforms). 445 * 446 * See the explanation in brw_emit_query_begin(). 447 */ 448 void 449 brw_emit_query_end(struct brw_context *brw) 450 { 451 struct brw_query_object *query = brw->query.obj; 452 453 if (brw->hw_ctx) 454 return; 455 456 if (!brw->query.begin_emitted) 457 return; 458 459 brw_write_depth_count(brw, query->bo, query->last_index * 2 + 1); 460 461 brw->query.begin_emitted = false; 462 query->last_index++; 463 } 464 465 /** 466 * Driver hook for glQueryCounter(). 467 * 468 * This handles GL_TIMESTAMP queries, which perform a pipelined read of the 469 * current GPU time. This is unlike GL_TIME_ELAPSED, which measures the 470 * time while the query is active. 471 */ 472 void 473 brw_query_counter(struct gl_context *ctx, struct gl_query_object *q) 474 { 475 struct brw_context *brw = brw_context(ctx); 476 struct brw_query_object *query = (struct brw_query_object *) q; 477 478 assert(q->Target == GL_TIMESTAMP); 479 480 drm_intel_bo_unreference(query->bo); 481 query->bo = drm_intel_bo_alloc(brw->bufmgr, "timestamp query", 4096, 4096); 482 brw_write_timestamp(brw, query->bo, 0); 483 484 query->flushed = false; 485 } 486 487 /** 488 * Read the TIMESTAMP register immediately (in a non-pipelined fashion). 489 * 490 * This is used to implement the GetTimestamp() driver hook. 491 */ 492 static uint64_t 493 brw_get_timestamp(struct gl_context *ctx) 494 { 495 struct brw_context *brw = brw_context(ctx); 496 uint64_t result = 0; 497 498 switch (brw->screen->hw_has_timestamp) { 499 case 3: /* New kernel, always full 36bit accuracy */ 500 drm_intel_reg_read(brw->bufmgr, TIMESTAMP | 1, &result); 501 break; 502 case 2: /* 64bit kernel, result is left-shifted by 32bits, losing 4bits */ 503 drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result); 504 result = result >> 32; 505 break; 506 case 1: /* 32bit kernel, result is 36bit wide but may be inaccurate! */ 507 drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result); 508 break; 509 } 510 511 /* See logic in brw_queryobj_get_results() */ 512 result *= 80; 513 result &= (1ull << 36) - 1; 514 return result; 515 } 516 517 /** 518 * Is this type of query written by PIPE_CONTROL? 519 */ 520 bool 521 brw_is_query_pipelined(struct brw_query_object *query) 522 { 523 switch (query->Base.Target) { 524 case GL_TIMESTAMP: 525 case GL_TIME_ELAPSED: 526 case GL_ANY_SAMPLES_PASSED: 527 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 528 case GL_SAMPLES_PASSED_ARB: 529 return true; 530 531 case GL_PRIMITIVES_GENERATED: 532 case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: 533 case GL_VERTICES_SUBMITTED_ARB: 534 case GL_PRIMITIVES_SUBMITTED_ARB: 535 case GL_VERTEX_SHADER_INVOCATIONS_ARB: 536 case GL_GEOMETRY_SHADER_INVOCATIONS: 537 case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: 538 case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: 539 case GL_CLIPPING_INPUT_PRIMITIVES_ARB: 540 case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: 541 case GL_COMPUTE_SHADER_INVOCATIONS_ARB: 542 case GL_TESS_CONTROL_SHADER_PATCHES_ARB: 543 case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: 544 return false; 545 546 default: 547 unreachable("Unrecognized query target in is_query_pipelined()"); 548 } 549 } 550 551 /* Initialize query object functions used on all generations. */ 552 void brw_init_common_queryobj_functions(struct dd_function_table *functions) 553 { 554 functions->NewQueryObject = brw_new_query_object; 555 functions->DeleteQuery = brw_delete_query; 556 functions->GetTimestamp = brw_get_timestamp; 557 } 558 559 /* Initialize Gen4/5-specific query object functions. */ 560 void gen4_init_queryobj_functions(struct dd_function_table *functions) 561 { 562 functions->BeginQuery = brw_begin_query; 563 functions->EndQuery = brw_end_query; 564 functions->CheckQuery = brw_check_query; 565 functions->WaitQuery = brw_wait_query; 566 functions->QueryCounter = brw_query_counter; 567 } 568