1 /* 2 * Copyright 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric (at) anholt.net> 25 * 26 */ 27 28 /** @file brw_queryobj.c 29 * 30 * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query, 31 * GL_EXT_transform_feedback, and friends). 32 * 33 * The hardware provides a PIPE_CONTROL command that can report the number of 34 * fragments that passed the depth test, or the hardware timer. They are 35 * appropriately synced with the stage of the pipeline for our extensions' 36 * needs. 37 */ 38 #include "main/imports.h" 39 40 #include "brw_context.h" 41 #include "brw_defines.h" 42 #include "brw_state.h" 43 #include "intel_batchbuffer.h" 44 45 uint64_t 46 brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp) 47 { 48 const struct gen_device_info *devinfo = &brw->screen->devinfo; 49 50 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency; 51 } 52 53 /* As best we know currently, the Gen HW timestamps are 36bits across 54 * all platforms, which we need to account for when calculating a 55 * delta to measure elapsed time. 56 * 57 * The timestamps read via glGetTimestamp() / brw_get_timestamp() sometimes 58 * only have 32bits due to a kernel bug and so in that case we make sure to 59 * treat all raw timestamps as 32bits so they overflow consistently and remain 60 * comparable. (Note: the timestamps being passed here are not from the kernel 61 * so we don't need to be taking the upper 32bits in this buggy kernel case we 62 * are just clipping to 32bits here for consistency.) 63 */ 64 uint64_t 65 brw_raw_timestamp_delta(struct brw_context *brw, uint64_t time0, uint64_t time1) 66 { 67 if (brw->screen->hw_has_timestamp == 2) { 68 /* Kernel clips timestamps to 32bits in this case, so we also clip 69 * PIPE_CONTROL timestamps for consistency. 70 */ 71 return (uint32_t)time1 - (uint32_t)time0; 72 } else { 73 if (time0 > time1) { 74 return (1ULL << 36) + time1 - time0; 75 } else { 76 return time1 - time0; 77 } 78 } 79 } 80 81 /** 82 * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. 83 */ 84 void 85 brw_write_timestamp(struct brw_context *brw, struct brw_bo *query_bo, int idx) 86 { 87 const struct gen_device_info *devinfo = &brw->screen->devinfo; 88 89 if (devinfo->gen == 6) { 90 /* Emit Sandybridge workaround flush: */ 91 brw_emit_pipe_control_flush(brw, 92 PIPE_CONTROL_CS_STALL | 93 PIPE_CONTROL_STALL_AT_SCOREBOARD); 94 } 95 96 uint32_t flags = PIPE_CONTROL_WRITE_TIMESTAMP; 97 98 if (devinfo->gen == 9 && devinfo->gt == 4) 99 flags |= PIPE_CONTROL_CS_STALL; 100 101 brw_emit_pipe_control_write(brw, flags, 102 query_bo, idx * sizeof(uint64_t), 0); 103 } 104 105 /** 106 * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer. 107 */ 108 void 109 brw_write_depth_count(struct brw_context *brw, struct brw_bo *query_bo, int idx) 110 { 111 const struct gen_device_info *devinfo = &brw->screen->devinfo; 112 uint32_t flags = PIPE_CONTROL_WRITE_DEPTH_COUNT | PIPE_CONTROL_DEPTH_STALL; 113 114 if (devinfo->gen == 9 && devinfo->gt == 4) 115 flags |= PIPE_CONTROL_CS_STALL; 116 117 if (devinfo->gen >= 10) { 118 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable bit set 119 * prior to programming a PIPE_CONTROL with Write PS Depth Count Post sync 120 * operation." 121 */ 122 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL); 123 } 124 125 brw_emit_pipe_control_write(brw, flags, 126 query_bo, idx * sizeof(uint64_t), 0); 127 } 128 129 /** 130 * Wait on the query object's BO and calculate the final result. 131 */ 132 static void 133 brw_queryobj_get_results(struct gl_context *ctx, 134 struct brw_query_object *query) 135 { 136 struct brw_context *brw = brw_context(ctx); 137 const struct gen_device_info *devinfo = &brw->screen->devinfo; 138 139 int i; 140 uint64_t *results; 141 142 assert(devinfo->gen < 6); 143 144 if (query->bo == NULL) 145 return; 146 147 /* If the application has requested the query result, but this batch is 148 * still contributing to it, flush it now so the results will be present 149 * when mapped. 150 */ 151 if (brw_batch_references(&brw->batch, query->bo)) 152 intel_batchbuffer_flush(brw); 153 154 if (unlikely(brw->perf_debug)) { 155 if (brw_bo_busy(query->bo)) { 156 perf_debug("Stalling on the GPU waiting for a query object.\n"); 157 } 158 } 159 160 results = brw_bo_map(brw, query->bo, MAP_READ); 161 switch (query->Base.Target) { 162 case GL_TIME_ELAPSED_EXT: 163 /* The query BO contains the starting and ending timestamps. 164 * Subtract the two and convert to nanoseconds. 165 */ 166 query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]); 167 query->Base.Result = brw_timebase_scale(brw, query->Base.Result); 168 break; 169 170 case GL_TIMESTAMP: 171 /* The query BO contains a single timestamp value in results[0]. */ 172 query->Base.Result = brw_timebase_scale(brw, results[0]); 173 174 /* Ensure the scaled timestamp overflows according to 175 * GL_QUERY_COUNTER_BITS 176 */ 177 query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; 178 break; 179 180 case GL_SAMPLES_PASSED_ARB: 181 /* Loop over pairs of values from the BO, which are the PS_DEPTH_COUNT 182 * value at the start and end of the batchbuffer. Subtract them to 183 * get the number of fragments which passed the depth test in each 184 * individual batch, and add those differences up to get the number 185 * of fragments for the entire query. 186 * 187 * Note that query->Base.Result may already be non-zero. We may have 188 * run out of space in the query's BO and allocated a new one. If so, 189 * this function was already called to accumulate the results so far. 190 */ 191 for (i = 0; i < query->last_index; i++) { 192 query->Base.Result += results[i * 2 + 1] - results[i * 2]; 193 } 194 break; 195 196 case GL_ANY_SAMPLES_PASSED: 197 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 198 /* If the starting and ending PS_DEPTH_COUNT from any of the batches 199 * differ, then some fragments passed the depth test. 200 */ 201 for (i = 0; i < query->last_index; i++) { 202 if (results[i * 2 + 1] != results[i * 2]) { 203 query->Base.Result = GL_TRUE; 204 break; 205 } 206 } 207 break; 208 209 default: 210 unreachable("Unrecognized query target in brw_queryobj_get_results()"); 211 } 212 brw_bo_unmap(query->bo); 213 214 /* Now that we've processed the data stored in the query's buffer object, 215 * we can release it. 216 */ 217 brw_bo_unreference(query->bo); 218 query->bo = NULL; 219 } 220 221 /** 222 * The NewQueryObject() driver hook. 223 * 224 * Allocates and initializes a new query object. 225 */ 226 static struct gl_query_object * 227 brw_new_query_object(struct gl_context *ctx, GLuint id) 228 { 229 struct brw_query_object *query; 230 231 query = calloc(1, sizeof(struct brw_query_object)); 232 233 query->Base.Id = id; 234 query->Base.Result = 0; 235 query->Base.Active = false; 236 query->Base.Ready = true; 237 238 return &query->Base; 239 } 240 241 /** 242 * The DeleteQuery() driver hook. 243 */ 244 static void 245 brw_delete_query(struct gl_context *ctx, struct gl_query_object *q) 246 { 247 struct brw_query_object *query = (struct brw_query_object *)q; 248 249 brw_bo_unreference(query->bo); 250 free(query); 251 } 252 253 /** 254 * Gen4-5 driver hook for glBeginQuery(). 255 * 256 * Initializes driver structures and emits any GPU commands required to begin 257 * recording data for the query. 258 */ 259 static void 260 brw_begin_query(struct gl_context *ctx, struct gl_query_object *q) 261 { 262 struct brw_context *brw = brw_context(ctx); 263 struct brw_query_object *query = (struct brw_query_object *)q; 264 const struct gen_device_info *devinfo = &brw->screen->devinfo; 265 266 assert(devinfo->gen < 6); 267 268 switch (query->Base.Target) { 269 case GL_TIME_ELAPSED_EXT: 270 /* For timestamp queries, we record the starting time right away so that 271 * we measure the full time between BeginQuery and EndQuery. There's 272 * some debate about whether this is the right thing to do. Our decision 273 * is based on the following text from the ARB_timer_query extension: 274 * 275 * "(5) Should the extension measure total time elapsed between the full 276 * completion of the BeginQuery and EndQuery commands, or just time 277 * spent in the graphics library? 278 * 279 * RESOLVED: This extension will measure the total time elapsed 280 * between the full completion of these commands. Future extensions 281 * may implement a query to determine time elapsed at different stages 282 * of the graphics pipeline." 283 * 284 * We write a starting timestamp now (at index 0). At EndQuery() time, 285 * we'll write a second timestamp (at index 1), and subtract the two to 286 * obtain the time elapsed. Notably, this includes time elapsed while 287 * the system was doing other work, such as running other applications. 288 */ 289 brw_bo_unreference(query->bo); 290 query->bo = brw_bo_alloc(brw->bufmgr, "timer query", 4096, 4096); 291 brw_write_timestamp(brw, query->bo, 0); 292 break; 293 294 case GL_ANY_SAMPLES_PASSED: 295 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 296 case GL_SAMPLES_PASSED_ARB: 297 /* For occlusion queries, we delay taking an initial sample until the 298 * first drawing occurs in this batch. See the reasoning in the comments 299 * for brw_emit_query_begin() below. 300 * 301 * Since we're starting a new query, we need to be sure to throw away 302 * any previous occlusion query results. 303 */ 304 brw_bo_unreference(query->bo); 305 query->bo = NULL; 306 query->last_index = -1; 307 308 brw->query.obj = query; 309 310 /* Depth statistics on Gen4 require strange workarounds, so we try to 311 * avoid them when necessary. They're required for occlusion queries, 312 * so turn them on now. 313 */ 314 brw->stats_wm++; 315 brw->ctx.NewDriverState |= BRW_NEW_STATS_WM; 316 break; 317 318 default: 319 unreachable("Unrecognized query target in brw_begin_query()"); 320 } 321 } 322 323 /** 324 * Gen4-5 driver hook for glEndQuery(). 325 * 326 * Emits GPU commands to record a final query value, ending any data capturing. 327 * However, the final result isn't necessarily available until the GPU processes 328 * those commands. brw_queryobj_get_results() processes the captured data to 329 * produce the final result. 330 */ 331 static void 332 brw_end_query(struct gl_context *ctx, struct gl_query_object *q) 333 { 334 struct brw_context *brw = brw_context(ctx); 335 struct brw_query_object *query = (struct brw_query_object *)q; 336 const struct gen_device_info *devinfo = &brw->screen->devinfo; 337 338 assert(devinfo->gen < 6); 339 340 switch (query->Base.Target) { 341 case GL_TIME_ELAPSED_EXT: 342 /* Write the final timestamp. */ 343 brw_write_timestamp(brw, query->bo, 1); 344 break; 345 346 case GL_ANY_SAMPLES_PASSED: 347 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 348 case GL_SAMPLES_PASSED_ARB: 349 350 /* No query->bo means that EndQuery was called after BeginQuery with no 351 * intervening drawing. Rather than doing nothing at all here in this 352 * case, we emit the query_begin and query_end state to the 353 * hardware. This is to guarantee that waiting on the result of this 354 * empty state will cause all previous queries to complete at all, as 355 * required by the specification: 356 * 357 * It must always be true that if any query object 358 * returns a result available of TRUE, all queries of the 359 * same type issued prior to that query must also return 360 * TRUE. [Open GL 4.3 (Core Profile) Section 4.2.1] 361 */ 362 if (!query->bo) { 363 brw_emit_query_begin(brw); 364 } 365 366 assert(query->bo); 367 368 brw_emit_query_end(brw); 369 370 brw->query.obj = NULL; 371 372 brw->stats_wm--; 373 brw->ctx.NewDriverState |= BRW_NEW_STATS_WM; 374 break; 375 376 default: 377 unreachable("Unrecognized query target in brw_end_query()"); 378 } 379 } 380 381 /** 382 * The Gen4-5 WaitQuery() driver hook. 383 * 384 * Wait for a query result to become available and return it. This is the 385 * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname. 386 */ 387 static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q) 388 { 389 struct brw_query_object *query = (struct brw_query_object *)q; 390 const struct gen_device_info *devinfo = &brw_context(ctx)->screen->devinfo; 391 392 assert(devinfo->gen < 6); 393 394 brw_queryobj_get_results(ctx, query); 395 query->Base.Ready = true; 396 } 397 398 /** 399 * The Gen4-5 CheckQuery() driver hook. 400 * 401 * Checks whether a query result is ready yet. If not, flushes. 402 * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname. 403 */ 404 static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q) 405 { 406 struct brw_context *brw = brw_context(ctx); 407 struct brw_query_object *query = (struct brw_query_object *)q; 408 const struct gen_device_info *devinfo = &brw->screen->devinfo; 409 410 assert(devinfo->gen < 6); 411 412 /* From the GL_ARB_occlusion_query spec: 413 * 414 * "Instead of allowing for an infinite loop, performing a 415 * QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is 416 * not ready yet on the first time it is queried. This ensures that 417 * the async query will return true in finite time. 418 */ 419 if (query->bo && brw_batch_references(&brw->batch, query->bo)) 420 intel_batchbuffer_flush(brw); 421 422 if (query->bo == NULL || !brw_bo_busy(query->bo)) { 423 brw_queryobj_get_results(ctx, query); 424 query->Base.Ready = true; 425 } 426 } 427 428 /** 429 * Ensure there query's BO has enough space to store a new pair of values. 430 * 431 * If not, gather the existing BO's results and create a new buffer of the 432 * same size. 433 */ 434 static void 435 ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query) 436 { 437 struct brw_context *brw = brw_context(ctx); 438 const struct gen_device_info *devinfo = &brw->screen->devinfo; 439 440 assert(devinfo->gen < 6); 441 442 if (!query->bo || query->last_index * 2 + 1 >= 4096 / sizeof(uint64_t)) { 443 444 if (query->bo != NULL) { 445 /* The old query BO did not have enough space, so we allocated a new 446 * one. Gather the results so far (adding up the differences) and 447 * release the old BO. 448 */ 449 brw_queryobj_get_results(ctx, query); 450 } 451 452 query->bo = brw_bo_alloc(brw->bufmgr, "query", 4096, 1); 453 query->last_index = 0; 454 } 455 } 456 457 /** 458 * Record the PS_DEPTH_COUNT value (for occlusion queries) just before 459 * primitive drawing. 460 * 461 * In a pre-hardware context world, the single PS_DEPTH_COUNT register is 462 * shared among all applications using the GPU. However, our query value 463 * needs to only include fragments generated by our application/GL context. 464 * 465 * To accommodate this, we record PS_DEPTH_COUNT at the start and end of 466 * each batchbuffer (technically, the first primitive drawn and flush time). 467 * Subtracting each pair of values calculates the change in PS_DEPTH_COUNT 468 * caused by a batchbuffer. Since there is no preemption inside batches, 469 * this is guaranteed to only measure the effects of our current application. 470 * 471 * Adding each of these differences (in case drawing is done over many batches) 472 * produces the final expected value. 473 * 474 * In a world with hardware contexts, PS_DEPTH_COUNT is saved and restored 475 * as part of the context state, so this is unnecessary, and skipped. 476 */ 477 void 478 brw_emit_query_begin(struct brw_context *brw) 479 { 480 struct gl_context *ctx = &brw->ctx; 481 struct brw_query_object *query = brw->query.obj; 482 483 /* Skip if we're not doing any queries, or we've already recorded the 484 * initial query value for this batchbuffer. 485 */ 486 if (!query || brw->query.begin_emitted) 487 return; 488 489 ensure_bo_has_space(ctx, query); 490 491 brw_write_depth_count(brw, query->bo, query->last_index * 2); 492 493 brw->query.begin_emitted = true; 494 } 495 496 /** 497 * Called at batchbuffer flush to get an ending PS_DEPTH_COUNT 498 * (for non-hardware context platforms). 499 * 500 * See the explanation in brw_emit_query_begin(). 501 */ 502 void 503 brw_emit_query_end(struct brw_context *brw) 504 { 505 struct brw_query_object *query = brw->query.obj; 506 507 if (!brw->query.begin_emitted) 508 return; 509 510 brw_write_depth_count(brw, query->bo, query->last_index * 2 + 1); 511 512 brw->query.begin_emitted = false; 513 query->last_index++; 514 } 515 516 /** 517 * Driver hook for glQueryCounter(). 518 * 519 * This handles GL_TIMESTAMP queries, which perform a pipelined read of the 520 * current GPU time. This is unlike GL_TIME_ELAPSED, which measures the 521 * time while the query is active. 522 */ 523 void 524 brw_query_counter(struct gl_context *ctx, struct gl_query_object *q) 525 { 526 struct brw_context *brw = brw_context(ctx); 527 struct brw_query_object *query = (struct brw_query_object *) q; 528 529 assert(q->Target == GL_TIMESTAMP); 530 531 brw_bo_unreference(query->bo); 532 query->bo = brw_bo_alloc(brw->bufmgr, "timestamp query", 4096, 4096); 533 brw_write_timestamp(brw, query->bo, 0); 534 535 query->flushed = false; 536 } 537 538 /** 539 * Read the TIMESTAMP register immediately (in a non-pipelined fashion). 540 * 541 * This is used to implement the GetTimestamp() driver hook. 542 */ 543 static uint64_t 544 brw_get_timestamp(struct gl_context *ctx) 545 { 546 struct brw_context *brw = brw_context(ctx); 547 uint64_t result = 0; 548 549 switch (brw->screen->hw_has_timestamp) { 550 case 3: /* New kernel, always full 36bit accuracy */ 551 brw_reg_read(brw->bufmgr, TIMESTAMP | 1, &result); 552 break; 553 case 2: /* 64bit kernel, result is left-shifted by 32bits, losing 4bits */ 554 brw_reg_read(brw->bufmgr, TIMESTAMP, &result); 555 result = result >> 32; 556 break; 557 case 1: /* 32bit kernel, result is 36bit wide but may be inaccurate! */ 558 brw_reg_read(brw->bufmgr, TIMESTAMP, &result); 559 break; 560 } 561 562 /* Scale to nanosecond units */ 563 result = brw_timebase_scale(brw, result); 564 565 /* Ensure the scaled timestamp overflows according to 566 * GL_QUERY_COUNTER_BITS. Technically this isn't required if 567 * querying GL_TIMESTAMP via glGetInteger but it seems best to keep 568 * QueryObject and GetInteger timestamps consistent. 569 */ 570 result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; 571 return result; 572 } 573 574 /** 575 * Is this type of query written by PIPE_CONTROL? 576 */ 577 bool 578 brw_is_query_pipelined(struct brw_query_object *query) 579 { 580 switch (query->Base.Target) { 581 case GL_TIMESTAMP: 582 case GL_TIME_ELAPSED: 583 case GL_ANY_SAMPLES_PASSED: 584 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 585 case GL_SAMPLES_PASSED_ARB: 586 return true; 587 588 case GL_PRIMITIVES_GENERATED: 589 case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: 590 case GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB: 591 case GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB: 592 case GL_VERTICES_SUBMITTED_ARB: 593 case GL_PRIMITIVES_SUBMITTED_ARB: 594 case GL_VERTEX_SHADER_INVOCATIONS_ARB: 595 case GL_GEOMETRY_SHADER_INVOCATIONS: 596 case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: 597 case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: 598 case GL_CLIPPING_INPUT_PRIMITIVES_ARB: 599 case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: 600 case GL_COMPUTE_SHADER_INVOCATIONS_ARB: 601 case GL_TESS_CONTROL_SHADER_PATCHES_ARB: 602 case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: 603 return false; 604 605 default: 606 unreachable("Unrecognized query target in is_query_pipelined()"); 607 } 608 } 609 610 /* Initialize query object functions used on all generations. */ 611 void brw_init_common_queryobj_functions(struct dd_function_table *functions) 612 { 613 functions->NewQueryObject = brw_new_query_object; 614 functions->DeleteQuery = brw_delete_query; 615 functions->GetTimestamp = brw_get_timestamp; 616 } 617 618 /* Initialize Gen4/5-specific query object functions. */ 619 void gen4_init_queryobj_functions(struct dd_function_table *functions) 620 { 621 functions->BeginQuery = brw_begin_query; 622 functions->EndQuery = brw_end_query; 623 functions->CheckQuery = brw_check_query; 624 functions->WaitQuery = brw_wait_query; 625 functions->QueryCounter = brw_query_counter; 626 } 627