Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2008 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *    Eric Anholt <eric (at) anholt.net>
     25  *
     26  */
     27 
     28 /** @file brw_queryobj.c
     29  *
     30  * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query,
     31  * GL_EXT_transform_feedback, and friends).
     32  *
     33  * The hardware provides a PIPE_CONTROL command that can report the number of
     34  * fragments that passed the depth test, or the hardware timer.  They are
     35  * appropriately synced with the stage of the pipeline for our extensions'
     36  * needs.
     37  *
     38  * To avoid getting samples from another context's rendering in our results,
     39  * we capture the counts at the start and end of every batchbuffer while the
     40  * query is active, and sum up the differences.  (We should do so for
     41  * GL_TIME_ELAPSED as well, but don't).
     42  */
     43 #include "main/imports.h"
     44 
     45 #include "brw_context.h"
     46 #include "brw_defines.h"
     47 #include "brw_state.h"
     48 #include "intel_batchbuffer.h"
     49 #include "intel_reg.h"
     50 
     51 static void
     52 write_timestamp(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
     53 {
     54    if (intel->gen >= 6) {
     55       /* Emit workaround flushes: */
     56       if (intel->gen == 6) {
     57          /* The timestamp write below is a non-zero post-sync op, which on
     58           * Gen6 necessitates a CS stall.  CS stalls need stall at scoreboard
     59           * set.  See the comments for intel_emit_post_sync_nonzero_flush().
     60           */
     61          BEGIN_BATCH(4);
     62          OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
     63          OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD);
     64          OUT_BATCH(0);
     65          OUT_BATCH(0);
     66          ADVANCE_BATCH();
     67       }
     68 
     69       BEGIN_BATCH(5);
     70       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
     71       OUT_BATCH(PIPE_CONTROL_WRITE_TIMESTAMP);
     72       OUT_RELOC(query_bo,
     73                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
     74                 PIPE_CONTROL_GLOBAL_GTT_WRITE |
     75                 idx * sizeof(uint64_t));
     76       OUT_BATCH(0);
     77       OUT_BATCH(0);
     78       ADVANCE_BATCH();
     79    } else {
     80       BEGIN_BATCH(4);
     81       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
     82                 PIPE_CONTROL_WRITE_TIMESTAMP);
     83       OUT_RELOC(query_bo,
     84                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
     85                 PIPE_CONTROL_GLOBAL_GTT_WRITE |
     86                 idx * sizeof(uint64_t));
     87       OUT_BATCH(0);
     88       OUT_BATCH(0);
     89       ADVANCE_BATCH();
     90    }
     91 }
     92 
     93 static void
     94 write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
     95 {
     96    if (intel->gen >= 6) {
     97       /* Emit Sandybridge workaround flush: */
     98       if (intel->gen == 6)
     99          intel_emit_post_sync_nonzero_flush(intel);
    100 
    101       BEGIN_BATCH(5);
    102       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
    103       OUT_BATCH(PIPE_CONTROL_DEPTH_STALL |
    104                 PIPE_CONTROL_WRITE_DEPTH_COUNT);
    105       OUT_RELOC(query_bo,
    106                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    107                 PIPE_CONTROL_GLOBAL_GTT_WRITE |
    108                 (idx * sizeof(uint64_t)));
    109       OUT_BATCH(0);
    110       OUT_BATCH(0);
    111       ADVANCE_BATCH();
    112    } else {
    113       BEGIN_BATCH(4);
    114       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
    115                 PIPE_CONTROL_DEPTH_STALL |
    116                 PIPE_CONTROL_WRITE_DEPTH_COUNT);
    117       /* This object could be mapped cacheable, but we don't have an exposed
    118        * mechanism to support that.  Since it's going uncached, tell GEM that
    119        * we're writing to it.  The usual clflush should be all that's required
    120        * to pick up the results.
    121        */
    122       OUT_RELOC(query_bo,
    123                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
    124                 PIPE_CONTROL_GLOBAL_GTT_WRITE |
    125                 (idx * sizeof(uint64_t)));
    126       OUT_BATCH(0);
    127       OUT_BATCH(0);
    128       ADVANCE_BATCH();
    129    }
    130 }
    131 
    132 /** Waits on the query object's BO and totals the results for this query */
    133 static void
    134 brw_queryobj_get_results(struct gl_context *ctx,
    135 			 struct brw_query_object *query)
    136 {
    137    struct intel_context *intel = intel_context(ctx);
    138 
    139    int i;
    140    uint64_t *results;
    141 
    142    if (query->bo == NULL)
    143       return;
    144 
    145    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
    146       if (drm_intel_bo_busy(query->bo)) {
    147          perf_debug("Stalling on the GPU waiting for a query object.\n");
    148       }
    149    }
    150 
    151    drm_intel_bo_map(query->bo, false);
    152    results = query->bo->virtual;
    153    switch (query->Base.Target) {
    154    case GL_TIME_ELAPSED_EXT:
    155       if (intel->gen >= 6)
    156 	 query->Base.Result += 80 * (results[1] - results[0]);
    157       else
    158 	 query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
    159       break;
    160 
    161    case GL_TIMESTAMP:
    162       if (intel->gen >= 6) {
    163          /* Our timer is a clock that increments every 80ns (regardless of
    164           * other clock scaling in the system).  The timestamp register we can
    165           * read for glGetTimestamp() masks out the top 32 bits, so we do that
    166           * here too to let the two counters be compared against each other.
    167           *
    168           * If we just multiplied that 32 bits of data by 80, it would roll
    169           * over at a non-power-of-two, so an application couldn't use
    170           * GL_QUERY_COUNTER_BITS to handle rollover correctly.  Instead, we
    171           * report 36 bits and truncate at that (rolling over 5 times as often
    172           * as the HW counter), and when the 32-bit counter rolls over, it
    173           * happens to also be at a rollover in the reported value from near
    174           * (1<<36) to 0.
    175           *
    176           * The low 32 bits rolls over in ~343 seconds.  Our 36-bit result
    177           * rolls over every ~69 seconds.
    178           */
    179 	 query->Base.Result = 80 * (results[1] & 0xffffffff);
    180          query->Base.Result &= (1ull << 36) - 1;
    181       } else {
    182 	 query->Base.Result = 1000 * (results[1] >> 32);
    183       }
    184 
    185       break;
    186 
    187    case GL_SAMPLES_PASSED_ARB:
    188       /* Map and count the pixels from the current query BO */
    189       for (i = query->first_index; i <= query->last_index; i++) {
    190 	 query->Base.Result += results[i * 2 + 1] - results[i * 2];
    191       }
    192       break;
    193 
    194    case GL_ANY_SAMPLES_PASSED:
    195       /* Set true if any of the sub-queries passed. */
    196       for (i = query->first_index; i <= query->last_index; i++) {
    197 	 if (results[i * 2 + 1] != results[i * 2]) {
    198             query->Base.Result = GL_TRUE;
    199             break;
    200          }
    201       }
    202       break;
    203 
    204    case GL_PRIMITIVES_GENERATED:
    205    case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
    206       /* We don't actually query the hardware for this value, so query->bo
    207        * should always be NULL and execution should never reach here.
    208        */
    209       assert(!"Unreachable");
    210       break;
    211 
    212    default:
    213       assert(!"Unrecognized query target in brw_queryobj_get_results()");
    214       break;
    215    }
    216    drm_intel_bo_unmap(query->bo);
    217 
    218    drm_intel_bo_unreference(query->bo);
    219    query->bo = NULL;
    220 }
    221 
    222 static struct gl_query_object *
    223 brw_new_query_object(struct gl_context *ctx, GLuint id)
    224 {
    225    struct brw_query_object *query;
    226 
    227    query = calloc(1, sizeof(struct brw_query_object));
    228 
    229    query->Base.Id = id;
    230    query->Base.Result = 0;
    231    query->Base.Active = false;
    232    query->Base.Ready = true;
    233 
    234    return &query->Base;
    235 }
    236 
    237 static void
    238 brw_delete_query(struct gl_context *ctx, struct gl_query_object *q)
    239 {
    240    struct brw_query_object *query = (struct brw_query_object *)q;
    241 
    242    drm_intel_bo_unreference(query->bo);
    243    free(query);
    244 }
    245 
    246 static void
    247 brw_begin_query(struct gl_context *ctx, struct gl_query_object *q)
    248 {
    249    struct brw_context *brw = brw_context(ctx);
    250    struct intel_context *intel = intel_context(ctx);
    251    struct brw_query_object *query = (struct brw_query_object *)q;
    252 
    253    switch (query->Base.Target) {
    254    case GL_TIME_ELAPSED_EXT:
    255       drm_intel_bo_unreference(query->bo);
    256       query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query", 4096, 4096);
    257       write_timestamp(intel, query->bo, 0);
    258       break;
    259 
    260    case GL_ANY_SAMPLES_PASSED:
    261    case GL_SAMPLES_PASSED_ARB:
    262       /* Reset our driver's tracking of query state. */
    263       drm_intel_bo_unreference(query->bo);
    264       query->bo = NULL;
    265       query->first_index = -1;
    266       query->last_index = -1;
    267 
    268       brw->query.obj = query;
    269       intel->stats_wm++;
    270       break;
    271 
    272    case GL_PRIMITIVES_GENERATED:
    273       /* We don't actually query the hardware for this value; we keep track of
    274        * it a software counter.  So just reset the counter.
    275        */
    276       brw->sol.primitives_generated = 0;
    277       brw->sol.counting_primitives_generated = true;
    278       break;
    279 
    280    case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
    281       /* We don't actually query the hardware for this value; we keep track of
    282        * it a software counter.  So just reset the counter.
    283        */
    284       brw->sol.primitives_written = 0;
    285       brw->sol.counting_primitives_written = true;
    286       break;
    287 
    288    default:
    289       assert(!"Unrecognized query target in brw_begin_query()");
    290       break;
    291    }
    292 }
    293 
    294 /**
    295  * Begin the ARB_occlusion_query query on a query object.
    296  */
    297 static void
    298 brw_end_query(struct gl_context *ctx, struct gl_query_object *q)
    299 {
    300    struct brw_context *brw = brw_context(ctx);
    301    struct intel_context *intel = intel_context(ctx);
    302    struct brw_query_object *query = (struct brw_query_object *)q;
    303 
    304    switch (query->Base.Target) {
    305    case GL_TIMESTAMP:
    306       drm_intel_bo_unreference(query->bo);
    307       query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query",
    308 				     4096, 4096);
    309       /* FALLTHROUGH */
    310 
    311    case GL_TIME_ELAPSED_EXT:
    312       write_timestamp(intel, query->bo, 1);
    313       intel_batchbuffer_flush(intel);
    314       break;
    315 
    316    case GL_ANY_SAMPLES_PASSED:
    317    case GL_SAMPLES_PASSED_ARB:
    318       /* Flush the batchbuffer in case it has writes to our query BO.
    319        * Have later queries write to a new query BO so that further rendering
    320        * doesn't delay the collection of our results.
    321        */
    322       if (query->bo) {
    323 	 brw_emit_query_end(brw);
    324 	 intel_batchbuffer_flush(intel);
    325 
    326 	 drm_intel_bo_unreference(brw->query.bo);
    327 	 brw->query.bo = NULL;
    328       }
    329 
    330       brw->query.obj = NULL;
    331 
    332       intel->stats_wm--;
    333       break;
    334 
    335    case GL_PRIMITIVES_GENERATED:
    336       /* We don't actually query the hardware for this value; we keep track of
    337        * it in a software counter.  So just read the counter and store it in
    338        * the query object.
    339        */
    340       query->Base.Result = brw->sol.primitives_generated;
    341       brw->sol.counting_primitives_generated = false;
    342 
    343       /* And set brw->query.obj to NULL so that this query won't try to wait
    344        * for any rendering to complete.
    345        */
    346       query->bo = NULL;
    347       break;
    348 
    349    case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
    350       /* We don't actually query the hardware for this value; we keep track of
    351        * it in a software counter.  So just read the counter and store it in
    352        * the query object.
    353        */
    354       query->Base.Result = brw->sol.primitives_written;
    355       brw->sol.counting_primitives_written = false;
    356 
    357       /* And set brw->query.obj to NULL so that this query won't try to wait
    358        * for any rendering to complete.
    359        */
    360       query->bo = NULL;
    361       break;
    362 
    363    default:
    364       assert(!"Unrecognized query target in brw_end_query()");
    365       break;
    366    }
    367 }
    368 
    369 static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q)
    370 {
    371    struct brw_query_object *query = (struct brw_query_object *)q;
    372 
    373    brw_queryobj_get_results(ctx, query);
    374    query->Base.Ready = true;
    375 }
    376 
    377 static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q)
    378 {
    379    struct brw_query_object *query = (struct brw_query_object *)q;
    380 
    381    if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
    382       brw_queryobj_get_results(ctx, query);
    383       query->Base.Ready = true;
    384    }
    385 }
    386 
    387 /** Called to set up the query BO and account for its aperture space */
    388 void
    389 brw_prepare_query_begin(struct brw_context *brw)
    390 {
    391    struct intel_context *intel = &brw->intel;
    392 
    393    /* Skip if we're not doing any queries. */
    394    if (!brw->query.obj)
    395       return;
    396 
    397    /* Get a new query BO if we're going to need it. */
    398    if (brw->query.bo == NULL ||
    399        brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
    400       drm_intel_bo_unreference(brw->query.bo);
    401       brw->query.bo = NULL;
    402 
    403       brw->query.bo = drm_intel_bo_alloc(intel->bufmgr, "query", 4096, 1);
    404 
    405       /* clear target buffer */
    406       drm_intel_bo_map(brw->query.bo, true);
    407       memset((char *)brw->query.bo->virtual, 0, 4096);
    408       drm_intel_bo_unmap(brw->query.bo);
    409 
    410       brw->query.index = 0;
    411    }
    412 }
    413 
    414 /** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
    415 void
    416 brw_emit_query_begin(struct brw_context *brw)
    417 {
    418    struct intel_context *intel = &brw->intel;
    419    struct gl_context *ctx = &intel->ctx;
    420    struct brw_query_object *query = brw->query.obj;
    421 
    422    /* Skip if we're not doing any queries, or we've emitted the start. */
    423    if (!query || brw->query.active)
    424       return;
    425 
    426    write_depth_count(intel, brw->query.bo, brw->query.index * 2);
    427 
    428    if (query->bo != brw->query.bo) {
    429       if (query->bo != NULL)
    430 	 brw_queryobj_get_results(ctx, query);
    431       drm_intel_bo_reference(brw->query.bo);
    432       query->bo = brw->query.bo;
    433       query->first_index = brw->query.index;
    434    }
    435    query->last_index = brw->query.index;
    436    brw->query.active = true;
    437 }
    438 
    439 /** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */
    440 void
    441 brw_emit_query_end(struct brw_context *brw)
    442 {
    443    struct intel_context *intel = &brw->intel;
    444 
    445    if (!brw->query.active)
    446       return;
    447 
    448    write_depth_count(intel, brw->query.bo, brw->query.index * 2 + 1);
    449 
    450    brw->query.active = false;
    451    brw->query.index++;
    452 }
    453 
    454 static uint64_t
    455 brw_get_timestamp(struct gl_context *ctx)
    456 {
    457    struct intel_context *intel = intel_context(ctx);
    458    uint64_t result = 0;
    459 
    460    drm_intel_reg_read(intel->bufmgr, TIMESTAMP, &result);
    461 
    462    /* See logic in brw_queryobj_get_results() */
    463    result = result >> 32;
    464    result *= 80;
    465    result &= (1ull << 36) - 1;
    466 
    467    return result;
    468 }
    469 
    470 void brw_init_queryobj_functions(struct dd_function_table *functions)
    471 {
    472    functions->NewQueryObject = brw_new_query_object;
    473    functions->DeleteQuery = brw_delete_query;
    474    functions->BeginQuery = brw_begin_query;
    475    functions->EndQuery = brw_end_query;
    476    functions->CheckQuery = brw_check_query;
    477    functions->WaitQuery = brw_wait_query;
    478    functions->GetTimestamp = brw_get_timestamp;
    479 }
    480