Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2013 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  */
     23 
     24 /**
     25  * \file brw_performance_query.c
     26  *
     27  * Implementation of the GL_INTEL_performance_query extension.
     28  *
     29  * Currently there are two possible counter sources exposed here:
     30  *
     31  * On Gen6+ hardware we have numerous 64bit Pipeline Statistics Registers
     32  * that we can snapshot at the beginning and end of a query.
     33  *
     34  * On Gen7.5+ we have Observability Architecture counters which are
     35  * covered in separate document from the rest of the PRMs.  It is available at:
     36  * https://01.org/linuxgraphics/documentation/driver-documentation-prms
     37  * => 2013 Intel Core Processor Family => Observability Performance Counters
     38  * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell,
     39  * though notably we currently only support OA counters for Haswell+)
     40  */
     41 
     42 #include <limits.h>
     43 #include <dirent.h>
     44 
     45 /* put before sys/types.h to silence glibc warnings */
     46 #ifdef MAJOR_IN_MKDEV
     47 #include <sys/mkdev.h>
     48 #endif
     49 #ifdef MAJOR_IN_SYSMACROS
     50 #include <sys/sysmacros.h>
     51 #endif
     52 #include <sys/types.h>
     53 #include <sys/stat.h>
     54 #include <fcntl.h>
     55 #include <sys/mman.h>
     56 #include <sys/ioctl.h>
     57 
     58 #include <xf86drm.h>
     59 #include <i915_drm.h>
     60 
     61 #include "main/hash.h"
     62 #include "main/macros.h"
     63 #include "main/mtypes.h"
     64 #include "main/performance_query.h"
     65 
     66 #include "util/bitset.h"
     67 #include "util/ralloc.h"
     68 #include "util/hash_table.h"
     69 #include "util/list.h"
     70 
     71 #include "brw_context.h"
     72 #include "brw_defines.h"
     73 #include "brw_performance_query.h"
     74 #include "brw_oa_hsw.h"
     75 #include "brw_oa_bdw.h"
     76 #include "brw_oa_chv.h"
     77 #include "brw_oa_sklgt2.h"
     78 #include "brw_oa_sklgt3.h"
     79 #include "brw_oa_sklgt4.h"
     80 #include "brw_oa_bxt.h"
     81 #include "brw_oa_kblgt2.h"
     82 #include "brw_oa_kblgt3.h"
     83 #include "brw_oa_glk.h"
     84 #include "brw_oa_cflgt2.h"
     85 #include "brw_oa_cflgt3.h"
     86 #include "intel_batchbuffer.h"
     87 
     88 #define FILE_DEBUG_FLAG DEBUG_PERFMON
     89 
     90 /*
     91  * The largest OA formats we can use include:
     92  * For Haswell:
     93  *   1 timestamp, 45 A counters, 8 B counters and 8 C counters.
     94  * For Gen8+
     95  *   1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
     96  */
     97 #define MAX_OA_REPORT_COUNTERS 62
     98 
     99 #define OAREPORT_REASON_MASK           0x3f
    100 #define OAREPORT_REASON_SHIFT          19
    101 #define OAREPORT_REASON_TIMER          (1<<0)
    102 #define OAREPORT_REASON_TRIGGER1       (1<<1)
    103 #define OAREPORT_REASON_TRIGGER2       (1<<2)
    104 #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
    105 #define OAREPORT_REASON_GO_TRANSITION  (1<<4)
    106 
    107 #define I915_PERF_OA_SAMPLE_SIZE (8 +   /* drm_i915_perf_record_header */ \
    108                                   256)  /* OA counter report */
    109 
    110 /**
    111  * Periodic OA samples are read() into these buffer structures via the
    112  * i915 perf kernel interface and appended to the
    113  * brw->perfquery.sample_buffers linked list. When we process the
    114  * results of an OA metrics query we need to consider all the periodic
    115  * samples between the Begin and End MI_REPORT_PERF_COUNT command
    116  * markers.
    117  *
    118  * 'Periodic' is a simplification as there are other automatic reports
    119  * written by the hardware also buffered here.
    120  *
    121  * Considering three queries, A, B and C:
    122  *
    123  *  Time ---->
    124  *                ________________A_________________
    125  *                |                                |
    126  *                | ________B_________ _____C___________
    127  *                | |                | |           |   |
    128  *
    129  * And an illustration of sample buffers read over this time frame:
    130  * [HEAD ][     ][     ][     ][     ][     ][     ][     ][TAIL ]
    131  *
    132  * These nodes may hold samples for query A:
    133  * [     ][     ][  A  ][  A  ][  A  ][  A  ][  A  ][     ][     ]
    134  *
    135  * These nodes may hold samples for query B:
    136  * [     ][     ][  B  ][  B  ][  B  ][     ][     ][     ][     ]
    137  *
    138  * These nodes may hold samples for query C:
    139  * [     ][     ][     ][     ][     ][  C  ][  C  ][  C  ][     ]
    140  *
    141  * The illustration assumes we have an even distribution of periodic
    142  * samples so all nodes have the same size plotted against time:
    143  *
    144  * Note, to simplify code, the list is never empty.
    145  *
    146  * With overlapping queries we can see that periodic OA reports may
    147  * relate to multiple queries and care needs to be take to keep
    148  * track of sample buffers until there are no queries that might
    149  * depend on their contents.
    150  *
    151  * We use a node ref counting system where a reference ensures that a
    152  * node and all following nodes can't be freed/recycled until the
    153  * reference drops to zero.
    154  *
    155  * E.g. with a ref of one here:
    156  * [  0  ][  0  ][  1  ][  0  ][  0  ][  0  ][  0  ][  0  ][  0  ]
    157  *
    158  * These nodes could be freed or recycled ("reaped"):
    159  * [  0  ][  0  ]
    160  *
    161  * These must be preserved until the leading ref drops to zero:
    162  *               [  1  ][  0  ][  0  ][  0  ][  0  ][  0  ][  0  ]
    163  *
    164  * When a query starts we take a reference on the current tail of
    165  * the list, knowing that no already-buffered samples can possibly
    166  * relate to the newly-started query. A pointer to this node is
    167  * also saved in the query object's ->oa.samples_head.
    168  *
    169  * E.g. starting query A while there are two nodes in .sample_buffers:
    170  *                ________________A________
    171  *                |
    172  *
    173  * [  0  ][  1  ]
    174  *           ^_______ Add a reference and store pointer to node in
    175  *                    A->oa.samples_head
    176  *
    177  * Moving forward to when the B query starts with no new buffer nodes:
    178  * (for reference, i915 perf reads() are only done when queries finish)
    179  *                ________________A_______
    180  *                | ________B___
    181  *                | |
    182  *
    183  * [  0  ][  2  ]
    184  *           ^_______ Add a reference and store pointer to
    185  *                    node in B->oa.samples_head
    186  *
    187  * Once a query is finished, after an OA query has become 'Ready',
    188  * once the End OA report has landed and after we we have processed
    189  * all the intermediate periodic samples then we drop the
    190  * ->oa.samples_head reference we took at the start.
    191  *
    192  * So when the B query has finished we have:
    193  *                ________________A________
    194  *                | ______B___________
    195  *                | |                |
    196  * [  0  ][  1  ][  0  ][  0  ][  0  ]
    197  *           ^_______ Drop B->oa.samples_head reference
    198  *
    199  * We still can't free these due to the A->oa.samples_head ref:
    200  *        [  1  ][  0  ][  0  ][  0  ]
    201  *
    202  * When the A query finishes: (note there's a new ref for C's samples_head)
    203  *                ________________A_________________
    204  *                |                                |
    205  *                |                    _____C_________
    206  *                |                    |           |
    207  * [  0  ][  0  ][  0  ][  0  ][  1  ][  0  ][  0  ]
    208  *           ^_______ Drop A->oa.samples_head reference
    209  *
    210  * And we can now reap these nodes up to the C->oa.samples_head:
    211  * [  X  ][  X  ][  X  ][  X  ]
    212  *                  keeping -> [  1  ][  0  ][  0  ]
    213  *
    214  * We reap old sample buffers each time we finish processing an OA
    215  * query by iterating the sample_buffers list from the head until we
    216  * find a referenced node and stop.
    217  *
    218  * Reaped buffers move to a perfquery.free_sample_buffers list and
    219  * when we come to read() we first look to recycle a buffer from the
    220  * free_sample_buffers list before allocating a new buffer.
    221  */
    222 struct brw_oa_sample_buf {
    223    struct exec_node link;
    224    int refcount;
    225    int len;
    226    uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
    227    uint32_t last_timestamp;
    228 };
    229 
    230 /**
    231  * i965 representation of a performance query object.
    232  *
    233  * NB: We want to keep this structure relatively lean considering that
    234  * applications may expect to allocate enough objects to be able to
    235  * query around all draw calls in a frame.
    236  */
    237 struct brw_perf_query_object
    238 {
    239    struct gl_perf_query_object base;
    240 
    241    const struct brw_perf_query_info *query;
    242 
    243    /* See query->kind to know which state below is in use... */
    244    union {
    245       struct {
    246 
    247          /**
    248           * BO containing OA counter snapshots at query Begin/End time.
    249           */
    250          struct brw_bo *bo;
    251 
    252          /**
    253           * Address of mapped of @bo
    254           */
    255          void *map;
    256 
    257          /**
    258           * The MI_REPORT_PERF_COUNT command lets us specify a unique
    259           * ID that will be reflected in the resulting OA report
    260           * that's written by the GPU. This is the ID we're expecting
    261           * in the begin report and the the end report should be
    262           * @begin_report_id + 1.
    263           */
    264          int begin_report_id;
    265 
    266          /**
    267           * Reference the head of the brw->perfquery.sample_buffers
    268           * list at the time that the query started (so we only need
    269           * to look at nodes after this point when looking for samples
    270           * related to this query)
    271           *
    272           * (See struct brw_oa_sample_buf description for more details)
    273           */
    274          struct exec_node *samples_head;
    275 
    276          /**
    277           * Storage for the final accumulated OA counters.
    278           */
    279          uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
    280 
    281          /**
    282           * false while in the unaccumulated_elements list, and set to
    283           * true when the final, end MI_RPC snapshot has been
    284           * accumulated.
    285           */
    286          bool results_accumulated;
    287 
    288       } oa;
    289 
    290       struct {
    291          /**
    292           * BO containing starting and ending snapshots for the
    293           * statistics counters.
    294           */
    295          struct brw_bo *bo;
    296       } pipeline_stats;
    297    };
    298 };
    299 
    300 /** Downcasting convenience macro. */
    301 static inline struct brw_perf_query_object *
    302 brw_perf_query(struct gl_perf_query_object *o)
    303 {
    304    return (struct brw_perf_query_object *) o;
    305 }
    306 
    307 #define STATS_BO_SIZE               4096
    308 #define STATS_BO_END_OFFSET_BYTES   (STATS_BO_SIZE / 2)
    309 #define MAX_STAT_COUNTERS           (STATS_BO_END_OFFSET_BYTES / 8)
    310 
    311 #define MI_RPC_BO_SIZE              4096
    312 #define MI_RPC_BO_END_OFFSET_BYTES  (MI_RPC_BO_SIZE / 2)
    313 
    314 /******************************************************************************/
    315 
    316 static bool
    317 brw_is_perf_query_ready(struct gl_context *ctx,
    318                         struct gl_perf_query_object *o);
    319 
    320 static void
    321 dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
    322 {
    323    struct gl_context *ctx = brw_void;
    324    struct gl_perf_query_object *o = query_void;
    325    struct brw_perf_query_object *obj = query_void;
    326 
    327    switch (obj->query->kind) {
    328    case OA_COUNTERS:
    329       DBG("%4d: %-6s %-8s BO: %-4s OA data: %-10s %-15s\n",
    330           id,
    331           o->Used ? "Dirty," : "New,",
    332           o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
    333           obj->oa.bo ? "yes," : "no,",
    334           brw_is_perf_query_ready(ctx, o) ? "ready," : "not ready,",
    335           obj->oa.results_accumulated ? "accumulated" : "not accumulated");
    336       break;
    337    case PIPELINE_STATS:
    338       DBG("%4d: %-6s %-8s BO: %-4s\n",
    339           id,
    340           o->Used ? "Dirty," : "New,",
    341           o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
    342           obj->pipeline_stats.bo ? "yes" : "no");
    343       break;
    344    }
    345 }
    346 
    347 static void
    348 dump_perf_queries(struct brw_context *brw)
    349 {
    350    struct gl_context *ctx = &brw->ctx;
    351    DBG("Queries: (Open queries = %d, OA users = %d)\n",
    352        brw->perfquery.n_active_oa_queries, brw->perfquery.n_oa_users);
    353    _mesa_HashWalk(ctx->PerfQuery.Objects, dump_perf_query_callback, brw);
    354 }
    355 
    356 /******************************************************************************/
    357 
    358 static struct brw_oa_sample_buf *
    359 get_free_sample_buf(struct brw_context *brw)
    360 {
    361    struct exec_node *node = exec_list_pop_head(&brw->perfquery.free_sample_buffers);
    362    struct brw_oa_sample_buf *buf;
    363 
    364    if (node)
    365       buf = exec_node_data(struct brw_oa_sample_buf, node, link);
    366    else {
    367       buf = ralloc_size(brw, sizeof(*buf));
    368 
    369       exec_node_init(&buf->link);
    370       buf->refcount = 0;
    371       buf->len = 0;
    372    }
    373 
    374    return buf;
    375 }
    376 
    377 static void
    378 reap_old_sample_buffers(struct brw_context *brw)
    379 {
    380    struct exec_node *tail_node =
    381       exec_list_get_tail(&brw->perfquery.sample_buffers);
    382    struct brw_oa_sample_buf *tail_buf =
    383       exec_node_data(struct brw_oa_sample_buf, tail_node, link);
    384 
    385    /* Remove all old, unreferenced sample buffers walking forward from
    386     * the head of the list, except always leave at least one node in
    387     * the list so we always have a node to reference when we Begin
    388     * a new query.
    389     */
    390    foreach_list_typed_safe(struct brw_oa_sample_buf, buf, link,
    391                            &brw->perfquery.sample_buffers)
    392    {
    393       if (buf->refcount == 0 && buf != tail_buf) {
    394          exec_node_remove(&buf->link);
    395          exec_list_push_head(&brw->perfquery.free_sample_buffers, &buf->link);
    396       } else
    397          return;
    398    }
    399 }
    400 
    401 static void
    402 free_sample_bufs(struct brw_context *brw)
    403 {
    404    foreach_list_typed_safe(struct brw_oa_sample_buf, buf, link,
    405                            &brw->perfquery.free_sample_buffers)
    406       ralloc_free(buf);
    407 
    408    exec_list_make_empty(&brw->perfquery.free_sample_buffers);
    409 }
    410 
    411 /******************************************************************************/
    412 
    413 /**
    414  * Driver hook for glGetPerfQueryInfoINTEL().
    415  */
    416 static void
    417 brw_get_perf_query_info(struct gl_context *ctx,
    418                         unsigned query_index,
    419                         const char **name,
    420                         GLuint *data_size,
    421                         GLuint *n_counters,
    422                         GLuint *n_active)
    423 {
    424    struct brw_context *brw = brw_context(ctx);
    425    const struct brw_perf_query_info *query =
    426       &brw->perfquery.queries[query_index];
    427 
    428    *name = query->name;
    429    *data_size = query->data_size;
    430    *n_counters = query->n_counters;
    431 
    432    switch (query->kind) {
    433    case OA_COUNTERS:
    434       *n_active = brw->perfquery.n_active_oa_queries;
    435       break;
    436 
    437    case PIPELINE_STATS:
    438       *n_active = brw->perfquery.n_active_pipeline_stats_queries;
    439       break;
    440    }
    441 }
    442 
    443 /**
    444  * Driver hook for glGetPerfCounterInfoINTEL().
    445  */
    446 static void
    447 brw_get_perf_counter_info(struct gl_context *ctx,
    448                           unsigned query_index,
    449                           unsigned counter_index,
    450                           const char **name,
    451                           const char **desc,
    452                           GLuint *offset,
    453                           GLuint *data_size,
    454                           GLuint *type_enum,
    455                           GLuint *data_type_enum,
    456                           GLuint64 *raw_max)
    457 {
    458    struct brw_context *brw = brw_context(ctx);
    459    const struct brw_perf_query_info *query =
    460       &brw->perfquery.queries[query_index];
    461    const struct brw_perf_query_counter *counter =
    462       &query->counters[counter_index];
    463 
    464    *name = counter->name;
    465    *desc = counter->desc;
    466    *offset = counter->offset;
    467    *data_size = counter->size;
    468    *type_enum = counter->type;
    469    *data_type_enum = counter->data_type;
    470    *raw_max = counter->raw_max;
    471 }
    472 
    473 /******************************************************************************/
    474 
    475 /**
    476  * Emit MI_STORE_REGISTER_MEM commands to capture all of the
    477  * pipeline statistics for the performance query object.
    478  */
    479 static void
    480 snapshot_statistics_registers(struct brw_context *brw,
    481                               struct brw_perf_query_object *obj,
    482                               uint32_t offset_in_bytes)
    483 {
    484    const struct brw_perf_query_info *query = obj->query;
    485    const int n_counters = query->n_counters;
    486 
    487    for (int i = 0; i < n_counters; i++) {
    488       const struct brw_perf_query_counter *counter = &query->counters[i];
    489 
    490       assert(counter->data_type == GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL);
    491 
    492       brw_store_register_mem64(brw, obj->pipeline_stats.bo,
    493                                counter->pipeline_stat.reg,
    494                                offset_in_bytes + i * sizeof(uint64_t));
    495    }
    496 }
    497 
    498 /**
    499  * Add a query to the global list of "unaccumulated queries."
    500  *
    501  * Queries are tracked here until all the associated OA reports have
    502  * been accumulated via accumulate_oa_reports() after the end
    503  * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
    504  */
    505 static void
    506 add_to_unaccumulated_query_list(struct brw_context *brw,
    507                                 struct brw_perf_query_object *obj)
    508 {
    509    if (brw->perfquery.unaccumulated_elements >=
    510        brw->perfquery.unaccumulated_array_size)
    511    {
    512       brw->perfquery.unaccumulated_array_size *= 1.5;
    513       brw->perfquery.unaccumulated =
    514          reralloc(brw, brw->perfquery.unaccumulated,
    515                   struct brw_perf_query_object *,
    516                   brw->perfquery.unaccumulated_array_size);
    517    }
    518 
    519    brw->perfquery.unaccumulated[brw->perfquery.unaccumulated_elements++] = obj;
    520 }
    521 
    522 /**
    523  * Remove a query from the global list of unaccumulated queries once
    524  * after successfully accumulating the OA reports associated with the
    525  * query in accumulate_oa_reports() or when discarding unwanted query
    526  * results.
    527  */
    528 static void
    529 drop_from_unaccumulated_query_list(struct brw_context *brw,
    530                                    struct brw_perf_query_object *obj)
    531 {
    532    for (int i = 0; i < brw->perfquery.unaccumulated_elements; i++) {
    533       if (brw->perfquery.unaccumulated[i] == obj) {
    534          int last_elt = --brw->perfquery.unaccumulated_elements;
    535 
    536          if (i == last_elt)
    537             brw->perfquery.unaccumulated[i] = NULL;
    538          else {
    539             brw->perfquery.unaccumulated[i] =
    540                brw->perfquery.unaccumulated[last_elt];
    541          }
    542 
    543          break;
    544       }
    545    }
    546 
    547    /* Drop our samples_head reference so that associated periodic
    548     * sample data buffers can potentially be reaped if they aren't
    549     * referenced by any other queries...
    550     */
    551 
    552    struct brw_oa_sample_buf *buf =
    553       exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link);
    554 
    555    assert(buf->refcount > 0);
    556    buf->refcount--;
    557 
    558    obj->oa.samples_head = NULL;
    559 
    560    reap_old_sample_buffers(brw);
    561 }
    562 
    563 static uint64_t
    564 timebase_scale(struct brw_context *brw, uint32_t u32_time_delta)
    565 {
    566    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    567    uint64_t tmp = ((uint64_t)u32_time_delta) * 1000000000ull;
    568 
    569    return tmp ? tmp / devinfo->timestamp_frequency : 0;
    570 }
    571 
    572 static void
    573 accumulate_uint32(const uint32_t *report0,
    574                   const uint32_t *report1,
    575                   uint64_t *accumulator)
    576 {
    577    *accumulator += (uint32_t)(*report1 - *report0);
    578 }
    579 
    580 static void
    581 accumulate_uint40(int a_index,
    582                   const uint32_t *report0,
    583                   const uint32_t *report1,
    584                   uint64_t *accumulator)
    585 {
    586    const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
    587    const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
    588    uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
    589    uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
    590    uint64_t value0 = report0[a_index + 4] | high0;
    591    uint64_t value1 = report1[a_index + 4] | high1;
    592    uint64_t delta;
    593 
    594    if (value0 > value1)
    595       delta = (1ULL << 40) + value1 - value0;
    596    else
    597       delta = value1 - value0;
    598 
    599    *accumulator += delta;
    600 }
    601 
    602 /**
    603  * Given pointers to starting and ending OA snapshots, add the deltas for each
    604  * counter to the results.
    605  */
    606 static void
    607 add_deltas(struct brw_context *brw,
    608            struct brw_perf_query_object *obj,
    609            const uint32_t *start,
    610            const uint32_t *end)
    611 {
    612    const struct brw_perf_query_info *query = obj->query;
    613    uint64_t *accumulator = obj->oa.accumulator;
    614    int idx = 0;
    615    int i;
    616 
    617    switch (query->oa_format) {
    618    case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
    619       accumulate_uint32(start + 1, end + 1, accumulator + idx++); /* timestamp */
    620       accumulate_uint32(start + 3, end + 3, accumulator + idx++); /* clock */
    621 
    622       /* 32x 40bit A counters... */
    623       for (i = 0; i < 32; i++)
    624          accumulate_uint40(i, start, end, accumulator + idx++);
    625 
    626       /* 4x 32bit A counters... */
    627       for (i = 0; i < 4; i++)
    628          accumulate_uint32(start + 36 + i, end + 36 + i, accumulator + idx++);
    629 
    630       /* 8x 32bit B counters + 8x 32bit C counters... */
    631       for (i = 0; i < 16; i++)
    632          accumulate_uint32(start + 48 + i, end + 48 + i, accumulator + idx++);
    633 
    634       break;
    635    case I915_OA_FORMAT_A45_B8_C8:
    636       accumulate_uint32(start + 1, end + 1, accumulator); /* timestamp */
    637 
    638       for (i = 0; i < 61; i++)
    639          accumulate_uint32(start + 3 + i, end + 3 + i, accumulator + 1 + i);
    640 
    641       break;
    642    default:
    643       unreachable("Can't accumulate OA counters in unknown format");
    644    }
    645 }
    646 
    647 static bool
    648 inc_n_oa_users(struct brw_context *brw)
    649 {
    650    if (brw->perfquery.n_oa_users == 0 &&
    651        drmIoctl(brw->perfquery.oa_stream_fd,
    652                 I915_PERF_IOCTL_ENABLE, 0) < 0)
    653    {
    654       return false;
    655    }
    656    ++brw->perfquery.n_oa_users;
    657 
    658    return true;
    659 }
    660 
    661 static void
    662 dec_n_oa_users(struct brw_context *brw)
    663 {
    664    /* Disabling the i915 perf stream will effectively disable the OA
    665     * counters.  Note it's important to be sure there are no outstanding
    666     * MI_RPC commands at this point since they could stall the CS
    667     * indefinitely once OACONTROL is disabled.
    668     */
    669    --brw->perfquery.n_oa_users;
    670    if (brw->perfquery.n_oa_users == 0 &&
    671        drmIoctl(brw->perfquery.oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
    672    {
    673       DBG("WARNING: Error disabling i915 perf stream: %m\n");
    674    }
    675 }
    676 
    677 /* In general if we see anything spurious while accumulating results,
    678  * we don't try and continue accumulating the current query, hoping
    679  * for the best, we scrap anything outstanding, and then hope for the
    680  * best with new queries.
    681  */
    682 static void
    683 discard_all_queries(struct brw_context *brw)
    684 {
    685    while (brw->perfquery.unaccumulated_elements) {
    686       struct brw_perf_query_object *obj = brw->perfquery.unaccumulated[0];
    687 
    688       obj->oa.results_accumulated = true;
    689       drop_from_unaccumulated_query_list(brw, brw->perfquery.unaccumulated[0]);
    690 
    691       dec_n_oa_users(brw);
    692    }
    693 }
    694 
    695 enum OaReadStatus {
    696    OA_READ_STATUS_ERROR,
    697    OA_READ_STATUS_UNFINISHED,
    698    OA_READ_STATUS_FINISHED,
    699 };
    700 
    701 static enum OaReadStatus
    702 read_oa_samples_until(struct brw_context *brw,
    703                       uint32_t start_timestamp,
    704                       uint32_t end_timestamp)
    705 {
    706    struct exec_node *tail_node =
    707       exec_list_get_tail(&brw->perfquery.sample_buffers);
    708    struct brw_oa_sample_buf *tail_buf =
    709       exec_node_data(struct brw_oa_sample_buf, tail_node, link);
    710    uint32_t last_timestamp = tail_buf->last_timestamp;
    711 
    712    while (1) {
    713       struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
    714       uint32_t offset;
    715       int len;
    716 
    717       while ((len = read(brw->perfquery.oa_stream_fd, buf->buf,
    718                          sizeof(buf->buf))) < 0 && errno == EINTR)
    719          ;
    720 
    721       if (len <= 0) {
    722          exec_list_push_tail(&brw->perfquery.free_sample_buffers, &buf->link);
    723 
    724          if (len < 0) {
    725             if (errno == EAGAIN)
    726                return ((last_timestamp - start_timestamp) >=
    727                        (end_timestamp - start_timestamp)) ?
    728                       OA_READ_STATUS_FINISHED :
    729                       OA_READ_STATUS_UNFINISHED;
    730             else {
    731                DBG("Error reading i915 perf samples: %m\n");
    732             }
    733          } else
    734             DBG("Spurious EOF reading i915 perf samples\n");
    735 
    736          return OA_READ_STATUS_ERROR;
    737       }
    738 
    739       buf->len = len;
    740       exec_list_push_tail(&brw->perfquery.sample_buffers, &buf->link);
    741 
    742       /* Go through the reports and update the last timestamp. */
    743       offset = 0;
    744       while (offset < buf->len) {
    745          const struct drm_i915_perf_record_header *header =
    746             (const struct drm_i915_perf_record_header *) &buf->buf[offset];
    747          uint32_t *report = (uint32_t *) (header + 1);
    748 
    749          if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
    750             last_timestamp = report[1];
    751 
    752          offset += header->size;
    753       }
    754 
    755       buf->last_timestamp = last_timestamp;
    756    }
    757 
    758    unreachable("not reached");
    759    return OA_READ_STATUS_ERROR;
    760 }
    761 
    762 /**
    763  * Try to read all the reports until either the delimiting timestamp
    764  * or an error arises.
    765  */
    766 static bool
    767 read_oa_samples_for_query(struct brw_context *brw,
    768                           struct brw_perf_query_object *obj)
    769 {
    770    uint32_t *start;
    771    uint32_t *last;
    772    uint32_t *end;
    773 
    774    /* We need the MI_REPORT_PERF_COUNT to land before we can start
    775     * accumulate. */
    776    assert(!brw_batch_references(&brw->batch, obj->oa.bo) &&
    777           !brw_bo_busy(obj->oa.bo));
    778 
    779    /* Map the BO once here and let accumulate_oa_reports() unmap
    780     * it. */
    781    if (obj->oa.map == NULL)
    782       obj->oa.map = brw_bo_map(brw, obj->oa.bo, MAP_READ);
    783 
    784    start = last = obj->oa.map;
    785    end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
    786 
    787    if (start[0] != obj->oa.begin_report_id) {
    788       DBG("Spurious start report id=%"PRIu32"\n", start[0]);
    789       return true;
    790    }
    791    if (end[0] != (obj->oa.begin_report_id + 1)) {
    792       DBG("Spurious end report id=%"PRIu32"\n", end[0]);
    793       return true;
    794    }
    795 
    796    /* Read the reports until the end timestamp. */
    797    switch (read_oa_samples_until(brw, start[1], end[1])) {
    798    case OA_READ_STATUS_ERROR:
    799       /* Fallthrough and let accumulate_oa_reports() deal with the
    800        * error. */
    801    case OA_READ_STATUS_FINISHED:
    802       return true;
    803    case OA_READ_STATUS_UNFINISHED:
    804       return false;
    805    }
    806 
    807    unreachable("invalid read status");
    808    return false;
    809 }
    810 
    811 /**
    812  * Accumulate raw OA counter values based on deltas between pairs of
    813  * OA reports.
    814  *
    815  * Accumulation starts from the first report captured via
    816  * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
    817  * last MI_RPC report requested by brw_end_perf_query(). Between these
    818  * two reports there may also some number of periodically sampled OA
    819  * reports collected via the i915 perf interface - depending on the
    820  * duration of the query.
    821  *
    822  * These periodic snapshots help to ensure we handle counter overflow
    823  * correctly by being frequent enough to ensure we don't miss multiple
    824  * overflows of a counter between snapshots. For Gen8+ the i915 perf
    825  * snapshots provide the extra context-switch reports that let us
    826  * subtract out the progress of counters associated with other
    827  * contexts running on the system.
    828  */
    829 static void
    830 accumulate_oa_reports(struct brw_context *brw,
    831                       struct brw_perf_query_object *obj)
    832 {
    833    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    834    struct gl_perf_query_object *o = &obj->base;
    835    uint32_t *start;
    836    uint32_t *last;
    837    uint32_t *end;
    838    struct exec_node *first_samples_node;
    839    bool in_ctx = true;
    840    uint32_t ctx_id;
    841    int out_duration = 0;
    842 
    843    assert(o->Ready);
    844    assert(obj->oa.map != NULL);
    845 
    846    start = last = obj->oa.map;
    847    end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
    848 
    849    if (start[0] != obj->oa.begin_report_id) {
    850       DBG("Spurious start report id=%"PRIu32"\n", start[0]);
    851       goto error;
    852    }
    853    if (end[0] != (obj->oa.begin_report_id + 1)) {
    854       DBG("Spurious end report id=%"PRIu32"\n", end[0]);
    855       goto error;
    856    }
    857 
    858    ctx_id = start[2];
    859 
    860    /* See if we have any periodic reports to accumulate too... */
    861 
    862    /* N.B. The oa.samples_head was set when the query began and
    863     * pointed to the tail of the brw->perfquery.sample_buffers list at
    864     * the time the query started. Since the buffer existed before the
    865     * first MI_REPORT_PERF_COUNT command was emitted we therefore know
    866     * that no data in this particular node's buffer can possibly be
    867     * associated with the query - so skip ahead one...
    868     */
    869    first_samples_node = obj->oa.samples_head->next;
    870 
    871    foreach_list_typed_from(struct brw_oa_sample_buf, buf, link,
    872                            &brw->perfquery.sample_buffers,
    873                            first_samples_node)
    874    {
    875       int offset = 0;
    876 
    877       while (offset < buf->len) {
    878          const struct drm_i915_perf_record_header *header =
    879             (const struct drm_i915_perf_record_header *)(buf->buf + offset);
    880 
    881          assert(header->size != 0);
    882          assert(header->size <= buf->len);
    883 
    884          offset += header->size;
    885 
    886          switch (header->type) {
    887          case DRM_I915_PERF_RECORD_SAMPLE: {
    888             uint32_t *report = (uint32_t *)(header + 1);
    889             bool add = true;
    890 
    891             /* Ignore reports that come before the start marker.
    892              * (Note: takes care to allow overflow of 32bit timestamps)
    893              */
    894             if (timebase_scale(brw, report[1] - start[1]) > 5000000000)
    895                continue;
    896 
    897             /* Ignore reports that come after the end marker.
    898              * (Note: takes care to allow overflow of 32bit timestamps)
    899              */
    900             if (timebase_scale(brw, report[1] - end[1]) <= 5000000000)
    901                goto end;
    902 
    903             /* For Gen8+ since the counters continue while other
    904              * contexts are running we need to discount any unrelated
    905              * deltas. The hardware automatically generates a report
    906              * on context switch which gives us a new reference point
    907              * to continuing adding deltas from.
    908              *
    909              * For Haswell we can rely on the HW to stop the progress
    910              * of OA counters while any other context is acctive.
    911              */
    912             if (devinfo->gen >= 8) {
    913                if (in_ctx && report[2] != ctx_id) {
    914                   DBG("i915 perf: Switch AWAY (observed by ID change)\n");
    915                   in_ctx = false;
    916                   out_duration = 0;
    917                } else if (in_ctx == false && report[2] == ctx_id) {
    918                   DBG("i915 perf: Switch TO\n");
    919                   in_ctx = true;
    920 
    921                   /* From experimentation in IGT, we found that the OA unit
    922                    * might label some report as "idle" (using an invalid
    923                    * context ID), right after a report for a given context.
    924                    * Deltas generated by those reports actually belong to the
    925                    * previous context, even though they're not labelled as
    926                    * such.
    927                    *
    928                    * We didn't *really* Switch AWAY in the case that we e.g.
    929                    * saw a single periodic report while idle...
    930                    */
    931                   if (out_duration >= 1)
    932                      add = false;
    933                } else if (in_ctx) {
    934                   assert(report[2] == ctx_id);
    935                   DBG("i915 perf: Continuation IN\n");
    936                } else {
    937                   assert(report[2] != ctx_id);
    938                   DBG("i915 perf: Continuation OUT\n");
    939                   add = false;
    940                   out_duration++;
    941                }
    942             }
    943 
    944             if (add)
    945                add_deltas(brw, obj, last, report);
    946 
    947             last = report;
    948 
    949             break;
    950          }
    951 
    952          case DRM_I915_PERF_RECORD_OA_BUFFER_LOST:
    953              DBG("i915 perf: OA error: all reports lost\n");
    954              goto error;
    955          case DRM_I915_PERF_RECORD_OA_REPORT_LOST:
    956              DBG("i915 perf: OA report lost\n");
    957              break;
    958          }
    959       }
    960    }
    961 
    962 end:
    963 
    964    add_deltas(brw, obj, last, end);
    965 
    966    DBG("Marking %d accumulated - results gathered\n", o->Id);
    967 
    968    brw_bo_unmap(obj->oa.bo);
    969    obj->oa.map = NULL;
    970    obj->oa.results_accumulated = true;
    971    drop_from_unaccumulated_query_list(brw, obj);
    972    dec_n_oa_users(brw);
    973 
    974    return;
    975 
    976 error:
    977 
    978    brw_bo_unmap(obj->oa.bo);
    979    obj->oa.map = NULL;
    980    discard_all_queries(brw);
    981 }
    982 
    983 /******************************************************************************/
    984 
    985 static bool
    986 open_i915_perf_oa_stream(struct brw_context *brw,
    987                          int metrics_set_id,
    988                          int report_format,
    989                          int period_exponent,
    990                          int drm_fd,
    991                          uint32_t ctx_id)
    992 {
    993    uint64_t properties[] = {
    994       /* Single context sampling */
    995       DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id,
    996 
    997       /* Include OA reports in samples */
    998       DRM_I915_PERF_PROP_SAMPLE_OA, true,
    999 
   1000       /* OA unit configuration */
   1001       DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id,
   1002       DRM_I915_PERF_PROP_OA_FORMAT, report_format,
   1003       DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent,
   1004    };
   1005    struct drm_i915_perf_open_param param = {
   1006       .flags = I915_PERF_FLAG_FD_CLOEXEC |
   1007                I915_PERF_FLAG_FD_NONBLOCK |
   1008                I915_PERF_FLAG_DISABLED,
   1009       .num_properties = ARRAY_SIZE(properties) / 2,
   1010       .properties_ptr = (uintptr_t) properties,
   1011    };
   1012    int fd = drmIoctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
   1013    if (fd == -1) {
   1014       DBG("Error opening i915 perf OA stream: %m\n");
   1015       return false;
   1016    }
   1017 
   1018    brw->perfquery.oa_stream_fd = fd;
   1019 
   1020    brw->perfquery.current_oa_metrics_set_id = metrics_set_id;
   1021    brw->perfquery.current_oa_format = report_format;
   1022 
   1023    return true;
   1024 }
   1025 
   1026 static void
   1027 close_perf(struct brw_context *brw)
   1028 {
   1029    if (brw->perfquery.oa_stream_fd != -1) {
   1030       close(brw->perfquery.oa_stream_fd);
   1031       brw->perfquery.oa_stream_fd = -1;
   1032    }
   1033 }
   1034 
   1035 /**
   1036  * Driver hook for glBeginPerfQueryINTEL().
   1037  */
   1038 static bool
   1039 brw_begin_perf_query(struct gl_context *ctx,
   1040                      struct gl_perf_query_object *o)
   1041 {
   1042    struct brw_context *brw = brw_context(ctx);
   1043    struct brw_perf_query_object *obj = brw_perf_query(o);
   1044    const struct brw_perf_query_info *query = obj->query;
   1045 
   1046    /* We can assume the frontend hides mistaken attempts to Begin a
   1047     * query object multiple times before its End. Similarly if an
   1048     * application reuses a query object before results have arrived
   1049     * the frontend will wait for prior results so we don't need
   1050     * to support abandoning in-flight results.
   1051     */
   1052    assert(!o->Active);
   1053    assert(!o->Used || o->Ready); /* no in-flight query to worry about */
   1054 
   1055    DBG("Begin(%d)\n", o->Id);
   1056 
   1057    /* XXX: We have to consider that the command parser unit that parses batch
   1058     * buffer commands and is used to capture begin/end counter snapshots isn't
   1059     * implicitly synchronized with what's currently running across other GPU
   1060     * units (such as the EUs running shaders) that the performance counters are
   1061     * associated with.
   1062     *
   1063     * The intention of performance queries is to measure the work associated
   1064     * with commands between the begin/end delimiters and so for that to be the
   1065     * case we need to explicitly synchronize the parsing of commands to capture
   1066     * Begin/End counter snapshots with what's running across other parts of the
   1067     * GPU.
   1068     *
   1069     * When the command parser reaches a Begin marker it effectively needs to
   1070     * drain everything currently running on the GPU until the hardware is idle
   1071     * before capturing the first snapshot of counters - otherwise the results
   1072     * would also be measuring the effects of earlier commands.
   1073     *
   1074     * When the command parser reaches an End marker it needs to stall until
   1075     * everything currently running on the GPU has finished before capturing the
   1076     * end snapshot - otherwise the results won't be a complete representation
   1077     * of the work.
   1078     *
   1079     * Theoretically there could be opportunities to minimize how much of the
   1080     * GPU pipeline is drained, or that we stall for, when we know what specific
   1081     * units the performance counters being queried relate to but we don't
   1082     * currently attempt to be clever here.
   1083     *
   1084     * Note: with our current simple approach here then for back-to-back queries
   1085     * we will redundantly emit duplicate commands to synchronize the command
   1086     * streamer with the rest of the GPU pipeline, but we assume that in HW the
   1087     * second synchronization is effectively a NOOP.
   1088     *
   1089     * N.B. The final results are based on deltas of counters between (inside)
   1090     * Begin/End markers so even though the total wall clock time of the
   1091     * workload is stretched by larger pipeline bubbles the bubbles themselves
   1092     * are generally invisible to the query results. Whether that's a good or a
   1093     * bad thing depends on the use case. For a lower real-time impact while
   1094     * capturing metrics then periodic sampling may be a better choice than
   1095     * INTEL_performance_query.
   1096     *
   1097     *
   1098     * This is our Begin synchronization point to drain current work on the
   1099     * GPU before we capture our first counter snapshot...
   1100     */
   1101    brw_emit_mi_flush(brw);
   1102 
   1103    switch (query->kind) {
   1104    case OA_COUNTERS:
   1105 
   1106       /* Opening an i915 perf stream implies exclusive access to the OA unit
   1107        * which will generate counter reports for a specific counter set with a
   1108        * specific layout/format so we can't begin any OA based queries that
   1109        * require a different counter set or format unless we get an opportunity
   1110        * to close the stream and open a new one...
   1111        */
   1112       if (brw->perfquery.oa_stream_fd != -1 &&
   1113           brw->perfquery.current_oa_metrics_set_id !=
   1114           query->oa_metrics_set_id) {
   1115 
   1116          if (brw->perfquery.n_oa_users != 0)
   1117             return false;
   1118          else
   1119             close_perf(brw);
   1120       }
   1121 
   1122       /* If the OA counters aren't already on, enable them. */
   1123       if (brw->perfquery.oa_stream_fd == -1) {
   1124          __DRIscreen *screen = brw->screen->driScrnPriv;
   1125          const struct gen_device_info *devinfo = &brw->screen->devinfo;
   1126 
   1127          /* The period_exponent gives a sampling period as follows:
   1128           *   sample_period = timestamp_period * 2^(period_exponent + 1)
   1129           *
   1130           * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
   1131           * ~83ns (GEN8/9).
   1132           *
   1133           * The counter overflow period is derived from the EuActive counter
   1134           * which reads a counter that increments by the number of clock
   1135           * cycles multiplied by the number of EUs. It can be calculated as:
   1136           *
   1137           * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
   1138           *
   1139           * (E.g. 40 EUs @ 1GHz = ~53ms)
   1140           *
   1141           * We select a sampling period inferior to that overflow period to
   1142           * ensure we cannot see more than 1 counter overflow, otherwise we
   1143           * could loose information.
   1144           */
   1145 
   1146          int a_counter_in_bits = 32;
   1147          if (devinfo->gen >= 8)
   1148             a_counter_in_bits = 40;
   1149 
   1150          uint64_t overflow_period = pow(2, a_counter_in_bits) /
   1151             (brw->perfquery.sys_vars.n_eus *
   1152              /* drop 1GHz freq to have units in nanoseconds */
   1153              2);
   1154 
   1155          DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
   1156              overflow_period, overflow_period / 1000000ul, brw->perfquery.sys_vars.n_eus);
   1157 
   1158          int period_exponent = 0;
   1159          uint64_t prev_sample_period, next_sample_period;
   1160          for (int e = 0; e < 30; e++) {
   1161             prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
   1162             next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
   1163 
   1164             /* Take the previous sampling period, lower than the overflow
   1165              * period.
   1166              */
   1167             if (prev_sample_period < overflow_period &&
   1168                 next_sample_period > overflow_period)
   1169                period_exponent = e + 1;
   1170          }
   1171 
   1172          if (period_exponent == 0) {
   1173             DBG("WARNING: enable to find a sampling exponent\n");
   1174             return false;
   1175          }
   1176 
   1177          DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
   1178              prev_sample_period / 1000000ul);
   1179 
   1180          if (!open_i915_perf_oa_stream(brw,
   1181                                        query->oa_metrics_set_id,
   1182                                        query->oa_format,
   1183                                        period_exponent,
   1184                                        screen->fd, /* drm fd */
   1185                                        brw->hw_ctx))
   1186             return false;
   1187       } else {
   1188          assert(brw->perfquery.current_oa_metrics_set_id ==
   1189                 query->oa_metrics_set_id &&
   1190                 brw->perfquery.current_oa_format ==
   1191                 query->oa_format);
   1192       }
   1193 
   1194       if (!inc_n_oa_users(brw)) {
   1195          DBG("WARNING: Error enabling i915 perf stream: %m\n");
   1196          return false;
   1197       }
   1198 
   1199       if (obj->oa.bo) {
   1200          brw_bo_unreference(obj->oa.bo);
   1201          obj->oa.bo = NULL;
   1202       }
   1203 
   1204       obj->oa.bo =
   1205          brw_bo_alloc(brw->bufmgr, "perf. query OA MI_RPC bo",
   1206                       MI_RPC_BO_SIZE, 64);
   1207 #ifdef DEBUG
   1208       /* Pre-filling the BO helps debug whether writes landed. */
   1209       void *map = brw_bo_map(brw, obj->oa.bo, MAP_WRITE);
   1210       memset(map, 0x80, MI_RPC_BO_SIZE);
   1211       brw_bo_unmap(obj->oa.bo);
   1212 #endif
   1213 
   1214       obj->oa.begin_report_id = brw->perfquery.next_query_start_report_id;
   1215       brw->perfquery.next_query_start_report_id += 2;
   1216 
   1217       /* We flush the batchbuffer here to minimize the chances that MI_RPC
   1218        * delimiting commands end up in different batchbuffers. If that's the
   1219        * case, the measurement will include the time it takes for the kernel
   1220        * scheduler to load a new request into the hardware. This is manifested in
   1221        * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
   1222        */
   1223       intel_batchbuffer_flush(brw);
   1224 
   1225       /* Take a starting OA counter snapshot. */
   1226       brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0,
   1227                                           obj->oa.begin_report_id);
   1228       ++brw->perfquery.n_active_oa_queries;
   1229 
   1230       /* No already-buffered samples can possibly be associated with this query
   1231        * so create a marker within the list of sample buffers enabling us to
   1232        * easily ignore earlier samples when processing this query after
   1233        * completion.
   1234        */
   1235       assert(!exec_list_is_empty(&brw->perfquery.sample_buffers));
   1236       obj->oa.samples_head = exec_list_get_tail(&brw->perfquery.sample_buffers);
   1237 
   1238       struct brw_oa_sample_buf *buf =
   1239          exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link);
   1240 
   1241       /* This reference will ensure that future/following sample
   1242        * buffers (that may relate to this query) can't be freed until
   1243        * this drops to zero.
   1244        */
   1245       buf->refcount++;
   1246 
   1247       memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator));
   1248       obj->oa.results_accumulated = false;
   1249 
   1250       add_to_unaccumulated_query_list(brw, obj);
   1251       break;
   1252 
   1253    case PIPELINE_STATS:
   1254       if (obj->pipeline_stats.bo) {
   1255          brw_bo_unreference(obj->pipeline_stats.bo);
   1256          obj->pipeline_stats.bo = NULL;
   1257       }
   1258 
   1259       obj->pipeline_stats.bo =
   1260          brw_bo_alloc(brw->bufmgr, "perf. query pipeline stats bo",
   1261                             STATS_BO_SIZE, 64);
   1262 
   1263       /* Take starting snapshots. */
   1264       snapshot_statistics_registers(brw, obj, 0);
   1265 
   1266       ++brw->perfquery.n_active_pipeline_stats_queries;
   1267       break;
   1268    }
   1269 
   1270    if (INTEL_DEBUG & DEBUG_PERFMON)
   1271       dump_perf_queries(brw);
   1272 
   1273    return true;
   1274 }
   1275 
   1276 /**
   1277  * Driver hook for glEndPerfQueryINTEL().
   1278  */
   1279 static void
   1280 brw_end_perf_query(struct gl_context *ctx,
   1281                      struct gl_perf_query_object *o)
   1282 {
   1283    struct brw_context *brw = brw_context(ctx);
   1284    struct brw_perf_query_object *obj = brw_perf_query(o);
   1285 
   1286    DBG("End(%d)\n", o->Id);
   1287 
   1288    /* Ensure that the work associated with the queried commands will have
   1289     * finished before taking our query end counter readings.
   1290     *
   1291     * For more details see comment in brw_begin_perf_query for
   1292     * corresponding flush.
   1293     */
   1294    brw_emit_mi_flush(brw);
   1295 
   1296    switch (obj->query->kind) {
   1297    case OA_COUNTERS:
   1298 
   1299       /* NB: It's possible that the query will have already been marked
   1300        * as 'accumulated' if an error was seen while reading samples
   1301        * from perf. In this case we mustn't try and emit a closing
   1302        * MI_RPC command in case the OA unit has already been disabled
   1303        */
   1304       if (!obj->oa.results_accumulated) {
   1305          /* Take an ending OA counter snapshot. */
   1306          brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo,
   1307                                              MI_RPC_BO_END_OFFSET_BYTES,
   1308                                              obj->oa.begin_report_id + 1);
   1309       }
   1310 
   1311       --brw->perfquery.n_active_oa_queries;
   1312 
   1313       /* NB: even though the query has now ended, it can't be accumulated
   1314        * until the end MI_REPORT_PERF_COUNT snapshot has been written
   1315        * to query->oa.bo
   1316        */
   1317       break;
   1318 
   1319    case PIPELINE_STATS:
   1320       snapshot_statistics_registers(brw, obj,
   1321                                     STATS_BO_END_OFFSET_BYTES);
   1322       --brw->perfquery.n_active_pipeline_stats_queries;
   1323       break;
   1324    }
   1325 }
   1326 
   1327 static void
   1328 brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
   1329 {
   1330    struct brw_context *brw = brw_context(ctx);
   1331    struct brw_perf_query_object *obj = brw_perf_query(o);
   1332    struct brw_bo *bo = NULL;
   1333 
   1334    assert(!o->Ready);
   1335 
   1336    switch (obj->query->kind) {
   1337    case OA_COUNTERS:
   1338       bo = obj->oa.bo;
   1339       break;
   1340 
   1341    case PIPELINE_STATS:
   1342       bo = obj->pipeline_stats.bo;
   1343       break;
   1344    }
   1345 
   1346    if (bo == NULL)
   1347       return;
   1348 
   1349    /* If the current batch references our results bo then we need to
   1350     * flush first...
   1351     */
   1352    if (brw_batch_references(&brw->batch, bo))
   1353       intel_batchbuffer_flush(brw);
   1354 
   1355    brw_bo_wait_rendering(bo);
   1356 
   1357    /* Due to a race condition between the OA unit signaling report
   1358     * availability and the report actually being written into memory,
   1359     * we need to wait for all the reports to come in before we can
   1360     * read them.
   1361     */
   1362    if (obj->query->kind == OA_COUNTERS) {
   1363       while (!read_oa_samples_for_query(brw, obj))
   1364          ;
   1365    }
   1366 }
   1367 
   1368 static bool
   1369 brw_is_perf_query_ready(struct gl_context *ctx,
   1370                         struct gl_perf_query_object *o)
   1371 {
   1372    struct brw_context *brw = brw_context(ctx);
   1373    struct brw_perf_query_object *obj = brw_perf_query(o);
   1374 
   1375    if (o->Ready)
   1376       return true;
   1377 
   1378    switch (obj->query->kind) {
   1379    case OA_COUNTERS:
   1380       return (obj->oa.results_accumulated ||
   1381               (obj->oa.bo &&
   1382                !brw_batch_references(&brw->batch, obj->oa.bo) &&
   1383                !brw_bo_busy(obj->oa.bo) &&
   1384                read_oa_samples_for_query(brw, obj)));
   1385    case PIPELINE_STATS:
   1386       return (obj->pipeline_stats.bo &&
   1387               !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
   1388               !brw_bo_busy(obj->pipeline_stats.bo));
   1389    }
   1390 
   1391    unreachable("missing ready check for unknown query kind");
   1392    return false;
   1393 }
   1394 
   1395 static int
   1396 get_oa_counter_data(struct brw_context *brw,
   1397                     struct brw_perf_query_object *obj,
   1398                     size_t data_size,
   1399                     uint8_t *data)
   1400 {
   1401    const struct brw_perf_query_info *query = obj->query;
   1402    int n_counters = query->n_counters;
   1403    int written = 0;
   1404 
   1405    if (!obj->oa.results_accumulated) {
   1406       accumulate_oa_reports(brw, obj);
   1407       assert(obj->oa.results_accumulated);
   1408    }
   1409 
   1410    for (int i = 0; i < n_counters; i++) {
   1411       const struct brw_perf_query_counter *counter = &query->counters[i];
   1412       uint64_t *out_uint64;
   1413       float *out_float;
   1414 
   1415       if (counter->size) {
   1416          switch (counter->data_type) {
   1417          case GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL:
   1418             out_uint64 = (uint64_t *)(data + counter->offset);
   1419             *out_uint64 = counter->oa_counter_read_uint64(brw, query,
   1420                                                           obj->oa.accumulator);
   1421             break;
   1422          case GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL:
   1423             out_float = (float *)(data + counter->offset);
   1424             *out_float = counter->oa_counter_read_float(brw, query,
   1425                                                         obj->oa.accumulator);
   1426             break;
   1427          default:
   1428             /* So far we aren't using uint32, double or bool32... */
   1429             unreachable("unexpected counter data type");
   1430          }
   1431          written = counter->offset + counter->size;
   1432       }
   1433    }
   1434 
   1435    return written;
   1436 }
   1437 
   1438 static int
   1439 get_pipeline_stats_data(struct brw_context *brw,
   1440                         struct brw_perf_query_object *obj,
   1441                         size_t data_size,
   1442                         uint8_t *data)
   1443 
   1444 {
   1445    const struct brw_perf_query_info *query = obj->query;
   1446    int n_counters = obj->query->n_counters;
   1447    uint8_t *p = data;
   1448 
   1449    uint64_t *start = brw_bo_map(brw, obj->pipeline_stats.bo, MAP_READ);
   1450    uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
   1451 
   1452    for (int i = 0; i < n_counters; i++) {
   1453       const struct brw_perf_query_counter *counter = &query->counters[i];
   1454       uint64_t value = end[i] - start[i];
   1455 
   1456       if (counter->pipeline_stat.numerator !=
   1457           counter->pipeline_stat.denominator) {
   1458          value *= counter->pipeline_stat.numerator;
   1459          value /= counter->pipeline_stat.denominator;
   1460       }
   1461 
   1462       *((uint64_t *)p) = value;
   1463       p += 8;
   1464    }
   1465 
   1466    brw_bo_unmap(obj->pipeline_stats.bo);
   1467 
   1468    return p - data;
   1469 }
   1470 
   1471 /**
   1472  * Driver hook for glGetPerfQueryDataINTEL().
   1473  */
   1474 static void
   1475 brw_get_perf_query_data(struct gl_context *ctx,
   1476                         struct gl_perf_query_object *o,
   1477                         GLsizei data_size,
   1478                         GLuint *data,
   1479                         GLuint *bytes_written)
   1480 {
   1481    struct brw_context *brw = brw_context(ctx);
   1482    struct brw_perf_query_object *obj = brw_perf_query(o);
   1483    int written = 0;
   1484 
   1485    assert(brw_is_perf_query_ready(ctx, o));
   1486 
   1487    DBG("GetData(%d)\n", o->Id);
   1488 
   1489    if (INTEL_DEBUG & DEBUG_PERFMON)
   1490       dump_perf_queries(brw);
   1491 
   1492    /* We expect that the frontend only calls this hook when it knows
   1493     * that results are available.
   1494     */
   1495    assert(o->Ready);
   1496 
   1497    switch (obj->query->kind) {
   1498    case OA_COUNTERS:
   1499       written = get_oa_counter_data(brw, obj, data_size, (uint8_t *)data);
   1500       break;
   1501 
   1502    case PIPELINE_STATS:
   1503       written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data);
   1504       break;
   1505    }
   1506 
   1507    if (bytes_written)
   1508       *bytes_written = written;
   1509 }
   1510 
   1511 static struct gl_perf_query_object *
   1512 brw_new_perf_query_object(struct gl_context *ctx, unsigned query_index)
   1513 {
   1514    struct brw_context *brw = brw_context(ctx);
   1515    const struct brw_perf_query_info *query =
   1516       &brw->perfquery.queries[query_index];
   1517    struct brw_perf_query_object *obj =
   1518       calloc(1, sizeof(struct brw_perf_query_object));
   1519 
   1520    if (!obj)
   1521       return NULL;
   1522 
   1523    obj->query = query;
   1524 
   1525    brw->perfquery.n_query_instances++;
   1526 
   1527    return &obj->base;
   1528 }
   1529 
   1530 /**
   1531  * Driver hook for glDeletePerfQueryINTEL().
   1532  */
   1533 static void
   1534 brw_delete_perf_query(struct gl_context *ctx,
   1535                       struct gl_perf_query_object *o)
   1536 {
   1537    struct brw_context *brw = brw_context(ctx);
   1538    struct brw_perf_query_object *obj = brw_perf_query(o);
   1539 
   1540    /* We can assume that the frontend waits for a query to complete
   1541     * before ever calling into here, so we don't have to worry about
   1542     * deleting an in-flight query object.
   1543     */
   1544    assert(!o->Active);
   1545    assert(!o->Used || o->Ready);
   1546 
   1547    DBG("Delete(%d)\n", o->Id);
   1548 
   1549    switch (obj->query->kind) {
   1550    case OA_COUNTERS:
   1551       if (obj->oa.bo) {
   1552          if (!obj->oa.results_accumulated) {
   1553             drop_from_unaccumulated_query_list(brw, obj);
   1554             dec_n_oa_users(brw);
   1555          }
   1556 
   1557          brw_bo_unreference(obj->oa.bo);
   1558          obj->oa.bo = NULL;
   1559       }
   1560 
   1561       obj->oa.results_accumulated = false;
   1562       break;
   1563 
   1564    case PIPELINE_STATS:
   1565       if (obj->pipeline_stats.bo) {
   1566          brw_bo_unreference(obj->pipeline_stats.bo);
   1567          obj->pipeline_stats.bo = NULL;
   1568       }
   1569       break;
   1570    }
   1571 
   1572    free(obj);
   1573 
   1574    /* As an indication that the INTEL_performance_query extension is no
   1575     * longer in use, it's a good time to free our cache of sample
   1576     * buffers and close any current i915-perf stream.
   1577     */
   1578    if (--brw->perfquery.n_query_instances == 0) {
   1579       free_sample_bufs(brw);
   1580       close_perf(brw);
   1581    }
   1582 }
   1583 
   1584 /******************************************************************************/
   1585 
   1586 static struct brw_perf_query_info *
   1587 append_query_info(struct brw_context *brw)
   1588 {
   1589    brw->perfquery.queries =
   1590       reralloc(brw, brw->perfquery.queries,
   1591                struct brw_perf_query_info, ++brw->perfquery.n_queries);
   1592 
   1593    return &brw->perfquery.queries[brw->perfquery.n_queries - 1];
   1594 }
   1595 
   1596 static void
   1597 add_stat_reg(struct brw_perf_query_info *query,
   1598              uint32_t reg,
   1599              uint32_t numerator,
   1600              uint32_t denominator,
   1601              const char *name,
   1602              const char *description)
   1603 {
   1604    struct brw_perf_query_counter *counter;
   1605 
   1606    assert(query->n_counters < MAX_STAT_COUNTERS);
   1607 
   1608    counter = &query->counters[query->n_counters];
   1609    counter->name = name;
   1610    counter->desc = description;
   1611    counter->type = GL_PERFQUERY_COUNTER_RAW_INTEL;
   1612    counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
   1613    counter->size = sizeof(uint64_t);
   1614    counter->offset = sizeof(uint64_t) * query->n_counters;
   1615    counter->pipeline_stat.reg = reg;
   1616    counter->pipeline_stat.numerator = numerator;
   1617    counter->pipeline_stat.denominator = denominator;
   1618 
   1619    query->n_counters++;
   1620 }
   1621 
   1622 static void
   1623 add_basic_stat_reg(struct brw_perf_query_info *query,
   1624                    uint32_t reg, const char *name)
   1625 {
   1626    add_stat_reg(query, reg, 1, 1, name, name);
   1627 }
   1628 
   1629 static void
   1630 init_pipeline_statistic_query_registers(struct brw_context *brw)
   1631 {
   1632    const struct gen_device_info *devinfo = &brw->screen->devinfo;
   1633    struct brw_perf_query_info *query = append_query_info(brw);
   1634 
   1635    query->kind = PIPELINE_STATS;
   1636    query->name = "Pipeline Statistics Registers";
   1637    query->n_counters = 0;
   1638    query->counters =
   1639       rzalloc_array(brw, struct brw_perf_query_counter, MAX_STAT_COUNTERS);
   1640 
   1641    add_basic_stat_reg(query, IA_VERTICES_COUNT,
   1642                       "N vertices submitted");
   1643    add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
   1644                       "N primitives submitted");
   1645    add_basic_stat_reg(query, VS_INVOCATION_COUNT,
   1646                       "N vertex shader invocations");
   1647 
   1648    if (devinfo->gen == 6) {
   1649       add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
   1650                    "SO_PRIM_STORAGE_NEEDED",
   1651                    "N geometry shader stream-out primitives (total)");
   1652       add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
   1653                    "SO_NUM_PRIMS_WRITTEN",
   1654                    "N geometry shader stream-out primitives (written)");
   1655    } else {
   1656       add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
   1657                    "SO_PRIM_STORAGE_NEEDED (Stream 0)",
   1658                    "N stream-out (stream 0) primitives (total)");
   1659       add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
   1660                    "SO_PRIM_STORAGE_NEEDED (Stream 1)",
   1661                    "N stream-out (stream 1) primitives (total)");
   1662       add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
   1663                    "SO_PRIM_STORAGE_NEEDED (Stream 2)",
   1664                    "N stream-out (stream 2) primitives (total)");
   1665       add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
   1666                    "SO_PRIM_STORAGE_NEEDED (Stream 3)",
   1667                    "N stream-out (stream 3) primitives (total)");
   1668       add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
   1669                    "SO_NUM_PRIMS_WRITTEN (Stream 0)",
   1670                    "N stream-out (stream 0) primitives (written)");
   1671       add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
   1672                    "SO_NUM_PRIMS_WRITTEN (Stream 1)",
   1673                    "N stream-out (stream 1) primitives (written)");
   1674       add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
   1675                    "SO_NUM_PRIMS_WRITTEN (Stream 2)",
   1676                    "N stream-out (stream 2) primitives (written)");
   1677       add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
   1678                    "SO_NUM_PRIMS_WRITTEN (Stream 3)",
   1679                    "N stream-out (stream 3) primitives (written)");
   1680    }
   1681 
   1682    add_basic_stat_reg(query, HS_INVOCATION_COUNT,
   1683                       "N TCS shader invocations");
   1684    add_basic_stat_reg(query, DS_INVOCATION_COUNT,
   1685                       "N TES shader invocations");
   1686 
   1687    add_basic_stat_reg(query, GS_INVOCATION_COUNT,
   1688                       "N geometry shader invocations");
   1689    add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
   1690                       "N geometry shader primitives emitted");
   1691 
   1692    add_basic_stat_reg(query, CL_INVOCATION_COUNT,
   1693                       "N primitives entering clipping");
   1694    add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
   1695                       "N primitives leaving clipping");
   1696 
   1697    if (devinfo->is_haswell || devinfo->gen == 8)
   1698       add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
   1699                    "N fragment shader invocations",
   1700                    "N fragment shader invocations");
   1701    else
   1702       add_basic_stat_reg(query, PS_INVOCATION_COUNT,
   1703                          "N fragment shader invocations");
   1704 
   1705    add_basic_stat_reg(query, PS_DEPTH_COUNT, "N z-pass fragments");
   1706 
   1707    if (devinfo->gen >= 7)
   1708       add_basic_stat_reg(query, CS_INVOCATION_COUNT,
   1709                          "N compute shader invocations");
   1710 
   1711    query->data_size = sizeof(uint64_t) * query->n_counters;
   1712 }
   1713 
   1714 static bool
   1715 read_file_uint64(const char *file, uint64_t *val)
   1716 {
   1717     char buf[32];
   1718     int fd, n;
   1719 
   1720     fd = open(file, 0);
   1721     if (fd < 0)
   1722 	return false;
   1723     while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
   1724            errno == EINTR);
   1725     close(fd);
   1726     if (n < 0)
   1727 	return false;
   1728 
   1729     buf[n] = '\0';
   1730     *val = strtoull(buf, NULL, 0);
   1731 
   1732     return true;
   1733 }
   1734 
   1735 static void
   1736 register_oa_config(struct brw_context *brw,
   1737                    const struct brw_perf_query_info *query,
   1738                    uint64_t config_id)
   1739 {
   1740    struct brw_perf_query_info *registred_query = append_query_info(brw);
   1741    *registred_query = *query;
   1742    registred_query->oa_metrics_set_id = config_id;
   1743    DBG("metric set registred: id = %" PRIu64", guid = %s\n",
   1744        registred_query->oa_metrics_set_id, query->guid);
   1745 }
   1746 
   1747 static void
   1748 enumerate_sysfs_metrics(struct brw_context *brw, const char *sysfs_dev_dir)
   1749 {
   1750    char buf[256];
   1751    DIR *metricsdir = NULL;
   1752    struct dirent *metric_entry;
   1753    int len;
   1754 
   1755    len = snprintf(buf, sizeof(buf), "%s/metrics", sysfs_dev_dir);
   1756    if (len < 0 || len >= sizeof(buf)) {
   1757       DBG("Failed to concatenate path to sysfs metrics/ directory\n");
   1758       return;
   1759    }
   1760 
   1761    metricsdir = opendir(buf);
   1762    if (!metricsdir) {
   1763       DBG("Failed to open %s: %m\n", buf);
   1764       return;
   1765    }
   1766 
   1767    while ((metric_entry = readdir(metricsdir))) {
   1768       struct hash_entry *entry;
   1769 
   1770       if ((metric_entry->d_type != DT_DIR &&
   1771            metric_entry->d_type != DT_LNK) ||
   1772           metric_entry->d_name[0] == '.')
   1773          continue;
   1774 
   1775       DBG("metric set: %s\n", metric_entry->d_name);
   1776       entry = _mesa_hash_table_search(brw->perfquery.oa_metrics_table,
   1777                                       metric_entry->d_name);
   1778       if (entry) {
   1779          uint64_t id;
   1780 
   1781          len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
   1782                         sysfs_dev_dir, metric_entry->d_name);
   1783          if (len < 0 || len >= sizeof(buf)) {
   1784             DBG("Failed to concatenate path to sysfs metric id file\n");
   1785             continue;
   1786          }
   1787 
   1788          if (!read_file_uint64(buf, &id)) {
   1789             DBG("Failed to read metric set id from %s: %m", buf);
   1790             continue;
   1791          }
   1792 
   1793          register_oa_config(brw, (const struct brw_perf_query_info *)entry->data, id);
   1794       } else
   1795          DBG("metric set not known by mesa (skipping)\n");
   1796    }
   1797 
   1798    closedir(metricsdir);
   1799 }
   1800 
   1801 static bool
   1802 read_sysfs_drm_device_file_uint64(struct brw_context *brw,
   1803                                   const char *sysfs_dev_dir,
   1804                                   const char *file,
   1805                                   uint64_t *value)
   1806 {
   1807    char buf[512];
   1808    int len;
   1809 
   1810    len = snprintf(buf, sizeof(buf), "%s/%s", sysfs_dev_dir, file);
   1811    if (len < 0 || len >= sizeof(buf)) {
   1812       DBG("Failed to concatenate sys filename to read u64 from\n");
   1813       return false;
   1814    }
   1815 
   1816    return read_file_uint64(buf, value);
   1817 }
   1818 
   1819 static bool
   1820 kernel_has_dynamic_config_support(struct brw_context *brw,
   1821                                   const char *sysfs_dev_dir)
   1822 {
   1823    __DRIscreen *screen = brw->screen->driScrnPriv;
   1824    struct hash_entry *entry;
   1825 
   1826    hash_table_foreach(brw->perfquery.oa_metrics_table, entry) {
   1827       struct brw_perf_query_info *query = entry->data;
   1828       char config_path[256];
   1829       uint64_t config_id;
   1830 
   1831       snprintf(config_path, sizeof(config_path),
   1832                "%s/metrics/%s/id", sysfs_dev_dir, query->guid);
   1833 
   1834       /* Look for the test config, which we know we can't replace. */
   1835       if (read_file_uint64(config_path, &config_id) && config_id == 1) {
   1836          uint32_t mux_regs[] = { 0x9888 /* NOA_WRITE */, 0x0 };
   1837          struct drm_i915_perf_oa_config config;
   1838 
   1839          memset(&config, 0, sizeof(config));
   1840 
   1841          memcpy(config.uuid, query->guid, sizeof(config.uuid));
   1842 
   1843          config.n_mux_regs = 1;
   1844          config.mux_regs_ptr = (uintptr_t) mux_regs;
   1845 
   1846          if (drmIoctl(screen->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config_id) < 0 &&
   1847              errno == ENOENT)
   1848             return true;
   1849 
   1850          break;
   1851       }
   1852    }
   1853 
   1854    return false;
   1855 }
   1856 
   1857 static void
   1858 init_oa_configs(struct brw_context *brw, const char *sysfs_dev_dir)
   1859 {
   1860    __DRIscreen *screen = brw->screen->driScrnPriv;
   1861    struct hash_entry *entry;
   1862 
   1863    hash_table_foreach(brw->perfquery.oa_metrics_table, entry) {
   1864       const struct brw_perf_query_info *query = entry->data;
   1865       struct drm_i915_perf_oa_config config;
   1866       char config_path[256];
   1867       uint64_t config_id;
   1868       int ret;
   1869 
   1870       snprintf(config_path, sizeof(config_path),
   1871                "%s/metrics/%s/id", sysfs_dev_dir, query->guid);
   1872 
   1873       /* Don't recreate already loaded configs. */
   1874       if (read_file_uint64(config_path, &config_id)) {
   1875          register_oa_config(brw, query, config_id);
   1876          continue;
   1877       }
   1878 
   1879       memset(&config, 0, sizeof(config));
   1880 
   1881       memcpy(config.uuid, query->guid, sizeof(config.uuid));
   1882 
   1883       config.n_mux_regs = query->n_mux_regs;
   1884       config.mux_regs_ptr = (uintptr_t) query->mux_regs;
   1885 
   1886       config.n_boolean_regs = query->n_b_counter_regs;
   1887       config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
   1888 
   1889       config.n_flex_regs = query->n_flex_regs;
   1890       config.flex_regs_ptr = (uintptr_t) query->flex_regs;
   1891 
   1892       ret = drmIoctl(screen->fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
   1893       if (ret < 0) {
   1894          DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
   1895              query->name, query->guid, strerror(errno));
   1896          continue;
   1897       }
   1898 
   1899       register_oa_config(brw, query, ret);
   1900    }
   1901 }
   1902 
   1903 static bool
   1904 init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
   1905 {
   1906    const struct gen_device_info *devinfo = &brw->screen->devinfo;
   1907    uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
   1908    __DRIscreen *screen = brw->screen->driScrnPriv;
   1909 
   1910    if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir,
   1911                                           "gt_min_freq_mhz",
   1912                                           &min_freq_mhz))
   1913       return false;
   1914 
   1915    if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir,
   1916                                           "gt_max_freq_mhz",
   1917                                           &max_freq_mhz))
   1918       return false;
   1919 
   1920    brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
   1921    brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
   1922    brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
   1923 
   1924    brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
   1925    brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
   1926    /* Assuming uniform distribution of subslices per slices. */
   1927    brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
   1928 
   1929    if (devinfo->is_haswell) {
   1930       brw->perfquery.sys_vars.slice_mask = 0;
   1931       brw->perfquery.sys_vars.subslice_mask = 0;
   1932 
   1933       for (int s = 0; s < devinfo->num_slices; s++)
   1934          brw->perfquery.sys_vars.slice_mask |= 1U << s;
   1935       for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
   1936          brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
   1937 
   1938       if (devinfo->gt == 1) {
   1939          brw->perfquery.sys_vars.n_eus = 10;
   1940       } else if (devinfo->gt == 2) {
   1941          brw->perfquery.sys_vars.n_eus = 20;
   1942       } else if (devinfo->gt == 3) {
   1943          brw->perfquery.sys_vars.n_eus = 40;
   1944       } else
   1945          unreachable("not reached");
   1946    } else {
   1947       drm_i915_getparam_t gp;
   1948       int ret;
   1949       int slice_mask = 0;
   1950       int ss_mask = 0;
   1951       /* maximum number of slices */
   1952       int s_max = devinfo->num_slices;
   1953       /* maximum number of subslices per slice (assuming uniform subslices per
   1954        * slices)
   1955        */
   1956       int ss_max = devinfo->num_subslices[0];
   1957       uint64_t subslice_mask = 0;
   1958       int s;
   1959 
   1960       gp.param = I915_PARAM_SLICE_MASK;
   1961       gp.value = &slice_mask;
   1962       ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
   1963       if (ret)
   1964          return false;
   1965 
   1966       gp.param = I915_PARAM_SUBSLICE_MASK;
   1967       gp.value = &ss_mask;
   1968       ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
   1969       if (ret)
   1970          return false;
   1971 
   1972       brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
   1973       brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
   1974       brw->perfquery.sys_vars.slice_mask = slice_mask;
   1975 
   1976       /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
   1977        * which applies to all slices.
   1978        *
   1979        * Note: some of the metrics we have (as described in XML) are
   1980        * conditional on a $SubsliceMask variable which is expected to also
   1981        * reflect the slice mask by packing together subslice masks for each
   1982        * slice in one value..
   1983        */
   1984       for (s = 0; s < s_max; s++) {
   1985          if (slice_mask & (1<<s)) {
   1986             subslice_mask |= ss_mask << (ss_max * s);
   1987          }
   1988       }
   1989 
   1990       brw->perfquery.sys_vars.subslice_mask = subslice_mask;
   1991       brw->perfquery.sys_vars.n_eu_sub_slices =
   1992          __builtin_popcount(subslice_mask);
   1993    }
   1994 
   1995    brw->perfquery.sys_vars.eu_threads_count =
   1996       brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
   1997 
   1998    return true;
   1999 }
   2000 
   2001 static bool
   2002 get_sysfs_dev_dir(struct brw_context *brw,
   2003                   char *path_buf,
   2004                   int path_buf_len)
   2005 {
   2006    __DRIscreen *screen = brw->screen->driScrnPriv;
   2007    struct stat sb;
   2008    int min, maj;
   2009    DIR *drmdir;
   2010    struct dirent *drm_entry;
   2011    int len;
   2012 
   2013    assert(path_buf);
   2014    assert(path_buf_len);
   2015    path_buf[0] = '\0';
   2016 
   2017    if (fstat(screen->fd, &sb)) {
   2018       DBG("Failed to stat DRM fd\n");
   2019       return false;
   2020    }
   2021 
   2022    maj = major(sb.st_rdev);
   2023    min = minor(sb.st_rdev);
   2024 
   2025    if (!S_ISCHR(sb.st_mode)) {
   2026       DBG("DRM fd is not a character device as expected\n");
   2027       return false;
   2028    }
   2029 
   2030    len = snprintf(path_buf, path_buf_len,
   2031                   "/sys/dev/char/%d:%d/device/drm", maj, min);
   2032    if (len < 0 || len >= path_buf_len) {
   2033       DBG("Failed to concatenate sysfs path to drm device\n");
   2034       return false;
   2035    }
   2036 
   2037    drmdir = opendir(path_buf);
   2038    if (!drmdir) {
   2039       DBG("Failed to open %s: %m\n", path_buf);
   2040       return false;
   2041    }
   2042 
   2043    while ((drm_entry = readdir(drmdir))) {
   2044       if ((drm_entry->d_type == DT_DIR ||
   2045            drm_entry->d_type == DT_LNK) &&
   2046           strncmp(drm_entry->d_name, "card", 4) == 0)
   2047       {
   2048          len = snprintf(path_buf, path_buf_len,
   2049                         "/sys/dev/char/%d:%d/device/drm/%s",
   2050                         maj, min, drm_entry->d_name);
   2051          closedir(drmdir);
   2052          if (len < 0 || len >= path_buf_len)
   2053             return false;
   2054          else
   2055             return true;
   2056       }
   2057    }
   2058 
   2059    closedir(drmdir);
   2060 
   2061    DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
   2062        maj, min);
   2063 
   2064    return false;
   2065 }
   2066 
   2067 typedef void (*perf_register_oa_queries_t)(struct brw_context *);
   2068 
   2069 static perf_register_oa_queries_t
   2070 get_register_queries_function(const struct gen_device_info *devinfo)
   2071 {
   2072    if (devinfo->is_haswell)
   2073       return brw_oa_register_queries_hsw;
   2074    if (devinfo->is_cherryview)
   2075       return brw_oa_register_queries_chv;
   2076    if (devinfo->is_broadwell)
   2077       return brw_oa_register_queries_bdw;
   2078    if (devinfo->is_broxton)
   2079       return brw_oa_register_queries_bxt;
   2080    if (devinfo->is_skylake) {
   2081       if (devinfo->gt == 2)
   2082          return brw_oa_register_queries_sklgt2;
   2083       if (devinfo->gt == 3)
   2084          return brw_oa_register_queries_sklgt3;
   2085       if (devinfo->gt == 4)
   2086          return brw_oa_register_queries_sklgt4;
   2087    }
   2088    if (devinfo->is_kabylake) {
   2089       if (devinfo->gt == 2)
   2090          return brw_oa_register_queries_kblgt2;
   2091       if (devinfo->gt == 3)
   2092          return brw_oa_register_queries_kblgt3;
   2093    }
   2094    if (devinfo->is_geminilake)
   2095       return brw_oa_register_queries_glk;
   2096    if (devinfo->is_coffeelake) {
   2097       if (devinfo->gt == 2)
   2098          return brw_oa_register_queries_cflgt2;
   2099       if (devinfo->gt == 3)
   2100          return brw_oa_register_queries_cflgt3;
   2101    }
   2102 
   2103    return NULL;
   2104 }
   2105 
   2106 static unsigned
   2107 brw_init_perf_query_info(struct gl_context *ctx)
   2108 {
   2109    struct brw_context *brw = brw_context(ctx);
   2110    const struct gen_device_info *devinfo = &brw->screen->devinfo;
   2111    bool i915_perf_oa_available = false;
   2112    struct stat sb;
   2113    char sysfs_dev_dir[128];
   2114    perf_register_oa_queries_t oa_register;
   2115 
   2116    if (brw->perfquery.n_queries)
   2117       return brw->perfquery.n_queries;
   2118 
   2119    init_pipeline_statistic_query_registers(brw);
   2120 
   2121    oa_register = get_register_queries_function(devinfo);
   2122 
   2123    /* The existence of this sysctl parameter implies the kernel supports
   2124     * the i915 perf interface.
   2125     */
   2126    if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
   2127 
   2128       /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
   2129        * metrics unless running as root.
   2130        */
   2131       if (devinfo->is_haswell)
   2132          i915_perf_oa_available = true;
   2133       else {
   2134          uint64_t paranoid = 1;
   2135 
   2136          read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
   2137 
   2138          if (paranoid == 0 || geteuid() == 0)
   2139             i915_perf_oa_available = true;
   2140       }
   2141    }
   2142 
   2143    if (i915_perf_oa_available &&
   2144        oa_register &&
   2145        get_sysfs_dev_dir(brw, sysfs_dev_dir, sizeof(sysfs_dev_dir)) &&
   2146        init_oa_sys_vars(brw, sysfs_dev_dir))
   2147    {
   2148       brw->perfquery.oa_metrics_table =
   2149          _mesa_hash_table_create(NULL, _mesa_key_hash_string,
   2150                                  _mesa_key_string_equal);
   2151 
   2152       /* Index all the metric sets mesa knows about before looking to see what
   2153        * the kernel is advertising.
   2154        */
   2155       oa_register(brw);
   2156 
   2157       if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
   2158           kernel_has_dynamic_config_support(brw, sysfs_dev_dir))
   2159          init_oa_configs(brw, sysfs_dev_dir);
   2160       else
   2161          enumerate_sysfs_metrics(brw, sysfs_dev_dir);
   2162    }
   2163 
   2164    brw->perfquery.unaccumulated =
   2165       ralloc_array(brw, struct brw_perf_query_object *, 2);
   2166    brw->perfquery.unaccumulated_elements = 0;
   2167    brw->perfquery.unaccumulated_array_size = 2;
   2168 
   2169    exec_list_make_empty(&brw->perfquery.sample_buffers);
   2170    exec_list_make_empty(&brw->perfquery.free_sample_buffers);
   2171 
   2172    /* It's convenient to guarantee that this linked list of sample
   2173     * buffers is never empty so we add an empty head so when we
   2174     * Begin an OA query we can always take a reference on a buffer
   2175     * in this list.
   2176     */
   2177    struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
   2178    exec_list_push_head(&brw->perfquery.sample_buffers, &buf->link);
   2179 
   2180    brw->perfquery.oa_stream_fd = -1;
   2181 
   2182    brw->perfquery.next_query_start_report_id = 1000;
   2183 
   2184    return brw->perfquery.n_queries;
   2185 }
   2186 
   2187 void
   2188 brw_init_performance_queries(struct brw_context *brw)
   2189 {
   2190    struct gl_context *ctx = &brw->ctx;
   2191 
   2192    ctx->Driver.InitPerfQueryInfo = brw_init_perf_query_info;
   2193    ctx->Driver.GetPerfQueryInfo = brw_get_perf_query_info;
   2194    ctx->Driver.GetPerfCounterInfo = brw_get_perf_counter_info;
   2195    ctx->Driver.NewPerfQueryObject = brw_new_perf_query_object;
   2196    ctx->Driver.DeletePerfQuery = brw_delete_perf_query;
   2197    ctx->Driver.BeginPerfQuery = brw_begin_perf_query;
   2198    ctx->Driver.EndPerfQuery = brw_end_perf_query;
   2199    ctx->Driver.WaitPerfQuery = brw_wait_perf_query;
   2200    ctx->Driver.IsPerfQueryReady = brw_is_perf_query_ready;
   2201    ctx->Driver.GetPerfQueryData = brw_get_perf_query_data;
   2202 }
   2203