Home | History | Annotate | Download | only in nvc0
      1 /*
      2  * Copyright 2015 Samuel Pitoiset
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "nvc0/nvc0_context.h"
     24 #include "nvc0/nvc0_query_hw_metric.h"
     25 #include "nvc0/nvc0_query_hw_sm.h"
     26 
     27 #define _Q(i,n,t,d) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t, d }
     28 static const struct nvc0_hw_metric_cfg {
     29    unsigned id;
     30    const char *name;
     31    enum pipe_driver_query_type type;
     32    const char *desc;
     33 } nvc0_hw_metric_queries[] = {
     34    _Q(ACHIEVED_OCCUPANCY,
     35       "metric-achieved_occupancy",
     36       PERCENTAGE,
     37       "Ratio of the average active warps per active cycle to the maximum "
     38       "number of warps supported on a multiprocessor"),
     39 
     40    _Q(BRANCH_EFFICIENCY,
     41       "metric-branch_efficiency",
     42       PERCENTAGE,
     43       "Ratio of non-divergent branches to total branches"),
     44 
     45    _Q(INST_ISSUED,
     46       "metric-inst_issued",
     47       UINT64,
     48       "The number of instructions issued"),
     49 
     50    _Q(INST_PER_WRAP,
     51       "metric-inst_per_wrap",
     52       UINT64,
     53       "Average number of instructions executed by each warp"),
     54 
     55    _Q(INST_REPLAY_OVERHEAD,
     56       "metric-inst_replay_overhead",
     57       UINT64,
     58       "Average number of replays for each instruction executed"),
     59 
     60    _Q(ISSUED_IPC,
     61       "metric-issued_ipc",
     62       UINT64,
     63       "Instructions issued per cycle"),
     64 
     65    _Q(ISSUE_SLOTS,
     66       "metric-issue_slots",
     67       UINT64,
     68       "The number of issue slots used"),
     69 
     70    _Q(ISSUE_SLOT_UTILIZATION,
     71       "metric-issue_slot_utilization",
     72       PERCENTAGE,
     73       "Percentage of issue slots that issued at least one instruction, "
     74       "averaged across all cycles"),
     75 
     76    _Q(IPC,
     77       "metric-ipc",
     78       UINT64,
     79       "Instructions executed per cycle"),
     80 
     81    _Q(SHARED_REPLAY_OVERHEAD,
     82       "metric-shared_replay_overhead",
     83       UINT64,
     84       "Average number of replays due to shared memory conflicts for each "
     85       "instruction executed"),
     86 
     87    _Q(WARP_EXECUTION_EFFICIENCY,
     88       "metric-warp_execution_efficiency",
     89       PERCENTAGE,
     90       "Ratio of the average active threads per warp to the maximum number of "
     91       "threads per warp supported on a multiprocessor"),
     92 
     93    _Q(WARP_NONPRED_EXECUTION_EFFICIENCY,
     94       "metric-warp_nonpred_execution_efficiency",
     95       PERCENTAGE,
     96       "Ratio of the average active threads per warp executing non-predicated "
     97       "instructions to the maximum number of threads per warp supported on a "
     98       "multiprocessor"),
     99 };
    100 
    101 #undef _Q
    102 
    103 static inline const struct nvc0_hw_metric_cfg *
    104 nvc0_hw_metric_get_cfg(unsigned metric_id)
    105 {
    106    unsigned i;
    107 
    108    for (i = 0; i < ARRAY_SIZE(nvc0_hw_metric_queries); i++) {
    109       if (nvc0_hw_metric_queries[i].id == metric_id)
    110          return &nvc0_hw_metric_queries[i];
    111    }
    112    assert(0);
    113    return NULL;
    114 }
    115 
    116 struct nvc0_hw_metric_query_cfg {
    117    unsigned type;
    118    uint32_t queries[8];
    119    uint32_t num_queries;
    120 };
    121 
    122 #define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n)
    123 
    124 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
    125 static const struct nvc0_hw_metric_query_cfg
    126 sm20_achieved_occupancy =
    127 {
    128    .type        = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY,
    129    .queries[0]  = _SM(ACTIVE_WARPS),
    130    .queries[1]  = _SM(ACTIVE_CYCLES),
    131    .num_queries = 2,
    132 };
    133 
    134 static const struct nvc0_hw_metric_query_cfg
    135 sm20_branch_efficiency =
    136 {
    137    .type        = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
    138    .queries[0]  = _SM(BRANCH),
    139    .queries[1]  = _SM(DIVERGENT_BRANCH),
    140    .num_queries = 2,
    141 };
    142 
    143 static const struct nvc0_hw_metric_query_cfg
    144 sm20_inst_per_wrap =
    145 {
    146    .type        = NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
    147    .queries[0]  = _SM(INST_EXECUTED),
    148    .queries[1]  = _SM(WARPS_LAUNCHED),
    149    .num_queries = 2,
    150 };
    151 
    152 static const struct nvc0_hw_metric_query_cfg
    153 sm20_inst_replay_overhead =
    154 {
    155    .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
    156    .queries[0]  = _SM(INST_ISSUED),
    157    .queries[1]  = _SM(INST_EXECUTED),
    158    .num_queries = 2,
    159 };
    160 
    161 static const struct nvc0_hw_metric_query_cfg
    162 sm20_issued_ipc =
    163 {
    164    .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
    165    .queries[0]  = _SM(INST_ISSUED),
    166    .queries[1]  = _SM(ACTIVE_CYCLES),
    167    .num_queries = 2,
    168 };
    169 
    170 static const struct nvc0_hw_metric_query_cfg
    171 sm20_issue_slot_utilization =
    172 {
    173    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
    174    .queries[0]  = _SM(INST_ISSUED),
    175    .queries[1]  = _SM(ACTIVE_CYCLES),
    176    .num_queries = 2,
    177 };
    178 
    179 static const struct nvc0_hw_metric_query_cfg
    180 sm20_ipc =
    181 {
    182    .type        = NVC0_HW_METRIC_QUERY_IPC,
    183    .queries[0]  = _SM(INST_EXECUTED),
    184    .queries[1]  = _SM(ACTIVE_CYCLES),
    185    .num_queries = 2,
    186 };
    187 
    188 static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] =
    189 {
    190    &sm20_achieved_occupancy,
    191    &sm20_branch_efficiency,
    192    &sm20_inst_per_wrap,
    193    &sm20_inst_replay_overhead,
    194    &sm20_ipc,
    195    &sm20_issued_ipc,
    196    &sm20_issue_slot_utilization,
    197 };
    198 
    199 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
    200 static const struct nvc0_hw_metric_query_cfg
    201 sm21_inst_issued =
    202 {
    203    .type        = NVC0_HW_METRIC_QUERY_INST_ISSUED,
    204    .queries[0]  = _SM(INST_ISSUED1_0),
    205    .queries[1]  = _SM(INST_ISSUED1_1),
    206    .queries[2]  = _SM(INST_ISSUED2_0),
    207    .queries[3]  = _SM(INST_ISSUED2_1),
    208    .num_queries = 4,
    209 };
    210 
    211 static const struct nvc0_hw_metric_query_cfg
    212 sm21_inst_replay_overhead =
    213 {
    214    .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
    215    .queries[0]  = _SM(INST_ISSUED1_0),
    216    .queries[1]  = _SM(INST_ISSUED1_1),
    217    .queries[2]  = _SM(INST_ISSUED2_0),
    218    .queries[3]  = _SM(INST_ISSUED2_1),
    219    .queries[4]  = _SM(INST_EXECUTED),
    220    .num_queries = 5,
    221 };
    222 
    223 static const struct nvc0_hw_metric_query_cfg
    224 sm21_issued_ipc =
    225 {
    226    .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
    227    .queries[0]  = _SM(INST_ISSUED1_0),
    228    .queries[1]  = _SM(INST_ISSUED1_1),
    229    .queries[2]  = _SM(INST_ISSUED2_0),
    230    .queries[3]  = _SM(INST_ISSUED2_1),
    231    .queries[4]  = _SM(ACTIVE_CYCLES),
    232    .num_queries = 5,
    233 };
    234 
    235 static const struct nvc0_hw_metric_query_cfg
    236 sm21_issue_slots =
    237 {
    238    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
    239    .queries[0]  = _SM(INST_ISSUED1_0),
    240    .queries[1]  = _SM(INST_ISSUED1_1),
    241    .queries[2]  = _SM(INST_ISSUED2_0),
    242    .queries[3]  = _SM(INST_ISSUED2_1),
    243    .num_queries = 4,
    244 };
    245 
    246 static const struct nvc0_hw_metric_query_cfg
    247 sm21_issue_slot_utilization =
    248 {
    249    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
    250    .queries[0]  = _SM(INST_ISSUED1_0),
    251    .queries[1]  = _SM(INST_ISSUED1_1),
    252    .queries[2]  = _SM(INST_ISSUED2_0),
    253    .queries[3]  = _SM(INST_ISSUED2_1),
    254    .queries[4]  = _SM(ACTIVE_CYCLES),
    255    .num_queries = 5,
    256 };
    257 
    258 static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
    259 {
    260    &sm20_achieved_occupancy,
    261    &sm20_branch_efficiency,
    262    &sm21_inst_issued,
    263    &sm20_inst_per_wrap,
    264    &sm21_inst_replay_overhead,
    265    &sm20_ipc,
    266    &sm21_issued_ipc,
    267    &sm21_issue_slots,
    268    &sm21_issue_slot_utilization,
    269 };
    270 
    271 /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */
    272 static const struct nvc0_hw_metric_query_cfg
    273 sm30_inst_issued =
    274 {
    275    .type        = NVC0_HW_METRIC_QUERY_INST_ISSUED,
    276    .queries[0]  = _SM(INST_ISSUED1),
    277    .queries[1]  = _SM(INST_ISSUED2),
    278    .num_queries = 2,
    279 };
    280 
    281 static const struct nvc0_hw_metric_query_cfg
    282 sm30_inst_replay_overhead =
    283 {
    284    .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
    285    .queries[0]  = _SM(INST_ISSUED1),
    286    .queries[1]  = _SM(INST_ISSUED2),
    287    .queries[2]  = _SM(INST_EXECUTED),
    288    .num_queries = 3,
    289 };
    290 
    291 static const struct nvc0_hw_metric_query_cfg
    292 sm30_issued_ipc =
    293 {
    294    .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
    295    .queries[0]  = _SM(INST_ISSUED1),
    296    .queries[1]  = _SM(INST_ISSUED2),
    297    .queries[2]  = _SM(ACTIVE_CYCLES),
    298    .num_queries = 3,
    299 };
    300 
    301 static const struct nvc0_hw_metric_query_cfg
    302 sm30_issue_slots =
    303 {
    304    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
    305    .queries[0]  = _SM(INST_ISSUED1),
    306    .queries[1]  = _SM(INST_ISSUED2),
    307    .num_queries = 2,
    308 };
    309 
    310 static const struct nvc0_hw_metric_query_cfg
    311 sm30_issue_slot_utilization =
    312 {
    313    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
    314    .queries[0]  = _SM(INST_ISSUED1),
    315    .queries[1]  = _SM(INST_ISSUED2),
    316    .queries[2]  = _SM(ACTIVE_CYCLES),
    317    .num_queries = 3,
    318 };
    319 
    320 static const struct nvc0_hw_metric_query_cfg
    321 sm30_shared_replay_overhead =
    322 {
    323    .type        = NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD,
    324    .queries[0]  = _SM(SHARED_LD_REPLAY),
    325    .queries[1]  = _SM(SHARED_ST_REPLAY),
    326    .queries[2]  = _SM(INST_EXECUTED),
    327    .num_queries = 3,
    328 };
    329 
    330 static const struct nvc0_hw_metric_query_cfg
    331 sm30_warp_execution_efficiency =
    332 {
    333    .type        = NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY,
    334    .queries[0]  = _SM(INST_EXECUTED),
    335    .queries[1]  = _SM(TH_INST_EXECUTED),
    336    .num_queries = 2,
    337 };
    338 
    339 static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] =
    340 {
    341    &sm20_achieved_occupancy,
    342    &sm20_branch_efficiency,
    343    &sm30_inst_issued,
    344    &sm20_inst_per_wrap,
    345    &sm30_inst_replay_overhead,
    346    &sm20_ipc,
    347    &sm30_issued_ipc,
    348    &sm30_issue_slots,
    349    &sm30_issue_slot_utilization,
    350    &sm30_shared_replay_overhead,
    351    &sm30_warp_execution_efficiency,
    352 };
    353 
    354 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
    355 static const struct nvc0_hw_metric_query_cfg
    356 sm35_warp_nonpred_execution_efficiency =
    357 {
    358    .type        = NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY,
    359    .queries[0]  = _SM(INST_EXECUTED),
    360    .queries[1]  = _SM(NOT_PRED_OFF_INST_EXECUTED),
    361    .num_queries = 2,
    362 };
    363 
    364 static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
    365 {
    366    &sm20_achieved_occupancy,
    367    &sm30_inst_issued,
    368    &sm20_inst_per_wrap,
    369    &sm30_inst_replay_overhead,
    370    &sm20_ipc,
    371    &sm30_issued_ipc,
    372    &sm30_issue_slots,
    373    &sm30_issue_slot_utilization,
    374    &sm30_shared_replay_overhead,
    375    &sm30_warp_execution_efficiency,
    376    &sm35_warp_nonpred_execution_efficiency,
    377 };
    378 
    379 /* ==== Compute capability 5.0 (GM107/GM108) ==== */
    380 static const struct nvc0_hw_metric_query_cfg *sm50_hw_metric_queries[] =
    381 {
    382    &sm20_achieved_occupancy,
    383    &sm20_branch_efficiency,
    384    &sm30_inst_issued,
    385    &sm20_inst_per_wrap,
    386    &sm30_inst_replay_overhead,
    387    &sm20_ipc,
    388    &sm30_issued_ipc,
    389    &sm30_issue_slots,
    390    &sm30_issue_slot_utilization,
    391    &sm30_warp_execution_efficiency,
    392    &sm35_warp_nonpred_execution_efficiency,
    393 };
    394 
    395 #undef _SM
    396 
    397 static inline const struct nvc0_hw_metric_query_cfg **
    398 nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
    399 {
    400    struct nouveau_device *dev = screen->base.device;
    401 
    402    switch (screen->base.class_3d) {
    403    case GM200_3D_CLASS:
    404    case GM107_3D_CLASS:
    405       return sm50_hw_metric_queries;
    406    case NVF0_3D_CLASS:
    407       return sm35_hw_metric_queries;
    408    case NVE4_3D_CLASS:
    409       return sm30_hw_metric_queries;
    410    default:
    411       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
    412          return sm20_hw_metric_queries;
    413       return sm21_hw_metric_queries;
    414    }
    415    assert(0);
    416    return NULL;
    417 }
    418 
    419 unsigned
    420 nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
    421 {
    422    struct nouveau_device *dev = screen->base.device;
    423 
    424    switch (screen->base.class_3d) {
    425    case GM200_3D_CLASS:
    426    case GM107_3D_CLASS:
    427       return ARRAY_SIZE(sm50_hw_metric_queries);
    428    case NVF0_3D_CLASS:
    429       return ARRAY_SIZE(sm35_hw_metric_queries);
    430    case NVE4_3D_CLASS:
    431       return ARRAY_SIZE(sm30_hw_metric_queries);
    432    default:
    433       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
    434          return ARRAY_SIZE(sm20_hw_metric_queries);
    435       return ARRAY_SIZE(sm21_hw_metric_queries);
    436    }
    437    return 0;
    438 }
    439 
    440 static const struct nvc0_hw_metric_query_cfg *
    441 nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
    442 {
    443    const struct nvc0_hw_metric_query_cfg **queries;
    444    struct nvc0_screen *screen = nvc0->screen;
    445    struct nvc0_query *q = &hq->base;
    446    unsigned num_queries;
    447    unsigned i;
    448 
    449    num_queries = nvc0_hw_metric_get_num_queries(screen);
    450    queries = nvc0_hw_metric_get_queries(screen);
    451 
    452    for (i = 0; i < num_queries; i++) {
    453       if (NVC0_HW_METRIC_QUERY(queries[i]->type) == q->type)
    454          return queries[i];
    455    }
    456    assert(0);
    457    return NULL;
    458 }
    459 
    460 static void
    461 nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
    462                              struct nvc0_hw_query *hq)
    463 {
    464    struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
    465    unsigned i;
    466 
    467    for (i = 0; i < hmq->num_queries; i++)
    468       if (hmq->queries[i]->funcs->destroy_query)
    469          hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
    470    FREE(hmq);
    471 }
    472 
    473 static boolean
    474 nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
    475 {
    476    struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
    477    boolean ret = false;
    478    unsigned i;
    479 
    480    for (i = 0; i < hmq->num_queries; i++) {
    481       ret = hmq->queries[i]->funcs->begin_query(nvc0, hmq->queries[i]);
    482       if (!ret)
    483          return ret;
    484    }
    485    return ret;
    486 }
    487 
    488 static void
    489 nvc0_hw_metric_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
    490 {
    491    struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
    492    unsigned i;
    493 
    494    for (i = 0; i < hmq->num_queries; i++)
    495       hmq->queries[i]->funcs->end_query(nvc0, hmq->queries[i]);
    496 }
    497 
    498 static uint64_t
    499 sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
    500 {
    501    switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
    502    case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
    503       /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
    504       if (res64[1])
    505          return ((res64[0] / (double)res64[1]) / 48) * 100;
    506       break;
    507    case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
    508       /* (branch / (branch + divergent_branch)) * 100 */
    509       if (res64[0] + res64[1])
    510          return (res64[0] / (double)(res64[0] + res64[1])) * 100;
    511       break;
    512    case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
    513       /* inst_executed / warps_launched */
    514       if (res64[1])
    515          return res64[0] / (double)res64[1];
    516       break;
    517    case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
    518       /* (inst_issued - inst_executed) / inst_executed */
    519       if (res64[1])
    520          return (res64[0] - res64[1]) / (double)res64[1];
    521       break;
    522    case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
    523       /* inst_issued / active_cycles */
    524       if (res64[1])
    525          return res64[0] / (double)res64[1];
    526       break;
    527    case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
    528       /* ((inst_issued / 2) / active_cycles) * 100 */
    529       if (res64[1])
    530          return ((res64[0] / 2) / (double)res64[1]) * 100;
    531       break;
    532    case NVC0_HW_METRIC_QUERY_IPC:
    533       /* inst_executed / active_cycles */
    534       if (res64[1])
    535          return res64[0] / (double)res64[1];
    536       break;
    537    default:
    538       debug_printf("invalid metric type: %d\n",
    539                    hq->base.type - NVC0_HW_METRIC_QUERY(0));
    540       break;
    541    }
    542    return 0;
    543 }
    544 
    545 static uint64_t
    546 sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
    547 {
    548    switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
    549    case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
    550       return sm20_hw_metric_calc_result(hq, res64);
    551    case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
    552       return sm20_hw_metric_calc_result(hq, res64);
    553    case NVC0_HW_METRIC_QUERY_INST_ISSUED:
    554       /* issued1_0 + issued1_1 + (issued2_0 + issued2_1) * 2 */
    555       return res64[0] + res64[1] + (res64[2] + res64[3]) * 2;
    556       break;
    557    case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
    558       return sm20_hw_metric_calc_result(hq, res64);
    559    case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
    560       /* (metric-inst_issued - inst_executed) / inst_executed */
    561       if (res64[4])
    562          return (((res64[0] + res64[1] + (res64[2] + res64[3]) * 2) -
    563                    res64[4]) / (double)res64[4]);
    564       break;
    565    case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
    566       /* metric-inst_issued / active_cycles */
    567       if (res64[4])
    568          return (res64[0] + res64[1] + (res64[2] + res64[3]) * 2) /
    569                 (double)res64[4];
    570       break;
    571    case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
    572       /* issued1_0 + issued1_1 + issued2_0 + issued2_1 */
    573       return res64[0] + res64[1] + res64[2] + res64[3];
    574       break;
    575    case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
    576       /* ((metric-issue_slots / 2) / active_cycles) * 100 */
    577       if (res64[4])
    578          return (((res64[0] + res64[1] + res64[2] + res64[3]) / 2) /
    579                  (double)res64[4]) * 100;
    580       break;
    581    case NVC0_HW_METRIC_QUERY_IPC:
    582       return sm20_hw_metric_calc_result(hq, res64);
    583    default:
    584       debug_printf("invalid metric type: %d\n",
    585                    hq->base.type - NVC0_HW_METRIC_QUERY(0));
    586       break;
    587    }
    588    return 0;
    589 }
    590 
    591 static uint64_t
    592 sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
    593 {
    594    switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
    595    case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
    596       /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
    597       if (res64[1])
    598          return ((res64[0] / (double)res64[1]) / 64) * 100;
    599       break;
    600    case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
    601       return sm20_hw_metric_calc_result(hq, res64);
    602    case NVC0_HW_METRIC_QUERY_INST_ISSUED:
    603       /* inst_issued1 + inst_issued2 * 2 */
    604       return res64[0] + res64[1] * 2;
    605    case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
    606       return sm20_hw_metric_calc_result(hq, res64);
    607    case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
    608       /* (metric-inst_issued - inst_executed) / inst_executed */
    609       if (res64[2])
    610          return (((res64[0] + res64[1] * 2) - res64[2]) / (double)res64[2]);
    611       break;
    612    case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
    613       /* metric-inst_issued / active_cycles */
    614       if (res64[2])
    615          return (res64[0] + res64[1] * 2) / (double)res64[2];
    616       break;
    617    case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
    618       /* inst_issued1 + inst_issued2 */
    619       return res64[0] + res64[1];
    620    case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
    621       /* ((metric-issue_slots / 2) / active_cycles) * 100 */
    622       if (res64[2])
    623          return (((res64[0] + res64[1]) / 2) / (double)res64[2]) * 100;
    624       break;
    625    case NVC0_HW_METRIC_QUERY_IPC:
    626       return sm20_hw_metric_calc_result(hq, res64);
    627    case NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD:
    628       /* (shared_load_replay + shared_store_replay) / inst_executed */
    629       if (res64[2])
    630          return (res64[0] + res64[1]) / (double)res64[2];
    631       break;
    632    case NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY:
    633       /* thread_inst_executed / (inst_executed * max. number of threads per
    634        * wrap) * 100 */
    635       if (res64[0])
    636          return (res64[1] / ((double)res64[0] * 32)) * 100;
    637       break;
    638    default:
    639       debug_printf("invalid metric type: %d\n",
    640                    hq->base.type - NVC0_HW_METRIC_QUERY(0));
    641       break;
    642    }
    643    return 0;
    644 }
    645 
    646 static uint64_t
    647 sm35_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
    648 {
    649    switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
    650    case NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY:
    651       /* not_predicated_off_thread_inst_executed / (inst_executed * max. number
    652        * of threads per wrap) * 100 */
    653       if (res64[0])
    654          return (res64[1] / ((double)res64[0] * 32)) * 100;
    655       break;
    656    default:
    657       return sm30_hw_metric_calc_result(hq, res64);
    658    }
    659    return 0;
    660 }
    661 
    662 static boolean
    663 nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
    664                                 struct nvc0_hw_query *hq, boolean wait,
    665                                 union pipe_query_result *result)
    666 {
    667    struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
    668    struct nvc0_screen *screen = nvc0->screen;
    669    struct nouveau_device *dev = screen->base.device;
    670    union pipe_query_result results[8] = {};
    671    uint64_t res64[8] = {};
    672    uint64_t value = 0;
    673    boolean ret = false;
    674    unsigned i;
    675 
    676    for (i = 0; i < hmq->num_queries; i++) {
    677       ret = hmq->queries[i]->funcs->get_query_result(nvc0, hmq->queries[i],
    678                                                      wait, &results[i]);
    679       if (!ret)
    680          return ret;
    681       res64[i] = *(uint64_t *)&results[i];
    682    }
    683 
    684    switch (screen->base.class_3d) {
    685    case GM200_3D_CLASS:
    686    case GM107_3D_CLASS:
    687    case NVF0_3D_CLASS:
    688       value = sm35_hw_metric_calc_result(hq, res64);
    689       break;
    690    case NVE4_3D_CLASS:
    691       value = sm30_hw_metric_calc_result(hq, res64);
    692       break;
    693    default:
    694       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
    695          value = sm20_hw_metric_calc_result(hq, res64);
    696       else
    697          value = sm21_hw_metric_calc_result(hq, res64);
    698       break;
    699    }
    700 
    701    *(uint64_t *)result = value;
    702    return ret;
    703 }
    704 
    705 static const struct nvc0_hw_query_funcs hw_metric_query_funcs = {
    706    .destroy_query = nvc0_hw_metric_destroy_query,
    707    .begin_query = nvc0_hw_metric_begin_query,
    708    .end_query = nvc0_hw_metric_end_query,
    709    .get_query_result = nvc0_hw_metric_get_query_result,
    710 };
    711 
    712 struct nvc0_hw_query *
    713 nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type)
    714 {
    715    const struct nvc0_hw_metric_query_cfg *cfg;
    716    struct nvc0_hw_metric_query *hmq;
    717    struct nvc0_hw_query *hq;
    718    unsigned i;
    719 
    720    if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST)
    721       return NULL;
    722 
    723    hmq = CALLOC_STRUCT(nvc0_hw_metric_query);
    724    if (!hmq)
    725       return NULL;
    726 
    727    hq = &hmq->base;
    728    hq->funcs = &hw_metric_query_funcs;
    729    hq->base.type = type;
    730 
    731    cfg = nvc0_hw_metric_query_get_cfg(nvc0, hq);
    732 
    733    for (i = 0; i < cfg->num_queries; i++) {
    734       hmq->queries[i] = nvc0_hw_sm_create_query(nvc0, cfg->queries[i]);
    735       if (!hmq->queries[i]) {
    736          nvc0_hw_metric_destroy_query(nvc0, hq);
    737          return NULL;
    738       }
    739       hmq->num_queries++;
    740    }
    741 
    742    return hq;
    743 }
    744 
    745 int
    746 nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
    747                                      struct pipe_driver_query_info *info)
    748 {
    749    int count = 0;
    750 
    751    if (screen->base.drm->version >= 0x01000101) {
    752       if (screen->compute)
    753          count = nvc0_hw_metric_get_num_queries(screen);
    754    }
    755 
    756    if (!info)
    757       return count;
    758 
    759    if (id < count) {
    760       if (screen->compute) {
    761          if (screen->base.class_3d <= GM200_3D_CLASS) {
    762             const struct nvc0_hw_metric_query_cfg **queries =
    763                nvc0_hw_metric_get_queries(screen);
    764             const struct nvc0_hw_metric_cfg *cfg =
    765                nvc0_hw_metric_get_cfg(queries[id]->type);
    766 
    767             info->name = cfg->name;
    768             info->query_type = NVC0_HW_METRIC_QUERY(queries[id]->type);
    769             info->type = cfg->type;
    770             info->group_id = NVC0_HW_METRIC_QUERY_GROUP;
    771             return 1;
    772          }
    773       }
    774    }
    775    return 0;
    776 }
    777