Home | History | Annotate | Download | only in nvc0
      1 /*
      2  * Copyright 2011 Christoph Bumiller
      3  * Copyright 2015 Samuel Pitoiset
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9  * and/or sell copies of the Software, and to permit persons to whom the
     10  * Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice shall be included in
     13  * all copies or substantial portions of the Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     21  * OTHER DEALINGS IN THE SOFTWARE.
     22  */
     23 
     24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
     25 
     26 #include "nvc0/nvc0_context.h"
     27 #include "nvc0/nvc0_query_hw_sm.h"
     28 
     29 #include "nv_object.xml.h"
     30 #include "nvc0/nve4_compute.xml.h"
     31 #include "nvc0/nvc0_compute.xml.h"
     32 
     33 /* NOTE: intentionally using the same names as NV */
     34 #define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d }
     35 static const struct {
     36    unsigned type;
     37    const char *name;
     38    const char *desc;
     39 } nvc0_hw_sm_queries[] = {
     40    _Q(ACTIVE_CTAS,
     41       "active_ctas",
     42       "Accumulated number of active blocks per cycle. For every cycle it "
     43       "increments by the number of active blocks in the cycle which can be in "
     44       "the range 0 to 32."),
     45 
     46    _Q(ACTIVE_CYCLES,
     47       "active_cycles",
     48       "Number of cycles a multiprocessor has at least one active warp"),
     49 
     50    _Q(ACTIVE_WARPS,
     51       "active_warps",
     52       "Accumulated number of active warps per cycle. For every cycle it "
     53       "increments by the number of active warps in the cycle which can be in "
     54       "the range 0 to 64"),
     55 
     56    _Q(ATOM_CAS_COUNT,
     57       "atom_cas_count",
     58       "Number of warps executing atomic compare and swap operations. Increments "
     59       "by one if at least one thread in a warp executes the instruction."),
     60 
     61    _Q(ATOM_COUNT,
     62       "atom_count",
     63       "Number of warps executing atomic reduction operations. Increments by one "
     64       "if at least one thread in a warp executes the instruction"),
     65 
     66    _Q(BRANCH,
     67       "branch",
     68       "Number of branch instructions executed per warp on a multiprocessor"),
     69 
     70    _Q(DIVERGENT_BRANCH,
     71       "divergent_branch",
     72       "Number of divergent branches within a warp. This counter will be "
     73       "incremented by one if at least one thread in a warp diverges (that is, "
     74       "follows a different execution path) via a conditional branch"),
     75 
     76    _Q(GLD_REQUEST,
     77       "gld_request",
     78       "Number of executed load instructions where the state space is not "
     79       "specified and hence generic addressing is used, increments per warp on a "
     80       "multiprocessor. It can include the load operations from global,local and "
     81       "shared state space"),
     82 
     83    _Q(GLD_MEM_DIV_REPLAY,
     84       "global_ld_mem_divergence_replays",
     85       "Number of instruction replays for global memory loads. Instruction is "
     86       "replayed if the instruction is accessing more than one cache line of "
     87       "128 bytes. For each extra cache line access the counter is incremented "
     88       "by 1"),
     89 
     90    _Q(GLOBAL_ATOM_CAS,
     91       "global_atom_cas",
     92       "Number of ATOM.CAS instructions executed per warp."),
     93 
     94    _Q(GLOBAL_LD,
     95       "global_load",
     96       "Number of executed load instructions where state space is specified as "
     97       "global, increments per warp on a multiprocessor."),
     98 
     99    _Q(GLOBAL_ST,
    100       "global_store",
    101       "Number of executed store instructions where state space is specified as "
    102       "global, increments per warp on a multiprocessor."),
    103 
    104    _Q(GST_TRANSACTIONS,
    105       "global_store_transaction",
    106       "Number of global store transactions. Increments by 1 per transaction. "
    107       "Transaction can be 32/64/96/128B"),
    108 
    109    _Q(GST_MEM_DIV_REPLAY,
    110       "global_st_mem_divergence_replays",
    111       "Number of instruction replays for global memory stores. Instruction is "
    112       "replayed if the instruction is accessing more than one cache line of "
    113       "128 bytes. For each extra cache line access the counter is incremented "
    114       "by 1"),
    115 
    116    _Q(GRED_COUNT,
    117       "gred_count",
    118       "Number of warps executing reduction operations on global memory. "
    119       "Increments by one if at least one thread in a warp executes the "
    120       "instruction"),
    121 
    122    _Q(GST_REQUEST,
    123       "gst_request",
    124       "Number of executed store instructions where the state space is not "
    125       "specified and hence generic addressing is used, increments per warp on a "
    126       "multiprocessor. It can include the store operations to global,local and "
    127       "shared state space"),
    128 
    129    _Q(INST_EXECUTED,
    130       "inst_executed",
    131       "Number of instructions executed, do not include replays"),
    132 
    133    _Q(INST_ISSUED,
    134       "inst_issued",
    135       "Number of instructions issued including replays"),
    136 
    137    _Q(INST_ISSUED0,
    138       "inst_issued0",
    139       "Number of cycles that did not issue any instruction, increments per "
    140       "warp."),
    141 
    142    _Q(INST_ISSUED1,
    143       "inst_issued1",
    144       "Number of single instruction issued per cycle"),
    145 
    146    _Q(INST_ISSUED2,
    147       "inst_issued2",
    148       "Number of dual instructions issued per cycle"),
    149 
    150    _Q(INST_ISSUED1_0,
    151       "inst_issued1_0",
    152       "Number of single instruction issued per cycle in pipeline 0"),
    153 
    154    _Q(INST_ISSUED1_1,
    155       "inst_issued1_1",
    156       "Number of single instruction issued per cycle in pipeline 1"),
    157 
    158    _Q(INST_ISSUED2_0,
    159       "inst_issued2_0",
    160       "Number of dual instructions issued per cycle in pipeline 0"),
    161 
    162    _Q(INST_ISSUED2_1,
    163       "inst_issued2_1",
    164       "Number of dual instructions issued per cycle in pipeline 1"),
    165 
    166    _Q(L1_GLD_HIT,
    167       "l1_global_load_hit",
    168       "Number of cache lines that hit in L1 cache for global memory load "
    169       "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
    170       "32, 64 and 128 bit accesses by a warp respectively"),
    171 
    172    _Q(L1_GLD_MISS,
    173       "l1_global_load_miss",
    174       "Number of cache lines that miss in L1 cache for global memory load "
    175       "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
    176       "32, 64 and 128 bit accesses by a warp respectively"),
    177 
    178    _Q(L1_GLD_TRANSACTIONS,
    179       "__l1_global_load_transactions",
    180       "Number of global load transactions from L1 cache. Increments by 1 per "
    181       "transaction. Transaction can be 32/64/96/128B"),
    182 
    183    _Q(L1_GST_TRANSACTIONS,
    184       "__l1_global_store_transactions",
    185       "Number of global store transactions from L1 cache. Increments by 1 per "
    186       "transaction. Transaction can be 32/64/96/128B"),
    187 
    188    _Q(L1_LOCAL_LD_HIT,
    189       "l1_local_load_hit",
    190       "Number of cache lines that hit in L1 cache for local memory load "
    191       "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
    192       "32, 64 and 128 bit accesses by a warp respectively"),
    193 
    194    _Q(L1_LOCAL_LD_MISS,
    195       "l1_local_load_miss",
    196       "Number of cache lines that miss in L1 cache for local memory load "
    197       "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
    198       "32, 64 and 128 bit accesses by a warp respectively"),
    199 
    200    _Q(L1_LOCAL_ST_HIT,
    201       "l1_local_store_hit",
    202       "Number of cache lines that hit in L1 cache for local memory store "
    203       "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
    204       "32, 64 and 128 bit accesses by a warp respectively"),
    205 
    206    _Q(L1_LOCAL_ST_MISS,
    207       "l1_local_store_miss",
    208       "Number of cache lines that miss in L1 cache for local memory store "
    209       "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
    210       "32,64 and 128 bit accesses by a warp respectively"),
    211 
    212    _Q(L1_SHARED_LD_TRANSACTIONS,
    213       "l1_shared_load_transactions",
    214       "Number of shared load transactions. Increments by 1 per transaction. "
    215       "Transaction can be 32/64/96/128B"),
    216 
    217    _Q(L1_SHARED_ST_TRANSACTIONS,
    218       "l1_shared_store_transactions",
    219       "Number of shared store transactions. Increments by 1 per transaction. "
    220       "Transaction can be 32/64/96/128B"),
    221 
    222    _Q(LOCAL_LD,
    223       "local_load",
    224       "Number of executed load instructions where state space is specified as "
    225       "local, increments per warp on a multiprocessor"),
    226 
    227    _Q(LOCAL_LD_TRANSACTIONS,
    228       "local_load_transactions",
    229       "Number of local load transactions from L1 cache. Increments by 1 per "
    230       "transaction. Transaction can be 32/64/96/128B"),
    231 
    232    _Q(LOCAL_ST,
    233       "local_store",
    234       "Number of executed store instructions where state space is specified as "
    235       "local, increments per warp on a multiprocessor"),
    236 
    237    _Q(LOCAL_ST_TRANSACTIONS,
    238       "local_store_transactions",
    239       "Number of local store transactions to L1 cache. Increments by 1 per "
    240       "transaction. Transaction can be 32/64/96/128B."),
    241 
    242    _Q(NOT_PRED_OFF_INST_EXECUTED,
    243       "not_predicated_off_thread_inst_executed",
    244       "Number of not predicated off instructions executed by all threads, does "
    245       "not include replays. For each instruction it increments by the number of "
    246       "threads that execute this instruction"),
    247 
    248    _Q(PROF_TRIGGER_0,
    249       "prof_trigger_00",
    250       "User profiled generic trigger that can be inserted in any place of the "
    251       "code to collect the related information. Increments per warp."),
    252 
    253    _Q(PROF_TRIGGER_1,
    254       "prof_trigger_01",
    255       "User profiled generic trigger that can be inserted in any place of the "
    256       "code to collect the related information. Increments per warp."),
    257 
    258    _Q(PROF_TRIGGER_2,
    259       "prof_trigger_02",
    260       "User profiled generic trigger that can be inserted in any place of the "
    261       "code to collect the related information. Increments per warp."),
    262 
    263    _Q(PROF_TRIGGER_3,
    264       "prof_trigger_03",
    265       "User profiled generic trigger that can be inserted in any place of the "
    266       "code to collect the related information. Increments per warp."),
    267 
    268    _Q(PROF_TRIGGER_4,
    269       "prof_trigger_04",
    270       "User profiled generic trigger that can be inserted in any place of the "
    271       "code to collect the related information. Increments per warp."),
    272 
    273    _Q(PROF_TRIGGER_5,
    274       "prof_trigger_05",
    275       "User profiled generic trigger that can be inserted in any place of the "
    276       "code to collect the related information. Increments per warp."),
    277 
    278    _Q(PROF_TRIGGER_6,
    279       "prof_trigger_06",
    280       "User profiled generic trigger that can be inserted in any place of the "
    281       "code to collect the related information. Increments per warp."),
    282 
    283    _Q(PROF_TRIGGER_7,
    284       "prof_trigger_07",
    285       "User profiled generic trigger that can be inserted in any place of the "
    286       "code to collect the related information. Increments per warp."),
    287 
    288    _Q(SHARED_ATOM,
    289       "shared_atom",
    290       "Number of ATOMS instructions executed per warp."),
    291 
    292    _Q(SHARED_ATOM_CAS,
    293       "shared_atom_cas",
    294       "Number of ATOMS.CAS instructions executed per warp."),
    295 
    296    _Q(SHARED_LD,
    297       "shared_load",
    298       "Number of executed load instructions where state space is specified as "
    299       "shared, increments per warp on a multiprocessor"),
    300 
    301    _Q(SHARED_LD_BANK_CONFLICT,
    302       "shared_load_bank_conflict",
    303       "Number of shared load bank conflict generated when the addresses for "
    304       "two or more shared memory load requests fall in the same memory bank."),
    305 
    306    _Q(SHARED_LD_REPLAY,
    307       "shared_load_replay",
    308       "Replays caused due to shared load bank conflict (when the addresses for "
    309       "two or more shared memory load requests fall in the same memory bank) or "
    310       "when there is no conflict but the total number of words accessed by all "
    311       "threads in the warp executing that instruction exceed the number of words "
    312       "that can be loaded in one cycle (256 bytes)"),
    313 
    314    _Q(SHARED_LD_TRANSACTIONS,
    315       "shared_ld_transactions",
    316       "Number of transactions for shared load accesses. Maximum transaction "
    317       "size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
    318       "will cause multiple transactions for a shared load instruction. This "
    319       "also includes extra transactions caused by shared bank conflicts."),
    320 
    321    _Q(SHARED_ST,
    322       "shared_store",
    323       "Number of executed store instructions where state space is specified as "
    324       "shared, increments per warp on a multiprocessor"),
    325 
    326    _Q(SHARED_ST_BANK_CONFLICT,
    327       "shared_store_bank_conflict",
    328       "Number of shared store bank conflict generated when the addresses for "
    329       "two or more shared memory store requests fall in the same memory bank."),
    330 
    331    _Q(SHARED_ST_REPLAY,
    332       "shared_store_replay",
    333       "Replays caused due to shared store bank conflict (when the addresses for "
    334       "two or more shared memory store requests fall in the same memory bank) or "
    335       "when there is no conflict but the total number of words accessed by all "
    336       "threads in the warp executing that instruction exceed the number of words "
    337       "that can be stored in one cycle"),
    338 
    339    _Q(SHARED_ST_TRANSACTIONS,
    340       "shared_st_transactions",
    341       "Number of transactions for shared store accesses. Maximum transaction "
    342       "size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
    343       "will cause multiple transactions for a shared store instruction. This "
    344       "also includes extra transactions caused by shared bank conflicts."),
    345 
    346    _Q(SM_CTA_LAUNCHED,
    347       "sm_cta_launched",
    348       "Number of thread blocks launched on a multiprocessor"),
    349 
    350    _Q(THREADS_LAUNCHED,
    351       "threads_launched",
    352       "Number of threads launched on a multiprocessor"),
    353 
    354    _Q(TH_INST_EXECUTED,
    355       "thread_inst_executed",
    356       "Number of instructions executed by all threads, does not include "
    357       "replays. For each instruction it increments by the number of threads in "
    358       "the warp that execute the instruction"),
    359 
    360    _Q(TH_INST_EXECUTED_0,
    361       "thread_inst_executed_0",
    362       "Number of instructions executed by all threads, does not include "
    363       "replays. For each instruction it increments by the number of threads in "
    364       "the warp that execute the instruction in pipeline 0"),
    365 
    366    _Q(TH_INST_EXECUTED_1,
    367       "thread_inst_executed_1",
    368       "Number of instructions executed by all threads, does not include "
    369       "replays. For each instruction it increments by the number of threads in "
    370       "the warp that execute the instruction in pipeline 1"),
    371 
    372    _Q(TH_INST_EXECUTED_2,
    373       "thread_inst_executed_2",
    374       "Number of instructions executed by all threads, does not include "
    375       "replays. For each instruction it increments by the number of threads in "
    376       "the warp that execute the instruction in pipeline 2"),
    377 
    378    _Q(TH_INST_EXECUTED_3,
    379       "thread_inst_executed_3",
    380       "Number of instructions executed by all threads, does not include "
    381       "replays. For each instruction it increments by the number of threads in "
    382       "the warp that execute the instruction in pipeline 3"),
    383 
    384    _Q(UNCACHED_GLD_TRANSACTIONS,
    385       "uncached_global_load_transaction",
    386       "Number of uncached global load transactions. Increments by 1 per "
    387       "transaction. Transaction can be 32/64/96/128B."),
    388 
    389    _Q(WARPS_LAUNCHED,
    390       "warps_launched",
    391       "Number of warps launched on a multiprocessor"),
    392 };
    393 
    394 #undef _Q
    395 
    396 static inline const char *
    397 nvc0_hw_sm_query_get_name(unsigned query_type)
    398 {
    399    unsigned i;
    400 
    401    for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) {
    402       if (nvc0_hw_sm_queries[i].type == query_type)
    403          return nvc0_hw_sm_queries[i].name;
    404    }
    405    assert(0);
    406    return NULL;
    407 }
    408 
    409 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
    410 
    411 /* Code to read out MP counters: They are accessible via mmio, too, but let's
    412  * just avoid mapping registers in userspace. We'd have to know which MPs are
    413  * enabled/present, too, and that information is not presently exposed.
    414  * We could add a kernel interface for it, but reading the counters like this
    415  * has the advantage of being async (if get_result isn't called immediately).
    416  */
    417 static const uint64_t nve4_read_hw_sm_counters_code[] =
    418 {
    419    /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
    420     * mov b32 $r8 $tidx
    421     * mov b32 $r12 $physid
    422     * mov b32 $r0 $pm0
    423     * mov b32 $r1 $pm1
    424     * mov b32 $r2 $pm2
    425     * mov b32 $r3 $pm3
    426     * mov b32 $r4 $pm4
    427     * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
    428     * mov b32 $r5 $pm5
    429     * mov b32 $r6 $pm6
    430     * mov b32 $r7 $pm7
    431     * set $p0 0x1 eq u32 $r8 0x0
    432     * mov b32 $r10 c7[0x620]
    433     * ext u32 $r8 $r12 0x414
    434     * mov b32 $r11 c7[0x624]
    435     * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
    436     * ext u32 $r9 $r12 0x208
    437     * (not $p0) exit
    438     * set $p1 0x1 eq u32 $r9 0x0
    439     * mul $r8 u32 $r8 u32 96
    440     * mul $r12 u32 $r9 u32 16
    441     * mul $r13 u32 $r9 u32 4
    442     * add b32 $r9 $r8 $r13
    443     * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
    444     * add b32 $r8 $r8 $r12
    445     * mov b32 $r12 $r10
    446     * add b32 $r10 $c $r10 $r8
    447     * mov b32 $r13 $r11
    448     * add b32 $r11 $r11 0x0 $c
    449     * add b32 $r12 $c $r12 $r9
    450     * st b128 wt g[$r10d] $r0q
    451     * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
    452     * mov b32 $r0 c7[0x628]
    453     * add b32 $r13 $r13 0x0 $c
    454     * $p1 st b128 wt g[$r12d+0x40] $r4q
    455     * st b32 wt g[$r12d+0x50] $r0
    456     * exit */
    457    0x2202020202020207ULL,
    458    0x2c00000084021c04ULL,
    459    0x2c0000000c031c04ULL,
    460    0x2c00000010001c04ULL,
    461    0x2c00000014005c04ULL,
    462    0x2c00000018009c04ULL,
    463    0x2c0000001c00dc04ULL,
    464    0x2c00000020011c04ULL,
    465    0x22b0420042320207ULL,
    466    0x2c00000024015c04ULL,
    467    0x2c00000028019c04ULL,
    468    0x2c0000002c01dc04ULL,
    469    0x190e0000fc81dc03ULL,
    470    0x28005c1880029de4ULL,
    471    0x7000c01050c21c03ULL,
    472    0x28005c189002dde4ULL,
    473    0x204282020042e047ULL,
    474    0x7000c00820c25c03ULL,
    475    0x80000000000021e7ULL,
    476    0x190e0000fc93dc03ULL,
    477    0x1000000180821c02ULL,
    478    0x1000000040931c02ULL,
    479    0x1000000010935c02ULL,
    480    0x4800000034825c03ULL,
    481    0x22c042c042c04287ULL,
    482    0x4800000030821c03ULL,
    483    0x2800000028031de4ULL,
    484    0x4801000020a29c03ULL,
    485    0x280000002c035de4ULL,
    486    0x0800000000b2dc42ULL,
    487    0x4801000024c31c03ULL,
    488    0x9400000000a01fc5ULL,
    489    0x200002e04202c047ULL,
    490    0x28005c18a0001de4ULL,
    491    0x0800000000d35c42ULL,
    492    0x9400000100c107c5ULL,
    493    0x9400000140c01f85ULL,
    494    0x8000000000001de7ULL
    495 };
    496 
    497 static const uint64_t nvf0_read_hw_sm_counters_code[] =
    498 {
    499    /* Same kernel as GK104 */
    500    0x0880808080808080ULL,
    501    0x86400000109c0022ULL,
    502    0x86400000019c0032ULL,
    503    0x86400000021c0002ULL,
    504    0x86400000029c0006ULL,
    505    0x86400000031c000aULL,
    506    0x86400000039c000eULL,
    507    0x86400000041c0012ULL,
    508    0x08ac1080108c8080ULL,
    509    0x86400000049c0016ULL,
    510    0x86400000051c001aULL,
    511    0x86400000059c001eULL,
    512    0xdb201c007f9c201eULL,
    513    0x64c03ce0c41c002aULL,
    514    0xc00000020a1c3021ULL,
    515    0x64c03ce0c49c002eULL,
    516    0x0810a0808010b810ULL,
    517    0xc0000001041c3025ULL,
    518    0x180000000020003cULL,
    519    0xdb201c007f9c243eULL,
    520    0xc1c00000301c2021ULL,
    521    0xc1c00000081c2431ULL,
    522    0xc1c00000021c2435ULL,
    523    0xe0800000069c2026ULL,
    524    0x08b010b010b010a0ULL,
    525    0xe0800000061c2022ULL,
    526    0xe4c03c00051c0032ULL,
    527    0xe0840000041c282aULL,
    528    0xe4c03c00059c0036ULL,
    529    0xe08040007f9c2c2eULL,
    530    0xe0840000049c3032ULL,
    531    0xfe800000001c2800ULL,
    532    0x080000b81080b010ULL,
    533    0x64c03ce0c51c0002ULL,
    534    0xe08040007f9c3436ULL,
    535    0xfe80000020043010ULL,
    536    0xfc800000281c3000ULL,
    537    0x18000000001c003cULL,
    538 };
    539 
    540 static const uint64_t gm107_read_hw_sm_counters_code[] =
    541 {
    542    0x001d0400e4200701ULL, /* sched (st 0x1 wr 0x0) (st 0x1 wr 0x1) (st 0x1 wr 0x2)  */
    543    0xf0c8000002170008ULL, /* mov $r8 $tidx                                          */
    544    0xf0c800000037000cULL, /* mov $r12 $virtid                                       */
    545    0xf0c8000000470000ULL, /* mov $r0 $pm0                                           */
    546    0x001e8400f0200761ULL, /* sched (st 0x1 wr 0x3) (st 0x1 wr 0x4) (st 0x1 wr 0x5)  */
    547    0xf0c8000000570001ULL, /* mov $r1 $pm1                                           */
    548    0xf0c8000000670002ULL, /* mov $r2 $pm2                                           */
    549    0xf0c8000000770003ULL, /* mov $r3 $pm3                                           */
    550    0x001e8400f42007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wr 0x5) (st 0x1 wr 0x5)  */
    551    0xf0c8000000870004ULL, /* mov $r4 $pm4                                           */
    552    0xf0c8000000970005ULL, /* mov $r5 $pm5                                           */
    553    0xf0c8000000a70006ULL, /* mov $r6 $pm6                                           */
    554    0x001f8401fc2007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wt 0x1) (st 0x1)         */
    555    0xf0c8000000b70007ULL, /* mov $r7 $pm7                                           */
    556    0x5b6403800087ff07ULL, /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1                   */
    557    0x4c98079c1887000aULL, /* mov $r10 c7[0x620] 0xf                                 */
    558    0x001fa400fc2017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x1) (st 0x9)                */
    559    0x3800000091470c08ULL, /* bfe u32 $r8 $r12 0x914                                 */
    560    0x4c98079c1897000bULL, /* mov $r11 c7[0x624] 0xf                                 */
    561    0x3800000020870c09ULL, /* bfe u32 $r9 $r12 0x208                                 */
    562    0x001c1800fc2007edULL, /* sched (st 0xd) (st 0x1) (st 0x6 wr 0x0)                */
    563    0xe30000000008000fULL, /* not $p0 exit                                           */
    564    0x5b6403800097ff0fULL, /* isetp eq u32 and $p1 0x1 0x0 $r9 0x1                   */
    565    0x3838000006070808ULL, /* imul u32 u32 $r8 $r8 0x60                              */
    566    0x003f8400e0c00726ULL, /* sched (st 0x6 wr 0x1) (st 0x6 wr 0x0) (st 0x1 wt 0x1)  */
    567    0x383800000107090cULL, /* imul u32 u32 $r12 $r9 0x10                             */
    568    0x383800000047090dULL, /* imul u32 u32 $r13 $r9 0x4                              */
    569    0x5c10000000d70809ULL, /* iadd $r9 $r8 $r13                                      */
    570    0x001f8400fcc017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x6) (st 0x1)                */
    571    0x5c10000000c70808ULL, /* iadd $r8 $r8 $r12                                      */
    572    0x5c98078000a7000cULL, /* mov $r12 $r10 0xf                                      */
    573    0x5c10800000870a0aULL, /* iadd cc $r10 $r10 $r8                                  */
    574    0x001f8400fc2007e6ULL, /* sched (st 0x6) (st 0x1) (st 0x1)                       */
    575    0x5c98078000b7000dULL, /* mov $r13 $r11 0xf                                      */
    576    0x5c1008000ff70b0bULL, /* iadd x $r11 $r11 0x0                                   */
    577    0x5c10800000970c0cULL, /* iadd cc $r12 $r12 $r9                                  */
    578    0x003f983c1c4007e1ULL, /* sched (st 0x1) (st 0x2 rd 0x0 wt 0x3c) (st 0x6 wt 0x1) */
    579    0x5c1008000ff70d0dULL, /* iadd x $r13 $r13 0x0                                   */
    580    0xbfd0000000070a00ULL, /* st e wt b128 g[$r10] $r0 0x1                           */
    581    0x4c98079c18a70000ULL, /* mov $r0 c7[0x628] 0xf                                  */
    582    0x001fbc00fc2007e6ULL, /* sched (st 0x1) (st 0x1) (st 0xf)                       */
    583    0xbfd0000004010c04ULL, /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1                  */
    584    0xbf90000005070c00ULL, /* st e wt b32 g[$r12+0x50] $r0 0x1                       */
    585    0xe30000000007000fULL, /* exit                                                   */
    586 };
    587 
    588 /* For simplicity, we will allocate as many group slots as we allocate counter
    589  * slots. This means that a single counter which wants to source from 2 groups
    590  * will have to be declared as using 2 counter slots. This shouldn't really be
    591  * a problem because such queries don't make much sense ... (unless someone is
    592  * really creative).
    593  */
    594 struct nvc0_hw_sm_counter_cfg
    595 {
    596    uint32_t func    : 16; /* mask or 4-bit logic op (depending on mode) */
    597    uint32_t mode    : 4;  /* LOGOP,B6,LOGOP_B6(_PULSE) */
    598    uint32_t sig_dom : 1;  /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
    599    uint32_t sig_sel : 8;  /* signal group */
    600    uint32_t src_mask;     /* mask for signal selection (only for NVC0:NVE4) */
    601    uint32_t src_sel;      /* signal selection for up to 4 sources */
    602 };
    603 
    604 struct nvc0_hw_sm_query_cfg
    605 {
    606    unsigned type;
    607    struct nvc0_hw_sm_counter_cfg ctr[8];
    608    uint8_t num_counters;
    609    uint8_t norm[2]; /* normalization num,denom */
    610 };
    611 
    612 #define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, g, 0, s }
    613 #define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, g, 0, s }
    614 #define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
    615 
    616 /* ==== Compute capability 3.0 (GK104:GK110) ==== */
    617 static const struct nvc0_hw_sm_query_cfg
    618 sm30_active_cycles =
    619 {
    620    .type         = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
    621    .ctr[0]       = _CB(0x0001, B6, 0x02, 0x00000000),
    622    .num_counters = 1,
    623    .norm         = { 1, 1 },
    624 };
    625 
    626 static const struct nvc0_hw_sm_query_cfg
    627 sm30_active_warps =
    628 {
    629    .type         = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
    630    .ctr[0]       = _CB(0x003f, B6, 0x02, 0x31483104),
    631    .num_counters = 1,
    632    .norm         = { 2, 1 },
    633 };
    634 
    635 static const struct nvc0_hw_sm_query_cfg
    636 sm30_atom_cas_count =
    637 {
    638    .type         = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
    639    .ctr[0]       = _CA(0x0001, B6, 0x1c, 0x000000004),
    640    .num_counters = 1,
    641    .norm         = { 1, 1 },
    642 };
    643 
    644 static const struct nvc0_hw_sm_query_cfg
    645 sm30_atom_count =
    646 {
    647    .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
    648    .ctr[0]       = _CA(0x0001, B6, 0x1c, 0x00000000),
    649    .num_counters = 1,
    650    .norm         = { 1, 1 },
    651 };
    652 
    653 static const struct nvc0_hw_sm_query_cfg
    654 sm30_branch =
    655 {
    656    .type         = NVC0_HW_SM_QUERY_BRANCH,
    657    .ctr[0]       = _CA(0x0001, B6, 0x1c, 0x0000000c),
    658    .num_counters = 1,
    659    .norm         = { 1, 1 },
    660 };
    661 
    662 static const struct nvc0_hw_sm_query_cfg
    663 sm30_divergent_branch =
    664 {
    665    .type         = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
    666    .ctr[0]       = _CA(0x0001, B6, 0x1c, 0x00000010),
    667    .num_counters = 1,
    668    .norm         = { 1, 1 },
    669 };
    670 
    671 static const struct nvc0_hw_sm_query_cfg
    672 sm30_gld_request =
    673 {
    674    .type         = NVC0_HW_SM_QUERY_GLD_REQUEST,
    675    .ctr[0]       = _CA(0x0001, B6, 0x1b, 0x00000010),
    676    .num_counters = 1,
    677    .norm         = { 1, 1 },
    678 };
    679 
    680 static const struct nvc0_hw_sm_query_cfg
    681 sm30_gld_mem_div_replay =
    682 {
    683    .type         = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
    684    .ctr[0]       = _CB(0x0001, B6, 0x08, 0x00000010),
    685    .num_counters = 1,
    686    .norm         = { 1, 1 },
    687 };
    688 
    689 static const struct nvc0_hw_sm_query_cfg
    690 sm30_gst_transactions =
    691 {
    692    .type         = NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
    693    .ctr[0]       = _CB(0x0001, B6, 0x11, 0x00000004),
    694    .num_counters = 1,
    695    .norm         = { 1, 1 },
    696 };
    697 
    698 static const struct nvc0_hw_sm_query_cfg
    699 sm30_gst_mem_div_replay =
    700 {
    701    .type         = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
    702    .ctr[0]       = _CB(0x0001, B6, 0x08, 0x00000014),
    703    .num_counters = 1,
    704    .norm         = { 1, 1 },
    705 };
    706 
    707 static const struct nvc0_hw_sm_query_cfg
    708 sm30_gred_count =
    709 {
    710    .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
    711    .ctr[0]       = _CA(0x0001, B6, 0x1c, 0x00000008),
    712    .num_counters = 1,
    713    .norm         = { 1, 1 },
    714 };
    715 
    716 static const struct nvc0_hw_sm_query_cfg
    717 sm30_gst_request =
    718 {
    719    .type         = NVC0_HW_SM_QUERY_GST_REQUEST,
    720    .ctr[0]       = _CA(0x0001, B6, 0x1b, 0x00000014),
    721    .num_counters = 1,
    722    .norm         = { 1, 1 },
    723 };
    724 
    725 static const struct nvc0_hw_sm_query_cfg
    726 sm30_inst_executed =
    727 {
    728    .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
    729    .ctr[0]       = _CA(0x0003, B6, 0x04, 0x00000398),
    730    .num_counters = 1,
    731    .norm         = { 1, 1 },
    732 };
    733 
    734 static const struct nvc0_hw_sm_query_cfg
    735 sm30_inst_issued1 =
    736 {
    737    .type         = NVC0_HW_SM_QUERY_INST_ISSUED1,
    738    .ctr[0]       = _CA(0x0001, B6, 0x05, 0x00000004),
    739    .num_counters = 1,
    740    .norm         = { 1, 1 },
    741 };
    742 
    743 static const struct nvc0_hw_sm_query_cfg
    744 sm30_inst_issued2 =
    745 {
    746    .type         = NVC0_HW_SM_QUERY_INST_ISSUED2,
    747    .ctr[0]       = _CA(0x0001, B6, 0x05, 0x00000008),
    748    .num_counters = 1,
    749    .norm         = { 1, 1 },
    750 };
    751 
    752 static const struct nvc0_hw_sm_query_cfg
    753 sm30_l1_gld_hit =
    754 {
    755    .type         = NVC0_HW_SM_QUERY_L1_GLD_HIT,
    756    .ctr[0]       = _CB(0x0001, B6, 0x10, 0x00000010),
    757    .num_counters = 1,
    758    .norm         = { 1, 1 },
    759 };
    760 
    761 static const struct nvc0_hw_sm_query_cfg
    762 sm30_l1_gld_miss =
    763 {
    764    .type         = NVC0_HW_SM_QUERY_L1_GLD_MISS,
    765    .ctr[0]       = _CB(0x0001, B6, 0x10, 0x00000014),
    766    .num_counters = 1,
    767    .norm         = { 1, 1 },
    768 };
    769 
    770 static const struct nvc0_hw_sm_query_cfg
    771 sm30_l1_gld_transactions =
    772 {
    773    .type         = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS,
    774    .ctr[0]       = _CB(0x0001, B6, 0x0f, 0x00000000),
    775    .num_counters = 1,
    776    .norm         = { 1, 1 },
    777 };
    778 
    779 static const struct nvc0_hw_sm_query_cfg
    780 sm30_l1_gst_transactions =
    781 {
    782    .type         = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS,
    783    .ctr[0]       = _CB(0x0001, B6, 0x0f, 0x00000004),
    784    .num_counters = 1,
    785    .norm         = { 1, 1 },
    786 };
    787 
    788 static const struct nvc0_hw_sm_query_cfg
    789 sm30_l1_local_ld_hit =
    790 {
    791    .type         = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT,
    792    .ctr[0]       = _CB(0x0001, B6, 0x10, 0x00000000),
    793    .num_counters = 1,
    794    .norm         = { 1, 1 },
    795 };
    796 
    797 static const struct nvc0_hw_sm_query_cfg
    798 sm30_l1_local_ld_miss =
    799 {
    800    .type         = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS,
    801    .ctr[0]       = _CB(0x0001, B6, 0x10, 0x00000004),
    802    .num_counters = 1,
    803    .norm         = { 1, 1 },
    804 };
    805 
    806 static const struct nvc0_hw_sm_query_cfg
    807 sm30_l1_local_st_hit =
    808 {
    809    .type         = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT,
    810    .ctr[0]       = _CB(0x0001, B6, 0x10, 0x00000008),
    811    .num_counters = 1,
    812    .norm         = { 1, 1 },
    813 };
    814 
    815 static const struct nvc0_hw_sm_query_cfg
    816 sm30_l1_local_st_miss =
    817 {
    818    .type         = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS,
    819    .ctr[0]       = _CB(0x0001, B6, 0x10, 0x0000000c),
    820    .num_counters = 1,
    821    .norm         = { 1, 1 },
    822 };
    823 
    824 static const struct nvc0_hw_sm_query_cfg
    825 sm30_l1_shared_ld_transactions =
    826 {
    827    .type         = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
    828    .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000008),
    829    .num_counters = 1,
    830    .norm         = { 1, 1 },
    831 };
    832 
    833 static const struct nvc0_hw_sm_query_cfg
    834 sm30_l1_shared_st_transactions =
    835 {
    836    .type         = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
    837    .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x0000000c),
    838    .num_counters = 1,
    839    .norm         = { 1, 1 },
    840 };
    841 
    842 static const struct nvc0_hw_sm_query_cfg
    843 sm30_local_ld =
    844 {
    845    .type         = NVC0_HW_SM_QUERY_LOCAL_LD,
    846    .ctr[0]       = _CA(0x0001, B6, 0x1b, 0x00000008),
    847    .num_counters = 1,
    848    .norm         = { 1, 1 },
    849 };
    850 
    851 static const struct nvc0_hw_sm_query_cfg
    852 sm30_local_ld_transactions =
    853 {
    854    .type         = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
    855    .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000000),
    856    .num_counters = 1,
    857    .norm         = { 1, 1 },
    858 };
    859 
    860 static const struct nvc0_hw_sm_query_cfg
    861 sm30_local_st =
    862 {
    863    .type         = NVC0_HW_SM_QUERY_LOCAL_ST,
    864    .ctr[0]       = _CA(0x0001, B6, 0x1b, 0x0000000c),
    865    .num_counters = 1,
    866    .norm         = { 1, 1 },
    867 };
    868 
    869 static const struct nvc0_hw_sm_query_cfg
    870 sm30_local_st_transactions =
    871 {
    872    .type         = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
    873    .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000004),
    874    .num_counters = 1,
    875    .norm         = { 1, 1 },
    876 };
    877 
    878 static const struct nvc0_hw_sm_query_cfg
    879 sm30_prof_trigger_0 =
    880 {
    881    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
    882    .ctr[0]       = _CA(0x0001, B6, 0x01, 0x00000000),
    883    .num_counters = 1,
    884    .norm         = { 1, 1 },
    885 };
    886 
    887 static const struct nvc0_hw_sm_query_cfg
    888 sm30_prof_trigger_1 =
    889 {
    890    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
    891    .ctr[0]       = _CA(0x0001, B6, 0x01, 0x00000004),
    892    .num_counters = 1,
    893    .norm         = { 1, 1 },
    894 };
    895 
    896 static const struct nvc0_hw_sm_query_cfg
    897 sm30_prof_trigger_2 =
    898 {
    899    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
    900    .ctr[0]       = _CA(0x0001, B6, 0x01, 0x00000008),
    901    .num_counters = 1,
    902    .norm         = { 1, 1 },
    903 };
    904 
    905 static const struct nvc0_hw_sm_query_cfg
    906 sm30_prof_trigger_3 =
    907 {
    908    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
    909    .ctr[0]       = _CA(0x0001, B6, 0x01, 0x0000000c),
    910    .num_counters = 1,
    911    .norm         = { 1, 1 },
    912 };
    913 
    914 static const struct nvc0_hw_sm_query_cfg
    915 sm30_prof_trigger_4 =
    916 {
    917    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
    918    .ctr[0]       = _CA(0x0001, B6, 0x01, 0x00000010),
    919    .num_counters = 1,
    920    .norm         = { 1, 1 },
    921 };
    922 
    923 static const struct nvc0_hw_sm_query_cfg
    924 sm30_prof_trigger_5 =
    925 {
    926    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
    927    .ctr[0]       = _CA(0x0001, B6, 0x01, 0x00000014),
    928    .num_counters = 1,
    929    .norm         = { 1, 1 },
    930 };
    931 
    932 static const struct nvc0_hw_sm_query_cfg
    933 sm30_prof_trigger_6 =
    934 {
    935    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
    936    .ctr[0]       = _CA(0x0001, B6, 0x01, 0x00000018),
    937    .num_counters = 1,
    938    .norm         = { 1, 1 },
    939 };
    940 
    941 static const struct nvc0_hw_sm_query_cfg
    942 sm30_prof_trigger_7 =
    943 {
    944    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
    945    .ctr[0]       = _CA(0x0001, B6, 0x01, 0x0000001c),
    946    .num_counters = 1,
    947    .norm         = { 1, 1 },
    948 };
    949 
    950 static const struct nvc0_hw_sm_query_cfg
    951 sm30_shared_ld =
    952 {
    953    .type         = NVC0_HW_SM_QUERY_SHARED_LD,
    954    .ctr[0]       = _CA(0x0001, B6, 0x1b, 0x00000000),
    955    .num_counters = 1,
    956    .norm         = { 1, 1 },
    957 };
    958 
    959 static const struct nvc0_hw_sm_query_cfg
    960 sm30_shared_ld_replay =
    961 {
    962    .type         = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
    963    .ctr[0]       = _CB(0x0001, B6, 0x08, 0x00000008),
    964    .num_counters = 1,
    965    .norm         = { 1, 1 },
    966 };
    967 
    968 static const struct nvc0_hw_sm_query_cfg
    969 sm30_shared_st =
    970 {
    971    .type         = NVC0_HW_SM_QUERY_SHARED_ST,
    972    .ctr[0]       = _CA(0x0001, B6, 0x1b, 0x00000004),
    973    .num_counters = 1,
    974    .norm         = { 1, 1 },
    975 };
    976 
    977 static const struct nvc0_hw_sm_query_cfg
    978 sm30_shared_st_replay =
    979 {
    980    .type         = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
    981    .ctr[0]       = _CB(0x0001, B6, 0x08, 0x0000000c),
    982    .num_counters = 1,
    983    .norm         = { 1, 1 },
    984 };
    985 
    986 static const struct nvc0_hw_sm_query_cfg
    987 sm30_sm_cta_launched =
    988 {
    989    .type         = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
    990    .ctr[0]       = _CB(0x0001, B6, 0x02, 0x0000001c),
    991    .num_counters = 1,
    992    .norm         = { 1, 1 },
    993 };
    994 
    995 static const struct nvc0_hw_sm_query_cfg
    996 sm30_threads_launched =
    997 {
    998    .type         = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
    999    .ctr[0]       = _CA(0x003f, B6, 0x03, 0x398a4188),
   1000    .num_counters = 1,
   1001    .norm         = { 1, 1 },
   1002 };
   1003 
   1004 static const struct nvc0_hw_sm_query_cfg
   1005 sm30_uncached_gld_transactions =
   1006 {
   1007    .type         = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
   1008    .ctr[0]       = _CB(0x0001, B6, 0x11, 0x00000000),
   1009    .num_counters = 1,
   1010    .norm         = { 1, 1 },
   1011 };
   1012 
   1013 static const struct nvc0_hw_sm_query_cfg
   1014 sm30_warps_launched =
   1015 {
   1016    .type         = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
   1017    .ctr[0]       = _CA(0x0001, B6, 0x03, 0x00000004),
   1018    .num_counters = 1,
   1019    .norm         = { 1, 1 },
   1020 };
   1021 
   1022 /* NOTES:
   1023  * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
   1024  * inst_executed etc.: we only count a single warp scheduler
   1025  */
   1026 static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] =
   1027 {
   1028    &sm30_active_cycles,
   1029    &sm30_active_warps,
   1030    &sm30_atom_cas_count,
   1031    &sm30_atom_count,
   1032    &sm30_branch,
   1033    &sm30_divergent_branch,
   1034    &sm30_gld_request,
   1035    &sm30_gld_mem_div_replay,
   1036    &sm30_gst_transactions,
   1037    &sm30_gst_mem_div_replay,
   1038    &sm30_gred_count,
   1039    &sm30_gst_request,
   1040    &sm30_inst_executed,
   1041    &sm30_inst_issued1,
   1042    &sm30_inst_issued2,
   1043    &sm30_l1_gld_hit,
   1044    &sm30_l1_gld_miss,
   1045    &sm30_l1_gld_transactions,
   1046    &sm30_l1_gst_transactions,
   1047    &sm30_l1_local_ld_hit,
   1048    &sm30_l1_local_ld_miss,
   1049    &sm30_l1_local_st_hit,
   1050    &sm30_l1_local_st_miss,
   1051    &sm30_l1_shared_ld_transactions,
   1052    &sm30_l1_shared_st_transactions,
   1053    &sm30_local_ld,
   1054    &sm30_local_ld_transactions,
   1055    &sm30_local_st,
   1056    &sm30_local_st_transactions,
   1057    &sm30_prof_trigger_0,
   1058    &sm30_prof_trigger_1,
   1059    &sm30_prof_trigger_2,
   1060    &sm30_prof_trigger_3,
   1061    &sm30_prof_trigger_4,
   1062    &sm30_prof_trigger_5,
   1063    &sm30_prof_trigger_6,
   1064    &sm30_prof_trigger_7,
   1065    &sm30_shared_ld,
   1066    &sm30_shared_ld_replay,
   1067    &sm30_shared_st,
   1068    &sm30_shared_st_replay,
   1069    &sm30_sm_cta_launched,
   1070    &sm30_threads_launched,
   1071    &sm30_uncached_gld_transactions,
   1072    &sm30_warps_launched,
   1073 };
   1074 
   1075 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
   1076 static const struct nvc0_hw_sm_query_cfg
   1077 sm35_atom_cas_count =
   1078 {
   1079    .type         = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
   1080    .ctr[0]       = _CA(0x0001, B6, 0x1a, 0x00000014),
   1081    .num_counters = 1,
   1082    .norm         = { 1, 1 },
   1083 };
   1084 
   1085 static const struct nvc0_hw_sm_query_cfg
   1086 sm35_atom_count =
   1087 {
   1088    .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
   1089    .ctr[0]       = _CA(0x0001, B6, 0x1a, 0x00000010),
   1090    .num_counters = 1,
   1091    .norm         = { 1, 1 },
   1092 };
   1093 
   1094 static const struct nvc0_hw_sm_query_cfg
   1095 sm35_gred_count =
   1096 {
   1097    .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
   1098    .ctr[0]       = _CA(0x0001, B6, 0x1a, 0x00000018),
   1099    .num_counters = 1,
   1100    .norm         = { 1, 1 },
   1101 };
   1102 
   1103 static const struct nvc0_hw_sm_query_cfg
   1104 sm35_not_pred_off_inst_executed =
   1105 {
   1106    .type         = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
   1107    .ctr[0]       = _CA(0x003f, B6, 0x14, 0x29062080),
   1108    .num_counters = 1,
   1109    .norm         = { 1, 1 },
   1110 };
   1111 
   1112 static const struct nvc0_hw_sm_query_cfg
   1113 sm35_shared_ld_replay =
   1114 {
   1115    .type         = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
   1116    .ctr[0]       = _CB(0xaaaa, LOGOP, 0x13, 0x00000018),
   1117    .ctr[1]       = _CB(0x8888, LOGOP, 0x08, 0x00000151),
   1118    .num_counters = 2,
   1119    .norm         = { 1, 1 },
   1120 };
   1121 
   1122 static const struct nvc0_hw_sm_query_cfg
   1123 sm35_shared_st_replay =
   1124 {
   1125    .type         = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
   1126    .ctr[0]       = _CB(0xaaaa, LOGOP, 0x13, 0x00000018),
   1127    .ctr[1]       = _CB(0x8888, LOGOP, 0x08, 0x000001d1),
   1128    .num_counters = 2,
   1129    .norm         = { 1, 1 },
   1130 };
   1131 
   1132 static const struct nvc0_hw_sm_query_cfg
   1133 sm35_th_inst_executed =
   1134 {
   1135    .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
   1136    .ctr[0]       = _CA(0x003f, B6, 0x11, 0x29062080),
   1137    .num_counters = 1,
   1138    .norm         = { 1, 1 },
   1139 };
   1140 
   1141 static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
   1142 {
   1143    &sm30_active_cycles,
   1144    &sm30_active_warps,
   1145    &sm35_atom_cas_count,
   1146    &sm35_atom_count,
   1147    &sm30_gld_request,
   1148    &sm30_gld_mem_div_replay,
   1149    &sm30_gst_transactions,
   1150    &sm30_gst_mem_div_replay,
   1151    &sm35_gred_count,
   1152    &sm30_gst_request,
   1153    &sm30_inst_executed,
   1154    &sm30_inst_issued1,
   1155    &sm30_inst_issued2,
   1156    &sm30_l1_gld_hit,
   1157    &sm30_l1_gld_miss,
   1158    &sm30_l1_gld_transactions,
   1159    &sm30_l1_gst_transactions,
   1160    &sm30_l1_local_ld_hit,
   1161    &sm30_l1_local_ld_miss,
   1162    &sm30_l1_local_st_hit,
   1163    &sm30_l1_local_st_miss,
   1164    &sm30_l1_shared_ld_transactions,
   1165    &sm30_l1_shared_st_transactions,
   1166    &sm30_local_ld,
   1167    &sm30_local_ld_transactions,
   1168    &sm30_local_st,
   1169    &sm30_local_st_transactions,
   1170    &sm35_not_pred_off_inst_executed,
   1171    &sm30_prof_trigger_0,
   1172    &sm30_prof_trigger_1,
   1173    &sm30_prof_trigger_2,
   1174    &sm30_prof_trigger_3,
   1175    &sm30_prof_trigger_4,
   1176    &sm30_prof_trigger_5,
   1177    &sm30_prof_trigger_6,
   1178    &sm30_prof_trigger_7,
   1179    &sm30_shared_ld,
   1180    &sm35_shared_ld_replay,
   1181    &sm30_shared_st,
   1182    &sm35_shared_st_replay,
   1183    &sm30_sm_cta_launched,
   1184    &sm35_th_inst_executed,
   1185    &sm30_threads_launched,
   1186    &sm30_uncached_gld_transactions,
   1187    &sm30_warps_launched,
   1188 };
   1189 
   1190 /* ==== Compute capability 5.0 (GM107/GM108) ==== */
   1191 static const struct nvc0_hw_sm_query_cfg
   1192 sm50_active_ctas =
   1193 {
   1194    .type         = NVC0_HW_SM_QUERY_ACTIVE_CTAS,
   1195    .ctr[0]       = _CB(0x003f, B6, 0x01, 0x29062080),
   1196    .num_counters = 1,
   1197    .norm         = { 1, 1 },
   1198 };
   1199 
   1200 static const struct nvc0_hw_sm_query_cfg
   1201 sm50_active_cycles =
   1202 {
   1203    .type         = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
   1204    .ctr[0]       = _CB(0x0001, B6, 0x00, 0x00000004),
   1205    .num_counters = 1,
   1206    .norm         = { 1, 1 },
   1207 };
   1208 
   1209 static const struct nvc0_hw_sm_query_cfg
   1210 sm50_active_warps =
   1211 {
   1212    .type         = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
   1213    .ctr[0]       = _CB(0x003f, B6, 0x00, 0x398a4188),
   1214    .num_counters = 1,
   1215    .norm         = { 1, 1 },
   1216 };
   1217 
   1218 static const struct nvc0_hw_sm_query_cfg
   1219 sm50_atom_count =
   1220 {
   1221    .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
   1222    .ctr[0]       = _CA(0x0001, B6, 0x14, 0x00000004),
   1223    .num_counters = 1,
   1224    .norm         = { 1, 1 },
   1225 };
   1226 
   1227 static const struct nvc0_hw_sm_query_cfg
   1228 sm50_branch =
   1229 {
   1230    .type         = NVC0_HW_SM_QUERY_BRANCH,
   1231    .ctr[0]       = _CA(0x0001, B6, 0x1a, 0x00000010),
   1232    .num_counters = 1,
   1233    .norm         = { 1, 1 },
   1234 };
   1235 
   1236 static const struct nvc0_hw_sm_query_cfg
   1237 sm50_divergent_branch =
   1238 {
   1239    .type         = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
   1240    .ctr[0]       = _CA(0x0001, B6, 0x1a, 0x00000004),
   1241    .num_counters = 1,
   1242    .norm         = { 1, 1 },
   1243 };
   1244 
   1245 static const struct nvc0_hw_sm_query_cfg
   1246 sm50_global_atom_cas =
   1247 {
   1248    .type         = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
   1249    .ctr[0]       = _CA(0x0001, B6, 0x14, 0x00000000),
   1250    .num_counters = 1,
   1251    .norm         = { 1, 1 },
   1252 };
   1253 
   1254 static const struct nvc0_hw_sm_query_cfg
   1255 sm50_global_ld =
   1256 {
   1257    .type         = NVC0_HW_SM_QUERY_GLOBAL_LD,
   1258    .ctr[0]       = _CA(0x0001, B6, 0x14, 0x0000000c),
   1259    .num_counters = 1,
   1260    .norm         = { 1, 1 },
   1261 };
   1262 
   1263 static const struct nvc0_hw_sm_query_cfg
   1264 sm50_global_st =
   1265 {
   1266    .type         = NVC0_HW_SM_QUERY_GLOBAL_ST,
   1267    .ctr[0]       = _CA(0x0001, B6, 0x14, 0x00000010),
   1268    .num_counters = 1,
   1269    .norm         = { 1, 1 },
   1270 };
   1271 
   1272 static const struct nvc0_hw_sm_query_cfg
   1273 sm50_gred_count =
   1274 {
   1275    .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
   1276    .ctr[0]       = _CA(0x0001, B6, 0x14, 0x00000008),
   1277    .num_counters = 1,
   1278    .norm         = { 1, 1 },
   1279 };
   1280 
   1281 static const struct nvc0_hw_sm_query_cfg
   1282 sm50_inst_executed =
   1283 {
   1284    .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
   1285    .ctr[0]       = _CA(0x0003, B6, 0x02, 0x00000398),
   1286    .num_counters = 1,
   1287    .norm         = { 1, 1 },
   1288 };
   1289 
   1290 static const struct nvc0_hw_sm_query_cfg
   1291 sm50_inst_issued0 =
   1292 {
   1293    .type         = NVC0_HW_SM_QUERY_INST_ISSUED0,
   1294    .ctr[0]       = _CA(0x0001, B6, 0x02, 0x0000000c),
   1295    .num_counters = 1,
   1296    .norm         = { 1, 1 },
   1297 };
   1298 
   1299 static const struct nvc0_hw_sm_query_cfg
   1300 sm50_inst_issued1 =
   1301 {
   1302    .type         = NVC0_HW_SM_QUERY_INST_ISSUED1,
   1303    .ctr[0]       = _CA(0x0001, B6, 0x02, 0x00000010),
   1304    .num_counters = 1,
   1305    .norm         = { 1, 1 },
   1306 };
   1307 
   1308 static const struct nvc0_hw_sm_query_cfg
   1309 sm50_inst_issued2 =
   1310 {
   1311    .type         = NVC0_HW_SM_QUERY_INST_ISSUED2,
   1312    .ctr[0]       = _CA(0x0001, B6, 0x02, 0x00000014),
   1313    .num_counters = 1,
   1314    .norm         = { 1, 1 },
   1315 };
   1316 
   1317 static const struct nvc0_hw_sm_query_cfg
   1318 sm50_local_ld =
   1319 {
   1320    .type         = NVC0_HW_SM_QUERY_LOCAL_LD,
   1321    .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000004),
   1322    .num_counters = 1,
   1323    .norm         = { 1, 1 },
   1324 };
   1325 
   1326 static const struct nvc0_hw_sm_query_cfg
   1327 sm50_local_st =
   1328 {
   1329    .type         = NVC0_HW_SM_QUERY_LOCAL_ST,
   1330    .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000000),
   1331    .num_counters = 1,
   1332    .norm         = { 1, 1 },
   1333 };
   1334 
   1335 static const struct nvc0_hw_sm_query_cfg
   1336 sm50_not_pred_off_inst_executed =
   1337 {
   1338    .type         = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
   1339    .ctr[0]       = _CA(0x003f, B6, 0x05, 0x29062080),
   1340    .num_counters = 1,
   1341    .norm         = { 1, 1 },
   1342 };
   1343 
   1344 static const struct nvc0_hw_sm_query_cfg
   1345 sm50_prof_trigger_0 =
   1346 {
   1347    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
   1348    .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000000),
   1349    .num_counters = 1,
   1350    .norm         = { 1, 1 },
   1351 };
   1352 
   1353 static const struct nvc0_hw_sm_query_cfg
   1354 sm50_prof_trigger_1 =
   1355 {
   1356    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
   1357    .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000004),
   1358    .num_counters = 1,
   1359    .norm         = { 1, 1 },
   1360 };
   1361 
   1362 static const struct nvc0_hw_sm_query_cfg
   1363 sm50_prof_trigger_2 =
   1364 {
   1365    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
   1366    .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000008),
   1367    .num_counters = 1,
   1368    .norm         = { 1, 1 },
   1369 };
   1370 
   1371 static const struct nvc0_hw_sm_query_cfg
   1372 sm50_prof_trigger_3 =
   1373 {
   1374    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
   1375    .ctr[0]       = _CA(0x0001, B6, 0x00, 0x0000000c),
   1376    .num_counters = 1,
   1377    .norm         = { 1, 1 },
   1378 };
   1379 
   1380 static const struct nvc0_hw_sm_query_cfg
   1381 sm50_prof_trigger_4 =
   1382 {
   1383    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
   1384    .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000010),
   1385    .num_counters = 1,
   1386    .norm         = { 1, 1 },
   1387 };
   1388 
   1389 static const struct nvc0_hw_sm_query_cfg
   1390 sm50_prof_trigger_5 =
   1391 {
   1392    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
   1393    .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000014),
   1394    .num_counters = 1,
   1395    .norm         = { 1, 1 },
   1396 };
   1397 
   1398 static const struct nvc0_hw_sm_query_cfg
   1399 sm50_prof_trigger_6 =
   1400 {
   1401    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
   1402    .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000018),
   1403    .num_counters = 1,
   1404    .norm         = { 1, 1 },
   1405 };
   1406 
   1407 static const struct nvc0_hw_sm_query_cfg
   1408 sm50_prof_trigger_7 =
   1409 {
   1410    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
   1411    .ctr[0]       = _CA(0x0001, B6, 0x00, 0x0000001c),
   1412    .num_counters = 1,
   1413    .norm         = { 1, 1 },
   1414 };
   1415 
   1416 static const struct nvc0_hw_sm_query_cfg
   1417 sm50_shared_atom =
   1418 {
   1419    .type         = NVC0_HW_SM_QUERY_SHARED_ATOM,
   1420    .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000014),
   1421    .num_counters = 1,
   1422    .norm         = { 1, 1 },
   1423 };
   1424 
   1425 static const struct nvc0_hw_sm_query_cfg
   1426 sm50_shared_atom_cas =
   1427 {
   1428    .type         = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
   1429    .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000010),
   1430    .num_counters = 1,
   1431    .norm         = { 1, 1 },
   1432 };
   1433 
   1434 static const struct nvc0_hw_sm_query_cfg
   1435 sm50_shared_ld =
   1436 {
   1437    .type         = NVC0_HW_SM_QUERY_SHARED_LD,
   1438    .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000008),
   1439    .num_counters = 1,
   1440    .norm         = { 1, 1 },
   1441 };
   1442 
   1443 static const struct nvc0_hw_sm_query_cfg
   1444 sm50_shared_ld_bank_conflict =
   1445 {
   1446    .type         = NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
   1447    .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000000),
   1448    .num_counters = 1,
   1449    .norm         = { 1, 1 },
   1450 };
   1451 
   1452 static const struct nvc0_hw_sm_query_cfg
   1453 sm50_shared_ld_transactions =
   1454 {
   1455    .type         = NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
   1456    .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000008),
   1457    .num_counters = 1,
   1458    .norm         = { 1, 1 },
   1459 };
   1460 
   1461 static const struct nvc0_hw_sm_query_cfg
   1462 sm50_shared_st =
   1463 {
   1464    .type         = NVC0_HW_SM_QUERY_SHARED_ST,
   1465    .ctr[0]       = _CA(0x0001, B6, 0x13, 0x0000000c),
   1466    .num_counters = 1,
   1467    .norm         = { 1, 1 },
   1468 };
   1469 
   1470 static const struct nvc0_hw_sm_query_cfg
   1471 sm50_shared_st_bank_conflict =
   1472 {
   1473    .type         = NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
   1474    .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000004),
   1475    .num_counters = 1,
   1476    .norm         = { 1, 1 },
   1477 };
   1478 
   1479 static const struct nvc0_hw_sm_query_cfg
   1480 sm50_shared_st_transactions =
   1481 {
   1482    .type         = NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
   1483    .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x0000000c),
   1484    .num_counters = 1,
   1485    .norm         = { 1, 1 },
   1486 };
   1487 
   1488 static const struct nvc0_hw_sm_query_cfg
   1489 sm50_sm_cta_launched =
   1490 {
   1491    .type         = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
   1492    .ctr[0]       = _CB(0x0001, B6, 0x01, 0x00000018),
   1493    .num_counters = 1,
   1494    .norm         = { 1, 1 },
   1495 };
   1496 
   1497 static const struct nvc0_hw_sm_query_cfg
   1498 sm50_th_inst_executed =
   1499 {
   1500    .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
   1501    .ctr[0]       = _CA(0x003f, B6, 0x04, 0x29062080),
   1502    .num_counters = 1,
   1503    .norm         = { 1, 1 },
   1504 };
   1505 
   1506 static const struct nvc0_hw_sm_query_cfg
   1507 sm50_warps_launched =
   1508 {
   1509    .type         = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
   1510    .ctr[0]       = _CA(0x0001, B6, 0x02, 0x00000008),
   1511    .num_counters = 1,
   1512    .norm         = { 1, 1 },
   1513 };
   1514 
   1515 static const struct nvc0_hw_sm_query_cfg *sm50_hw_sm_queries[] =
   1516 {
   1517    &sm50_active_ctas,
   1518    &sm50_active_cycles,
   1519    &sm50_active_warps,
   1520    &sm50_atom_count,
   1521    &sm50_branch,
   1522    &sm50_divergent_branch,
   1523    &sm50_global_atom_cas,
   1524    &sm50_global_ld,
   1525    &sm50_global_st,
   1526    &sm50_gred_count,
   1527    &sm50_inst_executed,
   1528    &sm50_inst_issued0,
   1529    &sm50_inst_issued1,
   1530    &sm50_inst_issued2,
   1531    &sm50_local_ld,
   1532    &sm50_local_st,
   1533    &sm50_not_pred_off_inst_executed,
   1534    &sm50_prof_trigger_0,
   1535    &sm50_prof_trigger_1,
   1536    &sm50_prof_trigger_2,
   1537    &sm50_prof_trigger_3,
   1538    &sm50_prof_trigger_4,
   1539    &sm50_prof_trigger_5,
   1540    &sm50_prof_trigger_6,
   1541    &sm50_prof_trigger_7,
   1542    &sm50_shared_atom,
   1543    &sm50_shared_atom_cas,
   1544    &sm50_shared_ld,
   1545    &sm50_shared_ld_bank_conflict,
   1546    &sm50_shared_ld_transactions,
   1547    &sm50_shared_st,
   1548    &sm50_shared_st_bank_conflict,
   1549    &sm50_shared_st_transactions,
   1550    &sm50_sm_cta_launched,
   1551    &sm50_th_inst_executed,
   1552    &sm50_warps_launched,
   1553 };
   1554 
   1555 /* ==== Compute capability 5.2 (GM200/GM204/GM206) ==== */
   1556 static const struct nvc0_hw_sm_query_cfg
   1557 sm52_atom_count =
   1558 {
   1559    .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
   1560    .ctr[0]       = _CA(0x0001, B6, 0x0a, 0x0000001c),
   1561    .num_counters = 1,
   1562    .norm         = { 1, 1 },
   1563 };
   1564 
   1565 static const struct nvc0_hw_sm_query_cfg
   1566 sm52_global_atom_cas =
   1567 {
   1568    .type         = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
   1569    .ctr[0]       = _CA(0x0001, B6, 0x0a, 0x00000018),
   1570    .num_counters = 1,
   1571    .norm         = { 1, 1 },
   1572 };
   1573 
   1574 static const struct nvc0_hw_sm_query_cfg
   1575 sm52_global_ld =
   1576 {
   1577    .type         = NVC0_HW_SM_QUERY_GLOBAL_LD,
   1578    .ctr[0]       = _CA(0x0001, B6, 0x0b, 0x00000018),
   1579    .num_counters = 1,
   1580    .norm         = { 1, 1 },
   1581 };
   1582 
   1583 static const struct nvc0_hw_sm_query_cfg
   1584 sm52_global_st =
   1585 {
   1586    .type         = NVC0_HW_SM_QUERY_GLOBAL_ST,
   1587    .ctr[0]       = _CA(0x0001, B6, 0x0b, 0x0000001c),
   1588    .num_counters = 1,
   1589    .norm         = { 1, 1 },
   1590 };
   1591 
   1592 static const struct nvc0_hw_sm_query_cfg
   1593 sm52_gred_count =
   1594 {
   1595    .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
   1596    .ctr[0]       = _CA(0x0001, B6, 0x0f, 0x00000018),
   1597    .num_counters = 1,
   1598    .norm         = { 1, 1 },
   1599 };
   1600 
   1601 static const struct nvc0_hw_sm_query_cfg
   1602 sm52_inst_executed =
   1603 {
   1604    .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
   1605    .ctr[0]       = _CA(0x0003, B6, 0x03, 0x0000020c),
   1606    .num_counters = 1,
   1607    .norm         = { 1, 1 },
   1608 };
   1609 
   1610 static const struct nvc0_hw_sm_query_cfg
   1611 sm52_inst_issued0 =
   1612 {
   1613    .type         = NVC0_HW_SM_QUERY_INST_ISSUED0,
   1614    .ctr[0]       = _CA(0x0001, B6, 0x03, 0x00000000),
   1615    .num_counters = 1,
   1616    .norm         = { 1, 1 },
   1617 };
   1618 
   1619 static const struct nvc0_hw_sm_query_cfg
   1620 sm52_inst_issued1 =
   1621 {
   1622    .type         = NVC0_HW_SM_QUERY_INST_ISSUED1,
   1623    .ctr[0]       = _CA(0x0001, B6, 0x03, 0x00000004),
   1624    .num_counters = 1,
   1625    .norm         = { 1, 1 },
   1626 };
   1627 
   1628 static const struct nvc0_hw_sm_query_cfg
   1629 sm52_inst_issued2 =
   1630 {
   1631    .type         = NVC0_HW_SM_QUERY_INST_ISSUED2,
   1632    .ctr[0]       = _CA(0x0001, B6, 0x03, 0x00000008),
   1633    .num_counters = 1,
   1634    .norm         = { 1, 1 },
   1635 };
   1636 
   1637 static const struct nvc0_hw_sm_query_cfg
   1638 sm52_local_ld =
   1639 {
   1640    .type         = NVC0_HW_SM_QUERY_LOCAL_LD,
   1641    .ctr[0]       = _CA(0x0001, B6, 0x06, 0x0000001c),
   1642    .num_counters = 1,
   1643    .norm         = { 1, 1 },
   1644 };
   1645 
   1646 static const struct nvc0_hw_sm_query_cfg
   1647 sm52_local_st =
   1648 {
   1649    .type         = NVC0_HW_SM_QUERY_LOCAL_ST,
   1650    .ctr[0]       = _CA(0x0001, B6, 0x06, 0x00000018),
   1651    .num_counters = 1,
   1652    .norm         = { 1, 1 },
   1653 };
   1654 
   1655 static const struct nvc0_hw_sm_query_cfg
   1656 sm52_shared_atom =
   1657 {
   1658    .type         = NVC0_HW_SM_QUERY_SHARED_ATOM,
   1659    .ctr[0]       = _CA(0x0001, B6, 0x08, 0x0000001c),
   1660    .num_counters = 1,
   1661    .norm         = { 1, 1 },
   1662 };
   1663 
   1664 static const struct nvc0_hw_sm_query_cfg
   1665 sm52_shared_atom_cas =
   1666 {
   1667    .type         = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
   1668    .ctr[0]       = _CA(0x0001, B6, 0x08, 0x00000018),
   1669    .num_counters = 1,
   1670    .norm         = { 1, 1 },
   1671 };
   1672 
   1673 static const struct nvc0_hw_sm_query_cfg
   1674 sm52_shared_ld =
   1675 {
   1676    .type         = NVC0_HW_SM_QUERY_SHARED_LD,
   1677    .ctr[0]       = _CA(0x0001, B6, 0x07, 0x00000018),
   1678    .num_counters = 1,
   1679    .norm         = { 1, 1 },
   1680 };
   1681 
   1682 static const struct nvc0_hw_sm_query_cfg
   1683 sm52_shared_st =
   1684 {
   1685    .type         = NVC0_HW_SM_QUERY_SHARED_ST,
   1686    .ctr[0]       = _CA(0x0001, B6, 0x07, 0x0000001c),
   1687    .num_counters = 1,
   1688    .norm         = { 1, 1 },
   1689 };
   1690 
   1691 static const struct nvc0_hw_sm_query_cfg
   1692 sm52_warps_launched =
   1693 {
   1694    .type         = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
   1695    .ctr[0]       = _CA(0x0001, B6, 0x02, 0x0000001c),
   1696    .num_counters = 1,
   1697    .norm         = { 1, 1 },
   1698 };
   1699 
   1700 static const struct nvc0_hw_sm_query_cfg *sm52_hw_sm_queries[] =
   1701 {
   1702    &sm50_active_ctas,
   1703    &sm50_active_cycles,
   1704    &sm50_active_warps,
   1705    &sm52_atom_count,
   1706    &sm50_branch,
   1707    &sm50_divergent_branch,
   1708    &sm52_global_atom_cas,
   1709    &sm52_global_ld,
   1710    &sm52_global_st,
   1711    &sm52_gred_count,
   1712    &sm52_inst_executed,
   1713    &sm52_inst_issued0,
   1714    &sm52_inst_issued1,
   1715    &sm52_inst_issued2,
   1716    &sm52_local_ld,
   1717    &sm52_local_st,
   1718    &sm50_not_pred_off_inst_executed,
   1719    &sm50_prof_trigger_0,
   1720    &sm50_prof_trigger_1,
   1721    &sm50_prof_trigger_2,
   1722    &sm50_prof_trigger_3,
   1723    &sm50_prof_trigger_4,
   1724    &sm50_prof_trigger_5,
   1725    &sm50_prof_trigger_6,
   1726    &sm50_prof_trigger_7,
   1727    &sm52_shared_atom,
   1728    &sm52_shared_atom_cas,
   1729    &sm52_shared_ld,
   1730    &sm50_shared_ld_bank_conflict,
   1731    &sm50_shared_ld_transactions,
   1732    &sm52_shared_st,
   1733    &sm50_shared_st_bank_conflict,
   1734    &sm50_shared_st_transactions,
   1735    &sm50_sm_cta_launched,
   1736    &sm50_th_inst_executed,
   1737    &sm52_warps_launched,
   1738 };
   1739 
   1740 #undef _Q
   1741 #undef _CA
   1742 #undef _CB
   1743 
   1744 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
   1745 /* NOTES:
   1746  * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
   1747  *   because there is a context-switch problem that we need to fix.
   1748  *   Results might be wrong sometimes, be careful!
   1749  */
   1750 static const uint64_t nvc0_read_hw_sm_counters_code[] =
   1751 {
   1752    /* mov b32 $r8 $tidx
   1753     * mov b32 $r9 $physid
   1754     * mov b32 $r0 $pm0
   1755     * mov b32 $r1 $pm1
   1756     * mov b32 $r2 $pm2
   1757     * mov b32 $r3 $pm3
   1758     * mov b32 $r4 $pm4
   1759     * mov b32 $r5 $pm5
   1760     * mov b32 $r6 $pm6
   1761     * mov b32 $r7 $pm7
   1762     * set $p0 0x1 eq u32 $r8 0x0
   1763     * mov b32 $r10 c15[0x620]
   1764     * mov b32 $r11 c15[0x624]
   1765     * ext u32 $r8 $r9 0x414
   1766     * (not $p0) exit
   1767     * mul $r8 u32 $r8 u32 48
   1768     * add b32 $r10 $c $r10 $r8
   1769     * add b32 $r11 $r11 0x0 $c
   1770     * mov b32 $r8 c15[0x628]
   1771     * st b128 wt g[$r10d+0x00] $r0q
   1772     * st b128 wt g[$r10d+0x10] $r4q
   1773     * st b32 wt g[$r10d+0x20] $r8
   1774     * exit */
   1775    0x2c00000084021c04ULL,
   1776    0x2c0000000c025c04ULL,
   1777    0x2c00000010001c04ULL,
   1778    0x2c00000014005c04ULL,
   1779    0x2c00000018009c04ULL,
   1780    0x2c0000001c00dc04ULL,
   1781    0x2c00000020011c04ULL,
   1782    0x2c00000024015c04ULL,
   1783    0x2c00000028019c04ULL,
   1784    0x2c0000002c01dc04ULL,
   1785    0x190e0000fc81dc03ULL,
   1786    0x28007c1880029de4ULL,
   1787    0x28007c189002dde4ULL,
   1788    0x7000c01050921c03ULL,
   1789    0x80000000000021e7ULL,
   1790    0x10000000c0821c02ULL,
   1791    0x4801000020a29c03ULL,
   1792    0x0800000000b2dc42ULL,
   1793    0x28007c18a0021de4ULL,
   1794    0x9400000000a01fc5ULL,
   1795    0x9400000040a11fc5ULL,
   1796    0x9400000080a21f85ULL,
   1797    0x8000000000001de7ULL
   1798 };
   1799 
   1800 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
   1801 
   1802 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
   1803 static const struct nvc0_hw_sm_query_cfg
   1804 sm20_active_cycles =
   1805 {
   1806    .type         = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
   1807    .ctr[0]       = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
   1808    .num_counters = 1,
   1809    .norm         = { 1, 1 },
   1810 };
   1811 
   1812 static const struct nvc0_hw_sm_query_cfg
   1813 sm20_active_warps =
   1814 {
   1815    .type         = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
   1816    .ctr[0]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
   1817    .ctr[1]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
   1818    .ctr[2]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
   1819    .ctr[3]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
   1820    .ctr[4]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
   1821    .ctr[5]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
   1822    .num_counters = 6,
   1823    .norm         = { 1, 1 },
   1824 };
   1825 
   1826 static const struct nvc0_hw_sm_query_cfg
   1827 sm20_atom_count =
   1828 {
   1829    .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
   1830    .ctr[0]       = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
   1831    .num_counters = 1,
   1832    .norm         = { 1, 1 },
   1833 };
   1834 
   1835 static const struct nvc0_hw_sm_query_cfg
   1836 sm20_branch =
   1837 {
   1838    .type         = NVC0_HW_SM_QUERY_BRANCH,
   1839    .ctr[0]       = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
   1840    .ctr[1]       = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
   1841    .num_counters = 2,
   1842    .norm         = { 1, 1 },
   1843 };
   1844 
   1845 static const struct nvc0_hw_sm_query_cfg
   1846 sm20_divergent_branch =
   1847 {
   1848    .type         = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
   1849    .ctr[0]       = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
   1850    .ctr[1]       = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
   1851    .num_counters = 2,
   1852    .norm         = { 1, 1 },
   1853 };
   1854 
   1855 static const struct nvc0_hw_sm_query_cfg
   1856 sm20_gld_request =
   1857 {
   1858    .type         = NVC0_HW_SM_QUERY_GLD_REQUEST,
   1859    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
   1860    .num_counters = 1,
   1861    .norm         = { 1, 1 },
   1862 };
   1863 
   1864 static const struct nvc0_hw_sm_query_cfg
   1865 sm20_gred_count =
   1866 {
   1867    .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
   1868    .ctr[0]       = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
   1869    .num_counters = 1,
   1870    .norm         = { 1, 1 },
   1871 };
   1872 
   1873 static const struct nvc0_hw_sm_query_cfg
   1874 sm20_gst_request =
   1875 {
   1876    .type         = NVC0_HW_SM_QUERY_GST_REQUEST,
   1877    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
   1878    .num_counters = 1,
   1879    .norm         = { 1, 1 },
   1880 };
   1881 
   1882 static const struct nvc0_hw_sm_query_cfg
   1883 sm20_inst_executed =
   1884 {
   1885    .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
   1886    .ctr[0]       = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
   1887    .ctr[1]       = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
   1888    .num_counters = 2,
   1889    .norm         = { 1, 1 },
   1890 };
   1891 
   1892 static const struct nvc0_hw_sm_query_cfg
   1893 sm20_inst_issued =
   1894 {
   1895    .type         = NVC0_HW_SM_QUERY_INST_ISSUED,
   1896    .ctr[0]       = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
   1897    .ctr[1]       = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
   1898    .num_counters = 2,
   1899    .norm         = { 1, 1 },
   1900 };
   1901 
   1902 static const struct nvc0_hw_sm_query_cfg
   1903 sm20_local_ld =
   1904 {
   1905    .type         = NVC0_HW_SM_QUERY_LOCAL_LD,
   1906    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
   1907    .num_counters = 1,
   1908    .norm         = { 1, 1 },
   1909 };
   1910 
   1911 static const struct nvc0_hw_sm_query_cfg
   1912 sm20_local_st =
   1913 {
   1914    .type         = NVC0_HW_SM_QUERY_LOCAL_ST,
   1915    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
   1916    .num_counters = 1,
   1917    .norm         = { 1, 1 },
   1918 };
   1919 
   1920 static const struct nvc0_hw_sm_query_cfg
   1921 sm20_prof_trigger_0 =
   1922 {
   1923    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
   1924    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
   1925    .num_counters = 1,
   1926    .norm         = { 1, 1 },
   1927 };
   1928 
   1929 static const struct nvc0_hw_sm_query_cfg
   1930 sm20_prof_trigger_1 =
   1931 {
   1932    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
   1933    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
   1934    .num_counters = 1,
   1935    .norm         = { 1, 1 },
   1936 };
   1937 
   1938 static const struct nvc0_hw_sm_query_cfg
   1939 sm20_prof_trigger_2 =
   1940 {
   1941    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
   1942    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
   1943    .num_counters = 1,
   1944    .norm         = { 1, 1 },
   1945 };
   1946 
   1947 static const struct nvc0_hw_sm_query_cfg
   1948 sm20_prof_trigger_3 =
   1949 {
   1950    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
   1951    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
   1952    .num_counters = 1,
   1953    .norm         = { 1, 1 },
   1954 };
   1955 
   1956 static const struct nvc0_hw_sm_query_cfg
   1957 sm20_prof_trigger_4 =
   1958 {
   1959    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
   1960    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
   1961    .num_counters = 1,
   1962    .norm         = { 1, 1 },
   1963 };
   1964 
   1965 static const struct nvc0_hw_sm_query_cfg
   1966 sm20_prof_trigger_5 =
   1967 {
   1968    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
   1969    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
   1970    .num_counters = 1,
   1971    .norm         = { 1, 1 },
   1972 };
   1973 
   1974 static const struct nvc0_hw_sm_query_cfg
   1975 sm20_prof_trigger_6 =
   1976 {
   1977    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
   1978    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
   1979    .num_counters = 1,
   1980    .norm         = { 1, 1 },
   1981 };
   1982 
   1983 static const struct nvc0_hw_sm_query_cfg
   1984 sm20_prof_trigger_7 =
   1985 {
   1986    .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
   1987    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
   1988    .num_counters = 1,
   1989    .norm         = { 1, 1 },
   1990 };
   1991 
   1992 static const struct nvc0_hw_sm_query_cfg
   1993 sm20_shared_ld =
   1994 {
   1995    .type         = NVC0_HW_SM_QUERY_SHARED_LD,
   1996    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
   1997    .num_counters = 1,
   1998    .norm         = { 1, 1 },
   1999 };
   2000 
   2001 static const struct nvc0_hw_sm_query_cfg
   2002 sm20_shared_st =
   2003 {
   2004    .type         = NVC0_HW_SM_QUERY_SHARED_ST,
   2005    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
   2006    .num_counters = 1,
   2007    .norm         = { 1, 1 },
   2008 };
   2009 
   2010 static const struct nvc0_hw_sm_query_cfg
   2011 sm20_threads_launched =
   2012 {
   2013    .type         = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
   2014    .ctr[0]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
   2015    .ctr[1]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
   2016    .ctr[2]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
   2017    .ctr[3]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
   2018    .ctr[4]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
   2019    .ctr[5]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
   2020    .num_counters = 6,
   2021    .norm         = { 1, 1 },
   2022 };
   2023 
   2024 static const struct nvc0_hw_sm_query_cfg
   2025 sm20_th_inst_executed_0 =
   2026 {
   2027    .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
   2028    .ctr[0]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
   2029    .ctr[1]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
   2030    .ctr[2]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
   2031    .ctr[3]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
   2032    .ctr[4]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
   2033    .ctr[5]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
   2034    .num_counters = 6,
   2035    .norm         = { 1, 1 },
   2036 };
   2037 
   2038 static const struct nvc0_hw_sm_query_cfg
   2039 sm20_th_inst_executed_1 =
   2040 {
   2041    .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
   2042    .ctr[0]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
   2043    .ctr[1]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
   2044    .ctr[2]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
   2045    .ctr[3]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
   2046    .ctr[4]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
   2047    .ctr[5]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
   2048    .num_counters = 6,
   2049    .norm         = { 1, 1 },
   2050 };
   2051 
   2052 static const struct nvc0_hw_sm_query_cfg
   2053 sm20_warps_launched =
   2054 {
   2055    .type         = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
   2056    .ctr[0]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
   2057    .num_counters = 1,
   2058    .norm         = { 1, 1 },
   2059 };
   2060 
   2061 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
   2062 {
   2063    &sm20_active_cycles,
   2064    &sm20_active_warps,
   2065    &sm20_atom_count,
   2066    &sm20_branch,
   2067    &sm20_divergent_branch,
   2068    &sm20_gld_request,
   2069    &sm20_gred_count,
   2070    &sm20_gst_request,
   2071    &sm20_inst_executed,
   2072    &sm20_inst_issued,
   2073    &sm20_local_ld,
   2074    &sm20_local_st,
   2075    &sm20_prof_trigger_0,
   2076    &sm20_prof_trigger_1,
   2077    &sm20_prof_trigger_2,
   2078    &sm20_prof_trigger_3,
   2079    &sm20_prof_trigger_4,
   2080    &sm20_prof_trigger_5,
   2081    &sm20_prof_trigger_6,
   2082    &sm20_prof_trigger_7,
   2083    &sm20_shared_ld,
   2084    &sm20_shared_st,
   2085    &sm20_threads_launched,
   2086    &sm20_th_inst_executed_0,
   2087    &sm20_th_inst_executed_1,
   2088    &sm20_warps_launched,
   2089 };
   2090 
   2091 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
   2092 static const struct nvc0_hw_sm_query_cfg
   2093 sm21_inst_executed =
   2094 {
   2095    .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
   2096    .ctr[0]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
   2097    .ctr[1]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
   2098    .ctr[2]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
   2099    .num_counters = 3,
   2100    .norm         = { 1, 1 },
   2101 };
   2102 
   2103 static const struct nvc0_hw_sm_query_cfg
   2104 sm21_inst_issued1_0 =
   2105 {
   2106    .type         = NVC0_HW_SM_QUERY_INST_ISSUED1_0,
   2107    .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
   2108    .num_counters = 1,
   2109    .norm         = { 1, 1 },
   2110 };
   2111 
   2112 static const struct nvc0_hw_sm_query_cfg
   2113 sm21_inst_issued1_1 =
   2114 {
   2115    .type         = NVC0_HW_SM_QUERY_INST_ISSUED1_1,
   2116    .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
   2117    .num_counters = 1,
   2118    .norm         = { 1, 1 },
   2119 };
   2120 
   2121 static const struct nvc0_hw_sm_query_cfg
   2122 sm21_inst_issued2_0 =
   2123 {
   2124    .type         = NVC0_HW_SM_QUERY_INST_ISSUED2_0,
   2125    .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
   2126    .num_counters = 1,
   2127    .norm         = { 1, 1 },
   2128 };
   2129 
   2130 static const struct nvc0_hw_sm_query_cfg
   2131 sm21_inst_issued2_1 =
   2132 {
   2133    .type         = NVC0_HW_SM_QUERY_INST_ISSUED2_1,
   2134    .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
   2135    .num_counters = 1,
   2136    .norm         = { 1, 1 },
   2137 };
   2138 
   2139 static const struct nvc0_hw_sm_query_cfg
   2140 sm21_th_inst_executed_0 =
   2141 {
   2142    .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
   2143    .ctr[0]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
   2144    .ctr[1]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
   2145    .ctr[2]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
   2146    .ctr[3]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
   2147    .ctr[4]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
   2148    .ctr[5]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
   2149    .num_counters = 6,
   2150    .norm         = { 1, 1 },
   2151 };
   2152 
   2153 static const struct nvc0_hw_sm_query_cfg
   2154 sm21_th_inst_executed_1 =
   2155 {
   2156    .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
   2157    .ctr[0]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
   2158    .ctr[1]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
   2159    .ctr[2]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
   2160    .ctr[3]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
   2161    .ctr[4]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
   2162    .ctr[5]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
   2163    .num_counters = 6,
   2164    .norm         = { 1, 1 },
   2165 };
   2166 
   2167 static const struct nvc0_hw_sm_query_cfg
   2168 sm21_th_inst_executed_2 =
   2169 {
   2170    .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
   2171    .ctr[0]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
   2172    .ctr[1]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
   2173    .ctr[2]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
   2174    .ctr[3]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
   2175    .ctr[4]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
   2176    .ctr[5]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
   2177    .num_counters = 6,
   2178    .norm         = { 1, 1 },
   2179 };
   2180 
   2181 static const struct nvc0_hw_sm_query_cfg
   2182 sm21_th_inst_executed_3 =
   2183 {
   2184    .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
   2185    .ctr[0]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
   2186    .ctr[1]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
   2187    .ctr[2]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
   2188    .ctr[3]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
   2189    .ctr[4]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
   2190    .ctr[5]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
   2191    .num_counters = 6,
   2192    .norm         = { 1, 1 },
   2193 };
   2194 
   2195 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
   2196 {
   2197    &sm20_active_cycles,
   2198    &sm20_active_warps,
   2199    &sm20_atom_count,
   2200    &sm20_branch,
   2201    &sm20_divergent_branch,
   2202    &sm20_gld_request,
   2203    &sm20_gred_count,
   2204    &sm20_gst_request,
   2205    &sm21_inst_executed,
   2206    &sm21_inst_issued1_0,
   2207    &sm21_inst_issued1_1,
   2208    &sm21_inst_issued2_0,
   2209    &sm21_inst_issued2_1,
   2210    &sm20_local_ld,
   2211    &sm20_local_st,
   2212    &sm20_prof_trigger_0,
   2213    &sm20_prof_trigger_1,
   2214    &sm20_prof_trigger_2,
   2215    &sm20_prof_trigger_3,
   2216    &sm20_prof_trigger_4,
   2217    &sm20_prof_trigger_5,
   2218    &sm20_prof_trigger_6,
   2219    &sm20_prof_trigger_7,
   2220    &sm20_shared_ld,
   2221    &sm20_shared_st,
   2222    &sm20_threads_launched,
   2223    &sm21_th_inst_executed_0,
   2224    &sm21_th_inst_executed_1,
   2225    &sm21_th_inst_executed_2,
   2226    &sm21_th_inst_executed_3,
   2227    &sm20_warps_launched,
   2228 };
   2229 
   2230 #undef _C
   2231 
   2232 static inline const struct nvc0_hw_sm_query_cfg **
   2233 nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
   2234 {
   2235    struct nouveau_device *dev = screen->base.device;
   2236 
   2237    switch (screen->base.class_3d) {
   2238    case GM200_3D_CLASS:
   2239       return sm52_hw_sm_queries;
   2240    case GM107_3D_CLASS:
   2241       return sm50_hw_sm_queries;
   2242    case NVF0_3D_CLASS:
   2243       return sm35_hw_sm_queries;
   2244    case NVE4_3D_CLASS:
   2245       return sm30_hw_sm_queries;
   2246    default:
   2247       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
   2248          return sm20_hw_sm_queries;
   2249       return sm21_hw_sm_queries;
   2250    }
   2251    assert(0);
   2252    return NULL;
   2253 }
   2254 
   2255 unsigned
   2256 nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
   2257 {
   2258    struct nouveau_device *dev = screen->base.device;
   2259 
   2260    switch (screen->base.class_3d) {
   2261    case GM200_3D_CLASS:
   2262       return ARRAY_SIZE(sm52_hw_sm_queries);
   2263    case GM107_3D_CLASS:
   2264       return ARRAY_SIZE(sm50_hw_sm_queries);
   2265    case NVF0_3D_CLASS:
   2266       return ARRAY_SIZE(sm35_hw_sm_queries);
   2267    case NVE4_3D_CLASS:
   2268       return ARRAY_SIZE(sm30_hw_sm_queries);
   2269    default:
   2270       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
   2271          return ARRAY_SIZE(sm20_hw_sm_queries);
   2272       return ARRAY_SIZE(sm21_hw_sm_queries);
   2273    }
   2274    return 0;
   2275 }
   2276 
   2277 static const struct nvc0_hw_sm_query_cfg *
   2278 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
   2279 {
   2280    const struct nvc0_hw_sm_query_cfg **queries;
   2281    struct nvc0_screen *screen = nvc0->screen;
   2282    struct nvc0_query *q = &hq->base;
   2283    unsigned num_queries;
   2284    unsigned i;
   2285 
   2286    num_queries = nvc0_hw_sm_get_num_queries(screen);
   2287    queries = nvc0_hw_sm_get_queries(screen);
   2288 
   2289    for (i = 0; i < num_queries; i++) {
   2290       if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type)
   2291          return queries[i];
   2292    }
   2293    assert(0);
   2294    return NULL;
   2295 }
   2296 
   2297 static void
   2298 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
   2299 {
   2300    struct nvc0_query *q = &hq->base;
   2301    nvc0_hw_query_allocate(nvc0, q, 0);
   2302    nouveau_fence_ref(NULL, &hq->fence);
   2303    FREE(hq);
   2304 }
   2305 
   2306 static boolean
   2307 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
   2308 {
   2309    struct nvc0_screen *screen = nvc0->screen;
   2310    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   2311    struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
   2312    const struct nvc0_hw_sm_query_cfg *cfg;
   2313    unsigned i, c;
   2314    unsigned num_ab[2] = { 0, 0 };
   2315 
   2316    cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
   2317 
   2318    /* check if we have enough free counter slots */
   2319    for (i = 0; i < cfg->num_counters; ++i)
   2320       num_ab[cfg->ctr[i].sig_dom]++;
   2321 
   2322    if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
   2323        screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
   2324       NOUVEAU_ERR("Not enough free MP counter slots !\n");
   2325       return false;
   2326    }
   2327 
   2328    assert(cfg->num_counters <= 4);
   2329    PUSH_SPACE(push, 4 * 8 * + 6);
   2330 
   2331    if (!screen->pm.mp_counters_enabled) {
   2332       screen->pm.mp_counters_enabled = true;
   2333       BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
   2334       PUSH_DATA (push, 0x1fcb);
   2335    }
   2336 
   2337    /* set sequence field to 0 (used to check if result is available) */
   2338    for (i = 0; i < screen->mp_count; ++i)
   2339       hq->data[i * 10 + 10] = 0;
   2340    hq->sequence++;
   2341 
   2342    for (i = 0; i < cfg->num_counters; ++i) {
   2343       const unsigned d = cfg->ctr[i].sig_dom;
   2344 
   2345       if (!screen->pm.num_hw_sm_active[d]) {
   2346          uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
   2347          if (screen->pm.num_hw_sm_active[!d])
   2348             m |= 1 << (7 + (8 * d));
   2349          BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
   2350          PUSH_DATA (push, m);
   2351       }
   2352       screen->pm.num_hw_sm_active[d]++;
   2353 
   2354       for (c = d * 4; c < (d * 4 + 4); ++c) {
   2355          if (!screen->pm.mp_counter[c]) {
   2356             hsq->ctr[i] = c;
   2357             screen->pm.mp_counter[c] = hsq;
   2358             break;
   2359          }
   2360       }
   2361       assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
   2362 
   2363       /* configure and reset the counter(s) */
   2364      if (d == 0)
   2365         BEGIN_NVC0(push, NVE4_CP(MP_PM_A_SIGSEL(c & 3)), 1);
   2366      else
   2367         BEGIN_NVC0(push, NVE4_CP(MP_PM_B_SIGSEL(c & 3)), 1);
   2368      PUSH_DATA (push, cfg->ctr[i].sig_sel);
   2369      BEGIN_NVC0(push, NVE4_CP(MP_PM_SRCSEL(c)), 1);
   2370      PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
   2371      BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 1);
   2372      PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
   2373      BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);
   2374      PUSH_DATA (push, 0);
   2375    }
   2376 
   2377    if (screen->base.class_3d >= GM107_3D_CLASS) {
   2378       /* Enable mask for counters, it's 8-bits value where 0:3 is for domain A
   2379        * and 4:7 for domain B. For example, the mask for active_warps should be
   2380        * 0x70 because it uses 3 counters in domain B. However, let's always
   2381        * enable all counters because we don't want to track which ones is
   2382        * enabled or not, and this allows to monitor multiple queries at the
   2383        * same time. */
   2384       BEGIN_NVC0(push, SUBC_CP(0x33e0), 1);
   2385       PUSH_DATA (push, 0xff);
   2386    }
   2387 
   2388    return true;
   2389 }
   2390 
   2391 static boolean
   2392 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
   2393 {
   2394    struct nvc0_screen *screen = nvc0->screen;
   2395    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   2396    struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
   2397    const struct nvc0_hw_sm_query_cfg *cfg;
   2398    unsigned i, c;
   2399 
   2400    if (screen->base.class_3d >= NVE4_3D_CLASS)
   2401       return nve4_hw_sm_begin_query(nvc0, hq);
   2402 
   2403    cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
   2404 
   2405    /* check if we have enough free counter slots */
   2406    if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
   2407       NOUVEAU_ERR("Not enough free MP counter slots !\n");
   2408       return false;
   2409    }
   2410 
   2411    assert(cfg->num_counters <= 8);
   2412    PUSH_SPACE(push, 8 * 8 + 2);
   2413 
   2414    /* set sequence field to 0 (used to check if result is available) */
   2415    for (i = 0; i < screen->mp_count; ++i) {
   2416       const unsigned b = (0x30 / 4) * i;
   2417       hq->data[b + 8] = 0;
   2418    }
   2419    hq->sequence++;
   2420 
   2421    for (i = 0; i < cfg->num_counters; ++i) {
   2422       uint32_t mask_sel = 0x00000000;
   2423 
   2424       if (!screen->pm.num_hw_sm_active[0]) {
   2425          BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
   2426          PUSH_DATA (push, 0x80000000);
   2427       }
   2428       screen->pm.num_hw_sm_active[0]++;
   2429 
   2430       for (c = 0; c < 8; ++c) {
   2431          if (!screen->pm.mp_counter[c]) {
   2432             hsq->ctr[i] = c;
   2433             screen->pm.mp_counter[c] = hsq;
   2434             break;
   2435          }
   2436       }
   2437 
   2438       /* Oddly-enough, the signal id depends on the slot selected on Fermi but
   2439        * not on Kepler. Fortunately, the signal ids are just offseted by the
   2440        * slot id! */
   2441       mask_sel |= c;
   2442       mask_sel |= (c << 8);
   2443       mask_sel |= (c << 16);
   2444       mask_sel |= (c << 24);
   2445       mask_sel &= cfg->ctr[i].src_mask;
   2446 
   2447       /* configure and reset the counter(s) */
   2448       BEGIN_NVC0(push, NVC0_CP(MP_PM_SIGSEL(c)), 1);
   2449       PUSH_DATA (push, cfg->ctr[i].sig_sel);
   2450       BEGIN_NVC0(push, NVC0_CP(MP_PM_SRCSEL(c)), 1);
   2451       PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
   2452       BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(c)), 1);
   2453       PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
   2454       BEGIN_NVC0(push, NVC0_CP(MP_PM_SET(c)), 1);
   2455       PUSH_DATA (push, 0);
   2456    }
   2457    return true;
   2458 }
   2459 
   2460 static inline struct nvc0_program *
   2461 nvc0_hw_sm_get_program(struct nvc0_screen *screen)
   2462 {
   2463    struct nvc0_program *prog;
   2464 
   2465    prog = CALLOC_STRUCT(nvc0_program);
   2466    if (!prog)
   2467       return NULL;
   2468 
   2469    prog->type = PIPE_SHADER_COMPUTE;
   2470    prog->translated = true;
   2471    prog->parm_size = 12;
   2472 
   2473    if (screen->base.class_3d >= GM107_3D_CLASS) {
   2474       prog->code = (uint32_t *)gm107_read_hw_sm_counters_code;
   2475       prog->code_size = sizeof(gm107_read_hw_sm_counters_code);
   2476       prog->num_gprs = 14;
   2477    } else
   2478    if (screen->base.class_3d == NVE4_3D_CLASS ||
   2479        screen->base.class_3d == NVF0_3D_CLASS) {
   2480       if (screen->base.class_3d == NVE4_3D_CLASS) {
   2481          prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
   2482          prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
   2483       } else {
   2484          prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code;
   2485          prog->code_size = sizeof(nvf0_read_hw_sm_counters_code);
   2486       }
   2487       prog->num_gprs = 14;
   2488    } else {
   2489       prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
   2490       prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
   2491       prog->num_gprs = 12;
   2492    }
   2493    return prog;
   2494 }
   2495 
   2496 static inline void
   2497 nvc0_hw_sm_upload_input(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
   2498 {
   2499    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   2500    struct nvc0_screen *screen = nvc0->screen;
   2501    uint64_t address;
   2502    const int s = 5;
   2503 
   2504    address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
   2505 
   2506    PUSH_SPACE(push, 11);
   2507 
   2508    if (screen->base.class_3d >= NVE4_3D_CLASS) {
   2509       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
   2510       PUSH_DATAh(push, address + NVC0_CB_AUX_MP_INFO);
   2511       PUSH_DATA (push, address + NVC0_CB_AUX_MP_INFO);
   2512       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
   2513       PUSH_DATA (push, 3 * 4);
   2514       PUSH_DATA (push, 0x1);
   2515       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 3);
   2516       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
   2517    } else {
   2518       BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
   2519       PUSH_DATA (push, NVC0_CB_AUX_SIZE);
   2520       PUSH_DATAh(push, address);
   2521       PUSH_DATA (push, address);
   2522       BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 3);
   2523       PUSH_DATA (push, NVC0_CB_AUX_MP_INFO);
   2524    }
   2525    PUSH_DATA (push, (hq->bo->offset + hq->base_offset));
   2526    PUSH_DATAh(push, (hq->bo->offset + hq->base_offset));
   2527    PUSH_DATA (push, hq->sequence);
   2528 }
   2529 
   2530 static void
   2531 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
   2532 {
   2533    struct nvc0_screen *screen = nvc0->screen;
   2534    struct pipe_context *pipe = &nvc0->base.pipe;
   2535    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   2536    const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
   2537    struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
   2538    struct nvc0_program *old = nvc0->compprog;
   2539    struct pipe_grid_info info = {};
   2540    uint32_t mask;
   2541    uint32_t input[3];
   2542    const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
   2543    const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
   2544    unsigned c, i;
   2545 
   2546    if (unlikely(!screen->pm.prog))
   2547       screen->pm.prog = nvc0_hw_sm_get_program(screen);
   2548 
   2549    /* disable all counting */
   2550    PUSH_SPACE(push, 8);
   2551    for (c = 0; c < 8; ++c)
   2552       if (screen->pm.mp_counter[c]) {
   2553          if (is_nve4) {
   2554             IMMED_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 0);
   2555          } else {
   2556             IMMED_NVC0(push, NVC0_CP(MP_PM_OP(c)), 0);
   2557          }
   2558       }
   2559    /* release counters for this query */
   2560    for (c = 0; c < 8; ++c) {
   2561       if (screen->pm.mp_counter[c] == hsq) {
   2562          uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
   2563          screen->pm.num_hw_sm_active[d]--;
   2564          screen->pm.mp_counter[c] = NULL;
   2565       }
   2566    }
   2567 
   2568    if (screen->base.class_3d >= GM107_3D_CLASS)
   2569       IMMED_NVC0(push, SUBC_CP(0x33e0), 0);
   2570 
   2571    BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
   2572                 hq->bo);
   2573 
   2574    PUSH_SPACE(push, 1);
   2575    IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
   2576 
   2577    /* upload input data for the compute shader which reads MP counters */
   2578    nvc0_hw_sm_upload_input(nvc0, hq);
   2579 
   2580    pipe->bind_compute_state(pipe, screen->pm.prog);
   2581    for (i = 0; i < 3; i++) {
   2582       info.block[i] = block[i];
   2583       info.grid[i] = grid[i];
   2584    }
   2585    info.pc = 0;
   2586    info.input = input;
   2587    pipe->launch_grid(pipe, &info);
   2588    pipe->bind_compute_state(pipe, old);
   2589 
   2590    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
   2591 
   2592    /* re-activate other counters */
   2593    PUSH_SPACE(push, 16);
   2594    mask = 0;
   2595    for (c = 0; c < 8; ++c) {
   2596       const struct nvc0_hw_sm_query_cfg *cfg;
   2597       unsigned i;
   2598 
   2599       hsq = screen->pm.mp_counter[c];
   2600       if (!hsq)
   2601          continue;
   2602 
   2603       cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
   2604       for (i = 0; i < cfg->num_counters; ++i) {
   2605          if (mask & (1 << hsq->ctr[i]))
   2606             break;
   2607          mask |= 1 << hsq->ctr[i];
   2608          if (is_nve4) {
   2609             BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(hsq->ctr[i])), 1);
   2610          } else {
   2611             BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(hsq->ctr[i])), 1);
   2612          }
   2613          PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
   2614       }
   2615    }
   2616 }
   2617 
   2618 static inline bool
   2619 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
   2620                            struct nvc0_context *nvc0, bool wait,
   2621                            struct nvc0_hw_query *hq,
   2622                            const struct nvc0_hw_sm_query_cfg *cfg,
   2623                            unsigned mp_count)
   2624 {
   2625    struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
   2626    unsigned p, c;
   2627 
   2628    for (p = 0; p < mp_count; ++p) {
   2629       const unsigned b = (0x30 / 4) * p;
   2630 
   2631       for (c = 0; c < cfg->num_counters; ++c) {
   2632          if (hq->data[b + 8] != hq->sequence) {
   2633             if (!wait)
   2634                return false;
   2635             if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
   2636                return false;
   2637          }
   2638          count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
   2639       }
   2640    }
   2641    return true;
   2642 }
   2643 
   2644 static inline bool
   2645 nve4_hw_sm_query_read_data(uint32_t count[32][8],
   2646                            struct nvc0_context *nvc0, bool wait,
   2647                            struct nvc0_hw_query *hq,
   2648                            const struct nvc0_hw_sm_query_cfg *cfg,
   2649                            unsigned mp_count)
   2650 {
   2651    struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
   2652    unsigned p, c, d;
   2653 
   2654    for (p = 0; p < mp_count; ++p) {
   2655       const unsigned b = (0x60 / 4) * p;
   2656 
   2657       for (c = 0; c < cfg->num_counters; ++c) {
   2658          count[p][c] = 0;
   2659          for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
   2660             if (hq->data[b + 20 + d] != hq->sequence) {
   2661                if (!wait)
   2662                   return false;
   2663                if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
   2664                   return false;
   2665             }
   2666             if (hsq->ctr[c] & ~0x3)
   2667                count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
   2668             else
   2669                count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
   2670          }
   2671       }
   2672    }
   2673    return true;
   2674 }
   2675 
   2676 static boolean
   2677 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
   2678                             boolean wait, union pipe_query_result *result)
   2679 {
   2680    uint32_t count[32][8];
   2681    uint64_t value = 0;
   2682    unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
   2683    unsigned p, c;
   2684    const struct nvc0_hw_sm_query_cfg *cfg;
   2685    bool ret;
   2686 
   2687    cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
   2688 
   2689    if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
   2690       ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
   2691    else
   2692       ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
   2693    if (!ret)
   2694       return false;
   2695 
   2696    for (c = 0; c < cfg->num_counters; ++c)
   2697       for (p = 0; p < mp_count; ++p)
   2698          value += count[p][c];
   2699    value = (value * cfg->norm[0]) / cfg->norm[1];
   2700 
   2701    *(uint64_t *)result = value;
   2702    return true;
   2703 }
   2704 
   2705 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
   2706    .destroy_query = nvc0_hw_sm_destroy_query,
   2707    .begin_query = nvc0_hw_sm_begin_query,
   2708    .end_query = nvc0_hw_sm_end_query,
   2709    .get_query_result = nvc0_hw_sm_get_query_result,
   2710 };
   2711 
   2712 struct nvc0_hw_query *
   2713 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
   2714 {
   2715    struct nvc0_screen *screen = nvc0->screen;
   2716    struct nvc0_hw_sm_query *hsq;
   2717    struct nvc0_hw_query *hq;
   2718    unsigned space;
   2719 
   2720    if (nvc0->screen->base.drm->version < 0x01000101)
   2721       return NULL;
   2722 
   2723    if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)
   2724       return NULL;
   2725 
   2726    hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
   2727    if (!hsq)
   2728       return NULL;
   2729 
   2730    hq = &hsq->base;
   2731    hq->funcs = &hw_sm_query_funcs;
   2732    hq->base.type = type;
   2733 
   2734    if (screen->base.class_3d >= NVE4_3D_CLASS) {
   2735        /* for each MP:
   2736         * [00] = WS0.C0
   2737         * [04] = WS0.C1
   2738         * [08] = WS0.C2
   2739         * [0c] = WS0.C3
   2740         * [10] = WS1.C0
   2741         * [14] = WS1.C1
   2742         * [18] = WS1.C2
   2743         * [1c] = WS1.C3
   2744         * [20] = WS2.C0
   2745         * [24] = WS2.C1
   2746         * [28] = WS2.C2
   2747         * [2c] = WS2.C3
   2748         * [30] = WS3.C0
   2749         * [34] = WS3.C1
   2750         * [38] = WS3.C2
   2751         * [3c] = WS3.C3
   2752         * [40] = MP.C4
   2753         * [44] = MP.C5
   2754         * [48] = MP.C6
   2755         * [4c] = MP.C7
   2756         * [50] = WS0.sequence
   2757         * [54] = WS1.sequence
   2758         * [58] = WS2.sequence
   2759         * [5c] = WS3.sequence
   2760         */
   2761        space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
   2762    } else {
   2763       /*
   2764        * Note that padding is used to align memory access to 128 bits.
   2765        *
   2766        * for each MP:
   2767        * [00] = MP.C0
   2768        * [04] = MP.C1
   2769        * [08] = MP.C2
   2770        * [0c] = MP.C3
   2771        * [10] = MP.C4
   2772        * [14] = MP.C5
   2773        * [18] = MP.C6
   2774        * [1c] = MP.C7
   2775        * [20] = MP.sequence
   2776        * [24] = padding
   2777        * [28] = padding
   2778        * [2c] = padding
   2779        */
   2780       space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
   2781    }
   2782 
   2783    if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
   2784       FREE(hq);
   2785       return NULL;
   2786    }
   2787 
   2788    return hq;
   2789 }
   2790 
   2791 int
   2792 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
   2793                                  struct pipe_driver_query_info *info)
   2794 {
   2795    int count = 0;
   2796 
   2797    if (screen->base.drm->version >= 0x01000101) {
   2798       if (screen->compute)
   2799          count = nvc0_hw_sm_get_num_queries(screen);
   2800    }
   2801 
   2802    if (!info)
   2803       return count;
   2804 
   2805    if (id < count) {
   2806       if (screen->compute) {
   2807          if (screen->base.class_3d <= GM200_3D_CLASS) {
   2808             const struct nvc0_hw_sm_query_cfg **queries =
   2809                nvc0_hw_sm_get_queries(screen);
   2810 
   2811             info->name = nvc0_hw_sm_query_get_name(queries[id]->type);
   2812             info->query_type = NVC0_HW_SM_QUERY(queries[id]->type);
   2813             info->group_id = NVC0_HW_SM_QUERY_GROUP;
   2814             return 1;
   2815          }
   2816       }
   2817    }
   2818    return 0;
   2819 }
   2820