1 /* 2 * Copyright 2011 Christoph Bumiller 3 * Copyright 2015 Samuel Pitoiset 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING 25 26 #include "nvc0/nvc0_context.h" 27 #include "nvc0/nvc0_query_hw_sm.h" 28 29 #include "nv_object.xml.h" 30 #include "nvc0/nve4_compute.xml.h" 31 #include "nvc0/nvc0_compute.xml.h" 32 33 /* NOTE: intentionally using the same names as NV */ 34 #define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d } 35 static const struct { 36 unsigned type; 37 const char *name; 38 const char *desc; 39 } nvc0_hw_sm_queries[] = { 40 _Q(ACTIVE_CTAS, 41 "active_ctas", 42 "Accumulated number of active blocks per cycle. For every cycle it " 43 "increments by the number of active blocks in the cycle which can be in " 44 "the range 0 to 32."), 45 46 _Q(ACTIVE_CYCLES, 47 "active_cycles", 48 "Number of cycles a multiprocessor has at least one active warp"), 49 50 _Q(ACTIVE_WARPS, 51 "active_warps", 52 "Accumulated number of active warps per cycle. For every cycle it " 53 "increments by the number of active warps in the cycle which can be in " 54 "the range 0 to 64"), 55 56 _Q(ATOM_CAS_COUNT, 57 "atom_cas_count", 58 "Number of warps executing atomic compare and swap operations. Increments " 59 "by one if at least one thread in a warp executes the instruction."), 60 61 _Q(ATOM_COUNT, 62 "atom_count", 63 "Number of warps executing atomic reduction operations. Increments by one " 64 "if at least one thread in a warp executes the instruction"), 65 66 _Q(BRANCH, 67 "branch", 68 "Number of branch instructions executed per warp on a multiprocessor"), 69 70 _Q(DIVERGENT_BRANCH, 71 "divergent_branch", 72 "Number of divergent branches within a warp. This counter will be " 73 "incremented by one if at least one thread in a warp diverges (that is, " 74 "follows a different execution path) via a conditional branch"), 75 76 _Q(GLD_REQUEST, 77 "gld_request", 78 "Number of executed load instructions where the state space is not " 79 "specified and hence generic addressing is used, increments per warp on a " 80 "multiprocessor. It can include the load operations from global,local and " 81 "shared state space"), 82 83 _Q(GLD_MEM_DIV_REPLAY, 84 "global_ld_mem_divergence_replays", 85 "Number of instruction replays for global memory loads. Instruction is " 86 "replayed if the instruction is accessing more than one cache line of " 87 "128 bytes. For each extra cache line access the counter is incremented " 88 "by 1"), 89 90 _Q(GLOBAL_ATOM_CAS, 91 "global_atom_cas", 92 "Number of ATOM.CAS instructions executed per warp."), 93 94 _Q(GLOBAL_LD, 95 "global_load", 96 "Number of executed load instructions where state space is specified as " 97 "global, increments per warp on a multiprocessor."), 98 99 _Q(GLOBAL_ST, 100 "global_store", 101 "Number of executed store instructions where state space is specified as " 102 "global, increments per warp on a multiprocessor."), 103 104 _Q(GST_TRANSACTIONS, 105 "global_store_transaction", 106 "Number of global store transactions. Increments by 1 per transaction. " 107 "Transaction can be 32/64/96/128B"), 108 109 _Q(GST_MEM_DIV_REPLAY, 110 "global_st_mem_divergence_replays", 111 "Number of instruction replays for global memory stores. Instruction is " 112 "replayed if the instruction is accessing more than one cache line of " 113 "128 bytes. For each extra cache line access the counter is incremented " 114 "by 1"), 115 116 _Q(GRED_COUNT, 117 "gred_count", 118 "Number of warps executing reduction operations on global memory. " 119 "Increments by one if at least one thread in a warp executes the " 120 "instruction"), 121 122 _Q(GST_REQUEST, 123 "gst_request", 124 "Number of executed store instructions where the state space is not " 125 "specified and hence generic addressing is used, increments per warp on a " 126 "multiprocessor. It can include the store operations to global,local and " 127 "shared state space"), 128 129 _Q(INST_EXECUTED, 130 "inst_executed", 131 "Number of instructions executed, do not include replays"), 132 133 _Q(INST_ISSUED, 134 "inst_issued", 135 "Number of instructions issued including replays"), 136 137 _Q(INST_ISSUED0, 138 "inst_issued0", 139 "Number of cycles that did not issue any instruction, increments per " 140 "warp."), 141 142 _Q(INST_ISSUED1, 143 "inst_issued1", 144 "Number of single instruction issued per cycle"), 145 146 _Q(INST_ISSUED2, 147 "inst_issued2", 148 "Number of dual instructions issued per cycle"), 149 150 _Q(INST_ISSUED1_0, 151 "inst_issued1_0", 152 "Number of single instruction issued per cycle in pipeline 0"), 153 154 _Q(INST_ISSUED1_1, 155 "inst_issued1_1", 156 "Number of single instruction issued per cycle in pipeline 1"), 157 158 _Q(INST_ISSUED2_0, 159 "inst_issued2_0", 160 "Number of dual instructions issued per cycle in pipeline 0"), 161 162 _Q(INST_ISSUED2_1, 163 "inst_issued2_1", 164 "Number of dual instructions issued per cycle in pipeline 1"), 165 166 _Q(L1_GLD_HIT, 167 "l1_global_load_hit", 168 "Number of cache lines that hit in L1 cache for global memory load " 169 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " 170 "32, 64 and 128 bit accesses by a warp respectively"), 171 172 _Q(L1_GLD_MISS, 173 "l1_global_load_miss", 174 "Number of cache lines that miss in L1 cache for global memory load " 175 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " 176 "32, 64 and 128 bit accesses by a warp respectively"), 177 178 _Q(L1_GLD_TRANSACTIONS, 179 "__l1_global_load_transactions", 180 "Number of global load transactions from L1 cache. Increments by 1 per " 181 "transaction. Transaction can be 32/64/96/128B"), 182 183 _Q(L1_GST_TRANSACTIONS, 184 "__l1_global_store_transactions", 185 "Number of global store transactions from L1 cache. Increments by 1 per " 186 "transaction. Transaction can be 32/64/96/128B"), 187 188 _Q(L1_LOCAL_LD_HIT, 189 "l1_local_load_hit", 190 "Number of cache lines that hit in L1 cache for local memory load " 191 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " 192 "32, 64 and 128 bit accesses by a warp respectively"), 193 194 _Q(L1_LOCAL_LD_MISS, 195 "l1_local_load_miss", 196 "Number of cache lines that miss in L1 cache for local memory load " 197 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " 198 "32, 64 and 128 bit accesses by a warp respectively"), 199 200 _Q(L1_LOCAL_ST_HIT, 201 "l1_local_store_hit", 202 "Number of cache lines that hit in L1 cache for local memory store " 203 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " 204 "32, 64 and 128 bit accesses by a warp respectively"), 205 206 _Q(L1_LOCAL_ST_MISS, 207 "l1_local_store_miss", 208 "Number of cache lines that miss in L1 cache for local memory store " 209 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for " 210 "32,64 and 128 bit accesses by a warp respectively"), 211 212 _Q(L1_SHARED_LD_TRANSACTIONS, 213 "l1_shared_load_transactions", 214 "Number of shared load transactions. Increments by 1 per transaction. " 215 "Transaction can be 32/64/96/128B"), 216 217 _Q(L1_SHARED_ST_TRANSACTIONS, 218 "l1_shared_store_transactions", 219 "Number of shared store transactions. Increments by 1 per transaction. " 220 "Transaction can be 32/64/96/128B"), 221 222 _Q(LOCAL_LD, 223 "local_load", 224 "Number of executed load instructions where state space is specified as " 225 "local, increments per warp on a multiprocessor"), 226 227 _Q(LOCAL_LD_TRANSACTIONS, 228 "local_load_transactions", 229 "Number of local load transactions from L1 cache. Increments by 1 per " 230 "transaction. Transaction can be 32/64/96/128B"), 231 232 _Q(LOCAL_ST, 233 "local_store", 234 "Number of executed store instructions where state space is specified as " 235 "local, increments per warp on a multiprocessor"), 236 237 _Q(LOCAL_ST_TRANSACTIONS, 238 "local_store_transactions", 239 "Number of local store transactions to L1 cache. Increments by 1 per " 240 "transaction. Transaction can be 32/64/96/128B."), 241 242 _Q(NOT_PRED_OFF_INST_EXECUTED, 243 "not_predicated_off_thread_inst_executed", 244 "Number of not predicated off instructions executed by all threads, does " 245 "not include replays. For each instruction it increments by the number of " 246 "threads that execute this instruction"), 247 248 _Q(PROF_TRIGGER_0, 249 "prof_trigger_00", 250 "User profiled generic trigger that can be inserted in any place of the " 251 "code to collect the related information. Increments per warp."), 252 253 _Q(PROF_TRIGGER_1, 254 "prof_trigger_01", 255 "User profiled generic trigger that can be inserted in any place of the " 256 "code to collect the related information. Increments per warp."), 257 258 _Q(PROF_TRIGGER_2, 259 "prof_trigger_02", 260 "User profiled generic trigger that can be inserted in any place of the " 261 "code to collect the related information. Increments per warp."), 262 263 _Q(PROF_TRIGGER_3, 264 "prof_trigger_03", 265 "User profiled generic trigger that can be inserted in any place of the " 266 "code to collect the related information. Increments per warp."), 267 268 _Q(PROF_TRIGGER_4, 269 "prof_trigger_04", 270 "User profiled generic trigger that can be inserted in any place of the " 271 "code to collect the related information. Increments per warp."), 272 273 _Q(PROF_TRIGGER_5, 274 "prof_trigger_05", 275 "User profiled generic trigger that can be inserted in any place of the " 276 "code to collect the related information. Increments per warp."), 277 278 _Q(PROF_TRIGGER_6, 279 "prof_trigger_06", 280 "User profiled generic trigger that can be inserted in any place of the " 281 "code to collect the related information. Increments per warp."), 282 283 _Q(PROF_TRIGGER_7, 284 "prof_trigger_07", 285 "User profiled generic trigger that can be inserted in any place of the " 286 "code to collect the related information. Increments per warp."), 287 288 _Q(SHARED_ATOM, 289 "shared_atom", 290 "Number of ATOMS instructions executed per warp."), 291 292 _Q(SHARED_ATOM_CAS, 293 "shared_atom_cas", 294 "Number of ATOMS.CAS instructions executed per warp."), 295 296 _Q(SHARED_LD, 297 "shared_load", 298 "Number of executed load instructions where state space is specified as " 299 "shared, increments per warp on a multiprocessor"), 300 301 _Q(SHARED_LD_BANK_CONFLICT, 302 "shared_load_bank_conflict", 303 "Number of shared load bank conflict generated when the addresses for " 304 "two or more shared memory load requests fall in the same memory bank."), 305 306 _Q(SHARED_LD_REPLAY, 307 "shared_load_replay", 308 "Replays caused due to shared load bank conflict (when the addresses for " 309 "two or more shared memory load requests fall in the same memory bank) or " 310 "when there is no conflict but the total number of words accessed by all " 311 "threads in the warp executing that instruction exceed the number of words " 312 "that can be loaded in one cycle (256 bytes)"), 313 314 _Q(SHARED_LD_TRANSACTIONS, 315 "shared_ld_transactions", 316 "Number of transactions for shared load accesses. Maximum transaction " 317 "size in maxwell is 128 bytes, any warp accessing more that 128 bytes " 318 "will cause multiple transactions for a shared load instruction. This " 319 "also includes extra transactions caused by shared bank conflicts."), 320 321 _Q(SHARED_ST, 322 "shared_store", 323 "Number of executed store instructions where state space is specified as " 324 "shared, increments per warp on a multiprocessor"), 325 326 _Q(SHARED_ST_BANK_CONFLICT, 327 "shared_store_bank_conflict", 328 "Number of shared store bank conflict generated when the addresses for " 329 "two or more shared memory store requests fall in the same memory bank."), 330 331 _Q(SHARED_ST_REPLAY, 332 "shared_store_replay", 333 "Replays caused due to shared store bank conflict (when the addresses for " 334 "two or more shared memory store requests fall in the same memory bank) or " 335 "when there is no conflict but the total number of words accessed by all " 336 "threads in the warp executing that instruction exceed the number of words " 337 "that can be stored in one cycle"), 338 339 _Q(SHARED_ST_TRANSACTIONS, 340 "shared_st_transactions", 341 "Number of transactions for shared store accesses. Maximum transaction " 342 "size in maxwell is 128 bytes, any warp accessing more that 128 bytes " 343 "will cause multiple transactions for a shared store instruction. This " 344 "also includes extra transactions caused by shared bank conflicts."), 345 346 _Q(SM_CTA_LAUNCHED, 347 "sm_cta_launched", 348 "Number of thread blocks launched on a multiprocessor"), 349 350 _Q(THREADS_LAUNCHED, 351 "threads_launched", 352 "Number of threads launched on a multiprocessor"), 353 354 _Q(TH_INST_EXECUTED, 355 "thread_inst_executed", 356 "Number of instructions executed by all threads, does not include " 357 "replays. For each instruction it increments by the number of threads in " 358 "the warp that execute the instruction"), 359 360 _Q(TH_INST_EXECUTED_0, 361 "thread_inst_executed_0", 362 "Number of instructions executed by all threads, does not include " 363 "replays. For each instruction it increments by the number of threads in " 364 "the warp that execute the instruction in pipeline 0"), 365 366 _Q(TH_INST_EXECUTED_1, 367 "thread_inst_executed_1", 368 "Number of instructions executed by all threads, does not include " 369 "replays. For each instruction it increments by the number of threads in " 370 "the warp that execute the instruction in pipeline 1"), 371 372 _Q(TH_INST_EXECUTED_2, 373 "thread_inst_executed_2", 374 "Number of instructions executed by all threads, does not include " 375 "replays. For each instruction it increments by the number of threads in " 376 "the warp that execute the instruction in pipeline 2"), 377 378 _Q(TH_INST_EXECUTED_3, 379 "thread_inst_executed_3", 380 "Number of instructions executed by all threads, does not include " 381 "replays. For each instruction it increments by the number of threads in " 382 "the warp that execute the instruction in pipeline 3"), 383 384 _Q(UNCACHED_GLD_TRANSACTIONS, 385 "uncached_global_load_transaction", 386 "Number of uncached global load transactions. Increments by 1 per " 387 "transaction. Transaction can be 32/64/96/128B."), 388 389 _Q(WARPS_LAUNCHED, 390 "warps_launched", 391 "Number of warps launched on a multiprocessor"), 392 }; 393 394 #undef _Q 395 396 static inline const char * 397 nvc0_hw_sm_query_get_name(unsigned query_type) 398 { 399 unsigned i; 400 401 for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) { 402 if (nvc0_hw_sm_queries[i].type == query_type) 403 return nvc0_hw_sm_queries[i].name; 404 } 405 assert(0); 406 return NULL; 407 } 408 409 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ 410 411 /* Code to read out MP counters: They are accessible via mmio, too, but let's 412 * just avoid mapping registers in userspace. We'd have to know which MPs are 413 * enabled/present, too, and that information is not presently exposed. 414 * We could add a kernel interface for it, but reading the counters like this 415 * has the advantage of being async (if get_result isn't called immediately). 416 */ 417 static const uint64_t nve4_read_hw_sm_counters_code[] = 418 { 419 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20 420 * mov b32 $r8 $tidx 421 * mov b32 $r12 $physid 422 * mov b32 $r0 $pm0 423 * mov b32 $r1 $pm1 424 * mov b32 $r2 $pm2 425 * mov b32 $r3 $pm3 426 * mov b32 $r4 $pm4 427 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b 428 * mov b32 $r5 $pm5 429 * mov b32 $r6 $pm6 430 * mov b32 $r7 $pm7 431 * set $p0 0x1 eq u32 $r8 0x0 432 * mov b32 $r10 c7[0x620] 433 * ext u32 $r8 $r12 0x414 434 * mov b32 $r11 c7[0x624] 435 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04 436 * ext u32 $r9 $r12 0x208 437 * (not $p0) exit 438 * set $p1 0x1 eq u32 $r9 0x0 439 * mul $r8 u32 $r8 u32 96 440 * mul $r12 u32 $r9 u32 16 441 * mul $r13 u32 $r9 u32 4 442 * add b32 $r9 $r8 $r13 443 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c 444 * add b32 $r8 $r8 $r12 445 * mov b32 $r12 $r10 446 * add b32 $r10 $c $r10 $r8 447 * mov b32 $r13 $r11 448 * add b32 $r11 $r11 0x0 $c 449 * add b32 $r12 $c $r12 $r9 450 * st b128 wt g[$r10d] $r0q 451 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00 452 * mov b32 $r0 c7[0x628] 453 * add b32 $r13 $r13 0x0 $c 454 * $p1 st b128 wt g[$r12d+0x40] $r4q 455 * st b32 wt g[$r12d+0x50] $r0 456 * exit */ 457 0x2202020202020207ULL, 458 0x2c00000084021c04ULL, 459 0x2c0000000c031c04ULL, 460 0x2c00000010001c04ULL, 461 0x2c00000014005c04ULL, 462 0x2c00000018009c04ULL, 463 0x2c0000001c00dc04ULL, 464 0x2c00000020011c04ULL, 465 0x22b0420042320207ULL, 466 0x2c00000024015c04ULL, 467 0x2c00000028019c04ULL, 468 0x2c0000002c01dc04ULL, 469 0x190e0000fc81dc03ULL, 470 0x28005c1880029de4ULL, 471 0x7000c01050c21c03ULL, 472 0x28005c189002dde4ULL, 473 0x204282020042e047ULL, 474 0x7000c00820c25c03ULL, 475 0x80000000000021e7ULL, 476 0x190e0000fc93dc03ULL, 477 0x1000000180821c02ULL, 478 0x1000000040931c02ULL, 479 0x1000000010935c02ULL, 480 0x4800000034825c03ULL, 481 0x22c042c042c04287ULL, 482 0x4800000030821c03ULL, 483 0x2800000028031de4ULL, 484 0x4801000020a29c03ULL, 485 0x280000002c035de4ULL, 486 0x0800000000b2dc42ULL, 487 0x4801000024c31c03ULL, 488 0x9400000000a01fc5ULL, 489 0x200002e04202c047ULL, 490 0x28005c18a0001de4ULL, 491 0x0800000000d35c42ULL, 492 0x9400000100c107c5ULL, 493 0x9400000140c01f85ULL, 494 0x8000000000001de7ULL 495 }; 496 497 static const uint64_t nvf0_read_hw_sm_counters_code[] = 498 { 499 /* Same kernel as GK104 */ 500 0x0880808080808080ULL, 501 0x86400000109c0022ULL, 502 0x86400000019c0032ULL, 503 0x86400000021c0002ULL, 504 0x86400000029c0006ULL, 505 0x86400000031c000aULL, 506 0x86400000039c000eULL, 507 0x86400000041c0012ULL, 508 0x08ac1080108c8080ULL, 509 0x86400000049c0016ULL, 510 0x86400000051c001aULL, 511 0x86400000059c001eULL, 512 0xdb201c007f9c201eULL, 513 0x64c03ce0c41c002aULL, 514 0xc00000020a1c3021ULL, 515 0x64c03ce0c49c002eULL, 516 0x0810a0808010b810ULL, 517 0xc0000001041c3025ULL, 518 0x180000000020003cULL, 519 0xdb201c007f9c243eULL, 520 0xc1c00000301c2021ULL, 521 0xc1c00000081c2431ULL, 522 0xc1c00000021c2435ULL, 523 0xe0800000069c2026ULL, 524 0x08b010b010b010a0ULL, 525 0xe0800000061c2022ULL, 526 0xe4c03c00051c0032ULL, 527 0xe0840000041c282aULL, 528 0xe4c03c00059c0036ULL, 529 0xe08040007f9c2c2eULL, 530 0xe0840000049c3032ULL, 531 0xfe800000001c2800ULL, 532 0x080000b81080b010ULL, 533 0x64c03ce0c51c0002ULL, 534 0xe08040007f9c3436ULL, 535 0xfe80000020043010ULL, 536 0xfc800000281c3000ULL, 537 0x18000000001c003cULL, 538 }; 539 540 static const uint64_t gm107_read_hw_sm_counters_code[] = 541 { 542 0x001d0400e4200701ULL, /* sched (st 0x1 wr 0x0) (st 0x1 wr 0x1) (st 0x1 wr 0x2) */ 543 0xf0c8000002170008ULL, /* mov $r8 $tidx */ 544 0xf0c800000037000cULL, /* mov $r12 $virtid */ 545 0xf0c8000000470000ULL, /* mov $r0 $pm0 */ 546 0x001e8400f0200761ULL, /* sched (st 0x1 wr 0x3) (st 0x1 wr 0x4) (st 0x1 wr 0x5) */ 547 0xf0c8000000570001ULL, /* mov $r1 $pm1 */ 548 0xf0c8000000670002ULL, /* mov $r2 $pm2 */ 549 0xf0c8000000770003ULL, /* mov $r3 $pm3 */ 550 0x001e8400f42007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wr 0x5) (st 0x1 wr 0x5) */ 551 0xf0c8000000870004ULL, /* mov $r4 $pm4 */ 552 0xf0c8000000970005ULL, /* mov $r5 $pm5 */ 553 0xf0c8000000a70006ULL, /* mov $r6 $pm6 */ 554 0x001f8401fc2007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wt 0x1) (st 0x1) */ 555 0xf0c8000000b70007ULL, /* mov $r7 $pm7 */ 556 0x5b6403800087ff07ULL, /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1 */ 557 0x4c98079c1887000aULL, /* mov $r10 c7[0x620] 0xf */ 558 0x001fa400fc2017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x1) (st 0x9) */ 559 0x3800000091470c08ULL, /* bfe u32 $r8 $r12 0x914 */ 560 0x4c98079c1897000bULL, /* mov $r11 c7[0x624] 0xf */ 561 0x3800000020870c09ULL, /* bfe u32 $r9 $r12 0x208 */ 562 0x001c1800fc2007edULL, /* sched (st 0xd) (st 0x1) (st 0x6 wr 0x0) */ 563 0xe30000000008000fULL, /* not $p0 exit */ 564 0x5b6403800097ff0fULL, /* isetp eq u32 and $p1 0x1 0x0 $r9 0x1 */ 565 0x3838000006070808ULL, /* imul u32 u32 $r8 $r8 0x60 */ 566 0x003f8400e0c00726ULL, /* sched (st 0x6 wr 0x1) (st 0x6 wr 0x0) (st 0x1 wt 0x1) */ 567 0x383800000107090cULL, /* imul u32 u32 $r12 $r9 0x10 */ 568 0x383800000047090dULL, /* imul u32 u32 $r13 $r9 0x4 */ 569 0x5c10000000d70809ULL, /* iadd $r9 $r8 $r13 */ 570 0x001f8400fcc017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x6) (st 0x1) */ 571 0x5c10000000c70808ULL, /* iadd $r8 $r8 $r12 */ 572 0x5c98078000a7000cULL, /* mov $r12 $r10 0xf */ 573 0x5c10800000870a0aULL, /* iadd cc $r10 $r10 $r8 */ 574 0x001f8400fc2007e6ULL, /* sched (st 0x6) (st 0x1) (st 0x1) */ 575 0x5c98078000b7000dULL, /* mov $r13 $r11 0xf */ 576 0x5c1008000ff70b0bULL, /* iadd x $r11 $r11 0x0 */ 577 0x5c10800000970c0cULL, /* iadd cc $r12 $r12 $r9 */ 578 0x003f983c1c4007e1ULL, /* sched (st 0x1) (st 0x2 rd 0x0 wt 0x3c) (st 0x6 wt 0x1) */ 579 0x5c1008000ff70d0dULL, /* iadd x $r13 $r13 0x0 */ 580 0xbfd0000000070a00ULL, /* st e wt b128 g[$r10] $r0 0x1 */ 581 0x4c98079c18a70000ULL, /* mov $r0 c7[0x628] 0xf */ 582 0x001fbc00fc2007e6ULL, /* sched (st 0x1) (st 0x1) (st 0xf) */ 583 0xbfd0000004010c04ULL, /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1 */ 584 0xbf90000005070c00ULL, /* st e wt b32 g[$r12+0x50] $r0 0x1 */ 585 0xe30000000007000fULL, /* exit */ 586 }; 587 588 /* For simplicity, we will allocate as many group slots as we allocate counter 589 * slots. This means that a single counter which wants to source from 2 groups 590 * will have to be declared as using 2 counter slots. This shouldn't really be 591 * a problem because such queries don't make much sense ... (unless someone is 592 * really creative). 593 */ 594 struct nvc0_hw_sm_counter_cfg 595 { 596 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */ 597 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */ 598 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */ 599 uint32_t sig_sel : 8; /* signal group */ 600 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */ 601 uint32_t src_sel; /* signal selection for up to 4 sources */ 602 }; 603 604 struct nvc0_hw_sm_query_cfg 605 { 606 unsigned type; 607 struct nvc0_hw_sm_counter_cfg ctr[8]; 608 uint8_t num_counters; 609 uint8_t norm[2]; /* normalization num,denom */ 610 }; 611 612 #define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, g, 0, s } 613 #define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, g, 0, s } 614 #define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c 615 616 /* ==== Compute capability 3.0 (GK104:GK110) ==== */ 617 static const struct nvc0_hw_sm_query_cfg 618 sm30_active_cycles = 619 { 620 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES, 621 .ctr[0] = _CB(0x0001, B6, 0x02, 0x00000000), 622 .num_counters = 1, 623 .norm = { 1, 1 }, 624 }; 625 626 static const struct nvc0_hw_sm_query_cfg 627 sm30_active_warps = 628 { 629 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS, 630 .ctr[0] = _CB(0x003f, B6, 0x02, 0x31483104), 631 .num_counters = 1, 632 .norm = { 2, 1 }, 633 }; 634 635 static const struct nvc0_hw_sm_query_cfg 636 sm30_atom_cas_count = 637 { 638 .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, 639 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x000000004), 640 .num_counters = 1, 641 .norm = { 1, 1 }, 642 }; 643 644 static const struct nvc0_hw_sm_query_cfg 645 sm30_atom_count = 646 { 647 .type = NVC0_HW_SM_QUERY_ATOM_COUNT, 648 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000000), 649 .num_counters = 1, 650 .norm = { 1, 1 }, 651 }; 652 653 static const struct nvc0_hw_sm_query_cfg 654 sm30_branch = 655 { 656 .type = NVC0_HW_SM_QUERY_BRANCH, 657 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x0000000c), 658 .num_counters = 1, 659 .norm = { 1, 1 }, 660 }; 661 662 static const struct nvc0_hw_sm_query_cfg 663 sm30_divergent_branch = 664 { 665 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, 666 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000010), 667 .num_counters = 1, 668 .norm = { 1, 1 }, 669 }; 670 671 static const struct nvc0_hw_sm_query_cfg 672 sm30_gld_request = 673 { 674 .type = NVC0_HW_SM_QUERY_GLD_REQUEST, 675 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000010), 676 .num_counters = 1, 677 .norm = { 1, 1 }, 678 }; 679 680 static const struct nvc0_hw_sm_query_cfg 681 sm30_gld_mem_div_replay = 682 { 683 .type = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, 684 .ctr[0] = _CB(0x0001, B6, 0x08, 0x00000010), 685 .num_counters = 1, 686 .norm = { 1, 1 }, 687 }; 688 689 static const struct nvc0_hw_sm_query_cfg 690 sm30_gst_transactions = 691 { 692 .type = NVC0_HW_SM_QUERY_GST_TRANSACTIONS, 693 .ctr[0] = _CB(0x0001, B6, 0x11, 0x00000004), 694 .num_counters = 1, 695 .norm = { 1, 1 }, 696 }; 697 698 static const struct nvc0_hw_sm_query_cfg 699 sm30_gst_mem_div_replay = 700 { 701 .type = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY, 702 .ctr[0] = _CB(0x0001, B6, 0x08, 0x00000014), 703 .num_counters = 1, 704 .norm = { 1, 1 }, 705 }; 706 707 static const struct nvc0_hw_sm_query_cfg 708 sm30_gred_count = 709 { 710 .type = NVC0_HW_SM_QUERY_GRED_COUNT, 711 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000008), 712 .num_counters = 1, 713 .norm = { 1, 1 }, 714 }; 715 716 static const struct nvc0_hw_sm_query_cfg 717 sm30_gst_request = 718 { 719 .type = NVC0_HW_SM_QUERY_GST_REQUEST, 720 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000014), 721 .num_counters = 1, 722 .norm = { 1, 1 }, 723 }; 724 725 static const struct nvc0_hw_sm_query_cfg 726 sm30_inst_executed = 727 { 728 .type = NVC0_HW_SM_QUERY_INST_EXECUTED, 729 .ctr[0] = _CA(0x0003, B6, 0x04, 0x00000398), 730 .num_counters = 1, 731 .norm = { 1, 1 }, 732 }; 733 734 static const struct nvc0_hw_sm_query_cfg 735 sm30_inst_issued1 = 736 { 737 .type = NVC0_HW_SM_QUERY_INST_ISSUED1, 738 .ctr[0] = _CA(0x0001, B6, 0x05, 0x00000004), 739 .num_counters = 1, 740 .norm = { 1, 1 }, 741 }; 742 743 static const struct nvc0_hw_sm_query_cfg 744 sm30_inst_issued2 = 745 { 746 .type = NVC0_HW_SM_QUERY_INST_ISSUED2, 747 .ctr[0] = _CA(0x0001, B6, 0x05, 0x00000008), 748 .num_counters = 1, 749 .norm = { 1, 1 }, 750 }; 751 752 static const struct nvc0_hw_sm_query_cfg 753 sm30_l1_gld_hit = 754 { 755 .type = NVC0_HW_SM_QUERY_L1_GLD_HIT, 756 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000010), 757 .num_counters = 1, 758 .norm = { 1, 1 }, 759 }; 760 761 static const struct nvc0_hw_sm_query_cfg 762 sm30_l1_gld_miss = 763 { 764 .type = NVC0_HW_SM_QUERY_L1_GLD_MISS, 765 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000014), 766 .num_counters = 1, 767 .norm = { 1, 1 }, 768 }; 769 770 static const struct nvc0_hw_sm_query_cfg 771 sm30_l1_gld_transactions = 772 { 773 .type = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS, 774 .ctr[0] = _CB(0x0001, B6, 0x0f, 0x00000000), 775 .num_counters = 1, 776 .norm = { 1, 1 }, 777 }; 778 779 static const struct nvc0_hw_sm_query_cfg 780 sm30_l1_gst_transactions = 781 { 782 .type = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS, 783 .ctr[0] = _CB(0x0001, B6, 0x0f, 0x00000004), 784 .num_counters = 1, 785 .norm = { 1, 1 }, 786 }; 787 788 static const struct nvc0_hw_sm_query_cfg 789 sm30_l1_local_ld_hit = 790 { 791 .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT, 792 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000000), 793 .num_counters = 1, 794 .norm = { 1, 1 }, 795 }; 796 797 static const struct nvc0_hw_sm_query_cfg 798 sm30_l1_local_ld_miss = 799 { 800 .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS, 801 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000004), 802 .num_counters = 1, 803 .norm = { 1, 1 }, 804 }; 805 806 static const struct nvc0_hw_sm_query_cfg 807 sm30_l1_local_st_hit = 808 { 809 .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT, 810 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000008), 811 .num_counters = 1, 812 .norm = { 1, 1 }, 813 }; 814 815 static const struct nvc0_hw_sm_query_cfg 816 sm30_l1_local_st_miss = 817 { 818 .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS, 819 .ctr[0] = _CB(0x0001, B6, 0x10, 0x0000000c), 820 .num_counters = 1, 821 .norm = { 1, 1 }, 822 }; 823 824 static const struct nvc0_hw_sm_query_cfg 825 sm30_l1_shared_ld_transactions = 826 { 827 .type = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, 828 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008), 829 .num_counters = 1, 830 .norm = { 1, 1 }, 831 }; 832 833 static const struct nvc0_hw_sm_query_cfg 834 sm30_l1_shared_st_transactions = 835 { 836 .type = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, 837 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c), 838 .num_counters = 1, 839 .norm = { 1, 1 }, 840 }; 841 842 static const struct nvc0_hw_sm_query_cfg 843 sm30_local_ld = 844 { 845 .type = NVC0_HW_SM_QUERY_LOCAL_LD, 846 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000008), 847 .num_counters = 1, 848 .norm = { 1, 1 }, 849 }; 850 851 static const struct nvc0_hw_sm_query_cfg 852 sm30_local_ld_transactions = 853 { 854 .type = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, 855 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000), 856 .num_counters = 1, 857 .norm = { 1, 1 }, 858 }; 859 860 static const struct nvc0_hw_sm_query_cfg 861 sm30_local_st = 862 { 863 .type = NVC0_HW_SM_QUERY_LOCAL_ST, 864 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x0000000c), 865 .num_counters = 1, 866 .norm = { 1, 1 }, 867 }; 868 869 static const struct nvc0_hw_sm_query_cfg 870 sm30_local_st_transactions = 871 { 872 .type = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, 873 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004), 874 .num_counters = 1, 875 .norm = { 1, 1 }, 876 }; 877 878 static const struct nvc0_hw_sm_query_cfg 879 sm30_prof_trigger_0 = 880 { 881 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0, 882 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000000), 883 .num_counters = 1, 884 .norm = { 1, 1 }, 885 }; 886 887 static const struct nvc0_hw_sm_query_cfg 888 sm30_prof_trigger_1 = 889 { 890 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1, 891 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000004), 892 .num_counters = 1, 893 .norm = { 1, 1 }, 894 }; 895 896 static const struct nvc0_hw_sm_query_cfg 897 sm30_prof_trigger_2 = 898 { 899 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2, 900 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000008), 901 .num_counters = 1, 902 .norm = { 1, 1 }, 903 }; 904 905 static const struct nvc0_hw_sm_query_cfg 906 sm30_prof_trigger_3 = 907 { 908 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3, 909 .ctr[0] = _CA(0x0001, B6, 0x01, 0x0000000c), 910 .num_counters = 1, 911 .norm = { 1, 1 }, 912 }; 913 914 static const struct nvc0_hw_sm_query_cfg 915 sm30_prof_trigger_4 = 916 { 917 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4, 918 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000010), 919 .num_counters = 1, 920 .norm = { 1, 1 }, 921 }; 922 923 static const struct nvc0_hw_sm_query_cfg 924 sm30_prof_trigger_5 = 925 { 926 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5, 927 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000014), 928 .num_counters = 1, 929 .norm = { 1, 1 }, 930 }; 931 932 static const struct nvc0_hw_sm_query_cfg 933 sm30_prof_trigger_6 = 934 { 935 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6, 936 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000018), 937 .num_counters = 1, 938 .norm = { 1, 1 }, 939 }; 940 941 static const struct nvc0_hw_sm_query_cfg 942 sm30_prof_trigger_7 = 943 { 944 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7, 945 .ctr[0] = _CA(0x0001, B6, 0x01, 0x0000001c), 946 .num_counters = 1, 947 .norm = { 1, 1 }, 948 }; 949 950 static const struct nvc0_hw_sm_query_cfg 951 sm30_shared_ld = 952 { 953 .type = NVC0_HW_SM_QUERY_SHARED_LD, 954 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000000), 955 .num_counters = 1, 956 .norm = { 1, 1 }, 957 }; 958 959 static const struct nvc0_hw_sm_query_cfg 960 sm30_shared_ld_replay = 961 { 962 .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, 963 .ctr[0] = _CB(0x0001, B6, 0x08, 0x00000008), 964 .num_counters = 1, 965 .norm = { 1, 1 }, 966 }; 967 968 static const struct nvc0_hw_sm_query_cfg 969 sm30_shared_st = 970 { 971 .type = NVC0_HW_SM_QUERY_SHARED_ST, 972 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000004), 973 .num_counters = 1, 974 .norm = { 1, 1 }, 975 }; 976 977 static const struct nvc0_hw_sm_query_cfg 978 sm30_shared_st_replay = 979 { 980 .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, 981 .ctr[0] = _CB(0x0001, B6, 0x08, 0x0000000c), 982 .num_counters = 1, 983 .norm = { 1, 1 }, 984 }; 985 986 static const struct nvc0_hw_sm_query_cfg 987 sm30_sm_cta_launched = 988 { 989 .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED, 990 .ctr[0] = _CB(0x0001, B6, 0x02, 0x0000001c), 991 .num_counters = 1, 992 .norm = { 1, 1 }, 993 }; 994 995 static const struct nvc0_hw_sm_query_cfg 996 sm30_threads_launched = 997 { 998 .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED, 999 .ctr[0] = _CA(0x003f, B6, 0x03, 0x398a4188), 1000 .num_counters = 1, 1001 .norm = { 1, 1 }, 1002 }; 1003 1004 static const struct nvc0_hw_sm_query_cfg 1005 sm30_uncached_gld_transactions = 1006 { 1007 .type = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, 1008 .ctr[0] = _CB(0x0001, B6, 0x11, 0x00000000), 1009 .num_counters = 1, 1010 .norm = { 1, 1 }, 1011 }; 1012 1013 static const struct nvc0_hw_sm_query_cfg 1014 sm30_warps_launched = 1015 { 1016 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, 1017 .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004), 1018 .num_counters = 1, 1019 .norm = { 1, 1 }, 1020 }; 1021 1022 /* NOTES: 1023 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps 1024 * inst_executed etc.: we only count a single warp scheduler 1025 */ 1026 static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] = 1027 { 1028 &sm30_active_cycles, 1029 &sm30_active_warps, 1030 &sm30_atom_cas_count, 1031 &sm30_atom_count, 1032 &sm30_branch, 1033 &sm30_divergent_branch, 1034 &sm30_gld_request, 1035 &sm30_gld_mem_div_replay, 1036 &sm30_gst_transactions, 1037 &sm30_gst_mem_div_replay, 1038 &sm30_gred_count, 1039 &sm30_gst_request, 1040 &sm30_inst_executed, 1041 &sm30_inst_issued1, 1042 &sm30_inst_issued2, 1043 &sm30_l1_gld_hit, 1044 &sm30_l1_gld_miss, 1045 &sm30_l1_gld_transactions, 1046 &sm30_l1_gst_transactions, 1047 &sm30_l1_local_ld_hit, 1048 &sm30_l1_local_ld_miss, 1049 &sm30_l1_local_st_hit, 1050 &sm30_l1_local_st_miss, 1051 &sm30_l1_shared_ld_transactions, 1052 &sm30_l1_shared_st_transactions, 1053 &sm30_local_ld, 1054 &sm30_local_ld_transactions, 1055 &sm30_local_st, 1056 &sm30_local_st_transactions, 1057 &sm30_prof_trigger_0, 1058 &sm30_prof_trigger_1, 1059 &sm30_prof_trigger_2, 1060 &sm30_prof_trigger_3, 1061 &sm30_prof_trigger_4, 1062 &sm30_prof_trigger_5, 1063 &sm30_prof_trigger_6, 1064 &sm30_prof_trigger_7, 1065 &sm30_shared_ld, 1066 &sm30_shared_ld_replay, 1067 &sm30_shared_st, 1068 &sm30_shared_st_replay, 1069 &sm30_sm_cta_launched, 1070 &sm30_threads_launched, 1071 &sm30_uncached_gld_transactions, 1072 &sm30_warps_launched, 1073 }; 1074 1075 /* ==== Compute capability 3.5 (GK110/GK208) ==== */ 1076 static const struct nvc0_hw_sm_query_cfg 1077 sm35_atom_cas_count = 1078 { 1079 .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, 1080 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000014), 1081 .num_counters = 1, 1082 .norm = { 1, 1 }, 1083 }; 1084 1085 static const struct nvc0_hw_sm_query_cfg 1086 sm35_atom_count = 1087 { 1088 .type = NVC0_HW_SM_QUERY_ATOM_COUNT, 1089 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010), 1090 .num_counters = 1, 1091 .norm = { 1, 1 }, 1092 }; 1093 1094 static const struct nvc0_hw_sm_query_cfg 1095 sm35_gred_count = 1096 { 1097 .type = NVC0_HW_SM_QUERY_GRED_COUNT, 1098 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000018), 1099 .num_counters = 1, 1100 .norm = { 1, 1 }, 1101 }; 1102 1103 static const struct nvc0_hw_sm_query_cfg 1104 sm35_not_pred_off_inst_executed = 1105 { 1106 .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED, 1107 .ctr[0] = _CA(0x003f, B6, 0x14, 0x29062080), 1108 .num_counters = 1, 1109 .norm = { 1, 1 }, 1110 }; 1111 1112 static const struct nvc0_hw_sm_query_cfg 1113 sm35_shared_ld_replay = 1114 { 1115 .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, 1116 .ctr[0] = _CB(0xaaaa, LOGOP, 0x13, 0x00000018), 1117 .ctr[1] = _CB(0x8888, LOGOP, 0x08, 0x00000151), 1118 .num_counters = 2, 1119 .norm = { 1, 1 }, 1120 }; 1121 1122 static const struct nvc0_hw_sm_query_cfg 1123 sm35_shared_st_replay = 1124 { 1125 .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, 1126 .ctr[0] = _CB(0xaaaa, LOGOP, 0x13, 0x00000018), 1127 .ctr[1] = _CB(0x8888, LOGOP, 0x08, 0x000001d1), 1128 .num_counters = 2, 1129 .norm = { 1, 1 }, 1130 }; 1131 1132 static const struct nvc0_hw_sm_query_cfg 1133 sm35_th_inst_executed = 1134 { 1135 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED, 1136 .ctr[0] = _CA(0x003f, B6, 0x11, 0x29062080), 1137 .num_counters = 1, 1138 .norm = { 1, 1 }, 1139 }; 1140 1141 static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] = 1142 { 1143 &sm30_active_cycles, 1144 &sm30_active_warps, 1145 &sm35_atom_cas_count, 1146 &sm35_atom_count, 1147 &sm30_gld_request, 1148 &sm30_gld_mem_div_replay, 1149 &sm30_gst_transactions, 1150 &sm30_gst_mem_div_replay, 1151 &sm35_gred_count, 1152 &sm30_gst_request, 1153 &sm30_inst_executed, 1154 &sm30_inst_issued1, 1155 &sm30_inst_issued2, 1156 &sm30_l1_gld_hit, 1157 &sm30_l1_gld_miss, 1158 &sm30_l1_gld_transactions, 1159 &sm30_l1_gst_transactions, 1160 &sm30_l1_local_ld_hit, 1161 &sm30_l1_local_ld_miss, 1162 &sm30_l1_local_st_hit, 1163 &sm30_l1_local_st_miss, 1164 &sm30_l1_shared_ld_transactions, 1165 &sm30_l1_shared_st_transactions, 1166 &sm30_local_ld, 1167 &sm30_local_ld_transactions, 1168 &sm30_local_st, 1169 &sm30_local_st_transactions, 1170 &sm35_not_pred_off_inst_executed, 1171 &sm30_prof_trigger_0, 1172 &sm30_prof_trigger_1, 1173 &sm30_prof_trigger_2, 1174 &sm30_prof_trigger_3, 1175 &sm30_prof_trigger_4, 1176 &sm30_prof_trigger_5, 1177 &sm30_prof_trigger_6, 1178 &sm30_prof_trigger_7, 1179 &sm30_shared_ld, 1180 &sm35_shared_ld_replay, 1181 &sm30_shared_st, 1182 &sm35_shared_st_replay, 1183 &sm30_sm_cta_launched, 1184 &sm35_th_inst_executed, 1185 &sm30_threads_launched, 1186 &sm30_uncached_gld_transactions, 1187 &sm30_warps_launched, 1188 }; 1189 1190 /* ==== Compute capability 5.0 (GM107/GM108) ==== */ 1191 static const struct nvc0_hw_sm_query_cfg 1192 sm50_active_ctas = 1193 { 1194 .type = NVC0_HW_SM_QUERY_ACTIVE_CTAS, 1195 .ctr[0] = _CB(0x003f, B6, 0x01, 0x29062080), 1196 .num_counters = 1, 1197 .norm = { 1, 1 }, 1198 }; 1199 1200 static const struct nvc0_hw_sm_query_cfg 1201 sm50_active_cycles = 1202 { 1203 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES, 1204 .ctr[0] = _CB(0x0001, B6, 0x00, 0x00000004), 1205 .num_counters = 1, 1206 .norm = { 1, 1 }, 1207 }; 1208 1209 static const struct nvc0_hw_sm_query_cfg 1210 sm50_active_warps = 1211 { 1212 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS, 1213 .ctr[0] = _CB(0x003f, B6, 0x00, 0x398a4188), 1214 .num_counters = 1, 1215 .norm = { 1, 1 }, 1216 }; 1217 1218 static const struct nvc0_hw_sm_query_cfg 1219 sm50_atom_count = 1220 { 1221 .type = NVC0_HW_SM_QUERY_ATOM_COUNT, 1222 .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000004), 1223 .num_counters = 1, 1224 .norm = { 1, 1 }, 1225 }; 1226 1227 static const struct nvc0_hw_sm_query_cfg 1228 sm50_branch = 1229 { 1230 .type = NVC0_HW_SM_QUERY_BRANCH, 1231 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010), 1232 .num_counters = 1, 1233 .norm = { 1, 1 }, 1234 }; 1235 1236 static const struct nvc0_hw_sm_query_cfg 1237 sm50_divergent_branch = 1238 { 1239 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, 1240 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000004), 1241 .num_counters = 1, 1242 .norm = { 1, 1 }, 1243 }; 1244 1245 static const struct nvc0_hw_sm_query_cfg 1246 sm50_global_atom_cas = 1247 { 1248 .type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS, 1249 .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000000), 1250 .num_counters = 1, 1251 .norm = { 1, 1 }, 1252 }; 1253 1254 static const struct nvc0_hw_sm_query_cfg 1255 sm50_global_ld = 1256 { 1257 .type = NVC0_HW_SM_QUERY_GLOBAL_LD, 1258 .ctr[0] = _CA(0x0001, B6, 0x14, 0x0000000c), 1259 .num_counters = 1, 1260 .norm = { 1, 1 }, 1261 }; 1262 1263 static const struct nvc0_hw_sm_query_cfg 1264 sm50_global_st = 1265 { 1266 .type = NVC0_HW_SM_QUERY_GLOBAL_ST, 1267 .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000010), 1268 .num_counters = 1, 1269 .norm = { 1, 1 }, 1270 }; 1271 1272 static const struct nvc0_hw_sm_query_cfg 1273 sm50_gred_count = 1274 { 1275 .type = NVC0_HW_SM_QUERY_GRED_COUNT, 1276 .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000008), 1277 .num_counters = 1, 1278 .norm = { 1, 1 }, 1279 }; 1280 1281 static const struct nvc0_hw_sm_query_cfg 1282 sm50_inst_executed = 1283 { 1284 .type = NVC0_HW_SM_QUERY_INST_EXECUTED, 1285 .ctr[0] = _CA(0x0003, B6, 0x02, 0x00000398), 1286 .num_counters = 1, 1287 .norm = { 1, 1 }, 1288 }; 1289 1290 static const struct nvc0_hw_sm_query_cfg 1291 sm50_inst_issued0 = 1292 { 1293 .type = NVC0_HW_SM_QUERY_INST_ISSUED0, 1294 .ctr[0] = _CA(0x0001, B6, 0x02, 0x0000000c), 1295 .num_counters = 1, 1296 .norm = { 1, 1 }, 1297 }; 1298 1299 static const struct nvc0_hw_sm_query_cfg 1300 sm50_inst_issued1 = 1301 { 1302 .type = NVC0_HW_SM_QUERY_INST_ISSUED1, 1303 .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000010), 1304 .num_counters = 1, 1305 .norm = { 1, 1 }, 1306 }; 1307 1308 static const struct nvc0_hw_sm_query_cfg 1309 sm50_inst_issued2 = 1310 { 1311 .type = NVC0_HW_SM_QUERY_INST_ISSUED2, 1312 .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000014), 1313 .num_counters = 1, 1314 .norm = { 1, 1 }, 1315 }; 1316 1317 static const struct nvc0_hw_sm_query_cfg 1318 sm50_local_ld = 1319 { 1320 .type = NVC0_HW_SM_QUERY_LOCAL_LD, 1321 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000004), 1322 .num_counters = 1, 1323 .norm = { 1, 1 }, 1324 }; 1325 1326 static const struct nvc0_hw_sm_query_cfg 1327 sm50_local_st = 1328 { 1329 .type = NVC0_HW_SM_QUERY_LOCAL_ST, 1330 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000000), 1331 .num_counters = 1, 1332 .norm = { 1, 1 }, 1333 }; 1334 1335 static const struct nvc0_hw_sm_query_cfg 1336 sm50_not_pred_off_inst_executed = 1337 { 1338 .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED, 1339 .ctr[0] = _CA(0x003f, B6, 0x05, 0x29062080), 1340 .num_counters = 1, 1341 .norm = { 1, 1 }, 1342 }; 1343 1344 static const struct nvc0_hw_sm_query_cfg 1345 sm50_prof_trigger_0 = 1346 { 1347 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0, 1348 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000000), 1349 .num_counters = 1, 1350 .norm = { 1, 1 }, 1351 }; 1352 1353 static const struct nvc0_hw_sm_query_cfg 1354 sm50_prof_trigger_1 = 1355 { 1356 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1, 1357 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000004), 1358 .num_counters = 1, 1359 .norm = { 1, 1 }, 1360 }; 1361 1362 static const struct nvc0_hw_sm_query_cfg 1363 sm50_prof_trigger_2 = 1364 { 1365 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2, 1366 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000008), 1367 .num_counters = 1, 1368 .norm = { 1, 1 }, 1369 }; 1370 1371 static const struct nvc0_hw_sm_query_cfg 1372 sm50_prof_trigger_3 = 1373 { 1374 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3, 1375 .ctr[0] = _CA(0x0001, B6, 0x00, 0x0000000c), 1376 .num_counters = 1, 1377 .norm = { 1, 1 }, 1378 }; 1379 1380 static const struct nvc0_hw_sm_query_cfg 1381 sm50_prof_trigger_4 = 1382 { 1383 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4, 1384 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000010), 1385 .num_counters = 1, 1386 .norm = { 1, 1 }, 1387 }; 1388 1389 static const struct nvc0_hw_sm_query_cfg 1390 sm50_prof_trigger_5 = 1391 { 1392 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5, 1393 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000014), 1394 .num_counters = 1, 1395 .norm = { 1, 1 }, 1396 }; 1397 1398 static const struct nvc0_hw_sm_query_cfg 1399 sm50_prof_trigger_6 = 1400 { 1401 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6, 1402 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000018), 1403 .num_counters = 1, 1404 .norm = { 1, 1 }, 1405 }; 1406 1407 static const struct nvc0_hw_sm_query_cfg 1408 sm50_prof_trigger_7 = 1409 { 1410 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7, 1411 .ctr[0] = _CA(0x0001, B6, 0x00, 0x0000001c), 1412 .num_counters = 1, 1413 .norm = { 1, 1 }, 1414 }; 1415 1416 static const struct nvc0_hw_sm_query_cfg 1417 sm50_shared_atom = 1418 { 1419 .type = NVC0_HW_SM_QUERY_SHARED_ATOM, 1420 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000014), 1421 .num_counters = 1, 1422 .norm = { 1, 1 }, 1423 }; 1424 1425 static const struct nvc0_hw_sm_query_cfg 1426 sm50_shared_atom_cas = 1427 { 1428 .type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS, 1429 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000010), 1430 .num_counters = 1, 1431 .norm = { 1, 1 }, 1432 }; 1433 1434 static const struct nvc0_hw_sm_query_cfg 1435 sm50_shared_ld = 1436 { 1437 .type = NVC0_HW_SM_QUERY_SHARED_LD, 1438 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000008), 1439 .num_counters = 1, 1440 .norm = { 1, 1 }, 1441 }; 1442 1443 static const struct nvc0_hw_sm_query_cfg 1444 sm50_shared_ld_bank_conflict = 1445 { 1446 .type = NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT, 1447 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000), 1448 .num_counters = 1, 1449 .norm = { 1, 1 }, 1450 }; 1451 1452 static const struct nvc0_hw_sm_query_cfg 1453 sm50_shared_ld_transactions = 1454 { 1455 .type = NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS, 1456 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008), 1457 .num_counters = 1, 1458 .norm = { 1, 1 }, 1459 }; 1460 1461 static const struct nvc0_hw_sm_query_cfg 1462 sm50_shared_st = 1463 { 1464 .type = NVC0_HW_SM_QUERY_SHARED_ST, 1465 .ctr[0] = _CA(0x0001, B6, 0x13, 0x0000000c), 1466 .num_counters = 1, 1467 .norm = { 1, 1 }, 1468 }; 1469 1470 static const struct nvc0_hw_sm_query_cfg 1471 sm50_shared_st_bank_conflict = 1472 { 1473 .type = NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT, 1474 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004), 1475 .num_counters = 1, 1476 .norm = { 1, 1 }, 1477 }; 1478 1479 static const struct nvc0_hw_sm_query_cfg 1480 sm50_shared_st_transactions = 1481 { 1482 .type = NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS, 1483 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c), 1484 .num_counters = 1, 1485 .norm = { 1, 1 }, 1486 }; 1487 1488 static const struct nvc0_hw_sm_query_cfg 1489 sm50_sm_cta_launched = 1490 { 1491 .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED, 1492 .ctr[0] = _CB(0x0001, B6, 0x01, 0x00000018), 1493 .num_counters = 1, 1494 .norm = { 1, 1 }, 1495 }; 1496 1497 static const struct nvc0_hw_sm_query_cfg 1498 sm50_th_inst_executed = 1499 { 1500 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED, 1501 .ctr[0] = _CA(0x003f, B6, 0x04, 0x29062080), 1502 .num_counters = 1, 1503 .norm = { 1, 1 }, 1504 }; 1505 1506 static const struct nvc0_hw_sm_query_cfg 1507 sm50_warps_launched = 1508 { 1509 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, 1510 .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000008), 1511 .num_counters = 1, 1512 .norm = { 1, 1 }, 1513 }; 1514 1515 static const struct nvc0_hw_sm_query_cfg *sm50_hw_sm_queries[] = 1516 { 1517 &sm50_active_ctas, 1518 &sm50_active_cycles, 1519 &sm50_active_warps, 1520 &sm50_atom_count, 1521 &sm50_branch, 1522 &sm50_divergent_branch, 1523 &sm50_global_atom_cas, 1524 &sm50_global_ld, 1525 &sm50_global_st, 1526 &sm50_gred_count, 1527 &sm50_inst_executed, 1528 &sm50_inst_issued0, 1529 &sm50_inst_issued1, 1530 &sm50_inst_issued2, 1531 &sm50_local_ld, 1532 &sm50_local_st, 1533 &sm50_not_pred_off_inst_executed, 1534 &sm50_prof_trigger_0, 1535 &sm50_prof_trigger_1, 1536 &sm50_prof_trigger_2, 1537 &sm50_prof_trigger_3, 1538 &sm50_prof_trigger_4, 1539 &sm50_prof_trigger_5, 1540 &sm50_prof_trigger_6, 1541 &sm50_prof_trigger_7, 1542 &sm50_shared_atom, 1543 &sm50_shared_atom_cas, 1544 &sm50_shared_ld, 1545 &sm50_shared_ld_bank_conflict, 1546 &sm50_shared_ld_transactions, 1547 &sm50_shared_st, 1548 &sm50_shared_st_bank_conflict, 1549 &sm50_shared_st_transactions, 1550 &sm50_sm_cta_launched, 1551 &sm50_th_inst_executed, 1552 &sm50_warps_launched, 1553 }; 1554 1555 /* ==== Compute capability 5.2 (GM200/GM204/GM206) ==== */ 1556 static const struct nvc0_hw_sm_query_cfg 1557 sm52_atom_count = 1558 { 1559 .type = NVC0_HW_SM_QUERY_ATOM_COUNT, 1560 .ctr[0] = _CA(0x0001, B6, 0x0a, 0x0000001c), 1561 .num_counters = 1, 1562 .norm = { 1, 1 }, 1563 }; 1564 1565 static const struct nvc0_hw_sm_query_cfg 1566 sm52_global_atom_cas = 1567 { 1568 .type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS, 1569 .ctr[0] = _CA(0x0001, B6, 0x0a, 0x00000018), 1570 .num_counters = 1, 1571 .norm = { 1, 1 }, 1572 }; 1573 1574 static const struct nvc0_hw_sm_query_cfg 1575 sm52_global_ld = 1576 { 1577 .type = NVC0_HW_SM_QUERY_GLOBAL_LD, 1578 .ctr[0] = _CA(0x0001, B6, 0x0b, 0x00000018), 1579 .num_counters = 1, 1580 .norm = { 1, 1 }, 1581 }; 1582 1583 static const struct nvc0_hw_sm_query_cfg 1584 sm52_global_st = 1585 { 1586 .type = NVC0_HW_SM_QUERY_GLOBAL_ST, 1587 .ctr[0] = _CA(0x0001, B6, 0x0b, 0x0000001c), 1588 .num_counters = 1, 1589 .norm = { 1, 1 }, 1590 }; 1591 1592 static const struct nvc0_hw_sm_query_cfg 1593 sm52_gred_count = 1594 { 1595 .type = NVC0_HW_SM_QUERY_GRED_COUNT, 1596 .ctr[0] = _CA(0x0001, B6, 0x0f, 0x00000018), 1597 .num_counters = 1, 1598 .norm = { 1, 1 }, 1599 }; 1600 1601 static const struct nvc0_hw_sm_query_cfg 1602 sm52_inst_executed = 1603 { 1604 .type = NVC0_HW_SM_QUERY_INST_EXECUTED, 1605 .ctr[0] = _CA(0x0003, B6, 0x03, 0x0000020c), 1606 .num_counters = 1, 1607 .norm = { 1, 1 }, 1608 }; 1609 1610 static const struct nvc0_hw_sm_query_cfg 1611 sm52_inst_issued0 = 1612 { 1613 .type = NVC0_HW_SM_QUERY_INST_ISSUED0, 1614 .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000000), 1615 .num_counters = 1, 1616 .norm = { 1, 1 }, 1617 }; 1618 1619 static const struct nvc0_hw_sm_query_cfg 1620 sm52_inst_issued1 = 1621 { 1622 .type = NVC0_HW_SM_QUERY_INST_ISSUED1, 1623 .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004), 1624 .num_counters = 1, 1625 .norm = { 1, 1 }, 1626 }; 1627 1628 static const struct nvc0_hw_sm_query_cfg 1629 sm52_inst_issued2 = 1630 { 1631 .type = NVC0_HW_SM_QUERY_INST_ISSUED2, 1632 .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000008), 1633 .num_counters = 1, 1634 .norm = { 1, 1 }, 1635 }; 1636 1637 static const struct nvc0_hw_sm_query_cfg 1638 sm52_local_ld = 1639 { 1640 .type = NVC0_HW_SM_QUERY_LOCAL_LD, 1641 .ctr[0] = _CA(0x0001, B6, 0x06, 0x0000001c), 1642 .num_counters = 1, 1643 .norm = { 1, 1 }, 1644 }; 1645 1646 static const struct nvc0_hw_sm_query_cfg 1647 sm52_local_st = 1648 { 1649 .type = NVC0_HW_SM_QUERY_LOCAL_ST, 1650 .ctr[0] = _CA(0x0001, B6, 0x06, 0x00000018), 1651 .num_counters = 1, 1652 .norm = { 1, 1 }, 1653 }; 1654 1655 static const struct nvc0_hw_sm_query_cfg 1656 sm52_shared_atom = 1657 { 1658 .type = NVC0_HW_SM_QUERY_SHARED_ATOM, 1659 .ctr[0] = _CA(0x0001, B6, 0x08, 0x0000001c), 1660 .num_counters = 1, 1661 .norm = { 1, 1 }, 1662 }; 1663 1664 static const struct nvc0_hw_sm_query_cfg 1665 sm52_shared_atom_cas = 1666 { 1667 .type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS, 1668 .ctr[0] = _CA(0x0001, B6, 0x08, 0x00000018), 1669 .num_counters = 1, 1670 .norm = { 1, 1 }, 1671 }; 1672 1673 static const struct nvc0_hw_sm_query_cfg 1674 sm52_shared_ld = 1675 { 1676 .type = NVC0_HW_SM_QUERY_SHARED_LD, 1677 .ctr[0] = _CA(0x0001, B6, 0x07, 0x00000018), 1678 .num_counters = 1, 1679 .norm = { 1, 1 }, 1680 }; 1681 1682 static const struct nvc0_hw_sm_query_cfg 1683 sm52_shared_st = 1684 { 1685 .type = NVC0_HW_SM_QUERY_SHARED_ST, 1686 .ctr[0] = _CA(0x0001, B6, 0x07, 0x0000001c), 1687 .num_counters = 1, 1688 .norm = { 1, 1 }, 1689 }; 1690 1691 static const struct nvc0_hw_sm_query_cfg 1692 sm52_warps_launched = 1693 { 1694 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, 1695 .ctr[0] = _CA(0x0001, B6, 0x02, 0x0000001c), 1696 .num_counters = 1, 1697 .norm = { 1, 1 }, 1698 }; 1699 1700 static const struct nvc0_hw_sm_query_cfg *sm52_hw_sm_queries[] = 1701 { 1702 &sm50_active_ctas, 1703 &sm50_active_cycles, 1704 &sm50_active_warps, 1705 &sm52_atom_count, 1706 &sm50_branch, 1707 &sm50_divergent_branch, 1708 &sm52_global_atom_cas, 1709 &sm52_global_ld, 1710 &sm52_global_st, 1711 &sm52_gred_count, 1712 &sm52_inst_executed, 1713 &sm52_inst_issued0, 1714 &sm52_inst_issued1, 1715 &sm52_inst_issued2, 1716 &sm52_local_ld, 1717 &sm52_local_st, 1718 &sm50_not_pred_off_inst_executed, 1719 &sm50_prof_trigger_0, 1720 &sm50_prof_trigger_1, 1721 &sm50_prof_trigger_2, 1722 &sm50_prof_trigger_3, 1723 &sm50_prof_trigger_4, 1724 &sm50_prof_trigger_5, 1725 &sm50_prof_trigger_6, 1726 &sm50_prof_trigger_7, 1727 &sm52_shared_atom, 1728 &sm52_shared_atom_cas, 1729 &sm52_shared_ld, 1730 &sm50_shared_ld_bank_conflict, 1731 &sm50_shared_ld_transactions, 1732 &sm52_shared_st, 1733 &sm50_shared_st_bank_conflict, 1734 &sm50_shared_st_transactions, 1735 &sm50_sm_cta_launched, 1736 &sm50_th_inst_executed, 1737 &sm52_warps_launched, 1738 }; 1739 1740 #undef _Q 1741 #undef _CA 1742 #undef _CB 1743 1744 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */ 1745 /* NOTES: 1746 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy 1747 * because there is a context-switch problem that we need to fix. 1748 * Results might be wrong sometimes, be careful! 1749 */ 1750 static const uint64_t nvc0_read_hw_sm_counters_code[] = 1751 { 1752 /* mov b32 $r8 $tidx 1753 * mov b32 $r9 $physid 1754 * mov b32 $r0 $pm0 1755 * mov b32 $r1 $pm1 1756 * mov b32 $r2 $pm2 1757 * mov b32 $r3 $pm3 1758 * mov b32 $r4 $pm4 1759 * mov b32 $r5 $pm5 1760 * mov b32 $r6 $pm6 1761 * mov b32 $r7 $pm7 1762 * set $p0 0x1 eq u32 $r8 0x0 1763 * mov b32 $r10 c15[0x620] 1764 * mov b32 $r11 c15[0x624] 1765 * ext u32 $r8 $r9 0x414 1766 * (not $p0) exit 1767 * mul $r8 u32 $r8 u32 48 1768 * add b32 $r10 $c $r10 $r8 1769 * add b32 $r11 $r11 0x0 $c 1770 * mov b32 $r8 c15[0x628] 1771 * st b128 wt g[$r10d+0x00] $r0q 1772 * st b128 wt g[$r10d+0x10] $r4q 1773 * st b32 wt g[$r10d+0x20] $r8 1774 * exit */ 1775 0x2c00000084021c04ULL, 1776 0x2c0000000c025c04ULL, 1777 0x2c00000010001c04ULL, 1778 0x2c00000014005c04ULL, 1779 0x2c00000018009c04ULL, 1780 0x2c0000001c00dc04ULL, 1781 0x2c00000020011c04ULL, 1782 0x2c00000024015c04ULL, 1783 0x2c00000028019c04ULL, 1784 0x2c0000002c01dc04ULL, 1785 0x190e0000fc81dc03ULL, 1786 0x28007c1880029de4ULL, 1787 0x28007c189002dde4ULL, 1788 0x7000c01050921c03ULL, 1789 0x80000000000021e7ULL, 1790 0x10000000c0821c02ULL, 1791 0x4801000020a29c03ULL, 1792 0x0800000000b2dc42ULL, 1793 0x28007c18a0021de4ULL, 1794 0x9400000000a01fc5ULL, 1795 0x9400000040a11fc5ULL, 1796 0x9400000080a21f85ULL, 1797 0x8000000000001de7ULL 1798 }; 1799 1800 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s } 1801 1802 /* ==== Compute capability 2.0 (GF100/GF110) ==== */ 1803 static const struct nvc0_hw_sm_query_cfg 1804 sm20_active_cycles = 1805 { 1806 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES, 1807 .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000), 1808 .num_counters = 1, 1809 .norm = { 1, 1 }, 1810 }; 1811 1812 static const struct nvc0_hw_sm_query_cfg 1813 sm20_active_warps = 1814 { 1815 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS, 1816 .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010), 1817 .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020), 1818 .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030), 1819 .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040), 1820 .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050), 1821 .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060), 1822 .num_counters = 6, 1823 .norm = { 1, 1 }, 1824 }; 1825 1826 static const struct nvc0_hw_sm_query_cfg 1827 sm20_atom_count = 1828 { 1829 .type = NVC0_HW_SM_QUERY_ATOM_COUNT, 1830 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030), 1831 .num_counters = 1, 1832 .norm = { 1, 1 }, 1833 }; 1834 1835 static const struct nvc0_hw_sm_query_cfg 1836 sm20_branch = 1837 { 1838 .type = NVC0_HW_SM_QUERY_BRANCH, 1839 .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000), 1840 .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010), 1841 .num_counters = 2, 1842 .norm = { 1, 1 }, 1843 }; 1844 1845 static const struct nvc0_hw_sm_query_cfg 1846 sm20_divergent_branch = 1847 { 1848 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, 1849 .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020), 1850 .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030), 1851 .num_counters = 2, 1852 .norm = { 1, 1 }, 1853 }; 1854 1855 static const struct nvc0_hw_sm_query_cfg 1856 sm20_gld_request = 1857 { 1858 .type = NVC0_HW_SM_QUERY_GLD_REQUEST, 1859 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030), 1860 .num_counters = 1, 1861 .norm = { 1, 1 }, 1862 }; 1863 1864 static const struct nvc0_hw_sm_query_cfg 1865 sm20_gred_count = 1866 { 1867 .type = NVC0_HW_SM_QUERY_GRED_COUNT, 1868 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040), 1869 .num_counters = 1, 1870 .norm = { 1, 1 }, 1871 }; 1872 1873 static const struct nvc0_hw_sm_query_cfg 1874 sm20_gst_request = 1875 { 1876 .type = NVC0_HW_SM_QUERY_GST_REQUEST, 1877 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060), 1878 .num_counters = 1, 1879 .norm = { 1, 1 }, 1880 }; 1881 1882 static const struct nvc0_hw_sm_query_cfg 1883 sm20_inst_executed = 1884 { 1885 .type = NVC0_HW_SM_QUERY_INST_EXECUTED, 1886 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000), 1887 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010), 1888 .num_counters = 2, 1889 .norm = { 1, 1 }, 1890 }; 1891 1892 static const struct nvc0_hw_sm_query_cfg 1893 sm20_inst_issued = 1894 { 1895 .type = NVC0_HW_SM_QUERY_INST_ISSUED, 1896 .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060), 1897 .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070), 1898 .num_counters = 2, 1899 .norm = { 1, 1 }, 1900 }; 1901 1902 static const struct nvc0_hw_sm_query_cfg 1903 sm20_local_ld = 1904 { 1905 .type = NVC0_HW_SM_QUERY_LOCAL_LD, 1906 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020), 1907 .num_counters = 1, 1908 .norm = { 1, 1 }, 1909 }; 1910 1911 static const struct nvc0_hw_sm_query_cfg 1912 sm20_local_st = 1913 { 1914 .type = NVC0_HW_SM_QUERY_LOCAL_ST, 1915 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050), 1916 .num_counters = 1, 1917 .norm = { 1, 1 }, 1918 }; 1919 1920 static const struct nvc0_hw_sm_query_cfg 1921 sm20_prof_trigger_0 = 1922 { 1923 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0, 1924 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000), 1925 .num_counters = 1, 1926 .norm = { 1, 1 }, 1927 }; 1928 1929 static const struct nvc0_hw_sm_query_cfg 1930 sm20_prof_trigger_1 = 1931 { 1932 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1, 1933 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010), 1934 .num_counters = 1, 1935 .norm = { 1, 1 }, 1936 }; 1937 1938 static const struct nvc0_hw_sm_query_cfg 1939 sm20_prof_trigger_2 = 1940 { 1941 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2, 1942 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020), 1943 .num_counters = 1, 1944 .norm = { 1, 1 }, 1945 }; 1946 1947 static const struct nvc0_hw_sm_query_cfg 1948 sm20_prof_trigger_3 = 1949 { 1950 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3, 1951 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030), 1952 .num_counters = 1, 1953 .norm = { 1, 1 }, 1954 }; 1955 1956 static const struct nvc0_hw_sm_query_cfg 1957 sm20_prof_trigger_4 = 1958 { 1959 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4, 1960 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040), 1961 .num_counters = 1, 1962 .norm = { 1, 1 }, 1963 }; 1964 1965 static const struct nvc0_hw_sm_query_cfg 1966 sm20_prof_trigger_5 = 1967 { 1968 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5, 1969 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050), 1970 .num_counters = 1, 1971 .norm = { 1, 1 }, 1972 }; 1973 1974 static const struct nvc0_hw_sm_query_cfg 1975 sm20_prof_trigger_6 = 1976 { 1977 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6, 1978 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060), 1979 .num_counters = 1, 1980 .norm = { 1, 1 }, 1981 }; 1982 1983 static const struct nvc0_hw_sm_query_cfg 1984 sm20_prof_trigger_7 = 1985 { 1986 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7, 1987 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070), 1988 .num_counters = 1, 1989 .norm = { 1, 1 }, 1990 }; 1991 1992 static const struct nvc0_hw_sm_query_cfg 1993 sm20_shared_ld = 1994 { 1995 .type = NVC0_HW_SM_QUERY_SHARED_LD, 1996 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010), 1997 .num_counters = 1, 1998 .norm = { 1, 1 }, 1999 }; 2000 2001 static const struct nvc0_hw_sm_query_cfg 2002 sm20_shared_st = 2003 { 2004 .type = NVC0_HW_SM_QUERY_SHARED_ST, 2005 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040), 2006 .num_counters = 1, 2007 .norm = { 1, 1 }, 2008 }; 2009 2010 static const struct nvc0_hw_sm_query_cfg 2011 sm20_threads_launched = 2012 { 2013 .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED, 2014 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010), 2015 .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020), 2016 .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030), 2017 .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040), 2018 .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050), 2019 .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060), 2020 .num_counters = 6, 2021 .norm = { 1, 1 }, 2022 }; 2023 2024 static const struct nvc0_hw_sm_query_cfg 2025 sm20_th_inst_executed_0 = 2026 { 2027 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, 2028 .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000), 2029 .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010), 2030 .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020), 2031 .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030), 2032 .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040), 2033 .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050), 2034 .num_counters = 6, 2035 .norm = { 1, 1 }, 2036 }; 2037 2038 static const struct nvc0_hw_sm_query_cfg 2039 sm20_th_inst_executed_1 = 2040 { 2041 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, 2042 .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000), 2043 .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010), 2044 .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020), 2045 .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030), 2046 .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040), 2047 .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050), 2048 .num_counters = 6, 2049 .norm = { 1, 1 }, 2050 }; 2051 2052 static const struct nvc0_hw_sm_query_cfg 2053 sm20_warps_launched = 2054 { 2055 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, 2056 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000), 2057 .num_counters = 1, 2058 .norm = { 1, 1 }, 2059 }; 2060 2061 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] = 2062 { 2063 &sm20_active_cycles, 2064 &sm20_active_warps, 2065 &sm20_atom_count, 2066 &sm20_branch, 2067 &sm20_divergent_branch, 2068 &sm20_gld_request, 2069 &sm20_gred_count, 2070 &sm20_gst_request, 2071 &sm20_inst_executed, 2072 &sm20_inst_issued, 2073 &sm20_local_ld, 2074 &sm20_local_st, 2075 &sm20_prof_trigger_0, 2076 &sm20_prof_trigger_1, 2077 &sm20_prof_trigger_2, 2078 &sm20_prof_trigger_3, 2079 &sm20_prof_trigger_4, 2080 &sm20_prof_trigger_5, 2081 &sm20_prof_trigger_6, 2082 &sm20_prof_trigger_7, 2083 &sm20_shared_ld, 2084 &sm20_shared_st, 2085 &sm20_threads_launched, 2086 &sm20_th_inst_executed_0, 2087 &sm20_th_inst_executed_1, 2088 &sm20_warps_launched, 2089 }; 2090 2091 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ 2092 static const struct nvc0_hw_sm_query_cfg 2093 sm21_inst_executed = 2094 { 2095 .type = NVC0_HW_SM_QUERY_INST_EXECUTED, 2096 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000), 2097 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010), 2098 .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020), 2099 .num_counters = 3, 2100 .norm = { 1, 1 }, 2101 }; 2102 2103 static const struct nvc0_hw_sm_query_cfg 2104 sm21_inst_issued1_0 = 2105 { 2106 .type = NVC0_HW_SM_QUERY_INST_ISSUED1_0, 2107 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010), 2108 .num_counters = 1, 2109 .norm = { 1, 1 }, 2110 }; 2111 2112 static const struct nvc0_hw_sm_query_cfg 2113 sm21_inst_issued1_1 = 2114 { 2115 .type = NVC0_HW_SM_QUERY_INST_ISSUED1_1, 2116 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040), 2117 .num_counters = 1, 2118 .norm = { 1, 1 }, 2119 }; 2120 2121 static const struct nvc0_hw_sm_query_cfg 2122 sm21_inst_issued2_0 = 2123 { 2124 .type = NVC0_HW_SM_QUERY_INST_ISSUED2_0, 2125 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020), 2126 .num_counters = 1, 2127 .norm = { 1, 1 }, 2128 }; 2129 2130 static const struct nvc0_hw_sm_query_cfg 2131 sm21_inst_issued2_1 = 2132 { 2133 .type = NVC0_HW_SM_QUERY_INST_ISSUED2_1, 2134 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050), 2135 .num_counters = 1, 2136 .norm = { 1, 1 }, 2137 }; 2138 2139 static const struct nvc0_hw_sm_query_cfg 2140 sm21_th_inst_executed_0 = 2141 { 2142 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, 2143 .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000), 2144 .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010), 2145 .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020), 2146 .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030), 2147 .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040), 2148 .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050), 2149 .num_counters = 6, 2150 .norm = { 1, 1 }, 2151 }; 2152 2153 static const struct nvc0_hw_sm_query_cfg 2154 sm21_th_inst_executed_1 = 2155 { 2156 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, 2157 .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000), 2158 .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010), 2159 .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020), 2160 .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030), 2161 .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040), 2162 .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050), 2163 .num_counters = 6, 2164 .norm = { 1, 1 }, 2165 }; 2166 2167 static const struct nvc0_hw_sm_query_cfg 2168 sm21_th_inst_executed_2 = 2169 { 2170 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, 2171 .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000), 2172 .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010), 2173 .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020), 2174 .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030), 2175 .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040), 2176 .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050), 2177 .num_counters = 6, 2178 .norm = { 1, 1 }, 2179 }; 2180 2181 static const struct nvc0_hw_sm_query_cfg 2182 sm21_th_inst_executed_3 = 2183 { 2184 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, 2185 .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000), 2186 .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010), 2187 .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020), 2188 .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030), 2189 .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040), 2190 .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050), 2191 .num_counters = 6, 2192 .norm = { 1, 1 }, 2193 }; 2194 2195 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] = 2196 { 2197 &sm20_active_cycles, 2198 &sm20_active_warps, 2199 &sm20_atom_count, 2200 &sm20_branch, 2201 &sm20_divergent_branch, 2202 &sm20_gld_request, 2203 &sm20_gred_count, 2204 &sm20_gst_request, 2205 &sm21_inst_executed, 2206 &sm21_inst_issued1_0, 2207 &sm21_inst_issued1_1, 2208 &sm21_inst_issued2_0, 2209 &sm21_inst_issued2_1, 2210 &sm20_local_ld, 2211 &sm20_local_st, 2212 &sm20_prof_trigger_0, 2213 &sm20_prof_trigger_1, 2214 &sm20_prof_trigger_2, 2215 &sm20_prof_trigger_3, 2216 &sm20_prof_trigger_4, 2217 &sm20_prof_trigger_5, 2218 &sm20_prof_trigger_6, 2219 &sm20_prof_trigger_7, 2220 &sm20_shared_ld, 2221 &sm20_shared_st, 2222 &sm20_threads_launched, 2223 &sm21_th_inst_executed_0, 2224 &sm21_th_inst_executed_1, 2225 &sm21_th_inst_executed_2, 2226 &sm21_th_inst_executed_3, 2227 &sm20_warps_launched, 2228 }; 2229 2230 #undef _C 2231 2232 static inline const struct nvc0_hw_sm_query_cfg ** 2233 nvc0_hw_sm_get_queries(struct nvc0_screen *screen) 2234 { 2235 struct nouveau_device *dev = screen->base.device; 2236 2237 switch (screen->base.class_3d) { 2238 case GM200_3D_CLASS: 2239 return sm52_hw_sm_queries; 2240 case GM107_3D_CLASS: 2241 return sm50_hw_sm_queries; 2242 case NVF0_3D_CLASS: 2243 return sm35_hw_sm_queries; 2244 case NVE4_3D_CLASS: 2245 return sm30_hw_sm_queries; 2246 default: 2247 if (dev->chipset == 0xc0 || dev->chipset == 0xc8) 2248 return sm20_hw_sm_queries; 2249 return sm21_hw_sm_queries; 2250 } 2251 assert(0); 2252 return NULL; 2253 } 2254 2255 unsigned 2256 nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen) 2257 { 2258 struct nouveau_device *dev = screen->base.device; 2259 2260 switch (screen->base.class_3d) { 2261 case GM200_3D_CLASS: 2262 return ARRAY_SIZE(sm52_hw_sm_queries); 2263 case GM107_3D_CLASS: 2264 return ARRAY_SIZE(sm50_hw_sm_queries); 2265 case NVF0_3D_CLASS: 2266 return ARRAY_SIZE(sm35_hw_sm_queries); 2267 case NVE4_3D_CLASS: 2268 return ARRAY_SIZE(sm30_hw_sm_queries); 2269 default: 2270 if (dev->chipset == 0xc0 || dev->chipset == 0xc8) 2271 return ARRAY_SIZE(sm20_hw_sm_queries); 2272 return ARRAY_SIZE(sm21_hw_sm_queries); 2273 } 2274 return 0; 2275 } 2276 2277 static const struct nvc0_hw_sm_query_cfg * 2278 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) 2279 { 2280 const struct nvc0_hw_sm_query_cfg **queries; 2281 struct nvc0_screen *screen = nvc0->screen; 2282 struct nvc0_query *q = &hq->base; 2283 unsigned num_queries; 2284 unsigned i; 2285 2286 num_queries = nvc0_hw_sm_get_num_queries(screen); 2287 queries = nvc0_hw_sm_get_queries(screen); 2288 2289 for (i = 0; i < num_queries; i++) { 2290 if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type) 2291 return queries[i]; 2292 } 2293 assert(0); 2294 return NULL; 2295 } 2296 2297 static void 2298 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) 2299 { 2300 struct nvc0_query *q = &hq->base; 2301 nvc0_hw_query_allocate(nvc0, q, 0); 2302 nouveau_fence_ref(NULL, &hq->fence); 2303 FREE(hq); 2304 } 2305 2306 static boolean 2307 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) 2308 { 2309 struct nvc0_screen *screen = nvc0->screen; 2310 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 2311 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); 2312 const struct nvc0_hw_sm_query_cfg *cfg; 2313 unsigned i, c; 2314 unsigned num_ab[2] = { 0, 0 }; 2315 2316 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); 2317 2318 /* check if we have enough free counter slots */ 2319 for (i = 0; i < cfg->num_counters; ++i) 2320 num_ab[cfg->ctr[i].sig_dom]++; 2321 2322 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 || 2323 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) { 2324 NOUVEAU_ERR("Not enough free MP counter slots !\n"); 2325 return false; 2326 } 2327 2328 assert(cfg->num_counters <= 4); 2329 PUSH_SPACE(push, 4 * 8 * + 6); 2330 2331 if (!screen->pm.mp_counters_enabled) { 2332 screen->pm.mp_counters_enabled = true; 2333 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1); 2334 PUSH_DATA (push, 0x1fcb); 2335 } 2336 2337 /* set sequence field to 0 (used to check if result is available) */ 2338 for (i = 0; i < screen->mp_count; ++i) 2339 hq->data[i * 10 + 10] = 0; 2340 hq->sequence++; 2341 2342 for (i = 0; i < cfg->num_counters; ++i) { 2343 const unsigned d = cfg->ctr[i].sig_dom; 2344 2345 if (!screen->pm.num_hw_sm_active[d]) { 2346 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d))); 2347 if (screen->pm.num_hw_sm_active[!d]) 2348 m |= 1 << (7 + (8 * d)); 2349 BEGIN_NVC0(push, SUBC_SW(0x0600), 1); 2350 PUSH_DATA (push, m); 2351 } 2352 screen->pm.num_hw_sm_active[d]++; 2353 2354 for (c = d * 4; c < (d * 4 + 4); ++c) { 2355 if (!screen->pm.mp_counter[c]) { 2356 hsq->ctr[i] = c; 2357 screen->pm.mp_counter[c] = hsq; 2358 break; 2359 } 2360 } 2361 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */ 2362 2363 /* configure and reset the counter(s) */ 2364 if (d == 0) 2365 BEGIN_NVC0(push, NVE4_CP(MP_PM_A_SIGSEL(c & 3)), 1); 2366 else 2367 BEGIN_NVC0(push, NVE4_CP(MP_PM_B_SIGSEL(c & 3)), 1); 2368 PUSH_DATA (push, cfg->ctr[i].sig_sel); 2369 BEGIN_NVC0(push, NVE4_CP(MP_PM_SRCSEL(c)), 1); 2370 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3)); 2371 BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 1); 2372 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); 2373 BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1); 2374 PUSH_DATA (push, 0); 2375 } 2376 2377 if (screen->base.class_3d >= GM107_3D_CLASS) { 2378 /* Enable mask for counters, it's 8-bits value where 0:3 is for domain A 2379 * and 4:7 for domain B. For example, the mask for active_warps should be 2380 * 0x70 because it uses 3 counters in domain B. However, let's always 2381 * enable all counters because we don't want to track which ones is 2382 * enabled or not, and this allows to monitor multiple queries at the 2383 * same time. */ 2384 BEGIN_NVC0(push, SUBC_CP(0x33e0), 1); 2385 PUSH_DATA (push, 0xff); 2386 } 2387 2388 return true; 2389 } 2390 2391 static boolean 2392 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) 2393 { 2394 struct nvc0_screen *screen = nvc0->screen; 2395 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 2396 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); 2397 const struct nvc0_hw_sm_query_cfg *cfg; 2398 unsigned i, c; 2399 2400 if (screen->base.class_3d >= NVE4_3D_CLASS) 2401 return nve4_hw_sm_begin_query(nvc0, hq); 2402 2403 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); 2404 2405 /* check if we have enough free counter slots */ 2406 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) { 2407 NOUVEAU_ERR("Not enough free MP counter slots !\n"); 2408 return false; 2409 } 2410 2411 assert(cfg->num_counters <= 8); 2412 PUSH_SPACE(push, 8 * 8 + 2); 2413 2414 /* set sequence field to 0 (used to check if result is available) */ 2415 for (i = 0; i < screen->mp_count; ++i) { 2416 const unsigned b = (0x30 / 4) * i; 2417 hq->data[b + 8] = 0; 2418 } 2419 hq->sequence++; 2420 2421 for (i = 0; i < cfg->num_counters; ++i) { 2422 uint32_t mask_sel = 0x00000000; 2423 2424 if (!screen->pm.num_hw_sm_active[0]) { 2425 BEGIN_NVC0(push, SUBC_SW(0x0600), 1); 2426 PUSH_DATA (push, 0x80000000); 2427 } 2428 screen->pm.num_hw_sm_active[0]++; 2429 2430 for (c = 0; c < 8; ++c) { 2431 if (!screen->pm.mp_counter[c]) { 2432 hsq->ctr[i] = c; 2433 screen->pm.mp_counter[c] = hsq; 2434 break; 2435 } 2436 } 2437 2438 /* Oddly-enough, the signal id depends on the slot selected on Fermi but 2439 * not on Kepler. Fortunately, the signal ids are just offseted by the 2440 * slot id! */ 2441 mask_sel |= c; 2442 mask_sel |= (c << 8); 2443 mask_sel |= (c << 16); 2444 mask_sel |= (c << 24); 2445 mask_sel &= cfg->ctr[i].src_mask; 2446 2447 /* configure and reset the counter(s) */ 2448 BEGIN_NVC0(push, NVC0_CP(MP_PM_SIGSEL(c)), 1); 2449 PUSH_DATA (push, cfg->ctr[i].sig_sel); 2450 BEGIN_NVC0(push, NVC0_CP(MP_PM_SRCSEL(c)), 1); 2451 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel); 2452 BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(c)), 1); 2453 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); 2454 BEGIN_NVC0(push, NVC0_CP(MP_PM_SET(c)), 1); 2455 PUSH_DATA (push, 0); 2456 } 2457 return true; 2458 } 2459 2460 static inline struct nvc0_program * 2461 nvc0_hw_sm_get_program(struct nvc0_screen *screen) 2462 { 2463 struct nvc0_program *prog; 2464 2465 prog = CALLOC_STRUCT(nvc0_program); 2466 if (!prog) 2467 return NULL; 2468 2469 prog->type = PIPE_SHADER_COMPUTE; 2470 prog->translated = true; 2471 prog->parm_size = 12; 2472 2473 if (screen->base.class_3d >= GM107_3D_CLASS) { 2474 prog->code = (uint32_t *)gm107_read_hw_sm_counters_code; 2475 prog->code_size = sizeof(gm107_read_hw_sm_counters_code); 2476 prog->num_gprs = 14; 2477 } else 2478 if (screen->base.class_3d == NVE4_3D_CLASS || 2479 screen->base.class_3d == NVF0_3D_CLASS) { 2480 if (screen->base.class_3d == NVE4_3D_CLASS) { 2481 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code; 2482 prog->code_size = sizeof(nve4_read_hw_sm_counters_code); 2483 } else { 2484 prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code; 2485 prog->code_size = sizeof(nvf0_read_hw_sm_counters_code); 2486 } 2487 prog->num_gprs = 14; 2488 } else { 2489 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code; 2490 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code); 2491 prog->num_gprs = 12; 2492 } 2493 return prog; 2494 } 2495 2496 static inline void 2497 nvc0_hw_sm_upload_input(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) 2498 { 2499 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 2500 struct nvc0_screen *screen = nvc0->screen; 2501 uint64_t address; 2502 const int s = 5; 2503 2504 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 2505 2506 PUSH_SPACE(push, 11); 2507 2508 if (screen->base.class_3d >= NVE4_3D_CLASS) { 2509 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 2510 PUSH_DATAh(push, address + NVC0_CB_AUX_MP_INFO); 2511 PUSH_DATA (push, address + NVC0_CB_AUX_MP_INFO); 2512 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 2513 PUSH_DATA (push, 3 * 4); 2514 PUSH_DATA (push, 0x1); 2515 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 3); 2516 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 2517 } else { 2518 BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); 2519 PUSH_DATA (push, NVC0_CB_AUX_SIZE); 2520 PUSH_DATAh(push, address); 2521 PUSH_DATA (push, address); 2522 BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 3); 2523 PUSH_DATA (push, NVC0_CB_AUX_MP_INFO); 2524 } 2525 PUSH_DATA (push, (hq->bo->offset + hq->base_offset)); 2526 PUSH_DATAh(push, (hq->bo->offset + hq->base_offset)); 2527 PUSH_DATA (push, hq->sequence); 2528 } 2529 2530 static void 2531 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) 2532 { 2533 struct nvc0_screen *screen = nvc0->screen; 2534 struct pipe_context *pipe = &nvc0->base.pipe; 2535 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 2536 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS; 2537 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); 2538 struct nvc0_program *old = nvc0->compprog; 2539 struct pipe_grid_info info = {}; 2540 uint32_t mask; 2541 uint32_t input[3]; 2542 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 }; 2543 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 }; 2544 unsigned c, i; 2545 2546 if (unlikely(!screen->pm.prog)) 2547 screen->pm.prog = nvc0_hw_sm_get_program(screen); 2548 2549 /* disable all counting */ 2550 PUSH_SPACE(push, 8); 2551 for (c = 0; c < 8; ++c) 2552 if (screen->pm.mp_counter[c]) { 2553 if (is_nve4) { 2554 IMMED_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 0); 2555 } else { 2556 IMMED_NVC0(push, NVC0_CP(MP_PM_OP(c)), 0); 2557 } 2558 } 2559 /* release counters for this query */ 2560 for (c = 0; c < 8; ++c) { 2561 if (screen->pm.mp_counter[c] == hsq) { 2562 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */ 2563 screen->pm.num_hw_sm_active[d]--; 2564 screen->pm.mp_counter[c] = NULL; 2565 } 2566 } 2567 2568 if (screen->base.class_3d >= GM107_3D_CLASS) 2569 IMMED_NVC0(push, SUBC_CP(0x33e0), 0); 2570 2571 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR, 2572 hq->bo); 2573 2574 PUSH_SPACE(push, 1); 2575 IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0); 2576 2577 /* upload input data for the compute shader which reads MP counters */ 2578 nvc0_hw_sm_upload_input(nvc0, hq); 2579 2580 pipe->bind_compute_state(pipe, screen->pm.prog); 2581 for (i = 0; i < 3; i++) { 2582 info.block[i] = block[i]; 2583 info.grid[i] = grid[i]; 2584 } 2585 info.pc = 0; 2586 info.input = input; 2587 pipe->launch_grid(pipe, &info); 2588 pipe->bind_compute_state(pipe, old); 2589 2590 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY); 2591 2592 /* re-activate other counters */ 2593 PUSH_SPACE(push, 16); 2594 mask = 0; 2595 for (c = 0; c < 8; ++c) { 2596 const struct nvc0_hw_sm_query_cfg *cfg; 2597 unsigned i; 2598 2599 hsq = screen->pm.mp_counter[c]; 2600 if (!hsq) 2601 continue; 2602 2603 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base); 2604 for (i = 0; i < cfg->num_counters; ++i) { 2605 if (mask & (1 << hsq->ctr[i])) 2606 break; 2607 mask |= 1 << hsq->ctr[i]; 2608 if (is_nve4) { 2609 BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(hsq->ctr[i])), 1); 2610 } else { 2611 BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(hsq->ctr[i])), 1); 2612 } 2613 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode); 2614 } 2615 } 2616 } 2617 2618 static inline bool 2619 nvc0_hw_sm_query_read_data(uint32_t count[32][8], 2620 struct nvc0_context *nvc0, bool wait, 2621 struct nvc0_hw_query *hq, 2622 const struct nvc0_hw_sm_query_cfg *cfg, 2623 unsigned mp_count) 2624 { 2625 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); 2626 unsigned p, c; 2627 2628 for (p = 0; p < mp_count; ++p) { 2629 const unsigned b = (0x30 / 4) * p; 2630 2631 for (c = 0; c < cfg->num_counters; ++c) { 2632 if (hq->data[b + 8] != hq->sequence) { 2633 if (!wait) 2634 return false; 2635 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) 2636 return false; 2637 } 2638 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c); 2639 } 2640 } 2641 return true; 2642 } 2643 2644 static inline bool 2645 nve4_hw_sm_query_read_data(uint32_t count[32][8], 2646 struct nvc0_context *nvc0, bool wait, 2647 struct nvc0_hw_query *hq, 2648 const struct nvc0_hw_sm_query_cfg *cfg, 2649 unsigned mp_count) 2650 { 2651 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq); 2652 unsigned p, c, d; 2653 2654 for (p = 0; p < mp_count; ++p) { 2655 const unsigned b = (0x60 / 4) * p; 2656 2657 for (c = 0; c < cfg->num_counters; ++c) { 2658 count[p][c] = 0; 2659 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) { 2660 if (hq->data[b + 20 + d] != hq->sequence) { 2661 if (!wait) 2662 return false; 2663 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client)) 2664 return false; 2665 } 2666 if (hsq->ctr[c] & ~0x3) 2667 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)]; 2668 else 2669 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]]; 2670 } 2671 } 2672 } 2673 return true; 2674 } 2675 2676 static boolean 2677 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq, 2678 boolean wait, union pipe_query_result *result) 2679 { 2680 uint32_t count[32][8]; 2681 uint64_t value = 0; 2682 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32); 2683 unsigned p, c; 2684 const struct nvc0_hw_sm_query_cfg *cfg; 2685 bool ret; 2686 2687 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq); 2688 2689 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) 2690 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count); 2691 else 2692 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count); 2693 if (!ret) 2694 return false; 2695 2696 for (c = 0; c < cfg->num_counters; ++c) 2697 for (p = 0; p < mp_count; ++p) 2698 value += count[p][c]; 2699 value = (value * cfg->norm[0]) / cfg->norm[1]; 2700 2701 *(uint64_t *)result = value; 2702 return true; 2703 } 2704 2705 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = { 2706 .destroy_query = nvc0_hw_sm_destroy_query, 2707 .begin_query = nvc0_hw_sm_begin_query, 2708 .end_query = nvc0_hw_sm_end_query, 2709 .get_query_result = nvc0_hw_sm_get_query_result, 2710 }; 2711 2712 struct nvc0_hw_query * 2713 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) 2714 { 2715 struct nvc0_screen *screen = nvc0->screen; 2716 struct nvc0_hw_sm_query *hsq; 2717 struct nvc0_hw_query *hq; 2718 unsigned space; 2719 2720 if (nvc0->screen->base.drm->version < 0x01000101) 2721 return NULL; 2722 2723 if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST) 2724 return NULL; 2725 2726 hsq = CALLOC_STRUCT(nvc0_hw_sm_query); 2727 if (!hsq) 2728 return NULL; 2729 2730 hq = &hsq->base; 2731 hq->funcs = &hw_sm_query_funcs; 2732 hq->base.type = type; 2733 2734 if (screen->base.class_3d >= NVE4_3D_CLASS) { 2735 /* for each MP: 2736 * [00] = WS0.C0 2737 * [04] = WS0.C1 2738 * [08] = WS0.C2 2739 * [0c] = WS0.C3 2740 * [10] = WS1.C0 2741 * [14] = WS1.C1 2742 * [18] = WS1.C2 2743 * [1c] = WS1.C3 2744 * [20] = WS2.C0 2745 * [24] = WS2.C1 2746 * [28] = WS2.C2 2747 * [2c] = WS2.C3 2748 * [30] = WS3.C0 2749 * [34] = WS3.C1 2750 * [38] = WS3.C2 2751 * [3c] = WS3.C3 2752 * [40] = MP.C4 2753 * [44] = MP.C5 2754 * [48] = MP.C6 2755 * [4c] = MP.C7 2756 * [50] = WS0.sequence 2757 * [54] = WS1.sequence 2758 * [58] = WS2.sequence 2759 * [5c] = WS3.sequence 2760 */ 2761 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t); 2762 } else { 2763 /* 2764 * Note that padding is used to align memory access to 128 bits. 2765 * 2766 * for each MP: 2767 * [00] = MP.C0 2768 * [04] = MP.C1 2769 * [08] = MP.C2 2770 * [0c] = MP.C3 2771 * [10] = MP.C4 2772 * [14] = MP.C5 2773 * [18] = MP.C6 2774 * [1c] = MP.C7 2775 * [20] = MP.sequence 2776 * [24] = padding 2777 * [28] = padding 2778 * [2c] = padding 2779 */ 2780 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t); 2781 } 2782 2783 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) { 2784 FREE(hq); 2785 return NULL; 2786 } 2787 2788 return hq; 2789 } 2790 2791 int 2792 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, 2793 struct pipe_driver_query_info *info) 2794 { 2795 int count = 0; 2796 2797 if (screen->base.drm->version >= 0x01000101) { 2798 if (screen->compute) 2799 count = nvc0_hw_sm_get_num_queries(screen); 2800 } 2801 2802 if (!info) 2803 return count; 2804 2805 if (id < count) { 2806 if (screen->compute) { 2807 if (screen->base.class_3d <= GM200_3D_CLASS) { 2808 const struct nvc0_hw_sm_query_cfg **queries = 2809 nvc0_hw_sm_get_queries(screen); 2810 2811 info->name = nvc0_hw_sm_query_get_name(queries[id]->type); 2812 info->query_type = NVC0_HW_SM_QUERY(queries[id]->type); 2813 info->group_id = NVC0_HW_SM_QUERY_GROUP; 2814 return 1; 2815 } 2816 } 2817 } 2818 return 0; 2819 } 2820