1 /* 2 * Copyright 2013 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 /** 25 * \file brw_performance_query.c 26 * 27 * Implementation of the GL_INTEL_performance_query extension. 28 * 29 * Currently there are two possible counter sources exposed here: 30 * 31 * On Gen6+ hardware we have numerous 64bit Pipeline Statistics Registers 32 * that we can snapshot at the beginning and end of a query. 33 * 34 * On Gen7.5+ we have Observability Architecture counters which are 35 * covered in separate document from the rest of the PRMs. It is available at: 36 * https://01.org/linuxgraphics/documentation/driver-documentation-prms 37 * => 2013 Intel Core Processor Family => Observability Performance Counters 38 * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell, 39 * though notably we currently only support OA counters for Haswell+) 40 */ 41 42 #include <limits.h> 43 #include <dirent.h> 44 45 /* put before sys/types.h to silence glibc warnings */ 46 #ifdef MAJOR_IN_MKDEV 47 #include <sys/mkdev.h> 48 #endif 49 #ifdef MAJOR_IN_SYSMACROS 50 #include <sys/sysmacros.h> 51 #endif 52 #include <sys/types.h> 53 #include <sys/stat.h> 54 #include <fcntl.h> 55 #include <sys/mman.h> 56 #include <sys/ioctl.h> 57 58 #include <xf86drm.h> 59 #include <i915_drm.h> 60 61 #include "main/hash.h" 62 #include "main/macros.h" 63 #include "main/mtypes.h" 64 #include "main/performance_query.h" 65 66 #include "util/bitset.h" 67 #include "util/ralloc.h" 68 #include "util/hash_table.h" 69 #include "util/list.h" 70 71 #include "brw_context.h" 72 #include "brw_defines.h" 73 #include "brw_performance_query.h" 74 #include "brw_oa_hsw.h" 75 #include "brw_oa_bdw.h" 76 #include "brw_oa_chv.h" 77 #include "brw_oa_sklgt2.h" 78 #include "brw_oa_sklgt3.h" 79 #include "brw_oa_sklgt4.h" 80 #include "brw_oa_bxt.h" 81 #include "brw_oa_kblgt2.h" 82 #include "brw_oa_kblgt3.h" 83 #include "brw_oa_glk.h" 84 #include "brw_oa_cflgt2.h" 85 #include "brw_oa_cflgt3.h" 86 #include "intel_batchbuffer.h" 87 88 #define FILE_DEBUG_FLAG DEBUG_PERFMON 89 90 /* 91 * The largest OA formats we can use include: 92 * For Haswell: 93 * 1 timestamp, 45 A counters, 8 B counters and 8 C counters. 94 * For Gen8+ 95 * 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters 96 */ 97 #define MAX_OA_REPORT_COUNTERS 62 98 99 #define OAREPORT_REASON_MASK 0x3f 100 #define OAREPORT_REASON_SHIFT 19 101 #define OAREPORT_REASON_TIMER (1<<0) 102 #define OAREPORT_REASON_TRIGGER1 (1<<1) 103 #define OAREPORT_REASON_TRIGGER2 (1<<2) 104 #define OAREPORT_REASON_CTX_SWITCH (1<<3) 105 #define OAREPORT_REASON_GO_TRANSITION (1<<4) 106 107 #define I915_PERF_OA_SAMPLE_SIZE (8 + /* drm_i915_perf_record_header */ \ 108 256) /* OA counter report */ 109 110 /** 111 * Periodic OA samples are read() into these buffer structures via the 112 * i915 perf kernel interface and appended to the 113 * brw->perfquery.sample_buffers linked list. When we process the 114 * results of an OA metrics query we need to consider all the periodic 115 * samples between the Begin and End MI_REPORT_PERF_COUNT command 116 * markers. 117 * 118 * 'Periodic' is a simplification as there are other automatic reports 119 * written by the hardware also buffered here. 120 * 121 * Considering three queries, A, B and C: 122 * 123 * Time ----> 124 * ________________A_________________ 125 * | | 126 * | ________B_________ _____C___________ 127 * | | | | | | 128 * 129 * And an illustration of sample buffers read over this time frame: 130 * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ] 131 * 132 * These nodes may hold samples for query A: 133 * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ] 134 * 135 * These nodes may hold samples for query B: 136 * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ] 137 * 138 * These nodes may hold samples for query C: 139 * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ] 140 * 141 * The illustration assumes we have an even distribution of periodic 142 * samples so all nodes have the same size plotted against time: 143 * 144 * Note, to simplify code, the list is never empty. 145 * 146 * With overlapping queries we can see that periodic OA reports may 147 * relate to multiple queries and care needs to be take to keep 148 * track of sample buffers until there are no queries that might 149 * depend on their contents. 150 * 151 * We use a node ref counting system where a reference ensures that a 152 * node and all following nodes can't be freed/recycled until the 153 * reference drops to zero. 154 * 155 * E.g. with a ref of one here: 156 * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] 157 * 158 * These nodes could be freed or recycled ("reaped"): 159 * [ 0 ][ 0 ] 160 * 161 * These must be preserved until the leading ref drops to zero: 162 * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ] 163 * 164 * When a query starts we take a reference on the current tail of 165 * the list, knowing that no already-buffered samples can possibly 166 * relate to the newly-started query. A pointer to this node is 167 * also saved in the query object's ->oa.samples_head. 168 * 169 * E.g. starting query A while there are two nodes in .sample_buffers: 170 * ________________A________ 171 * | 172 * 173 * [ 0 ][ 1 ] 174 * ^_______ Add a reference and store pointer to node in 175 * A->oa.samples_head 176 * 177 * Moving forward to when the B query starts with no new buffer nodes: 178 * (for reference, i915 perf reads() are only done when queries finish) 179 * ________________A_______ 180 * | ________B___ 181 * | | 182 * 183 * [ 0 ][ 2 ] 184 * ^_______ Add a reference and store pointer to 185 * node in B->oa.samples_head 186 * 187 * Once a query is finished, after an OA query has become 'Ready', 188 * once the End OA report has landed and after we we have processed 189 * all the intermediate periodic samples then we drop the 190 * ->oa.samples_head reference we took at the start. 191 * 192 * So when the B query has finished we have: 193 * ________________A________ 194 * | ______B___________ 195 * | | | 196 * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ] 197 * ^_______ Drop B->oa.samples_head reference 198 * 199 * We still can't free these due to the A->oa.samples_head ref: 200 * [ 1 ][ 0 ][ 0 ][ 0 ] 201 * 202 * When the A query finishes: (note there's a new ref for C's samples_head) 203 * ________________A_________________ 204 * | | 205 * | _____C_________ 206 * | | | 207 * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ] 208 * ^_______ Drop A->oa.samples_head reference 209 * 210 * And we can now reap these nodes up to the C->oa.samples_head: 211 * [ X ][ X ][ X ][ X ] 212 * keeping -> [ 1 ][ 0 ][ 0 ] 213 * 214 * We reap old sample buffers each time we finish processing an OA 215 * query by iterating the sample_buffers list from the head until we 216 * find a referenced node and stop. 217 * 218 * Reaped buffers move to a perfquery.free_sample_buffers list and 219 * when we come to read() we first look to recycle a buffer from the 220 * free_sample_buffers list before allocating a new buffer. 221 */ 222 struct brw_oa_sample_buf { 223 struct exec_node link; 224 int refcount; 225 int len; 226 uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10]; 227 uint32_t last_timestamp; 228 }; 229 230 /** 231 * i965 representation of a performance query object. 232 * 233 * NB: We want to keep this structure relatively lean considering that 234 * applications may expect to allocate enough objects to be able to 235 * query around all draw calls in a frame. 236 */ 237 struct brw_perf_query_object 238 { 239 struct gl_perf_query_object base; 240 241 const struct brw_perf_query_info *query; 242 243 /* See query->kind to know which state below is in use... */ 244 union { 245 struct { 246 247 /** 248 * BO containing OA counter snapshots at query Begin/End time. 249 */ 250 struct brw_bo *bo; 251 252 /** 253 * Address of mapped of @bo 254 */ 255 void *map; 256 257 /** 258 * The MI_REPORT_PERF_COUNT command lets us specify a unique 259 * ID that will be reflected in the resulting OA report 260 * that's written by the GPU. This is the ID we're expecting 261 * in the begin report and the the end report should be 262 * @begin_report_id + 1. 263 */ 264 int begin_report_id; 265 266 /** 267 * Reference the head of the brw->perfquery.sample_buffers 268 * list at the time that the query started (so we only need 269 * to look at nodes after this point when looking for samples 270 * related to this query) 271 * 272 * (See struct brw_oa_sample_buf description for more details) 273 */ 274 struct exec_node *samples_head; 275 276 /** 277 * Storage for the final accumulated OA counters. 278 */ 279 uint64_t accumulator[MAX_OA_REPORT_COUNTERS]; 280 281 /** 282 * false while in the unaccumulated_elements list, and set to 283 * true when the final, end MI_RPC snapshot has been 284 * accumulated. 285 */ 286 bool results_accumulated; 287 288 } oa; 289 290 struct { 291 /** 292 * BO containing starting and ending snapshots for the 293 * statistics counters. 294 */ 295 struct brw_bo *bo; 296 } pipeline_stats; 297 }; 298 }; 299 300 /** Downcasting convenience macro. */ 301 static inline struct brw_perf_query_object * 302 brw_perf_query(struct gl_perf_query_object *o) 303 { 304 return (struct brw_perf_query_object *) o; 305 } 306 307 #define STATS_BO_SIZE 4096 308 #define STATS_BO_END_OFFSET_BYTES (STATS_BO_SIZE / 2) 309 #define MAX_STAT_COUNTERS (STATS_BO_END_OFFSET_BYTES / 8) 310 311 #define MI_RPC_BO_SIZE 4096 312 #define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2) 313 314 /******************************************************************************/ 315 316 static bool 317 brw_is_perf_query_ready(struct gl_context *ctx, 318 struct gl_perf_query_object *o); 319 320 static void 321 dump_perf_query_callback(GLuint id, void *query_void, void *brw_void) 322 { 323 struct gl_context *ctx = brw_void; 324 struct gl_perf_query_object *o = query_void; 325 struct brw_perf_query_object *obj = query_void; 326 327 switch (obj->query->kind) { 328 case OA_COUNTERS: 329 DBG("%4d: %-6s %-8s BO: %-4s OA data: %-10s %-15s\n", 330 id, 331 o->Used ? "Dirty," : "New,", 332 o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"), 333 obj->oa.bo ? "yes," : "no,", 334 brw_is_perf_query_ready(ctx, o) ? "ready," : "not ready,", 335 obj->oa.results_accumulated ? "accumulated" : "not accumulated"); 336 break; 337 case PIPELINE_STATS: 338 DBG("%4d: %-6s %-8s BO: %-4s\n", 339 id, 340 o->Used ? "Dirty," : "New,", 341 o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"), 342 obj->pipeline_stats.bo ? "yes" : "no"); 343 break; 344 } 345 } 346 347 static void 348 dump_perf_queries(struct brw_context *brw) 349 { 350 struct gl_context *ctx = &brw->ctx; 351 DBG("Queries: (Open queries = %d, OA users = %d)\n", 352 brw->perfquery.n_active_oa_queries, brw->perfquery.n_oa_users); 353 _mesa_HashWalk(ctx->PerfQuery.Objects, dump_perf_query_callback, brw); 354 } 355 356 /******************************************************************************/ 357 358 static struct brw_oa_sample_buf * 359 get_free_sample_buf(struct brw_context *brw) 360 { 361 struct exec_node *node = exec_list_pop_head(&brw->perfquery.free_sample_buffers); 362 struct brw_oa_sample_buf *buf; 363 364 if (node) 365 buf = exec_node_data(struct brw_oa_sample_buf, node, link); 366 else { 367 buf = ralloc_size(brw, sizeof(*buf)); 368 369 exec_node_init(&buf->link); 370 buf->refcount = 0; 371 buf->len = 0; 372 } 373 374 return buf; 375 } 376 377 static void 378 reap_old_sample_buffers(struct brw_context *brw) 379 { 380 struct exec_node *tail_node = 381 exec_list_get_tail(&brw->perfquery.sample_buffers); 382 struct brw_oa_sample_buf *tail_buf = 383 exec_node_data(struct brw_oa_sample_buf, tail_node, link); 384 385 /* Remove all old, unreferenced sample buffers walking forward from 386 * the head of the list, except always leave at least one node in 387 * the list so we always have a node to reference when we Begin 388 * a new query. 389 */ 390 foreach_list_typed_safe(struct brw_oa_sample_buf, buf, link, 391 &brw->perfquery.sample_buffers) 392 { 393 if (buf->refcount == 0 && buf != tail_buf) { 394 exec_node_remove(&buf->link); 395 exec_list_push_head(&brw->perfquery.free_sample_buffers, &buf->link); 396 } else 397 return; 398 } 399 } 400 401 static void 402 free_sample_bufs(struct brw_context *brw) 403 { 404 foreach_list_typed_safe(struct brw_oa_sample_buf, buf, link, 405 &brw->perfquery.free_sample_buffers) 406 ralloc_free(buf); 407 408 exec_list_make_empty(&brw->perfquery.free_sample_buffers); 409 } 410 411 /******************************************************************************/ 412 413 /** 414 * Driver hook for glGetPerfQueryInfoINTEL(). 415 */ 416 static void 417 brw_get_perf_query_info(struct gl_context *ctx, 418 unsigned query_index, 419 const char **name, 420 GLuint *data_size, 421 GLuint *n_counters, 422 GLuint *n_active) 423 { 424 struct brw_context *brw = brw_context(ctx); 425 const struct brw_perf_query_info *query = 426 &brw->perfquery.queries[query_index]; 427 428 *name = query->name; 429 *data_size = query->data_size; 430 *n_counters = query->n_counters; 431 432 switch (query->kind) { 433 case OA_COUNTERS: 434 *n_active = brw->perfquery.n_active_oa_queries; 435 break; 436 437 case PIPELINE_STATS: 438 *n_active = brw->perfquery.n_active_pipeline_stats_queries; 439 break; 440 } 441 } 442 443 /** 444 * Driver hook for glGetPerfCounterInfoINTEL(). 445 */ 446 static void 447 brw_get_perf_counter_info(struct gl_context *ctx, 448 unsigned query_index, 449 unsigned counter_index, 450 const char **name, 451 const char **desc, 452 GLuint *offset, 453 GLuint *data_size, 454 GLuint *type_enum, 455 GLuint *data_type_enum, 456 GLuint64 *raw_max) 457 { 458 struct brw_context *brw = brw_context(ctx); 459 const struct brw_perf_query_info *query = 460 &brw->perfquery.queries[query_index]; 461 const struct brw_perf_query_counter *counter = 462 &query->counters[counter_index]; 463 464 *name = counter->name; 465 *desc = counter->desc; 466 *offset = counter->offset; 467 *data_size = counter->size; 468 *type_enum = counter->type; 469 *data_type_enum = counter->data_type; 470 *raw_max = counter->raw_max; 471 } 472 473 /******************************************************************************/ 474 475 /** 476 * Emit MI_STORE_REGISTER_MEM commands to capture all of the 477 * pipeline statistics for the performance query object. 478 */ 479 static void 480 snapshot_statistics_registers(struct brw_context *brw, 481 struct brw_perf_query_object *obj, 482 uint32_t offset_in_bytes) 483 { 484 const struct brw_perf_query_info *query = obj->query; 485 const int n_counters = query->n_counters; 486 487 for (int i = 0; i < n_counters; i++) { 488 const struct brw_perf_query_counter *counter = &query->counters[i]; 489 490 assert(counter->data_type == GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL); 491 492 brw_store_register_mem64(brw, obj->pipeline_stats.bo, 493 counter->pipeline_stat.reg, 494 offset_in_bytes + i * sizeof(uint64_t)); 495 } 496 } 497 498 /** 499 * Add a query to the global list of "unaccumulated queries." 500 * 501 * Queries are tracked here until all the associated OA reports have 502 * been accumulated via accumulate_oa_reports() after the end 503 * MI_REPORT_PERF_COUNT has landed in query->oa.bo. 504 */ 505 static void 506 add_to_unaccumulated_query_list(struct brw_context *brw, 507 struct brw_perf_query_object *obj) 508 { 509 if (brw->perfquery.unaccumulated_elements >= 510 brw->perfquery.unaccumulated_array_size) 511 { 512 brw->perfquery.unaccumulated_array_size *= 1.5; 513 brw->perfquery.unaccumulated = 514 reralloc(brw, brw->perfquery.unaccumulated, 515 struct brw_perf_query_object *, 516 brw->perfquery.unaccumulated_array_size); 517 } 518 519 brw->perfquery.unaccumulated[brw->perfquery.unaccumulated_elements++] = obj; 520 } 521 522 /** 523 * Remove a query from the global list of unaccumulated queries once 524 * after successfully accumulating the OA reports associated with the 525 * query in accumulate_oa_reports() or when discarding unwanted query 526 * results. 527 */ 528 static void 529 drop_from_unaccumulated_query_list(struct brw_context *brw, 530 struct brw_perf_query_object *obj) 531 { 532 for (int i = 0; i < brw->perfquery.unaccumulated_elements; i++) { 533 if (brw->perfquery.unaccumulated[i] == obj) { 534 int last_elt = --brw->perfquery.unaccumulated_elements; 535 536 if (i == last_elt) 537 brw->perfquery.unaccumulated[i] = NULL; 538 else { 539 brw->perfquery.unaccumulated[i] = 540 brw->perfquery.unaccumulated[last_elt]; 541 } 542 543 break; 544 } 545 } 546 547 /* Drop our samples_head reference so that associated periodic 548 * sample data buffers can potentially be reaped if they aren't 549 * referenced by any other queries... 550 */ 551 552 struct brw_oa_sample_buf *buf = 553 exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link); 554 555 assert(buf->refcount > 0); 556 buf->refcount--; 557 558 obj->oa.samples_head = NULL; 559 560 reap_old_sample_buffers(brw); 561 } 562 563 static uint64_t 564 timebase_scale(struct brw_context *brw, uint32_t u32_time_delta) 565 { 566 const struct gen_device_info *devinfo = &brw->screen->devinfo; 567 uint64_t tmp = ((uint64_t)u32_time_delta) * 1000000000ull; 568 569 return tmp ? tmp / devinfo->timestamp_frequency : 0; 570 } 571 572 static void 573 accumulate_uint32(const uint32_t *report0, 574 const uint32_t *report1, 575 uint64_t *accumulator) 576 { 577 *accumulator += (uint32_t)(*report1 - *report0); 578 } 579 580 static void 581 accumulate_uint40(int a_index, 582 const uint32_t *report0, 583 const uint32_t *report1, 584 uint64_t *accumulator) 585 { 586 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40); 587 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40); 588 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32; 589 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32; 590 uint64_t value0 = report0[a_index + 4] | high0; 591 uint64_t value1 = report1[a_index + 4] | high1; 592 uint64_t delta; 593 594 if (value0 > value1) 595 delta = (1ULL << 40) + value1 - value0; 596 else 597 delta = value1 - value0; 598 599 *accumulator += delta; 600 } 601 602 /** 603 * Given pointers to starting and ending OA snapshots, add the deltas for each 604 * counter to the results. 605 */ 606 static void 607 add_deltas(struct brw_context *brw, 608 struct brw_perf_query_object *obj, 609 const uint32_t *start, 610 const uint32_t *end) 611 { 612 const struct brw_perf_query_info *query = obj->query; 613 uint64_t *accumulator = obj->oa.accumulator; 614 int idx = 0; 615 int i; 616 617 switch (query->oa_format) { 618 case I915_OA_FORMAT_A32u40_A4u32_B8_C8: 619 accumulate_uint32(start + 1, end + 1, accumulator + idx++); /* timestamp */ 620 accumulate_uint32(start + 3, end + 3, accumulator + idx++); /* clock */ 621 622 /* 32x 40bit A counters... */ 623 for (i = 0; i < 32; i++) 624 accumulate_uint40(i, start, end, accumulator + idx++); 625 626 /* 4x 32bit A counters... */ 627 for (i = 0; i < 4; i++) 628 accumulate_uint32(start + 36 + i, end + 36 + i, accumulator + idx++); 629 630 /* 8x 32bit B counters + 8x 32bit C counters... */ 631 for (i = 0; i < 16; i++) 632 accumulate_uint32(start + 48 + i, end + 48 + i, accumulator + idx++); 633 634 break; 635 case I915_OA_FORMAT_A45_B8_C8: 636 accumulate_uint32(start + 1, end + 1, accumulator); /* timestamp */ 637 638 for (i = 0; i < 61; i++) 639 accumulate_uint32(start + 3 + i, end + 3 + i, accumulator + 1 + i); 640 641 break; 642 default: 643 unreachable("Can't accumulate OA counters in unknown format"); 644 } 645 } 646 647 static bool 648 inc_n_oa_users(struct brw_context *brw) 649 { 650 if (brw->perfquery.n_oa_users == 0 && 651 drmIoctl(brw->perfquery.oa_stream_fd, 652 I915_PERF_IOCTL_ENABLE, 0) < 0) 653 { 654 return false; 655 } 656 ++brw->perfquery.n_oa_users; 657 658 return true; 659 } 660 661 static void 662 dec_n_oa_users(struct brw_context *brw) 663 { 664 /* Disabling the i915 perf stream will effectively disable the OA 665 * counters. Note it's important to be sure there are no outstanding 666 * MI_RPC commands at this point since they could stall the CS 667 * indefinitely once OACONTROL is disabled. 668 */ 669 --brw->perfquery.n_oa_users; 670 if (brw->perfquery.n_oa_users == 0 && 671 drmIoctl(brw->perfquery.oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0) 672 { 673 DBG("WARNING: Error disabling i915 perf stream: %m\n"); 674 } 675 } 676 677 /* In general if we see anything spurious while accumulating results, 678 * we don't try and continue accumulating the current query, hoping 679 * for the best, we scrap anything outstanding, and then hope for the 680 * best with new queries. 681 */ 682 static void 683 discard_all_queries(struct brw_context *brw) 684 { 685 while (brw->perfquery.unaccumulated_elements) { 686 struct brw_perf_query_object *obj = brw->perfquery.unaccumulated[0]; 687 688 obj->oa.results_accumulated = true; 689 drop_from_unaccumulated_query_list(brw, brw->perfquery.unaccumulated[0]); 690 691 dec_n_oa_users(brw); 692 } 693 } 694 695 enum OaReadStatus { 696 OA_READ_STATUS_ERROR, 697 OA_READ_STATUS_UNFINISHED, 698 OA_READ_STATUS_FINISHED, 699 }; 700 701 static enum OaReadStatus 702 read_oa_samples_until(struct brw_context *brw, 703 uint32_t start_timestamp, 704 uint32_t end_timestamp) 705 { 706 struct exec_node *tail_node = 707 exec_list_get_tail(&brw->perfquery.sample_buffers); 708 struct brw_oa_sample_buf *tail_buf = 709 exec_node_data(struct brw_oa_sample_buf, tail_node, link); 710 uint32_t last_timestamp = tail_buf->last_timestamp; 711 712 while (1) { 713 struct brw_oa_sample_buf *buf = get_free_sample_buf(brw); 714 uint32_t offset; 715 int len; 716 717 while ((len = read(brw->perfquery.oa_stream_fd, buf->buf, 718 sizeof(buf->buf))) < 0 && errno == EINTR) 719 ; 720 721 if (len <= 0) { 722 exec_list_push_tail(&brw->perfquery.free_sample_buffers, &buf->link); 723 724 if (len < 0) { 725 if (errno == EAGAIN) 726 return ((last_timestamp - start_timestamp) >= 727 (end_timestamp - start_timestamp)) ? 728 OA_READ_STATUS_FINISHED : 729 OA_READ_STATUS_UNFINISHED; 730 else { 731 DBG("Error reading i915 perf samples: %m\n"); 732 } 733 } else 734 DBG("Spurious EOF reading i915 perf samples\n"); 735 736 return OA_READ_STATUS_ERROR; 737 } 738 739 buf->len = len; 740 exec_list_push_tail(&brw->perfquery.sample_buffers, &buf->link); 741 742 /* Go through the reports and update the last timestamp. */ 743 offset = 0; 744 while (offset < buf->len) { 745 const struct drm_i915_perf_record_header *header = 746 (const struct drm_i915_perf_record_header *) &buf->buf[offset]; 747 uint32_t *report = (uint32_t *) (header + 1); 748 749 if (header->type == DRM_I915_PERF_RECORD_SAMPLE) 750 last_timestamp = report[1]; 751 752 offset += header->size; 753 } 754 755 buf->last_timestamp = last_timestamp; 756 } 757 758 unreachable("not reached"); 759 return OA_READ_STATUS_ERROR; 760 } 761 762 /** 763 * Try to read all the reports until either the delimiting timestamp 764 * or an error arises. 765 */ 766 static bool 767 read_oa_samples_for_query(struct brw_context *brw, 768 struct brw_perf_query_object *obj) 769 { 770 uint32_t *start; 771 uint32_t *last; 772 uint32_t *end; 773 774 /* We need the MI_REPORT_PERF_COUNT to land before we can start 775 * accumulate. */ 776 assert(!brw_batch_references(&brw->batch, obj->oa.bo) && 777 !brw_bo_busy(obj->oa.bo)); 778 779 /* Map the BO once here and let accumulate_oa_reports() unmap 780 * it. */ 781 if (obj->oa.map == NULL) 782 obj->oa.map = brw_bo_map(brw, obj->oa.bo, MAP_READ); 783 784 start = last = obj->oa.map; 785 end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES; 786 787 if (start[0] != obj->oa.begin_report_id) { 788 DBG("Spurious start report id=%"PRIu32"\n", start[0]); 789 return true; 790 } 791 if (end[0] != (obj->oa.begin_report_id + 1)) { 792 DBG("Spurious end report id=%"PRIu32"\n", end[0]); 793 return true; 794 } 795 796 /* Read the reports until the end timestamp. */ 797 switch (read_oa_samples_until(brw, start[1], end[1])) { 798 case OA_READ_STATUS_ERROR: 799 /* Fallthrough and let accumulate_oa_reports() deal with the 800 * error. */ 801 case OA_READ_STATUS_FINISHED: 802 return true; 803 case OA_READ_STATUS_UNFINISHED: 804 return false; 805 } 806 807 unreachable("invalid read status"); 808 return false; 809 } 810 811 /** 812 * Accumulate raw OA counter values based on deltas between pairs of 813 * OA reports. 814 * 815 * Accumulation starts from the first report captured via 816 * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the 817 * last MI_RPC report requested by brw_end_perf_query(). Between these 818 * two reports there may also some number of periodically sampled OA 819 * reports collected via the i915 perf interface - depending on the 820 * duration of the query. 821 * 822 * These periodic snapshots help to ensure we handle counter overflow 823 * correctly by being frequent enough to ensure we don't miss multiple 824 * overflows of a counter between snapshots. For Gen8+ the i915 perf 825 * snapshots provide the extra context-switch reports that let us 826 * subtract out the progress of counters associated with other 827 * contexts running on the system. 828 */ 829 static void 830 accumulate_oa_reports(struct brw_context *brw, 831 struct brw_perf_query_object *obj) 832 { 833 const struct gen_device_info *devinfo = &brw->screen->devinfo; 834 struct gl_perf_query_object *o = &obj->base; 835 uint32_t *start; 836 uint32_t *last; 837 uint32_t *end; 838 struct exec_node *first_samples_node; 839 bool in_ctx = true; 840 uint32_t ctx_id; 841 int out_duration = 0; 842 843 assert(o->Ready); 844 assert(obj->oa.map != NULL); 845 846 start = last = obj->oa.map; 847 end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES; 848 849 if (start[0] != obj->oa.begin_report_id) { 850 DBG("Spurious start report id=%"PRIu32"\n", start[0]); 851 goto error; 852 } 853 if (end[0] != (obj->oa.begin_report_id + 1)) { 854 DBG("Spurious end report id=%"PRIu32"\n", end[0]); 855 goto error; 856 } 857 858 ctx_id = start[2]; 859 860 /* See if we have any periodic reports to accumulate too... */ 861 862 /* N.B. The oa.samples_head was set when the query began and 863 * pointed to the tail of the brw->perfquery.sample_buffers list at 864 * the time the query started. Since the buffer existed before the 865 * first MI_REPORT_PERF_COUNT command was emitted we therefore know 866 * that no data in this particular node's buffer can possibly be 867 * associated with the query - so skip ahead one... 868 */ 869 first_samples_node = obj->oa.samples_head->next; 870 871 foreach_list_typed_from(struct brw_oa_sample_buf, buf, link, 872 &brw->perfquery.sample_buffers, 873 first_samples_node) 874 { 875 int offset = 0; 876 877 while (offset < buf->len) { 878 const struct drm_i915_perf_record_header *header = 879 (const struct drm_i915_perf_record_header *)(buf->buf + offset); 880 881 assert(header->size != 0); 882 assert(header->size <= buf->len); 883 884 offset += header->size; 885 886 switch (header->type) { 887 case DRM_I915_PERF_RECORD_SAMPLE: { 888 uint32_t *report = (uint32_t *)(header + 1); 889 bool add = true; 890 891 /* Ignore reports that come before the start marker. 892 * (Note: takes care to allow overflow of 32bit timestamps) 893 */ 894 if (timebase_scale(brw, report[1] - start[1]) > 5000000000) 895 continue; 896 897 /* Ignore reports that come after the end marker. 898 * (Note: takes care to allow overflow of 32bit timestamps) 899 */ 900 if (timebase_scale(brw, report[1] - end[1]) <= 5000000000) 901 goto end; 902 903 /* For Gen8+ since the counters continue while other 904 * contexts are running we need to discount any unrelated 905 * deltas. The hardware automatically generates a report 906 * on context switch which gives us a new reference point 907 * to continuing adding deltas from. 908 * 909 * For Haswell we can rely on the HW to stop the progress 910 * of OA counters while any other context is acctive. 911 */ 912 if (devinfo->gen >= 8) { 913 if (in_ctx && report[2] != ctx_id) { 914 DBG("i915 perf: Switch AWAY (observed by ID change)\n"); 915 in_ctx = false; 916 out_duration = 0; 917 } else if (in_ctx == false && report[2] == ctx_id) { 918 DBG("i915 perf: Switch TO\n"); 919 in_ctx = true; 920 921 /* From experimentation in IGT, we found that the OA unit 922 * might label some report as "idle" (using an invalid 923 * context ID), right after a report for a given context. 924 * Deltas generated by those reports actually belong to the 925 * previous context, even though they're not labelled as 926 * such. 927 * 928 * We didn't *really* Switch AWAY in the case that we e.g. 929 * saw a single periodic report while idle... 930 */ 931 if (out_duration >= 1) 932 add = false; 933 } else if (in_ctx) { 934 assert(report[2] == ctx_id); 935 DBG("i915 perf: Continuation IN\n"); 936 } else { 937 assert(report[2] != ctx_id); 938 DBG("i915 perf: Continuation OUT\n"); 939 add = false; 940 out_duration++; 941 } 942 } 943 944 if (add) 945 add_deltas(brw, obj, last, report); 946 947 last = report; 948 949 break; 950 } 951 952 case DRM_I915_PERF_RECORD_OA_BUFFER_LOST: 953 DBG("i915 perf: OA error: all reports lost\n"); 954 goto error; 955 case DRM_I915_PERF_RECORD_OA_REPORT_LOST: 956 DBG("i915 perf: OA report lost\n"); 957 break; 958 } 959 } 960 } 961 962 end: 963 964 add_deltas(brw, obj, last, end); 965 966 DBG("Marking %d accumulated - results gathered\n", o->Id); 967 968 brw_bo_unmap(obj->oa.bo); 969 obj->oa.map = NULL; 970 obj->oa.results_accumulated = true; 971 drop_from_unaccumulated_query_list(brw, obj); 972 dec_n_oa_users(brw); 973 974 return; 975 976 error: 977 978 brw_bo_unmap(obj->oa.bo); 979 obj->oa.map = NULL; 980 discard_all_queries(brw); 981 } 982 983 /******************************************************************************/ 984 985 static bool 986 open_i915_perf_oa_stream(struct brw_context *brw, 987 int metrics_set_id, 988 int report_format, 989 int period_exponent, 990 int drm_fd, 991 uint32_t ctx_id) 992 { 993 uint64_t properties[] = { 994 /* Single context sampling */ 995 DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id, 996 997 /* Include OA reports in samples */ 998 DRM_I915_PERF_PROP_SAMPLE_OA, true, 999 1000 /* OA unit configuration */ 1001 DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id, 1002 DRM_I915_PERF_PROP_OA_FORMAT, report_format, 1003 DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent, 1004 }; 1005 struct drm_i915_perf_open_param param = { 1006 .flags = I915_PERF_FLAG_FD_CLOEXEC | 1007 I915_PERF_FLAG_FD_NONBLOCK | 1008 I915_PERF_FLAG_DISABLED, 1009 .num_properties = ARRAY_SIZE(properties) / 2, 1010 .properties_ptr = (uintptr_t) properties, 1011 }; 1012 int fd = drmIoctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); 1013 if (fd == -1) { 1014 DBG("Error opening i915 perf OA stream: %m\n"); 1015 return false; 1016 } 1017 1018 brw->perfquery.oa_stream_fd = fd; 1019 1020 brw->perfquery.current_oa_metrics_set_id = metrics_set_id; 1021 brw->perfquery.current_oa_format = report_format; 1022 1023 return true; 1024 } 1025 1026 static void 1027 close_perf(struct brw_context *brw) 1028 { 1029 if (brw->perfquery.oa_stream_fd != -1) { 1030 close(brw->perfquery.oa_stream_fd); 1031 brw->perfquery.oa_stream_fd = -1; 1032 } 1033 } 1034 1035 /** 1036 * Driver hook for glBeginPerfQueryINTEL(). 1037 */ 1038 static bool 1039 brw_begin_perf_query(struct gl_context *ctx, 1040 struct gl_perf_query_object *o) 1041 { 1042 struct brw_context *brw = brw_context(ctx); 1043 struct brw_perf_query_object *obj = brw_perf_query(o); 1044 const struct brw_perf_query_info *query = obj->query; 1045 1046 /* We can assume the frontend hides mistaken attempts to Begin a 1047 * query object multiple times before its End. Similarly if an 1048 * application reuses a query object before results have arrived 1049 * the frontend will wait for prior results so we don't need 1050 * to support abandoning in-flight results. 1051 */ 1052 assert(!o->Active); 1053 assert(!o->Used || o->Ready); /* no in-flight query to worry about */ 1054 1055 DBG("Begin(%d)\n", o->Id); 1056 1057 /* XXX: We have to consider that the command parser unit that parses batch 1058 * buffer commands and is used to capture begin/end counter snapshots isn't 1059 * implicitly synchronized with what's currently running across other GPU 1060 * units (such as the EUs running shaders) that the performance counters are 1061 * associated with. 1062 * 1063 * The intention of performance queries is to measure the work associated 1064 * with commands between the begin/end delimiters and so for that to be the 1065 * case we need to explicitly synchronize the parsing of commands to capture 1066 * Begin/End counter snapshots with what's running across other parts of the 1067 * GPU. 1068 * 1069 * When the command parser reaches a Begin marker it effectively needs to 1070 * drain everything currently running on the GPU until the hardware is idle 1071 * before capturing the first snapshot of counters - otherwise the results 1072 * would also be measuring the effects of earlier commands. 1073 * 1074 * When the command parser reaches an End marker it needs to stall until 1075 * everything currently running on the GPU has finished before capturing the 1076 * end snapshot - otherwise the results won't be a complete representation 1077 * of the work. 1078 * 1079 * Theoretically there could be opportunities to minimize how much of the 1080 * GPU pipeline is drained, or that we stall for, when we know what specific 1081 * units the performance counters being queried relate to but we don't 1082 * currently attempt to be clever here. 1083 * 1084 * Note: with our current simple approach here then for back-to-back queries 1085 * we will redundantly emit duplicate commands to synchronize the command 1086 * streamer with the rest of the GPU pipeline, but we assume that in HW the 1087 * second synchronization is effectively a NOOP. 1088 * 1089 * N.B. The final results are based on deltas of counters between (inside) 1090 * Begin/End markers so even though the total wall clock time of the 1091 * workload is stretched by larger pipeline bubbles the bubbles themselves 1092 * are generally invisible to the query results. Whether that's a good or a 1093 * bad thing depends on the use case. For a lower real-time impact while 1094 * capturing metrics then periodic sampling may be a better choice than 1095 * INTEL_performance_query. 1096 * 1097 * 1098 * This is our Begin synchronization point to drain current work on the 1099 * GPU before we capture our first counter snapshot... 1100 */ 1101 brw_emit_mi_flush(brw); 1102 1103 switch (query->kind) { 1104 case OA_COUNTERS: 1105 1106 /* Opening an i915 perf stream implies exclusive access to the OA unit 1107 * which will generate counter reports for a specific counter set with a 1108 * specific layout/format so we can't begin any OA based queries that 1109 * require a different counter set or format unless we get an opportunity 1110 * to close the stream and open a new one... 1111 */ 1112 if (brw->perfquery.oa_stream_fd != -1 && 1113 brw->perfquery.current_oa_metrics_set_id != 1114 query->oa_metrics_set_id) { 1115 1116 if (brw->perfquery.n_oa_users != 0) 1117 return false; 1118 else 1119 close_perf(brw); 1120 } 1121 1122 /* If the OA counters aren't already on, enable them. */ 1123 if (brw->perfquery.oa_stream_fd == -1) { 1124 __DRIscreen *screen = brw->screen->driScrnPriv; 1125 const struct gen_device_info *devinfo = &brw->screen->devinfo; 1126 1127 /* The period_exponent gives a sampling period as follows: 1128 * sample_period = timestamp_period * 2^(period_exponent + 1) 1129 * 1130 * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or 1131 * ~83ns (GEN8/9). 1132 * 1133 * The counter overflow period is derived from the EuActive counter 1134 * which reads a counter that increments by the number of clock 1135 * cycles multiplied by the number of EUs. It can be calculated as: 1136 * 1137 * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2) 1138 * 1139 * (E.g. 40 EUs @ 1GHz = ~53ms) 1140 * 1141 * We select a sampling period inferior to that overflow period to 1142 * ensure we cannot see more than 1 counter overflow, otherwise we 1143 * could loose information. 1144 */ 1145 1146 int a_counter_in_bits = 32; 1147 if (devinfo->gen >= 8) 1148 a_counter_in_bits = 40; 1149 1150 uint64_t overflow_period = pow(2, a_counter_in_bits) / 1151 (brw->perfquery.sys_vars.n_eus * 1152 /* drop 1GHz freq to have units in nanoseconds */ 1153 2); 1154 1155 DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n", 1156 overflow_period, overflow_period / 1000000ul, brw->perfquery.sys_vars.n_eus); 1157 1158 int period_exponent = 0; 1159 uint64_t prev_sample_period, next_sample_period; 1160 for (int e = 0; e < 30; e++) { 1161 prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency; 1162 next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency; 1163 1164 /* Take the previous sampling period, lower than the overflow 1165 * period. 1166 */ 1167 if (prev_sample_period < overflow_period && 1168 next_sample_period > overflow_period) 1169 period_exponent = e + 1; 1170 } 1171 1172 if (period_exponent == 0) { 1173 DBG("WARNING: enable to find a sampling exponent\n"); 1174 return false; 1175 } 1176 1177 DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent, 1178 prev_sample_period / 1000000ul); 1179 1180 if (!open_i915_perf_oa_stream(brw, 1181 query->oa_metrics_set_id, 1182 query->oa_format, 1183 period_exponent, 1184 screen->fd, /* drm fd */ 1185 brw->hw_ctx)) 1186 return false; 1187 } else { 1188 assert(brw->perfquery.current_oa_metrics_set_id == 1189 query->oa_metrics_set_id && 1190 brw->perfquery.current_oa_format == 1191 query->oa_format); 1192 } 1193 1194 if (!inc_n_oa_users(brw)) { 1195 DBG("WARNING: Error enabling i915 perf stream: %m\n"); 1196 return false; 1197 } 1198 1199 if (obj->oa.bo) { 1200 brw_bo_unreference(obj->oa.bo); 1201 obj->oa.bo = NULL; 1202 } 1203 1204 obj->oa.bo = 1205 brw_bo_alloc(brw->bufmgr, "perf. query OA MI_RPC bo", 1206 MI_RPC_BO_SIZE, 64); 1207 #ifdef DEBUG 1208 /* Pre-filling the BO helps debug whether writes landed. */ 1209 void *map = brw_bo_map(brw, obj->oa.bo, MAP_WRITE); 1210 memset(map, 0x80, MI_RPC_BO_SIZE); 1211 brw_bo_unmap(obj->oa.bo); 1212 #endif 1213 1214 obj->oa.begin_report_id = brw->perfquery.next_query_start_report_id; 1215 brw->perfquery.next_query_start_report_id += 2; 1216 1217 /* We flush the batchbuffer here to minimize the chances that MI_RPC 1218 * delimiting commands end up in different batchbuffers. If that's the 1219 * case, the measurement will include the time it takes for the kernel 1220 * scheduler to load a new request into the hardware. This is manifested in 1221 * tools like frameretrace by spikes in the "GPU Core Clocks" counter. 1222 */ 1223 intel_batchbuffer_flush(brw); 1224 1225 /* Take a starting OA counter snapshot. */ 1226 brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0, 1227 obj->oa.begin_report_id); 1228 ++brw->perfquery.n_active_oa_queries; 1229 1230 /* No already-buffered samples can possibly be associated with this query 1231 * so create a marker within the list of sample buffers enabling us to 1232 * easily ignore earlier samples when processing this query after 1233 * completion. 1234 */ 1235 assert(!exec_list_is_empty(&brw->perfquery.sample_buffers)); 1236 obj->oa.samples_head = exec_list_get_tail(&brw->perfquery.sample_buffers); 1237 1238 struct brw_oa_sample_buf *buf = 1239 exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link); 1240 1241 /* This reference will ensure that future/following sample 1242 * buffers (that may relate to this query) can't be freed until 1243 * this drops to zero. 1244 */ 1245 buf->refcount++; 1246 1247 memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator)); 1248 obj->oa.results_accumulated = false; 1249 1250 add_to_unaccumulated_query_list(brw, obj); 1251 break; 1252 1253 case PIPELINE_STATS: 1254 if (obj->pipeline_stats.bo) { 1255 brw_bo_unreference(obj->pipeline_stats.bo); 1256 obj->pipeline_stats.bo = NULL; 1257 } 1258 1259 obj->pipeline_stats.bo = 1260 brw_bo_alloc(brw->bufmgr, "perf. query pipeline stats bo", 1261 STATS_BO_SIZE, 64); 1262 1263 /* Take starting snapshots. */ 1264 snapshot_statistics_registers(brw, obj, 0); 1265 1266 ++brw->perfquery.n_active_pipeline_stats_queries; 1267 break; 1268 } 1269 1270 if (INTEL_DEBUG & DEBUG_PERFMON) 1271 dump_perf_queries(brw); 1272 1273 return true; 1274 } 1275 1276 /** 1277 * Driver hook for glEndPerfQueryINTEL(). 1278 */ 1279 static void 1280 brw_end_perf_query(struct gl_context *ctx, 1281 struct gl_perf_query_object *o) 1282 { 1283 struct brw_context *brw = brw_context(ctx); 1284 struct brw_perf_query_object *obj = brw_perf_query(o); 1285 1286 DBG("End(%d)\n", o->Id); 1287 1288 /* Ensure that the work associated with the queried commands will have 1289 * finished before taking our query end counter readings. 1290 * 1291 * For more details see comment in brw_begin_perf_query for 1292 * corresponding flush. 1293 */ 1294 brw_emit_mi_flush(brw); 1295 1296 switch (obj->query->kind) { 1297 case OA_COUNTERS: 1298 1299 /* NB: It's possible that the query will have already been marked 1300 * as 'accumulated' if an error was seen while reading samples 1301 * from perf. In this case we mustn't try and emit a closing 1302 * MI_RPC command in case the OA unit has already been disabled 1303 */ 1304 if (!obj->oa.results_accumulated) { 1305 /* Take an ending OA counter snapshot. */ 1306 brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 1307 MI_RPC_BO_END_OFFSET_BYTES, 1308 obj->oa.begin_report_id + 1); 1309 } 1310 1311 --brw->perfquery.n_active_oa_queries; 1312 1313 /* NB: even though the query has now ended, it can't be accumulated 1314 * until the end MI_REPORT_PERF_COUNT snapshot has been written 1315 * to query->oa.bo 1316 */ 1317 break; 1318 1319 case PIPELINE_STATS: 1320 snapshot_statistics_registers(brw, obj, 1321 STATS_BO_END_OFFSET_BYTES); 1322 --brw->perfquery.n_active_pipeline_stats_queries; 1323 break; 1324 } 1325 } 1326 1327 static void 1328 brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o) 1329 { 1330 struct brw_context *brw = brw_context(ctx); 1331 struct brw_perf_query_object *obj = brw_perf_query(o); 1332 struct brw_bo *bo = NULL; 1333 1334 assert(!o->Ready); 1335 1336 switch (obj->query->kind) { 1337 case OA_COUNTERS: 1338 bo = obj->oa.bo; 1339 break; 1340 1341 case PIPELINE_STATS: 1342 bo = obj->pipeline_stats.bo; 1343 break; 1344 } 1345 1346 if (bo == NULL) 1347 return; 1348 1349 /* If the current batch references our results bo then we need to 1350 * flush first... 1351 */ 1352 if (brw_batch_references(&brw->batch, bo)) 1353 intel_batchbuffer_flush(brw); 1354 1355 brw_bo_wait_rendering(bo); 1356 1357 /* Due to a race condition between the OA unit signaling report 1358 * availability and the report actually being written into memory, 1359 * we need to wait for all the reports to come in before we can 1360 * read them. 1361 */ 1362 if (obj->query->kind == OA_COUNTERS) { 1363 while (!read_oa_samples_for_query(brw, obj)) 1364 ; 1365 } 1366 } 1367 1368 static bool 1369 brw_is_perf_query_ready(struct gl_context *ctx, 1370 struct gl_perf_query_object *o) 1371 { 1372 struct brw_context *brw = brw_context(ctx); 1373 struct brw_perf_query_object *obj = brw_perf_query(o); 1374 1375 if (o->Ready) 1376 return true; 1377 1378 switch (obj->query->kind) { 1379 case OA_COUNTERS: 1380 return (obj->oa.results_accumulated || 1381 (obj->oa.bo && 1382 !brw_batch_references(&brw->batch, obj->oa.bo) && 1383 !brw_bo_busy(obj->oa.bo) && 1384 read_oa_samples_for_query(brw, obj))); 1385 case PIPELINE_STATS: 1386 return (obj->pipeline_stats.bo && 1387 !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) && 1388 !brw_bo_busy(obj->pipeline_stats.bo)); 1389 } 1390 1391 unreachable("missing ready check for unknown query kind"); 1392 return false; 1393 } 1394 1395 static int 1396 get_oa_counter_data(struct brw_context *brw, 1397 struct brw_perf_query_object *obj, 1398 size_t data_size, 1399 uint8_t *data) 1400 { 1401 const struct brw_perf_query_info *query = obj->query; 1402 int n_counters = query->n_counters; 1403 int written = 0; 1404 1405 if (!obj->oa.results_accumulated) { 1406 accumulate_oa_reports(brw, obj); 1407 assert(obj->oa.results_accumulated); 1408 } 1409 1410 for (int i = 0; i < n_counters; i++) { 1411 const struct brw_perf_query_counter *counter = &query->counters[i]; 1412 uint64_t *out_uint64; 1413 float *out_float; 1414 1415 if (counter->size) { 1416 switch (counter->data_type) { 1417 case GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL: 1418 out_uint64 = (uint64_t *)(data + counter->offset); 1419 *out_uint64 = counter->oa_counter_read_uint64(brw, query, 1420 obj->oa.accumulator); 1421 break; 1422 case GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL: 1423 out_float = (float *)(data + counter->offset); 1424 *out_float = counter->oa_counter_read_float(brw, query, 1425 obj->oa.accumulator); 1426 break; 1427 default: 1428 /* So far we aren't using uint32, double or bool32... */ 1429 unreachable("unexpected counter data type"); 1430 } 1431 written = counter->offset + counter->size; 1432 } 1433 } 1434 1435 return written; 1436 } 1437 1438 static int 1439 get_pipeline_stats_data(struct brw_context *brw, 1440 struct brw_perf_query_object *obj, 1441 size_t data_size, 1442 uint8_t *data) 1443 1444 { 1445 const struct brw_perf_query_info *query = obj->query; 1446 int n_counters = obj->query->n_counters; 1447 uint8_t *p = data; 1448 1449 uint64_t *start = brw_bo_map(brw, obj->pipeline_stats.bo, MAP_READ); 1450 uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t)); 1451 1452 for (int i = 0; i < n_counters; i++) { 1453 const struct brw_perf_query_counter *counter = &query->counters[i]; 1454 uint64_t value = end[i] - start[i]; 1455 1456 if (counter->pipeline_stat.numerator != 1457 counter->pipeline_stat.denominator) { 1458 value *= counter->pipeline_stat.numerator; 1459 value /= counter->pipeline_stat.denominator; 1460 } 1461 1462 *((uint64_t *)p) = value; 1463 p += 8; 1464 } 1465 1466 brw_bo_unmap(obj->pipeline_stats.bo); 1467 1468 return p - data; 1469 } 1470 1471 /** 1472 * Driver hook for glGetPerfQueryDataINTEL(). 1473 */ 1474 static void 1475 brw_get_perf_query_data(struct gl_context *ctx, 1476 struct gl_perf_query_object *o, 1477 GLsizei data_size, 1478 GLuint *data, 1479 GLuint *bytes_written) 1480 { 1481 struct brw_context *brw = brw_context(ctx); 1482 struct brw_perf_query_object *obj = brw_perf_query(o); 1483 int written = 0; 1484 1485 assert(brw_is_perf_query_ready(ctx, o)); 1486 1487 DBG("GetData(%d)\n", o->Id); 1488 1489 if (INTEL_DEBUG & DEBUG_PERFMON) 1490 dump_perf_queries(brw); 1491 1492 /* We expect that the frontend only calls this hook when it knows 1493 * that results are available. 1494 */ 1495 assert(o->Ready); 1496 1497 switch (obj->query->kind) { 1498 case OA_COUNTERS: 1499 written = get_oa_counter_data(brw, obj, data_size, (uint8_t *)data); 1500 break; 1501 1502 case PIPELINE_STATS: 1503 written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data); 1504 break; 1505 } 1506 1507 if (bytes_written) 1508 *bytes_written = written; 1509 } 1510 1511 static struct gl_perf_query_object * 1512 brw_new_perf_query_object(struct gl_context *ctx, unsigned query_index) 1513 { 1514 struct brw_context *brw = brw_context(ctx); 1515 const struct brw_perf_query_info *query = 1516 &brw->perfquery.queries[query_index]; 1517 struct brw_perf_query_object *obj = 1518 calloc(1, sizeof(struct brw_perf_query_object)); 1519 1520 if (!obj) 1521 return NULL; 1522 1523 obj->query = query; 1524 1525 brw->perfquery.n_query_instances++; 1526 1527 return &obj->base; 1528 } 1529 1530 /** 1531 * Driver hook for glDeletePerfQueryINTEL(). 1532 */ 1533 static void 1534 brw_delete_perf_query(struct gl_context *ctx, 1535 struct gl_perf_query_object *o) 1536 { 1537 struct brw_context *brw = brw_context(ctx); 1538 struct brw_perf_query_object *obj = brw_perf_query(o); 1539 1540 /* We can assume that the frontend waits for a query to complete 1541 * before ever calling into here, so we don't have to worry about 1542 * deleting an in-flight query object. 1543 */ 1544 assert(!o->Active); 1545 assert(!o->Used || o->Ready); 1546 1547 DBG("Delete(%d)\n", o->Id); 1548 1549 switch (obj->query->kind) { 1550 case OA_COUNTERS: 1551 if (obj->oa.bo) { 1552 if (!obj->oa.results_accumulated) { 1553 drop_from_unaccumulated_query_list(brw, obj); 1554 dec_n_oa_users(brw); 1555 } 1556 1557 brw_bo_unreference(obj->oa.bo); 1558 obj->oa.bo = NULL; 1559 } 1560 1561 obj->oa.results_accumulated = false; 1562 break; 1563 1564 case PIPELINE_STATS: 1565 if (obj->pipeline_stats.bo) { 1566 brw_bo_unreference(obj->pipeline_stats.bo); 1567 obj->pipeline_stats.bo = NULL; 1568 } 1569 break; 1570 } 1571 1572 free(obj); 1573 1574 /* As an indication that the INTEL_performance_query extension is no 1575 * longer in use, it's a good time to free our cache of sample 1576 * buffers and close any current i915-perf stream. 1577 */ 1578 if (--brw->perfquery.n_query_instances == 0) { 1579 free_sample_bufs(brw); 1580 close_perf(brw); 1581 } 1582 } 1583 1584 /******************************************************************************/ 1585 1586 static struct brw_perf_query_info * 1587 append_query_info(struct brw_context *brw) 1588 { 1589 brw->perfquery.queries = 1590 reralloc(brw, brw->perfquery.queries, 1591 struct brw_perf_query_info, ++brw->perfquery.n_queries); 1592 1593 return &brw->perfquery.queries[brw->perfquery.n_queries - 1]; 1594 } 1595 1596 static void 1597 add_stat_reg(struct brw_perf_query_info *query, 1598 uint32_t reg, 1599 uint32_t numerator, 1600 uint32_t denominator, 1601 const char *name, 1602 const char *description) 1603 { 1604 struct brw_perf_query_counter *counter; 1605 1606 assert(query->n_counters < MAX_STAT_COUNTERS); 1607 1608 counter = &query->counters[query->n_counters]; 1609 counter->name = name; 1610 counter->desc = description; 1611 counter->type = GL_PERFQUERY_COUNTER_RAW_INTEL; 1612 counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL; 1613 counter->size = sizeof(uint64_t); 1614 counter->offset = sizeof(uint64_t) * query->n_counters; 1615 counter->pipeline_stat.reg = reg; 1616 counter->pipeline_stat.numerator = numerator; 1617 counter->pipeline_stat.denominator = denominator; 1618 1619 query->n_counters++; 1620 } 1621 1622 static void 1623 add_basic_stat_reg(struct brw_perf_query_info *query, 1624 uint32_t reg, const char *name) 1625 { 1626 add_stat_reg(query, reg, 1, 1, name, name); 1627 } 1628 1629 static void 1630 init_pipeline_statistic_query_registers(struct brw_context *brw) 1631 { 1632 const struct gen_device_info *devinfo = &brw->screen->devinfo; 1633 struct brw_perf_query_info *query = append_query_info(brw); 1634 1635 query->kind = PIPELINE_STATS; 1636 query->name = "Pipeline Statistics Registers"; 1637 query->n_counters = 0; 1638 query->counters = 1639 rzalloc_array(brw, struct brw_perf_query_counter, MAX_STAT_COUNTERS); 1640 1641 add_basic_stat_reg(query, IA_VERTICES_COUNT, 1642 "N vertices submitted"); 1643 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, 1644 "N primitives submitted"); 1645 add_basic_stat_reg(query, VS_INVOCATION_COUNT, 1646 "N vertex shader invocations"); 1647 1648 if (devinfo->gen == 6) { 1649 add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1, 1650 "SO_PRIM_STORAGE_NEEDED", 1651 "N geometry shader stream-out primitives (total)"); 1652 add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1, 1653 "SO_NUM_PRIMS_WRITTEN", 1654 "N geometry shader stream-out primitives (written)"); 1655 } else { 1656 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, 1657 "SO_PRIM_STORAGE_NEEDED (Stream 0)", 1658 "N stream-out (stream 0) primitives (total)"); 1659 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, 1660 "SO_PRIM_STORAGE_NEEDED (Stream 1)", 1661 "N stream-out (stream 1) primitives (total)"); 1662 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, 1663 "SO_PRIM_STORAGE_NEEDED (Stream 2)", 1664 "N stream-out (stream 2) primitives (total)"); 1665 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, 1666 "SO_PRIM_STORAGE_NEEDED (Stream 3)", 1667 "N stream-out (stream 3) primitives (total)"); 1668 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, 1669 "SO_NUM_PRIMS_WRITTEN (Stream 0)", 1670 "N stream-out (stream 0) primitives (written)"); 1671 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, 1672 "SO_NUM_PRIMS_WRITTEN (Stream 1)", 1673 "N stream-out (stream 1) primitives (written)"); 1674 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, 1675 "SO_NUM_PRIMS_WRITTEN (Stream 2)", 1676 "N stream-out (stream 2) primitives (written)"); 1677 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, 1678 "SO_NUM_PRIMS_WRITTEN (Stream 3)", 1679 "N stream-out (stream 3) primitives (written)"); 1680 } 1681 1682 add_basic_stat_reg(query, HS_INVOCATION_COUNT, 1683 "N TCS shader invocations"); 1684 add_basic_stat_reg(query, DS_INVOCATION_COUNT, 1685 "N TES shader invocations"); 1686 1687 add_basic_stat_reg(query, GS_INVOCATION_COUNT, 1688 "N geometry shader invocations"); 1689 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, 1690 "N geometry shader primitives emitted"); 1691 1692 add_basic_stat_reg(query, CL_INVOCATION_COUNT, 1693 "N primitives entering clipping"); 1694 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, 1695 "N primitives leaving clipping"); 1696 1697 if (devinfo->is_haswell || devinfo->gen == 8) 1698 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, 1699 "N fragment shader invocations", 1700 "N fragment shader invocations"); 1701 else 1702 add_basic_stat_reg(query, PS_INVOCATION_COUNT, 1703 "N fragment shader invocations"); 1704 1705 add_basic_stat_reg(query, PS_DEPTH_COUNT, "N z-pass fragments"); 1706 1707 if (devinfo->gen >= 7) 1708 add_basic_stat_reg(query, CS_INVOCATION_COUNT, 1709 "N compute shader invocations"); 1710 1711 query->data_size = sizeof(uint64_t) * query->n_counters; 1712 } 1713 1714 static bool 1715 read_file_uint64(const char *file, uint64_t *val) 1716 { 1717 char buf[32]; 1718 int fd, n; 1719 1720 fd = open(file, 0); 1721 if (fd < 0) 1722 return false; 1723 while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 && 1724 errno == EINTR); 1725 close(fd); 1726 if (n < 0) 1727 return false; 1728 1729 buf[n] = '\0'; 1730 *val = strtoull(buf, NULL, 0); 1731 1732 return true; 1733 } 1734 1735 static void 1736 register_oa_config(struct brw_context *brw, 1737 const struct brw_perf_query_info *query, 1738 uint64_t config_id) 1739 { 1740 struct brw_perf_query_info *registred_query = append_query_info(brw); 1741 *registred_query = *query; 1742 registred_query->oa_metrics_set_id = config_id; 1743 DBG("metric set registred: id = %" PRIu64", guid = %s\n", 1744 registred_query->oa_metrics_set_id, query->guid); 1745 } 1746 1747 static void 1748 enumerate_sysfs_metrics(struct brw_context *brw, const char *sysfs_dev_dir) 1749 { 1750 char buf[256]; 1751 DIR *metricsdir = NULL; 1752 struct dirent *metric_entry; 1753 int len; 1754 1755 len = snprintf(buf, sizeof(buf), "%s/metrics", sysfs_dev_dir); 1756 if (len < 0 || len >= sizeof(buf)) { 1757 DBG("Failed to concatenate path to sysfs metrics/ directory\n"); 1758 return; 1759 } 1760 1761 metricsdir = opendir(buf); 1762 if (!metricsdir) { 1763 DBG("Failed to open %s: %m\n", buf); 1764 return; 1765 } 1766 1767 while ((metric_entry = readdir(metricsdir))) { 1768 struct hash_entry *entry; 1769 1770 if ((metric_entry->d_type != DT_DIR && 1771 metric_entry->d_type != DT_LNK) || 1772 metric_entry->d_name[0] == '.') 1773 continue; 1774 1775 DBG("metric set: %s\n", metric_entry->d_name); 1776 entry = _mesa_hash_table_search(brw->perfquery.oa_metrics_table, 1777 metric_entry->d_name); 1778 if (entry) { 1779 uint64_t id; 1780 1781 len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id", 1782 sysfs_dev_dir, metric_entry->d_name); 1783 if (len < 0 || len >= sizeof(buf)) { 1784 DBG("Failed to concatenate path to sysfs metric id file\n"); 1785 continue; 1786 } 1787 1788 if (!read_file_uint64(buf, &id)) { 1789 DBG("Failed to read metric set id from %s: %m", buf); 1790 continue; 1791 } 1792 1793 register_oa_config(brw, (const struct brw_perf_query_info *)entry->data, id); 1794 } else 1795 DBG("metric set not known by mesa (skipping)\n"); 1796 } 1797 1798 closedir(metricsdir); 1799 } 1800 1801 static bool 1802 read_sysfs_drm_device_file_uint64(struct brw_context *brw, 1803 const char *sysfs_dev_dir, 1804 const char *file, 1805 uint64_t *value) 1806 { 1807 char buf[512]; 1808 int len; 1809 1810 len = snprintf(buf, sizeof(buf), "%s/%s", sysfs_dev_dir, file); 1811 if (len < 0 || len >= sizeof(buf)) { 1812 DBG("Failed to concatenate sys filename to read u64 from\n"); 1813 return false; 1814 } 1815 1816 return read_file_uint64(buf, value); 1817 } 1818 1819 static bool 1820 kernel_has_dynamic_config_support(struct brw_context *brw, 1821 const char *sysfs_dev_dir) 1822 { 1823 __DRIscreen *screen = brw->screen->driScrnPriv; 1824 struct hash_entry *entry; 1825 1826 hash_table_foreach(brw->perfquery.oa_metrics_table, entry) { 1827 struct brw_perf_query_info *query = entry->data; 1828 char config_path[256]; 1829 uint64_t config_id; 1830 1831 snprintf(config_path, sizeof(config_path), 1832 "%s/metrics/%s/id", sysfs_dev_dir, query->guid); 1833 1834 /* Look for the test config, which we know we can't replace. */ 1835 if (read_file_uint64(config_path, &config_id) && config_id == 1) { 1836 uint32_t mux_regs[] = { 0x9888 /* NOA_WRITE */, 0x0 }; 1837 struct drm_i915_perf_oa_config config; 1838 1839 memset(&config, 0, sizeof(config)); 1840 1841 memcpy(config.uuid, query->guid, sizeof(config.uuid)); 1842 1843 config.n_mux_regs = 1; 1844 config.mux_regs_ptr = (uintptr_t) mux_regs; 1845 1846 if (drmIoctl(screen->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config_id) < 0 && 1847 errno == ENOENT) 1848 return true; 1849 1850 break; 1851 } 1852 } 1853 1854 return false; 1855 } 1856 1857 static void 1858 init_oa_configs(struct brw_context *brw, const char *sysfs_dev_dir) 1859 { 1860 __DRIscreen *screen = brw->screen->driScrnPriv; 1861 struct hash_entry *entry; 1862 1863 hash_table_foreach(brw->perfquery.oa_metrics_table, entry) { 1864 const struct brw_perf_query_info *query = entry->data; 1865 struct drm_i915_perf_oa_config config; 1866 char config_path[256]; 1867 uint64_t config_id; 1868 int ret; 1869 1870 snprintf(config_path, sizeof(config_path), 1871 "%s/metrics/%s/id", sysfs_dev_dir, query->guid); 1872 1873 /* Don't recreate already loaded configs. */ 1874 if (read_file_uint64(config_path, &config_id)) { 1875 register_oa_config(brw, query, config_id); 1876 continue; 1877 } 1878 1879 memset(&config, 0, sizeof(config)); 1880 1881 memcpy(config.uuid, query->guid, sizeof(config.uuid)); 1882 1883 config.n_mux_regs = query->n_mux_regs; 1884 config.mux_regs_ptr = (uintptr_t) query->mux_regs; 1885 1886 config.n_boolean_regs = query->n_b_counter_regs; 1887 config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs; 1888 1889 config.n_flex_regs = query->n_flex_regs; 1890 config.flex_regs_ptr = (uintptr_t) query->flex_regs; 1891 1892 ret = drmIoctl(screen->fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config); 1893 if (ret < 0) { 1894 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n", 1895 query->name, query->guid, strerror(errno)); 1896 continue; 1897 } 1898 1899 register_oa_config(brw, query, ret); 1900 } 1901 } 1902 1903 static bool 1904 init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir) 1905 { 1906 const struct gen_device_info *devinfo = &brw->screen->devinfo; 1907 uint64_t min_freq_mhz = 0, max_freq_mhz = 0; 1908 __DRIscreen *screen = brw->screen->driScrnPriv; 1909 1910 if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir, 1911 "gt_min_freq_mhz", 1912 &min_freq_mhz)) 1913 return false; 1914 1915 if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir, 1916 "gt_max_freq_mhz", 1917 &max_freq_mhz)) 1918 return false; 1919 1920 brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000; 1921 brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000; 1922 brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency; 1923 1924 brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd); 1925 brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices; 1926 /* Assuming uniform distribution of subslices per slices. */ 1927 brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0]; 1928 1929 if (devinfo->is_haswell) { 1930 brw->perfquery.sys_vars.slice_mask = 0; 1931 brw->perfquery.sys_vars.subslice_mask = 0; 1932 1933 for (int s = 0; s < devinfo->num_slices; s++) 1934 brw->perfquery.sys_vars.slice_mask |= 1U << s; 1935 for (int ss = 0; ss < devinfo->num_subslices[0]; ss++) 1936 brw->perfquery.sys_vars.subslice_mask |= 1U << ss; 1937 1938 if (devinfo->gt == 1) { 1939 brw->perfquery.sys_vars.n_eus = 10; 1940 } else if (devinfo->gt == 2) { 1941 brw->perfquery.sys_vars.n_eus = 20; 1942 } else if (devinfo->gt == 3) { 1943 brw->perfquery.sys_vars.n_eus = 40; 1944 } else 1945 unreachable("not reached"); 1946 } else { 1947 drm_i915_getparam_t gp; 1948 int ret; 1949 int slice_mask = 0; 1950 int ss_mask = 0; 1951 /* maximum number of slices */ 1952 int s_max = devinfo->num_slices; 1953 /* maximum number of subslices per slice (assuming uniform subslices per 1954 * slices) 1955 */ 1956 int ss_max = devinfo->num_subslices[0]; 1957 uint64_t subslice_mask = 0; 1958 int s; 1959 1960 gp.param = I915_PARAM_SLICE_MASK; 1961 gp.value = &slice_mask; 1962 ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); 1963 if (ret) 1964 return false; 1965 1966 gp.param = I915_PARAM_SUBSLICE_MASK; 1967 gp.value = &ss_mask; 1968 ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp); 1969 if (ret) 1970 return false; 1971 1972 brw->perfquery.sys_vars.n_eus = brw->screen->eu_total; 1973 brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask); 1974 brw->perfquery.sys_vars.slice_mask = slice_mask; 1975 1976 /* Note: the _SUBSLICE_MASK param only reports a global subslice mask 1977 * which applies to all slices. 1978 * 1979 * Note: some of the metrics we have (as described in XML) are 1980 * conditional on a $SubsliceMask variable which is expected to also 1981 * reflect the slice mask by packing together subslice masks for each 1982 * slice in one value.. 1983 */ 1984 for (s = 0; s < s_max; s++) { 1985 if (slice_mask & (1<<s)) { 1986 subslice_mask |= ss_mask << (ss_max * s); 1987 } 1988 } 1989 1990 brw->perfquery.sys_vars.subslice_mask = subslice_mask; 1991 brw->perfquery.sys_vars.n_eu_sub_slices = 1992 __builtin_popcount(subslice_mask); 1993 } 1994 1995 brw->perfquery.sys_vars.eu_threads_count = 1996 brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu; 1997 1998 return true; 1999 } 2000 2001 static bool 2002 get_sysfs_dev_dir(struct brw_context *brw, 2003 char *path_buf, 2004 int path_buf_len) 2005 { 2006 __DRIscreen *screen = brw->screen->driScrnPriv; 2007 struct stat sb; 2008 int min, maj; 2009 DIR *drmdir; 2010 struct dirent *drm_entry; 2011 int len; 2012 2013 assert(path_buf); 2014 assert(path_buf_len); 2015 path_buf[0] = '\0'; 2016 2017 if (fstat(screen->fd, &sb)) { 2018 DBG("Failed to stat DRM fd\n"); 2019 return false; 2020 } 2021 2022 maj = major(sb.st_rdev); 2023 min = minor(sb.st_rdev); 2024 2025 if (!S_ISCHR(sb.st_mode)) { 2026 DBG("DRM fd is not a character device as expected\n"); 2027 return false; 2028 } 2029 2030 len = snprintf(path_buf, path_buf_len, 2031 "/sys/dev/char/%d:%d/device/drm", maj, min); 2032 if (len < 0 || len >= path_buf_len) { 2033 DBG("Failed to concatenate sysfs path to drm device\n"); 2034 return false; 2035 } 2036 2037 drmdir = opendir(path_buf); 2038 if (!drmdir) { 2039 DBG("Failed to open %s: %m\n", path_buf); 2040 return false; 2041 } 2042 2043 while ((drm_entry = readdir(drmdir))) { 2044 if ((drm_entry->d_type == DT_DIR || 2045 drm_entry->d_type == DT_LNK) && 2046 strncmp(drm_entry->d_name, "card", 4) == 0) 2047 { 2048 len = snprintf(path_buf, path_buf_len, 2049 "/sys/dev/char/%d:%d/device/drm/%s", 2050 maj, min, drm_entry->d_name); 2051 closedir(drmdir); 2052 if (len < 0 || len >= path_buf_len) 2053 return false; 2054 else 2055 return true; 2056 } 2057 } 2058 2059 closedir(drmdir); 2060 2061 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n", 2062 maj, min); 2063 2064 return false; 2065 } 2066 2067 typedef void (*perf_register_oa_queries_t)(struct brw_context *); 2068 2069 static perf_register_oa_queries_t 2070 get_register_queries_function(const struct gen_device_info *devinfo) 2071 { 2072 if (devinfo->is_haswell) 2073 return brw_oa_register_queries_hsw; 2074 if (devinfo->is_cherryview) 2075 return brw_oa_register_queries_chv; 2076 if (devinfo->is_broadwell) 2077 return brw_oa_register_queries_bdw; 2078 if (devinfo->is_broxton) 2079 return brw_oa_register_queries_bxt; 2080 if (devinfo->is_skylake) { 2081 if (devinfo->gt == 2) 2082 return brw_oa_register_queries_sklgt2; 2083 if (devinfo->gt == 3) 2084 return brw_oa_register_queries_sklgt3; 2085 if (devinfo->gt == 4) 2086 return brw_oa_register_queries_sklgt4; 2087 } 2088 if (devinfo->is_kabylake) { 2089 if (devinfo->gt == 2) 2090 return brw_oa_register_queries_kblgt2; 2091 if (devinfo->gt == 3) 2092 return brw_oa_register_queries_kblgt3; 2093 } 2094 if (devinfo->is_geminilake) 2095 return brw_oa_register_queries_glk; 2096 if (devinfo->is_coffeelake) { 2097 if (devinfo->gt == 2) 2098 return brw_oa_register_queries_cflgt2; 2099 if (devinfo->gt == 3) 2100 return brw_oa_register_queries_cflgt3; 2101 } 2102 2103 return NULL; 2104 } 2105 2106 static unsigned 2107 brw_init_perf_query_info(struct gl_context *ctx) 2108 { 2109 struct brw_context *brw = brw_context(ctx); 2110 const struct gen_device_info *devinfo = &brw->screen->devinfo; 2111 bool i915_perf_oa_available = false; 2112 struct stat sb; 2113 char sysfs_dev_dir[128]; 2114 perf_register_oa_queries_t oa_register; 2115 2116 if (brw->perfquery.n_queries) 2117 return brw->perfquery.n_queries; 2118 2119 init_pipeline_statistic_query_registers(brw); 2120 2121 oa_register = get_register_queries_function(devinfo); 2122 2123 /* The existence of this sysctl parameter implies the kernel supports 2124 * the i915 perf interface. 2125 */ 2126 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) { 2127 2128 /* If _paranoid == 1 then on Gen8+ we won't be able to access OA 2129 * metrics unless running as root. 2130 */ 2131 if (devinfo->is_haswell) 2132 i915_perf_oa_available = true; 2133 else { 2134 uint64_t paranoid = 1; 2135 2136 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", ¶noid); 2137 2138 if (paranoid == 0 || geteuid() == 0) 2139 i915_perf_oa_available = true; 2140 } 2141 } 2142 2143 if (i915_perf_oa_available && 2144 oa_register && 2145 get_sysfs_dev_dir(brw, sysfs_dev_dir, sizeof(sysfs_dev_dir)) && 2146 init_oa_sys_vars(brw, sysfs_dev_dir)) 2147 { 2148 brw->perfquery.oa_metrics_table = 2149 _mesa_hash_table_create(NULL, _mesa_key_hash_string, 2150 _mesa_key_string_equal); 2151 2152 /* Index all the metric sets mesa knows about before looking to see what 2153 * the kernel is advertising. 2154 */ 2155 oa_register(brw); 2156 2157 if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) && 2158 kernel_has_dynamic_config_support(brw, sysfs_dev_dir)) 2159 init_oa_configs(brw, sysfs_dev_dir); 2160 else 2161 enumerate_sysfs_metrics(brw, sysfs_dev_dir); 2162 } 2163 2164 brw->perfquery.unaccumulated = 2165 ralloc_array(brw, struct brw_perf_query_object *, 2); 2166 brw->perfquery.unaccumulated_elements = 0; 2167 brw->perfquery.unaccumulated_array_size = 2; 2168 2169 exec_list_make_empty(&brw->perfquery.sample_buffers); 2170 exec_list_make_empty(&brw->perfquery.free_sample_buffers); 2171 2172 /* It's convenient to guarantee that this linked list of sample 2173 * buffers is never empty so we add an empty head so when we 2174 * Begin an OA query we can always take a reference on a buffer 2175 * in this list. 2176 */ 2177 struct brw_oa_sample_buf *buf = get_free_sample_buf(brw); 2178 exec_list_push_head(&brw->perfquery.sample_buffers, &buf->link); 2179 2180 brw->perfquery.oa_stream_fd = -1; 2181 2182 brw->perfquery.next_query_start_report_id = 1000; 2183 2184 return brw->perfquery.n_queries; 2185 } 2186 2187 void 2188 brw_init_performance_queries(struct brw_context *brw) 2189 { 2190 struct gl_context *ctx = &brw->ctx; 2191 2192 ctx->Driver.InitPerfQueryInfo = brw_init_perf_query_info; 2193 ctx->Driver.GetPerfQueryInfo = brw_get_perf_query_info; 2194 ctx->Driver.GetPerfCounterInfo = brw_get_perf_counter_info; 2195 ctx->Driver.NewPerfQueryObject = brw_new_perf_query_object; 2196 ctx->Driver.DeletePerfQuery = brw_delete_perf_query; 2197 ctx->Driver.BeginPerfQuery = brw_begin_perf_query; 2198 ctx->Driver.EndPerfQuery = brw_end_perf_query; 2199 ctx->Driver.WaitPerfQuery = brw_wait_perf_query; 2200 ctx->Driver.IsPerfQueryReady = brw_is_perf_query_ready; 2201 ctx->Driver.GetPerfQueryData = brw_get_perf_query_data; 2202 } 2203