1 /* 2 Copyright (C) Intel Corp. 2006. All Rights Reserved. 3 Intel funded Tungsten Graphics to 4 develop this 3D driver. 5 6 Permission is hereby granted, free of charge, to any person obtaining 7 a copy of this software and associated documentation files (the 8 "Software"), to deal in the Software without restriction, including 9 without limitation the rights to use, copy, modify, merge, publish, 10 distribute, sublicense, and/or sell copies of the Software, and to 11 permit persons to whom the Software is furnished to do so, subject to 12 the following conditions: 13 14 The above copyright notice and this permission notice (including the 15 next paragraph) shall be included in all copies or substantial 16 portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE 22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 26 **********************************************************************/ 27 /* 28 * Authors: 29 * Keith Whitwell <keithw (at) vmware.com> 30 */ 31 32 #include <pthread.h> 33 #include "main/imports.h" 34 #include "program/prog_parameter.h" 35 #include "program/prog_print.h" 36 #include "program/prog_to_nir.h" 37 #include "program/program.h" 38 #include "program/programopt.h" 39 #include "tnl/tnl.h" 40 #include "util/ralloc.h" 41 #include "compiler/glsl/ir.h" 42 #include "compiler/glsl/glsl_to_nir.h" 43 #include "compiler/nir/nir_serialize.h" 44 45 #include "brw_program.h" 46 #include "brw_context.h" 47 #include "compiler/brw_nir.h" 48 #include "brw_defines.h" 49 #include "intel_batchbuffer.h" 50 51 static bool 52 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar) 53 { 54 if (is_scalar) { 55 nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, 56 type_size_scalar_bytes); 57 return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0); 58 } else { 59 nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, 60 type_size_vec4_bytes); 61 return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0); 62 } 63 } 64 65 nir_shader * 66 brw_create_nir(struct brw_context *brw, 67 const struct gl_shader_program *shader_prog, 68 struct gl_program *prog, 69 gl_shader_stage stage, 70 bool is_scalar) 71 { 72 struct gl_context *ctx = &brw->ctx; 73 const nir_shader_compiler_options *options = 74 ctx->Const.ShaderCompilerOptions[stage].NirOptions; 75 nir_shader *nir; 76 77 /* First, lower the GLSL IR or Mesa IR to NIR */ 78 if (shader_prog) { 79 nir = glsl_to_nir(shader_prog, stage, options); 80 nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out); 81 nir_lower_returns(nir); 82 nir_validate_shader(nir); 83 NIR_PASS_V(nir, nir_lower_io_to_temporaries, 84 nir_shader_get_entrypoint(nir), true, false); 85 } else { 86 nir = prog_to_nir(prog, options); 87 NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */ 88 } 89 nir_validate_shader(nir); 90 91 /* Lower PatchVerticesIn from system value to uniform. This needs to 92 * happen before brw_preprocess_nir, since that will lower system values 93 * to intrinsics. 94 * 95 * We only do this for TES if no TCS is present, since otherwise we know 96 * the number of vertices in the patch at link time and we can lower it 97 * directly to a constant. We do this in nir_lower_patch_vertices, which 98 * needs to run after brw_nir_preprocess has turned the system values 99 * into intrinsics. 100 */ 101 const bool lower_patch_vertices_in_to_uniform = 102 (stage == MESA_SHADER_TESS_CTRL && brw->screen->devinfo.gen >= 8) || 103 (stage == MESA_SHADER_TESS_EVAL && 104 !shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]); 105 106 if (lower_patch_vertices_in_to_uniform) 107 brw_nir_lower_patch_vertices_in_to_uniform(nir); 108 109 nir = brw_preprocess_nir(brw->screen->compiler, nir); 110 111 if (stage == MESA_SHADER_TESS_EVAL && !lower_patch_vertices_in_to_uniform) { 112 assert(shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]); 113 struct gl_linked_shader *linked_tcs = 114 shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]; 115 uint32_t patch_vertices = linked_tcs->Program->info.tess.tcs_vertices_out; 116 nir_lower_tes_patch_vertices(nir, patch_vertices); 117 } 118 119 if (stage == MESA_SHADER_FRAGMENT) { 120 static const struct nir_lower_wpos_ytransform_options wpos_options = { 121 .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0}, 122 .fs_coord_pixel_center_integer = 1, 123 .fs_coord_origin_upper_left = 1, 124 }; 125 126 bool progress = false; 127 NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options); 128 if (progress) { 129 _mesa_add_state_reference(prog->Parameters, 130 (gl_state_index *) wpos_options.state_tokens); 131 } 132 } 133 134 NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar); 135 136 return nir; 137 } 138 139 void 140 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog) 141 { 142 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); 143 144 /* Copy the info we just generated back into the gl_program */ 145 const char *prog_name = prog->info.name; 146 const char *prog_label = prog->info.label; 147 prog->info = nir->info; 148 prog->info.name = prog_name; 149 prog->info.label = prog_label; 150 } 151 152 static unsigned 153 get_new_program_id(struct intel_screen *screen) 154 { 155 return p_atomic_inc_return(&screen->program_id); 156 } 157 158 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target, 159 GLuint id, bool is_arb_asm) 160 { 161 struct brw_context *brw = brw_context(ctx); 162 struct brw_program *prog = rzalloc(NULL, struct brw_program); 163 164 if (prog) { 165 prog->id = get_new_program_id(brw->screen); 166 167 return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm); 168 } 169 170 return NULL; 171 } 172 173 static void brwDeleteProgram( struct gl_context *ctx, 174 struct gl_program *prog ) 175 { 176 struct brw_context *brw = brw_context(ctx); 177 178 /* Beware! prog's refcount has reached zero, and it's about to be freed. 179 * 180 * In brw_upload_pipeline_state(), we compare brw->programs[i] to 181 * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the 182 * pointer has changed. 183 * 184 * We cannot leave brw->programs[i] as a dangling pointer to the dead 185 * program. malloc() may allocate the same memory for a new gl_program, 186 * causing us to see matching pointers...but totally different programs. 187 * 188 * We cannot set brw->programs[i] to NULL, either. If we've deleted the 189 * active program, Mesa may set ctx->FooProgram._Current to NULL. That 190 * would cause us to see matching pointers (NULL == NULL), and fail to 191 * detect that a program has changed since our last draw. 192 * 193 * So, set it to a bogus gl_program pointer that will never match, 194 * causing us to properly reevaluate the state on our next draw. 195 * 196 * Getting this wrong causes heisenbugs which are very hard to catch, 197 * as you need a very specific allocation pattern to hit the problem. 198 */ 199 static const struct gl_program deleted_program; 200 201 for (int i = 0; i < MESA_SHADER_STAGES; i++) { 202 if (brw->programs[i] == prog) 203 brw->programs[i] = (struct gl_program *) &deleted_program; 204 } 205 206 _mesa_delete_program( ctx, prog ); 207 } 208 209 210 static GLboolean 211 brwProgramStringNotify(struct gl_context *ctx, 212 GLenum target, 213 struct gl_program *prog) 214 { 215 assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant); 216 217 struct brw_context *brw = brw_context(ctx); 218 const struct brw_compiler *compiler = brw->screen->compiler; 219 220 switch (target) { 221 case GL_FRAGMENT_PROGRAM_ARB: { 222 struct brw_program *newFP = brw_program(prog); 223 const struct brw_program *curFP = 224 brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]); 225 226 if (newFP == curFP) 227 brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM; 228 newFP->id = get_new_program_id(brw->screen); 229 230 prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true); 231 232 brw_shader_gather_info(prog->nir, prog); 233 234 brw_fs_precompile(ctx, prog); 235 break; 236 } 237 case GL_VERTEX_PROGRAM_ARB: { 238 struct brw_program *newVP = brw_program(prog); 239 const struct brw_program *curVP = 240 brw_program_const(brw->programs[MESA_SHADER_VERTEX]); 241 242 if (newVP == curVP) 243 brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM; 244 if (newVP->program.arb.IsPositionInvariant) { 245 _mesa_insert_mvp_code(ctx, &newVP->program); 246 } 247 newVP->id = get_new_program_id(brw->screen); 248 249 /* Also tell tnl about it: 250 */ 251 _tnl_program_string(ctx, target, prog); 252 253 prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX, 254 compiler->scalar_stage[MESA_SHADER_VERTEX]); 255 256 brw_shader_gather_info(prog->nir, prog); 257 258 brw_vs_precompile(ctx, prog); 259 break; 260 } 261 default: 262 /* 263 * driver->ProgramStringNotify is only called for ARB programs, fixed 264 * function vertex programs, and ir_to_mesa (which isn't used by the 265 * i965 back-end). Therefore, even after geometry shaders are added, 266 * this function should only ever be called with a target of 267 * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB. 268 */ 269 unreachable("Unexpected target in brwProgramStringNotify"); 270 } 271 272 return true; 273 } 274 275 static void 276 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers) 277 { 278 struct brw_context *brw = brw_context(ctx); 279 const struct gen_device_info *devinfo = &brw->screen->devinfo; 280 unsigned bits = (PIPE_CONTROL_DATA_CACHE_FLUSH | 281 PIPE_CONTROL_NO_WRITE | 282 PIPE_CONTROL_CS_STALL); 283 assert(devinfo->gen >= 7 && devinfo->gen <= 10); 284 285 if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | 286 GL_ELEMENT_ARRAY_BARRIER_BIT | 287 GL_COMMAND_BARRIER_BIT)) 288 bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 289 290 if (barriers & GL_UNIFORM_BARRIER_BIT) 291 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 292 PIPE_CONTROL_CONST_CACHE_INVALIDATE); 293 294 if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT) 295 bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 296 297 if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT | 298 GL_PIXEL_BUFFER_BARRIER_BIT)) 299 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 300 PIPE_CONTROL_RENDER_TARGET_FLUSH); 301 302 if (barriers & GL_FRAMEBUFFER_BARRIER_BIT) 303 bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 304 PIPE_CONTROL_RENDER_TARGET_FLUSH); 305 306 /* Typed surface messages are handled by the render cache on IVB, so we 307 * need to flush it too. 308 */ 309 if (devinfo->gen == 7 && !devinfo->is_haswell) 310 bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH; 311 312 brw_emit_pipe_control_flush(brw, bits); 313 } 314 315 static void 316 brw_blend_barrier(struct gl_context *ctx) 317 { 318 struct brw_context *brw = brw_context(ctx); 319 const struct gen_device_info *devinfo = &brw->screen->devinfo; 320 321 if (!ctx->Extensions.MESA_shader_framebuffer_fetch) { 322 if (devinfo->gen >= 6) { 323 brw_emit_pipe_control_flush(brw, 324 PIPE_CONTROL_RENDER_TARGET_FLUSH | 325 PIPE_CONTROL_CS_STALL); 326 brw_emit_pipe_control_flush(brw, 327 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); 328 } else { 329 brw_emit_pipe_control_flush(brw, 330 PIPE_CONTROL_RENDER_TARGET_FLUSH); 331 } 332 } 333 } 334 335 void 336 brw_get_scratch_bo(struct brw_context *brw, 337 struct brw_bo **scratch_bo, int size) 338 { 339 struct brw_bo *old_bo = *scratch_bo; 340 341 if (old_bo && old_bo->size < size) { 342 brw_bo_unreference(old_bo); 343 old_bo = NULL; 344 } 345 346 if (!old_bo) { 347 *scratch_bo = brw_bo_alloc(brw->bufmgr, "scratch bo", size, 4096); 348 } 349 } 350 351 /** 352 * Reserve enough scratch space for the given stage to hold \p per_thread_size 353 * bytes times the given \p thread_count. 354 */ 355 void 356 brw_alloc_stage_scratch(struct brw_context *brw, 357 struct brw_stage_state *stage_state, 358 unsigned per_thread_size) 359 { 360 if (stage_state->per_thread_scratch >= per_thread_size) 361 return; 362 363 stage_state->per_thread_scratch = per_thread_size; 364 365 if (stage_state->scratch_bo) 366 brw_bo_unreference(stage_state->scratch_bo); 367 368 const struct gen_device_info *devinfo = &brw->screen->devinfo; 369 unsigned thread_count; 370 switch(stage_state->stage) { 371 case MESA_SHADER_VERTEX: 372 thread_count = devinfo->max_vs_threads; 373 break; 374 case MESA_SHADER_TESS_CTRL: 375 thread_count = devinfo->max_tcs_threads; 376 break; 377 case MESA_SHADER_TESS_EVAL: 378 thread_count = devinfo->max_tes_threads; 379 break; 380 case MESA_SHADER_GEOMETRY: 381 thread_count = devinfo->max_gs_threads; 382 break; 383 case MESA_SHADER_FRAGMENT: 384 thread_count = devinfo->max_wm_threads; 385 break; 386 case MESA_SHADER_COMPUTE: { 387 unsigned subslices = MAX2(brw->screen->subslice_total, 1); 388 389 /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says: 390 * 391 * "Scratch Space per slice is computed based on 4 sub-slices. SW must 392 * allocate scratch space enough so that each slice has 4 slices 393 * allowed." 394 * 395 * According to the other driver team, this applies to compute shaders 396 * as well. This is not currently documented at all. 397 * 398 * brw->screen->subslice_total is the TOTAL number of subslices 399 * and we wish to view that there are 4 subslices per slice 400 * instead of the actual number of subslices per slice. 401 */ 402 if (devinfo->gen >= 9) 403 subslices = 4 * brw->screen->devinfo.num_slices; 404 405 unsigned scratch_ids_per_subslice; 406 if (devinfo->is_haswell) { 407 /* WaCSScratchSize:hsw 408 * 409 * Haswell's scratch space address calculation appears to be sparse 410 * rather than tightly packed. The Thread ID has bits indicating 411 * which subslice, EU within a subslice, and thread within an EU it 412 * is. There's a maximum of two slices and two subslices, so these 413 * can be stored with a single bit. Even though there are only 10 EUs 414 * per subslice, this is stored in 4 bits, so there's an effective 415 * maximum value of 16 EUs. Similarly, although there are only 7 416 * threads per EU, this is stored in a 3 bit number, giving an 417 * effective maximum value of 8 threads per EU. 418 * 419 * This means that we need to use 16 * 8 instead of 10 * 7 for the 420 * number of threads per subslice. 421 */ 422 scratch_ids_per_subslice = 16 * 8; 423 } else if (devinfo->is_cherryview) { 424 /* Cherryview devices have either 6 or 8 EUs per subslice, and each 425 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs 426 * as if it had 8 EUs. 427 */ 428 scratch_ids_per_subslice = 8 * 7; 429 } else { 430 scratch_ids_per_subslice = devinfo->max_cs_threads; 431 } 432 433 thread_count = scratch_ids_per_subslice * subslices; 434 break; 435 } 436 default: 437 unreachable("Unsupported stage!"); 438 } 439 440 stage_state->scratch_bo = 441 brw_bo_alloc(brw->bufmgr, "shader scratch space", 442 per_thread_size * thread_count, 4096); 443 } 444 445 void brwInitFragProgFuncs( struct dd_function_table *functions ) 446 { 447 assert(functions->ProgramStringNotify == _tnl_program_string); 448 449 functions->NewProgram = brwNewProgram; 450 functions->DeleteProgram = brwDeleteProgram; 451 functions->ProgramStringNotify = brwProgramStringNotify; 452 453 functions->LinkShader = brw_link_shader; 454 455 functions->MemoryBarrier = brw_memory_barrier; 456 functions->BlendBarrier = brw_blend_barrier; 457 } 458 459 struct shader_times { 460 uint64_t time; 461 uint64_t written; 462 uint64_t reset; 463 }; 464 465 void 466 brw_init_shader_time(struct brw_context *brw) 467 { 468 const int max_entries = 2048; 469 brw->shader_time.bo = 470 brw_bo_alloc(brw->bufmgr, "shader time", 471 max_entries * BRW_SHADER_TIME_STRIDE * 3, 4096); 472 brw->shader_time.names = rzalloc_array(brw, const char *, max_entries); 473 brw->shader_time.ids = rzalloc_array(brw, int, max_entries); 474 brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type, 475 max_entries); 476 brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times, 477 max_entries); 478 brw->shader_time.max_entries = max_entries; 479 } 480 481 static int 482 compare_time(const void *a, const void *b) 483 { 484 uint64_t * const *a_val = a; 485 uint64_t * const *b_val = b; 486 487 /* We don't just subtract because we're turning the value to an int. */ 488 if (**a_val < **b_val) 489 return -1; 490 else if (**a_val == **b_val) 491 return 0; 492 else 493 return 1; 494 } 495 496 static void 497 print_shader_time_line(const char *stage, const char *name, 498 int shader_num, uint64_t time, uint64_t total) 499 { 500 fprintf(stderr, "%-6s%-18s", stage, name); 501 502 if (shader_num != 0) 503 fprintf(stderr, "%4d: ", shader_num); 504 else 505 fprintf(stderr, " : "); 506 507 fprintf(stderr, "%16lld (%7.2f Gcycles) %4.1f%%\n", 508 (long long)time, 509 (double)time / 1000000000.0, 510 (double)time / total * 100.0); 511 } 512 513 static void 514 brw_report_shader_time(struct brw_context *brw) 515 { 516 if (!brw->shader_time.bo || !brw->shader_time.num_entries) 517 return; 518 519 uint64_t scaled[brw->shader_time.num_entries]; 520 uint64_t *sorted[brw->shader_time.num_entries]; 521 uint64_t total_by_type[ST_CS + 1]; 522 memset(total_by_type, 0, sizeof(total_by_type)); 523 double total = 0; 524 for (int i = 0; i < brw->shader_time.num_entries; i++) { 525 uint64_t written = 0, reset = 0; 526 enum shader_time_shader_type type = brw->shader_time.types[i]; 527 528 sorted[i] = &scaled[i]; 529 530 switch (type) { 531 case ST_VS: 532 case ST_TCS: 533 case ST_TES: 534 case ST_GS: 535 case ST_FS8: 536 case ST_FS16: 537 case ST_CS: 538 written = brw->shader_time.cumulative[i].written; 539 reset = brw->shader_time.cumulative[i].reset; 540 break; 541 542 default: 543 /* I sometimes want to print things that aren't the 3 shader times. 544 * Just print the sum in that case. 545 */ 546 written = 1; 547 reset = 0; 548 break; 549 } 550 551 uint64_t time = brw->shader_time.cumulative[i].time; 552 if (written) { 553 scaled[i] = time / written * (written + reset); 554 } else { 555 scaled[i] = time; 556 } 557 558 switch (type) { 559 case ST_VS: 560 case ST_TCS: 561 case ST_TES: 562 case ST_GS: 563 case ST_FS8: 564 case ST_FS16: 565 case ST_CS: 566 total_by_type[type] += scaled[i]; 567 break; 568 default: 569 break; 570 } 571 572 total += scaled[i]; 573 } 574 575 if (total == 0) { 576 fprintf(stderr, "No shader time collected yet\n"); 577 return; 578 } 579 580 qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time); 581 582 fprintf(stderr, "\n"); 583 fprintf(stderr, "type ID cycles spent %% of total\n"); 584 for (int s = 0; s < brw->shader_time.num_entries; s++) { 585 const char *stage; 586 /* Work back from the sorted pointers times to a time to print. */ 587 int i = sorted[s] - scaled; 588 589 if (scaled[i] == 0) 590 continue; 591 592 int shader_num = brw->shader_time.ids[i]; 593 const char *shader_name = brw->shader_time.names[i]; 594 595 switch (brw->shader_time.types[i]) { 596 case ST_VS: 597 stage = "vs"; 598 break; 599 case ST_TCS: 600 stage = "tcs"; 601 break; 602 case ST_TES: 603 stage = "tes"; 604 break; 605 case ST_GS: 606 stage = "gs"; 607 break; 608 case ST_FS8: 609 stage = "fs8"; 610 break; 611 case ST_FS16: 612 stage = "fs16"; 613 break; 614 case ST_CS: 615 stage = "cs"; 616 break; 617 default: 618 stage = "other"; 619 break; 620 } 621 622 print_shader_time_line(stage, shader_name, shader_num, 623 scaled[i], total); 624 } 625 626 fprintf(stderr, "\n"); 627 print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total); 628 print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total); 629 print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total); 630 print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total); 631 print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total); 632 print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total); 633 print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total); 634 } 635 636 static void 637 brw_collect_shader_time(struct brw_context *brw) 638 { 639 if (!brw->shader_time.bo) 640 return; 641 642 /* This probably stalls on the last rendering. We could fix that by 643 * delaying reading the reports, but it doesn't look like it's a big 644 * overhead compared to the cost of tracking the time in the first place. 645 */ 646 void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE); 647 648 for (int i = 0; i < brw->shader_time.num_entries; i++) { 649 uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE; 650 651 brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4]; 652 brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4]; 653 brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4]; 654 } 655 656 /* Zero the BO out to clear it out for our next collection. 657 */ 658 memset(bo_map, 0, brw->shader_time.bo->size); 659 brw_bo_unmap(brw->shader_time.bo); 660 } 661 662 void 663 brw_collect_and_report_shader_time(struct brw_context *brw) 664 { 665 brw_collect_shader_time(brw); 666 667 if (brw->shader_time.report_time == 0 || 668 get_time() - brw->shader_time.report_time >= 1.0) { 669 brw_report_shader_time(brw); 670 brw->shader_time.report_time = get_time(); 671 } 672 } 673 674 /** 675 * Chooses an index in the shader_time buffer and sets up tracking information 676 * for our printouts. 677 * 678 * Note that this holds on to references to the underlying programs, which may 679 * change their lifetimes compared to normal operation. 680 */ 681 int 682 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog, 683 enum shader_time_shader_type type, bool is_glsl_sh) 684 { 685 int shader_time_index = brw->shader_time.num_entries++; 686 assert(shader_time_index < brw->shader_time.max_entries); 687 brw->shader_time.types[shader_time_index] = type; 688 689 const char *name; 690 if (prog->Id == 0) { 691 name = "ff"; 692 } else if (is_glsl_sh) { 693 name = prog->info.label ? 694 ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl"; 695 } else { 696 name = "prog"; 697 } 698 699 brw->shader_time.names[shader_time_index] = name; 700 brw->shader_time.ids[shader_time_index] = prog->Id; 701 702 return shader_time_index; 703 } 704 705 void 706 brw_destroy_shader_time(struct brw_context *brw) 707 { 708 brw_bo_unreference(brw->shader_time.bo); 709 brw->shader_time.bo = NULL; 710 } 711 712 void 713 brw_stage_prog_data_free(const void *p) 714 { 715 struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p; 716 717 ralloc_free(prog_data->param); 718 ralloc_free(prog_data->pull_param); 719 } 720 721 void 722 brw_dump_arb_asm(const char *stage, struct gl_program *prog) 723 { 724 fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n", 725 stage, prog->Id, stage); 726 _mesa_print_program(prog); 727 } 728 729 void 730 brw_setup_tex_for_precompile(struct brw_context *brw, 731 struct brw_sampler_prog_key_data *tex, 732 struct gl_program *prog) 733 { 734 const struct gen_device_info *devinfo = &brw->screen->devinfo; 735 const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8; 736 unsigned sampler_count = util_last_bit(prog->SamplersUsed); 737 for (unsigned i = 0; i < sampler_count; i++) { 738 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) { 739 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */ 740 tex->swizzles[i] = 741 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE); 742 } else { 743 /* Color sampler: assume no swizzling. */ 744 tex->swizzles[i] = SWIZZLE_XYZW; 745 } 746 } 747 } 748 749 /** 750 * Sets up the starting offsets for the groups of binding table entries 751 * common to all pipeline stages. 752 * 753 * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're 754 * unused but also make sure that addition of small offsets to them will 755 * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES. 756 */ 757 uint32_t 758 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo, 759 const struct gl_program *prog, 760 struct brw_stage_prog_data *stage_prog_data, 761 uint32_t next_binding_table_offset) 762 { 763 int num_textures = util_last_bit(prog->SamplersUsed); 764 765 stage_prog_data->binding_table.texture_start = next_binding_table_offset; 766 next_binding_table_offset += num_textures; 767 768 if (prog->info.num_ubos) { 769 assert(prog->info.num_ubos <= BRW_MAX_UBO); 770 stage_prog_data->binding_table.ubo_start = next_binding_table_offset; 771 next_binding_table_offset += prog->info.num_ubos; 772 } else { 773 stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0; 774 } 775 776 if (prog->info.num_ssbos || prog->info.num_abos) { 777 assert(prog->info.num_abos <= BRW_MAX_ABO); 778 assert(prog->info.num_ssbos <= BRW_MAX_SSBO); 779 stage_prog_data->binding_table.ssbo_start = next_binding_table_offset; 780 next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos; 781 } else { 782 stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0; 783 } 784 785 if (INTEL_DEBUG & DEBUG_SHADER_TIME) { 786 stage_prog_data->binding_table.shader_time_start = next_binding_table_offset; 787 next_binding_table_offset++; 788 } else { 789 stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0; 790 } 791 792 if (prog->info.uses_texture_gather) { 793 if (devinfo->gen >= 8) { 794 stage_prog_data->binding_table.gather_texture_start = 795 stage_prog_data->binding_table.texture_start; 796 } else { 797 stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset; 798 next_binding_table_offset += num_textures; 799 } 800 } else { 801 stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0; 802 } 803 804 if (prog->info.num_images) { 805 stage_prog_data->binding_table.image_start = next_binding_table_offset; 806 next_binding_table_offset += prog->info.num_images; 807 } else { 808 stage_prog_data->binding_table.image_start = 0xd0d0d0d0; 809 } 810 811 /* This may or may not be used depending on how the compile goes. */ 812 stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset; 813 next_binding_table_offset++; 814 815 /* Plane 0 is just the regular texture section */ 816 stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start; 817 818 stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset; 819 next_binding_table_offset += num_textures; 820 821 stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset; 822 next_binding_table_offset += num_textures; 823 824 /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */ 825 826 assert(next_binding_table_offset <= BRW_MAX_SURFACES); 827 return next_binding_table_offset; 828 } 829 830 void 831 brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog) 832 { 833 struct blob writer; 834 blob_init(&writer); 835 nir_serialize(&writer, prog->nir); 836 prog->driver_cache_blob = ralloc_size(NULL, writer.size); 837 memcpy(prog->driver_cache_blob, writer.data, writer.size); 838 prog->driver_cache_blob_size = writer.size; 839 blob_finish(&writer); 840 } 841 842 void 843 brw_program_deserialize_nir(struct gl_context *ctx, struct gl_program *prog, 844 gl_shader_stage stage) 845 { 846 if (!prog->nir) { 847 assert(prog->driver_cache_blob && prog->driver_cache_blob_size > 0); 848 const struct nir_shader_compiler_options *options = 849 ctx->Const.ShaderCompilerOptions[stage].NirOptions; 850 struct blob_reader reader; 851 blob_reader_init(&reader, prog->driver_cache_blob, 852 prog->driver_cache_blob_size); 853 prog->nir = nir_deserialize(NULL, options, &reader); 854 } 855 856 if (prog->driver_cache_blob) { 857 ralloc_free(prog->driver_cache_blob); 858 prog->driver_cache_blob = NULL; 859 prog->driver_cache_blob_size = 0; 860 } 861 } 862