1 /* 2 * Copyright 2010 - 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #ifndef BRW_COMPILER_H 25 #define BRW_COMPILER_H 26 27 #include <stdio.h> 28 #include "common/gen_device_info.h" 29 #include "main/macros.h" 30 #include "util/ralloc.h" 31 32 #ifdef __cplusplus 33 extern "C" { 34 #endif 35 36 struct ra_regs; 37 struct nir_shader; 38 struct brw_program; 39 40 struct brw_compiler { 41 const struct gen_device_info *devinfo; 42 43 struct { 44 struct ra_regs *regs; 45 46 /** 47 * Array of the ra classes for the unaligned contiguous register 48 * block sizes used. 49 */ 50 int *classes; 51 52 /** 53 * Mapping for register-allocated objects in *regs to the first 54 * GRF for that object. 55 */ 56 uint8_t *ra_reg_to_grf; 57 } vec4_reg_set; 58 59 struct { 60 struct ra_regs *regs; 61 62 /** 63 * Array of the ra classes for the unaligned contiguous register 64 * block sizes used, indexed by register size. 65 */ 66 int classes[16]; 67 68 /** 69 * Mapping from classes to ra_reg ranges. Each of the per-size 70 * classes corresponds to a range of ra_reg nodes. This array stores 71 * those ranges in the form of first ra_reg in each class and the 72 * total number of ra_reg elements in the last array element. This 73 * way the range of the i'th class is given by: 74 * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] ) 75 */ 76 int class_to_ra_reg_range[17]; 77 78 /** 79 * Mapping for register-allocated objects in *regs to the first 80 * GRF for that object. 81 */ 82 uint8_t *ra_reg_to_grf; 83 84 /** 85 * ra class for the aligned pairs we use for PLN, which doesn't 86 * appear in *classes. 87 */ 88 int aligned_pairs_class; 89 } fs_reg_sets[3]; 90 91 void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); 92 void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); 93 94 bool scalar_stage[MESA_SHADER_STAGES]; 95 struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; 96 97 /** 98 * Apply workarounds for SIN and COS output range problems. 99 * This can negatively impact performance. 100 */ 101 bool precise_trig; 102 103 /** 104 * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State 105 * Base Address? (If not, it's a normal GPU address.) 106 */ 107 bool constant_buffer_0_is_relative; 108 109 /** 110 * Whether or not the driver supports pull constants. If not, the compiler 111 * will attempt to push everything. 112 */ 113 bool supports_pull_constants; 114 }; 115 116 117 /** 118 * Program key structures. 119 * 120 * When drawing, we look for the currently bound shaders in the program 121 * cache. This is essentially a hash table lookup, and these are the keys. 122 * 123 * Sometimes OpenGL features specified as state need to be simulated via 124 * shader code, due to a mismatch between the API and the hardware. This 125 * is often referred to as "non-orthagonal state" or "NOS". We store NOS 126 * in the program key so it's considered when searching for a program. If 127 * we haven't seen a particular combination before, we have to recompile a 128 * new specialized version. 129 * 130 * Shader compilation should not look up state in gl_context directly, but 131 * instead use the copy in the program key. This guarantees recompiles will 132 * happen correctly. 133 * 134 * @{ 135 */ 136 137 enum PACKED gen6_gather_sampler_wa { 138 WA_SIGN = 1, /* whether we need to sign extend */ 139 WA_8BIT = 2, /* if we have an 8bit format needing wa */ 140 WA_16BIT = 4, /* if we have a 16bit format needing wa */ 141 }; 142 143 /** 144 * Sampler information needed by VS, WM, and GS program cache keys. 145 */ 146 struct brw_sampler_prog_key_data { 147 /** 148 * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. 149 */ 150 uint16_t swizzles[MAX_SAMPLERS]; 151 152 uint32_t gl_clamp_mask[3]; 153 154 /** 155 * For RG32F, gather4's channel select is broken. 156 */ 157 uint32_t gather_channel_quirk_mask; 158 159 /** 160 * Whether this sampler uses the compressed multisample surface layout. 161 */ 162 uint32_t compressed_multisample_layout_mask; 163 164 /** 165 * Whether this sampler is using 16x multisampling. If so fetching from 166 * this sampler will be handled with a different instruction, ld2dms_w 167 * instead of ld2dms. 168 */ 169 uint32_t msaa_16; 170 171 /** 172 * For Sandybridge, which shader w/a we need for gather quirks. 173 */ 174 enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; 175 176 /** 177 * Texture units that have a YUV image bound. 178 */ 179 uint32_t y_u_v_image_mask; 180 uint32_t y_uv_image_mask; 181 uint32_t yx_xuxv_image_mask; 182 uint32_t xy_uxvx_image_mask; 183 }; 184 185 /** 186 * The VF can't natively handle certain types of attributes, such as GL_FIXED 187 * or most 10_10_10_2 types. These flags enable various VS workarounds to 188 * "fix" attributes at the beginning of shaders. 189 */ 190 #define BRW_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */ 191 #define BRW_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */ 192 #define BRW_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */ 193 #define BRW_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */ 194 #define BRW_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */ 195 196 /** 197 * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range 198 * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user 199 * input vertex attributes. In Vulkan, we expose up to 28 user vertex input 200 * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0. 201 */ 202 #define MAX_GL_VERT_ATTRIB VERT_ATTRIB_MAX 203 #define MAX_VK_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 28) 204 205 /** The program key for Vertex Shaders. */ 206 struct brw_vs_prog_key { 207 unsigned program_string_id; 208 209 /** 210 * Per-attribute workaround flags 211 * 212 * For each attribute, a combination of BRW_ATTRIB_WA_*. 213 * 214 * For OpenGL, where we expose a maximum of 16 user input atttributes 215 * we only need up to VERT_ATTRIB_MAX slots, however, in Vulkan 216 * slots preceding VERT_ATTRIB_GENERIC0 are unused and we can 217 * expose up to 28 user input vertex attributes that are mapped to slots 218 * starting at VERT_ATTRIB_GENERIC0, so this array needs to be large 219 * enough to hold this many slots. 220 */ 221 uint8_t gl_attrib_wa_flags[MAX2(MAX_GL_VERT_ATTRIB, MAX_VK_VERT_ATTRIB)]; 222 223 bool copy_edgeflag:1; 224 225 bool clamp_vertex_color:1; 226 227 /** 228 * How many user clipping planes are being uploaded to the vertex shader as 229 * push constants. 230 * 231 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to 232 * clip distances. 233 */ 234 unsigned nr_userclip_plane_consts:4; 235 236 /** 237 * For pre-Gen6 hardware, a bitfield indicating which texture coordinates 238 * are going to be replaced with point coordinates (as a consequence of a 239 * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because 240 * our SF thread requires exact matching between VS outputs and FS inputs, 241 * these texture coordinates will need to be unconditionally included in 242 * the VUE, even if they aren't written by the vertex shader. 243 */ 244 uint8_t point_coord_replace; 245 246 struct brw_sampler_prog_key_data tex; 247 }; 248 249 /** The program key for Tessellation Control Shaders. */ 250 struct brw_tcs_prog_key 251 { 252 unsigned program_string_id; 253 254 GLenum tes_primitive_mode; 255 256 unsigned input_vertices; 257 258 /** A bitfield of per-patch outputs written. */ 259 uint32_t patch_outputs_written; 260 261 /** A bitfield of per-vertex outputs written. */ 262 uint64_t outputs_written; 263 264 bool quads_workaround; 265 266 struct brw_sampler_prog_key_data tex; 267 }; 268 269 /** The program key for Tessellation Evaluation Shaders. */ 270 struct brw_tes_prog_key 271 { 272 unsigned program_string_id; 273 274 /** A bitfield of per-patch inputs read. */ 275 uint32_t patch_inputs_read; 276 277 /** A bitfield of per-vertex inputs read. */ 278 uint64_t inputs_read; 279 280 struct brw_sampler_prog_key_data tex; 281 }; 282 283 /** The program key for Geometry Shaders. */ 284 struct brw_gs_prog_key 285 { 286 unsigned program_string_id; 287 288 struct brw_sampler_prog_key_data tex; 289 }; 290 291 enum brw_sf_primitive { 292 BRW_SF_PRIM_POINTS = 0, 293 BRW_SF_PRIM_LINES = 1, 294 BRW_SF_PRIM_TRIANGLES = 2, 295 BRW_SF_PRIM_UNFILLED_TRIS = 3, 296 }; 297 298 struct brw_sf_prog_key { 299 uint64_t attrs; 300 bool contains_flat_varying; 301 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 302 uint8_t point_sprite_coord_replace; 303 enum brw_sf_primitive primitive:2; 304 bool do_twoside_color:1; 305 bool frontface_ccw:1; 306 bool do_point_sprite:1; 307 bool do_point_coord:1; 308 bool sprite_origin_lower_left:1; 309 bool userclip_active:1; 310 }; 311 312 enum brw_clip_mode { 313 BRW_CLIP_MODE_NORMAL = 0, 314 BRW_CLIP_MODE_CLIP_ALL = 1, 315 BRW_CLIP_MODE_CLIP_NON_REJECTED = 2, 316 BRW_CLIP_MODE_REJECT_ALL = 3, 317 BRW_CLIP_MODE_ACCEPT_ALL = 4, 318 BRW_CLIP_MODE_KERNEL_CLIP = 5, 319 }; 320 321 enum brw_clip_fill_mode { 322 BRW_CLIP_FILL_MODE_LINE = 0, 323 BRW_CLIP_FILL_MODE_POINT = 1, 324 BRW_CLIP_FILL_MODE_FILL = 2, 325 BRW_CLIP_FILL_MODE_CULL = 3, 326 }; 327 328 /* Note that if unfilled primitives are being emitted, we have to fix 329 * up polygon offset and flatshading at this point: 330 */ 331 struct brw_clip_prog_key { 332 uint64_t attrs; 333 bool contains_flat_varying; 334 bool contains_noperspective_varying; 335 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 336 unsigned primitive:4; 337 unsigned nr_userclip:4; 338 bool pv_first:1; 339 bool do_unfilled:1; 340 enum brw_clip_fill_mode fill_cw:2; /* includes cull information */ 341 enum brw_clip_fill_mode fill_ccw:2; /* includes cull information */ 342 bool offset_cw:1; 343 bool offset_ccw:1; 344 bool copy_bfc_cw:1; 345 bool copy_bfc_ccw:1; 346 enum brw_clip_mode clip_mode:3; 347 348 float offset_factor; 349 float offset_units; 350 float offset_clamp; 351 }; 352 353 /* A big lookup table is used to figure out which and how many 354 * additional regs will inserted before the main payload in the WM 355 * program execution. These mainly relate to depth and stencil 356 * processing and the early-depth-test optimization. 357 */ 358 enum brw_wm_iz_bits { 359 BRW_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1, 360 BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2, 361 BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4, 362 BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8, 363 BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10, 364 BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20, 365 BRW_WM_IZ_BIT_MAX = 0x40 366 }; 367 368 enum brw_wm_aa_enable { 369 BRW_WM_AA_NEVER, 370 BRW_WM_AA_SOMETIMES, 371 BRW_WM_AA_ALWAYS 372 }; 373 374 /** The program key for Fragment/Pixel Shaders. */ 375 struct brw_wm_prog_key { 376 /* Some collection of BRW_WM_IZ_* */ 377 uint8_t iz_lookup; 378 bool stats_wm:1; 379 bool flat_shade:1; 380 unsigned nr_color_regions:5; 381 bool replicate_alpha:1; 382 bool clamp_fragment_color:1; 383 bool persample_interp:1; 384 bool multisample_fbo:1; 385 bool frag_coord_adds_sample_pos:1; 386 enum brw_wm_aa_enable line_aa:2; 387 bool high_quality_derivatives:1; 388 bool force_dual_color_blend:1; 389 bool coherent_fb_fetch:1; 390 391 uint64_t input_slots_valid; 392 unsigned program_string_id; 393 GLenum alpha_test_func; /* < For Gen4/5 MRT alpha test */ 394 float alpha_test_ref; 395 396 struct brw_sampler_prog_key_data tex; 397 }; 398 399 struct brw_cs_prog_key { 400 uint32_t program_string_id; 401 struct brw_sampler_prog_key_data tex; 402 }; 403 404 /* brw_any_prog_key is any of the keys that map to an API stage */ 405 union brw_any_prog_key { 406 struct brw_vs_prog_key vs; 407 struct brw_tcs_prog_key tcs; 408 struct brw_tes_prog_key tes; 409 struct brw_gs_prog_key gs; 410 struct brw_wm_prog_key wm; 411 struct brw_cs_prog_key cs; 412 }; 413 414 /* 415 * Image metadata structure as laid out in the shader parameter 416 * buffer. Entries have to be 16B-aligned for the vec4 back-end to be 417 * able to use them. That's okay because the padding and any unused 418 * entries [most of them except when we're doing untyped surface 419 * access] will be removed by the uniform packing pass. 420 */ 421 #define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET 0 422 #define BRW_IMAGE_PARAM_OFFSET_OFFSET 4 423 #define BRW_IMAGE_PARAM_SIZE_OFFSET 8 424 #define BRW_IMAGE_PARAM_STRIDE_OFFSET 12 425 #define BRW_IMAGE_PARAM_TILING_OFFSET 16 426 #define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 20 427 #define BRW_IMAGE_PARAM_SIZE 24 428 429 struct brw_image_param { 430 /** Surface binding table index. */ 431 uint32_t surface_idx; 432 433 /** Offset applied to the X and Y surface coordinates. */ 434 uint32_t offset[2]; 435 436 /** Surface X, Y and Z dimensions. */ 437 uint32_t size[3]; 438 439 /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in 440 * pixels, vertical slice stride in pixels. 441 */ 442 uint32_t stride[4]; 443 444 /** Log2 of the tiling modulus in the X, Y and Z dimension. */ 445 uint32_t tiling[3]; 446 447 /** 448 * Right shift to apply for bit 6 address swizzling. Two different 449 * swizzles can be specified and will be applied one after the other. The 450 * resulting address will be: 451 * 452 * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ 453 * (addr >> swizzling[1]))) 454 * 455 * Use \c 0xff if any of the swizzles is not required. 456 */ 457 uint32_t swizzling[2]; 458 }; 459 460 /** Max number of render targets in a shader */ 461 #define BRW_MAX_DRAW_BUFFERS 8 462 463 /** 464 * Max number of binding table entries used for stream output. 465 * 466 * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the 467 * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64. 468 * 469 * On Gen6, the size of transform feedback data is limited not by the number 470 * of components but by the number of binding table entries we set aside. We 471 * use one binding table entry for a float, one entry for a vector, and one 472 * entry per matrix column. Since the only way we can communicate our 473 * transform feedback capabilities to the client is via 474 * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the 475 * worst case, in which all the varyings are floats, so we use up one binding 476 * table entry per component. Therefore we need to set aside at least 64 477 * binding table entries for use by transform feedback. 478 * 479 * Note: since we don't currently pack varyings, it is currently impossible 480 * for the client to actually use up all of these binding table entries--if 481 * all of their varyings were floats, they would run out of varying slots and 482 * fail to link. But that's a bug, so it seems prudent to go ahead and 483 * allocate the number of binding table entries we will need once the bug is 484 * fixed. 485 */ 486 #define BRW_MAX_SOL_BINDINGS 64 487 488 /** 489 * Binding table index for the first gen6 SOL binding. 490 */ 491 #define BRW_GEN6_SOL_BINDING_START 0 492 493 /** 494 * Stride in bytes between shader_time entries. 495 * 496 * We separate entries by a cacheline to reduce traffic between EUs writing to 497 * different entries. 498 */ 499 #define BRW_SHADER_TIME_STRIDE 64 500 501 struct brw_ubo_range 502 { 503 uint16_t block; 504 uint8_t start; 505 uint8_t length; 506 }; 507 508 /* We reserve the first 2^16 values for builtins */ 509 #define BRW_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0) 510 511 enum brw_param_builtin { 512 BRW_PARAM_BUILTIN_ZERO, 513 514 BRW_PARAM_BUILTIN_CLIP_PLANE_0_X, 515 BRW_PARAM_BUILTIN_CLIP_PLANE_0_Y, 516 BRW_PARAM_BUILTIN_CLIP_PLANE_0_Z, 517 BRW_PARAM_BUILTIN_CLIP_PLANE_0_W, 518 BRW_PARAM_BUILTIN_CLIP_PLANE_1_X, 519 BRW_PARAM_BUILTIN_CLIP_PLANE_1_Y, 520 BRW_PARAM_BUILTIN_CLIP_PLANE_1_Z, 521 BRW_PARAM_BUILTIN_CLIP_PLANE_1_W, 522 BRW_PARAM_BUILTIN_CLIP_PLANE_2_X, 523 BRW_PARAM_BUILTIN_CLIP_PLANE_2_Y, 524 BRW_PARAM_BUILTIN_CLIP_PLANE_2_Z, 525 BRW_PARAM_BUILTIN_CLIP_PLANE_2_W, 526 BRW_PARAM_BUILTIN_CLIP_PLANE_3_X, 527 BRW_PARAM_BUILTIN_CLIP_PLANE_3_Y, 528 BRW_PARAM_BUILTIN_CLIP_PLANE_3_Z, 529 BRW_PARAM_BUILTIN_CLIP_PLANE_3_W, 530 BRW_PARAM_BUILTIN_CLIP_PLANE_4_X, 531 BRW_PARAM_BUILTIN_CLIP_PLANE_4_Y, 532 BRW_PARAM_BUILTIN_CLIP_PLANE_4_Z, 533 BRW_PARAM_BUILTIN_CLIP_PLANE_4_W, 534 BRW_PARAM_BUILTIN_CLIP_PLANE_5_X, 535 BRW_PARAM_BUILTIN_CLIP_PLANE_5_Y, 536 BRW_PARAM_BUILTIN_CLIP_PLANE_5_Z, 537 BRW_PARAM_BUILTIN_CLIP_PLANE_5_W, 538 BRW_PARAM_BUILTIN_CLIP_PLANE_6_X, 539 BRW_PARAM_BUILTIN_CLIP_PLANE_6_Y, 540 BRW_PARAM_BUILTIN_CLIP_PLANE_6_Z, 541 BRW_PARAM_BUILTIN_CLIP_PLANE_6_W, 542 BRW_PARAM_BUILTIN_CLIP_PLANE_7_X, 543 BRW_PARAM_BUILTIN_CLIP_PLANE_7_Y, 544 BRW_PARAM_BUILTIN_CLIP_PLANE_7_Z, 545 BRW_PARAM_BUILTIN_CLIP_PLANE_7_W, 546 547 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X, 548 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y, 549 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z, 550 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W, 551 BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X, 552 BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y, 553 554 BRW_PARAM_BUILTIN_SUBGROUP_ID, 555 }; 556 557 #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \ 558 (BRW_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp)) 559 560 #define BRW_PARAM_BUILTIN_IS_CLIP_PLANE(param) \ 561 ((param) >= BRW_PARAM_BUILTIN_CLIP_PLANE_0_X && \ 562 (param) <= BRW_PARAM_BUILTIN_CLIP_PLANE_7_W) 563 564 #define BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \ 565 (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2) 566 567 #define BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \ 568 (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3) 569 570 struct brw_stage_prog_data { 571 struct { 572 /** size of our binding table. */ 573 uint32_t size_bytes; 574 575 /** @{ 576 * surface indices for the various groups of surfaces 577 */ 578 uint32_t pull_constants_start; 579 uint32_t texture_start; 580 uint32_t gather_texture_start; 581 uint32_t ubo_start; 582 uint32_t ssbo_start; 583 uint32_t image_start; 584 uint32_t shader_time_start; 585 uint32_t plane_start[3]; 586 /** @} */ 587 } binding_table; 588 589 struct brw_ubo_range ubo_ranges[4]; 590 591 GLuint nr_params; /**< number of float params/constants */ 592 GLuint nr_pull_params; 593 594 unsigned curb_read_length; 595 unsigned total_scratch; 596 unsigned total_shared; 597 598 unsigned program_size; 599 600 /** 601 * Register where the thread expects to find input data from the URB 602 * (typically uniforms, followed by vertex or fragment attributes). 603 */ 604 unsigned dispatch_grf_start_reg; 605 606 bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ 607 608 /* 32-bit identifiers for all push/pull parameters. These can be anything 609 * the driver wishes them to be; the core of the back-end compiler simply 610 * re-arranges them. The one restriction is that the bottom 2^16 values 611 * are reserved for builtins defined in the brw_param_builtin enum defined 612 * above. 613 */ 614 uint32_t *param; 615 uint32_t *pull_param; 616 }; 617 618 static inline uint32_t * 619 brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data, 620 unsigned nr_new_params) 621 { 622 unsigned old_nr_params = prog_data->nr_params; 623 prog_data->nr_params += nr_new_params; 624 prog_data->param = reralloc(ralloc_parent(prog_data->param), 625 prog_data->param, uint32_t, 626 prog_data->nr_params); 627 return prog_data->param + old_nr_params; 628 } 629 630 static inline void 631 brw_mark_surface_used(struct brw_stage_prog_data *prog_data, 632 unsigned surf_index) 633 { 634 /* A binding table index is 8 bits and the top 3 values are reserved for 635 * special things (stateless and SLM). 636 */ 637 assert(surf_index <= 252); 638 639 prog_data->binding_table.size_bytes = 640 MAX2(prog_data->binding_table.size_bytes, (surf_index + 1) * 4); 641 } 642 643 enum brw_barycentric_mode { 644 BRW_BARYCENTRIC_PERSPECTIVE_PIXEL = 0, 645 BRW_BARYCENTRIC_PERSPECTIVE_CENTROID = 1, 646 BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2, 647 BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3, 648 BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4, 649 BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5, 650 BRW_BARYCENTRIC_MODE_COUNT = 6 651 }; 652 #define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \ 653 ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \ 654 (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \ 655 (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) 656 657 enum brw_pixel_shader_computed_depth_mode { 658 BRW_PSCDEPTH_OFF = 0, /* PS does not compute depth */ 659 BRW_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */ 660 BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */ 661 BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */ 662 }; 663 664 /* Data about a particular attempt to compile a program. Note that 665 * there can be many of these, each in a different GL state 666 * corresponding to a different brw_wm_prog_key struct, with different 667 * compiled programs. 668 */ 669 struct brw_wm_prog_data { 670 struct brw_stage_prog_data base; 671 672 GLuint num_varying_inputs; 673 674 uint8_t reg_blocks_0; 675 uint8_t reg_blocks_2; 676 677 uint8_t dispatch_grf_start_reg_2; 678 uint32_t prog_offset_2; 679 680 struct { 681 /** @{ 682 * surface indices the WM-specific surfaces 683 */ 684 uint32_t render_target_read_start; 685 /** @} */ 686 } binding_table; 687 688 uint8_t computed_depth_mode; 689 bool computed_stencil; 690 691 bool early_fragment_tests; 692 bool post_depth_coverage; 693 bool inner_coverage; 694 bool dispatch_8; 695 bool dispatch_16; 696 bool dual_src_blend; 697 bool persample_dispatch; 698 bool uses_pos_offset; 699 bool uses_omask; 700 bool uses_kill; 701 bool uses_src_depth; 702 bool uses_src_w; 703 bool uses_sample_mask; 704 bool has_render_target_reads; 705 bool has_side_effects; 706 bool pulls_bary; 707 708 bool contains_flat_varying; 709 bool contains_noperspective_varying; 710 711 /** 712 * Mask of which interpolation modes are required by the fragment shader. 713 * Used in hardware setup on gen6+. 714 */ 715 uint32_t barycentric_interp_modes; 716 717 /** 718 * Mask of which FS inputs are marked flat by the shader source. This is 719 * needed for setting up 3DSTATE_SF/SBE. 720 */ 721 uint32_t flat_inputs; 722 723 /* Mapping of VUE slots to interpolation modes. 724 * Used by the Gen4-5 clip/sf/wm stages. 725 */ 726 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 727 728 /** 729 * Map from gl_varying_slot to the position within the FS setup data 730 * payload where the varying's attribute vertex deltas should be delivered. 731 * For varying slots that are not used by the FS, the value is -1. 732 */ 733 int urb_setup[VARYING_SLOT_MAX]; 734 }; 735 736 struct brw_push_const_block { 737 unsigned dwords; /* Dword count, not reg aligned */ 738 unsigned regs; 739 unsigned size; /* Bytes, register aligned */ 740 }; 741 742 struct brw_cs_prog_data { 743 struct brw_stage_prog_data base; 744 745 unsigned local_size[3]; 746 unsigned simd_size; 747 unsigned threads; 748 bool uses_barrier; 749 bool uses_num_work_groups; 750 751 struct { 752 struct brw_push_const_block cross_thread; 753 struct brw_push_const_block per_thread; 754 struct brw_push_const_block total; 755 } push; 756 757 struct { 758 /** @{ 759 * surface indices the CS-specific surfaces 760 */ 761 uint32_t work_groups_start; 762 /** @} */ 763 } binding_table; 764 }; 765 766 /** 767 * Enum representing the i965-specific vertex results that don't correspond 768 * exactly to any element of gl_varying_slot. The values of this enum are 769 * assigned such that they don't conflict with gl_varying_slot. 770 */ 771 typedef enum 772 { 773 BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, 774 BRW_VARYING_SLOT_PAD, 775 /** 776 * Technically this is not a varying but just a placeholder that 777 * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord 778 * builtin variable to be compiled correctly. see compile_sf_prog() for 779 * more info. 780 */ 781 BRW_VARYING_SLOT_PNTC, 782 BRW_VARYING_SLOT_COUNT 783 } brw_varying_slot; 784 785 /** 786 * We always program SF to start reading at an offset of 1 (2 varying slots) 787 * from the start of the vertex URB entry. This causes it to skip: 788 * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5 789 * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gen6+ 790 */ 791 #define BRW_SF_URB_ENTRY_READ_OFFSET 1 792 793 /** 794 * Bitmask indicating which fragment shader inputs represent varyings (and 795 * hence have to be delivered to the fragment shader by the SF/SBE stage). 796 */ 797 #define BRW_FS_VARYING_INPUT_MASK \ 798 (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \ 799 ~VARYING_BIT_POS & ~VARYING_BIT_FACE) 800 801 /** 802 * Data structure recording the relationship between the gl_varying_slot enum 803 * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a 804 * single octaword within the VUE (128 bits). 805 * 806 * Note that each BRW register contains 256 bits (2 octawords), so when 807 * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two 808 * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as 809 * in a vertex shader), each register corresponds to a single VUE slot, since 810 * it contains data for two separate vertices. 811 */ 812 struct brw_vue_map { 813 /** 814 * Bitfield representing all varying slots that are (a) stored in this VUE 815 * map, and (b) actually written by the shader. Does not include any of 816 * the additional varying slots defined in brw_varying_slot. 817 */ 818 uint64_t slots_valid; 819 820 /** 821 * Is this VUE map for a separate shader pipeline? 822 * 823 * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched 824 * without the linker having a chance to dead code eliminate unused varyings. 825 * 826 * This means that we have to use a fixed slot layout, based on the output's 827 * location field, rather than assigning slots in a compact contiguous block. 828 */ 829 bool separate; 830 831 /** 832 * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are 833 * not stored in a slot (because they are not written, or because 834 * additional processing is applied before storing them in the VUE), the 835 * value is -1. 836 */ 837 signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; 838 839 /** 840 * Map from VUE slot to gl_varying_slot value. For slots that do not 841 * directly correspond to a gl_varying_slot, the value comes from 842 * brw_varying_slot. 843 * 844 * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. 845 */ 846 signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; 847 848 /** 849 * Total number of VUE slots in use 850 */ 851 int num_slots; 852 853 /** 854 * Number of per-patch VUE slots. Only valid for tessellation control 855 * shader outputs and tessellation evaluation shader inputs. 856 */ 857 int num_per_patch_slots; 858 859 /** 860 * Number of per-vertex VUE slots. Only valid for tessellation control 861 * shader outputs and tessellation evaluation shader inputs. 862 */ 863 int num_per_vertex_slots; 864 }; 865 866 void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map); 867 868 /** 869 * Convert a VUE slot number into a byte offset within the VUE. 870 */ 871 static inline GLuint brw_vue_slot_to_offset(GLuint slot) 872 { 873 return 16*slot; 874 } 875 876 /** 877 * Convert a vertex output (brw_varying_slot) into a byte offset within the 878 * VUE. 879 */ 880 static inline 881 GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying) 882 { 883 return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); 884 } 885 886 void brw_compute_vue_map(const struct gen_device_info *devinfo, 887 struct brw_vue_map *vue_map, 888 uint64_t slots_valid, 889 bool separate_shader); 890 891 void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map, 892 uint64_t slots_valid, 893 uint32_t is_patch); 894 895 /* brw_interpolation_map.c */ 896 void brw_setup_vue_interpolation(struct brw_vue_map *vue_map, 897 struct nir_shader *nir, 898 struct brw_wm_prog_data *prog_data, 899 const struct gen_device_info *devinfo); 900 901 enum shader_dispatch_mode { 902 DISPATCH_MODE_4X1_SINGLE = 0, 903 DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, 904 DISPATCH_MODE_4X2_DUAL_OBJECT = 2, 905 DISPATCH_MODE_SIMD8 = 3, 906 }; 907 908 /** 909 * @defgroup Tessellator parameter enumerations. 910 * 911 * These correspond to the hardware values in 3DSTATE_TE, and are provided 912 * as part of the tessellation evaluation shader. 913 * 914 * @{ 915 */ 916 enum brw_tess_partitioning { 917 BRW_TESS_PARTITIONING_INTEGER = 0, 918 BRW_TESS_PARTITIONING_ODD_FRACTIONAL = 1, 919 BRW_TESS_PARTITIONING_EVEN_FRACTIONAL = 2, 920 }; 921 922 enum brw_tess_output_topology { 923 BRW_TESS_OUTPUT_TOPOLOGY_POINT = 0, 924 BRW_TESS_OUTPUT_TOPOLOGY_LINE = 1, 925 BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2, 926 BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3, 927 }; 928 929 enum brw_tess_domain { 930 BRW_TESS_DOMAIN_QUAD = 0, 931 BRW_TESS_DOMAIN_TRI = 1, 932 BRW_TESS_DOMAIN_ISOLINE = 2, 933 }; 934 /** @} */ 935 936 struct brw_vue_prog_data { 937 struct brw_stage_prog_data base; 938 struct brw_vue_map vue_map; 939 940 /** Should the hardware deliver input VUE handles for URB pull loads? */ 941 bool include_vue_handles; 942 943 GLuint urb_read_length; 944 GLuint total_grf; 945 946 uint32_t clip_distance_mask; 947 uint32_t cull_distance_mask; 948 949 /* Used for calculating urb partitions. In the VS, this is the size of the 950 * URB entry used for both input and output to the thread. In the GS, this 951 * is the size of the URB entry used for output. 952 */ 953 GLuint urb_entry_size; 954 955 enum shader_dispatch_mode dispatch_mode; 956 }; 957 958 struct brw_vs_prog_data { 959 struct brw_vue_prog_data base; 960 961 GLbitfield64 inputs_read; 962 GLbitfield64 double_inputs_read; 963 964 unsigned nr_attribute_slots; 965 966 bool uses_vertexid; 967 bool uses_instanceid; 968 bool uses_basevertex; 969 bool uses_baseinstance; 970 bool uses_drawid; 971 }; 972 973 struct brw_tcs_prog_data 974 { 975 struct brw_vue_prog_data base; 976 977 /** Number vertices in output patch */ 978 int instances; 979 }; 980 981 982 struct brw_tes_prog_data 983 { 984 struct brw_vue_prog_data base; 985 986 enum brw_tess_partitioning partitioning; 987 enum brw_tess_output_topology output_topology; 988 enum brw_tess_domain domain; 989 }; 990 991 struct brw_gs_prog_data 992 { 993 struct brw_vue_prog_data base; 994 995 unsigned vertices_in; 996 997 /** 998 * Size of an output vertex, measured in HWORDS (32 bytes). 999 */ 1000 unsigned output_vertex_size_hwords; 1001 1002 unsigned output_topology; 1003 1004 /** 1005 * Size of the control data (cut bits or StreamID bits), in hwords (32 1006 * bytes). 0 if there is no control data. 1007 */ 1008 unsigned control_data_header_size_hwords; 1009 1010 /** 1011 * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID 1012 * if the control data is StreamID bits, or 1013 * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). 1014 * Ignored if control_data_header_size is 0. 1015 */ 1016 unsigned control_data_format; 1017 1018 bool include_primitive_id; 1019 1020 /** 1021 * The number of vertices emitted, if constant - otherwise -1. 1022 */ 1023 int static_vertex_count; 1024 1025 int invocations; 1026 1027 /** 1028 * Gen6: Provoking vertex convention for odd-numbered triangles 1029 * in tristrips. 1030 */ 1031 GLuint pv_first:1; 1032 1033 /** 1034 * Gen6: Number of varyings that are output to transform feedback. 1035 */ 1036 GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ 1037 1038 /** 1039 * Gen6: Map from the index of a transform feedback binding table entry to the 1040 * gl_varying_slot that should be streamed out through that binding table 1041 * entry. 1042 */ 1043 unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */]; 1044 1045 /** 1046 * Gen6: Map from the index of a transform feedback binding table entry to the 1047 * swizzles that should be used when streaming out data through that 1048 * binding table entry. 1049 */ 1050 unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; 1051 }; 1052 1053 struct brw_sf_prog_data { 1054 uint32_t urb_read_length; 1055 uint32_t total_grf; 1056 1057 /* Each vertex may have upto 12 attributes, 4 components each, 1058 * except WPOS which requires only 2. (11*4 + 2) == 44 ==> 11 1059 * rows. 1060 * 1061 * Actually we use 4 for each, so call it 12 rows. 1062 */ 1063 unsigned urb_entry_size; 1064 }; 1065 1066 struct brw_clip_prog_data { 1067 uint32_t curb_read_length; /* user planes? */ 1068 uint32_t clip_mode; 1069 uint32_t urb_read_length; 1070 uint32_t total_grf; 1071 }; 1072 1073 /* brw_any_prog_data is prog_data for any stage that maps to an API stage */ 1074 union brw_any_prog_data { 1075 struct brw_stage_prog_data base; 1076 struct brw_vue_prog_data vue; 1077 struct brw_vs_prog_data vs; 1078 struct brw_tcs_prog_data tcs; 1079 struct brw_tes_prog_data tes; 1080 struct brw_gs_prog_data gs; 1081 struct brw_wm_prog_data wm; 1082 struct brw_cs_prog_data cs; 1083 }; 1084 1085 #define DEFINE_PROG_DATA_DOWNCAST(stage) \ 1086 static inline struct brw_##stage##_prog_data * \ 1087 brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \ 1088 { \ 1089 return (struct brw_##stage##_prog_data *) prog_data; \ 1090 } 1091 DEFINE_PROG_DATA_DOWNCAST(vue) 1092 DEFINE_PROG_DATA_DOWNCAST(vs) 1093 DEFINE_PROG_DATA_DOWNCAST(tcs) 1094 DEFINE_PROG_DATA_DOWNCAST(tes) 1095 DEFINE_PROG_DATA_DOWNCAST(gs) 1096 DEFINE_PROG_DATA_DOWNCAST(wm) 1097 DEFINE_PROG_DATA_DOWNCAST(cs) 1098 DEFINE_PROG_DATA_DOWNCAST(ff_gs) 1099 DEFINE_PROG_DATA_DOWNCAST(clip) 1100 DEFINE_PROG_DATA_DOWNCAST(sf) 1101 #undef DEFINE_PROG_DATA_DOWNCAST 1102 1103 /** @} */ 1104 1105 struct brw_compiler * 1106 brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo); 1107 1108 unsigned 1109 brw_prog_data_size(gl_shader_stage stage); 1110 1111 unsigned 1112 brw_prog_key_size(gl_shader_stage stage); 1113 1114 /** 1115 * Compile a vertex shader. 1116 * 1117 * Returns the final assembly and the program's size. 1118 */ 1119 const unsigned * 1120 brw_compile_vs(const struct brw_compiler *compiler, void *log_data, 1121 void *mem_ctx, 1122 const struct brw_vs_prog_key *key, 1123 struct brw_vs_prog_data *prog_data, 1124 const struct nir_shader *shader, 1125 int shader_time_index, 1126 char **error_str); 1127 1128 /** 1129 * Compile a tessellation control shader. 1130 * 1131 * Returns the final assembly and the program's size. 1132 */ 1133 const unsigned * 1134 brw_compile_tcs(const struct brw_compiler *compiler, 1135 void *log_data, 1136 void *mem_ctx, 1137 const struct brw_tcs_prog_key *key, 1138 struct brw_tcs_prog_data *prog_data, 1139 const struct nir_shader *nir, 1140 int shader_time_index, 1141 char **error_str); 1142 1143 /** 1144 * Compile a tessellation evaluation shader. 1145 * 1146 * Returns the final assembly and the program's size. 1147 */ 1148 const unsigned * 1149 brw_compile_tes(const struct brw_compiler *compiler, void *log_data, 1150 void *mem_ctx, 1151 const struct brw_tes_prog_key *key, 1152 const struct brw_vue_map *input_vue_map, 1153 struct brw_tes_prog_data *prog_data, 1154 const struct nir_shader *shader, 1155 struct gl_program *prog, 1156 int shader_time_index, 1157 char **error_str); 1158 1159 /** 1160 * Compile a vertex shader. 1161 * 1162 * Returns the final assembly and the program's size. 1163 */ 1164 const unsigned * 1165 brw_compile_gs(const struct brw_compiler *compiler, void *log_data, 1166 void *mem_ctx, 1167 const struct brw_gs_prog_key *key, 1168 struct brw_gs_prog_data *prog_data, 1169 const struct nir_shader *shader, 1170 struct gl_program *prog, 1171 int shader_time_index, 1172 char **error_str); 1173 1174 /** 1175 * Compile a strips and fans shader. 1176 * 1177 * This is a fixed-function shader determined entirely by the shader key and 1178 * a VUE map. 1179 * 1180 * Returns the final assembly and the program's size. 1181 */ 1182 const unsigned * 1183 brw_compile_sf(const struct brw_compiler *compiler, 1184 void *mem_ctx, 1185 const struct brw_sf_prog_key *key, 1186 struct brw_sf_prog_data *prog_data, 1187 struct brw_vue_map *vue_map, 1188 unsigned *final_assembly_size); 1189 1190 /** 1191 * Compile a clipper shader. 1192 * 1193 * This is a fixed-function shader determined entirely by the shader key and 1194 * a VUE map. 1195 * 1196 * Returns the final assembly and the program's size. 1197 */ 1198 const unsigned * 1199 brw_compile_clip(const struct brw_compiler *compiler, 1200 void *mem_ctx, 1201 const struct brw_clip_prog_key *key, 1202 struct brw_clip_prog_data *prog_data, 1203 struct brw_vue_map *vue_map, 1204 unsigned *final_assembly_size); 1205 1206 /** 1207 * Compile a fragment shader. 1208 * 1209 * Returns the final assembly and the program's size. 1210 */ 1211 const unsigned * 1212 brw_compile_fs(const struct brw_compiler *compiler, void *log_data, 1213 void *mem_ctx, 1214 const struct brw_wm_prog_key *key, 1215 struct brw_wm_prog_data *prog_data, 1216 const struct nir_shader *shader, 1217 struct gl_program *prog, 1218 int shader_time_index8, 1219 int shader_time_index16, 1220 bool allow_spilling, 1221 bool use_rep_send, struct brw_vue_map *vue_map, 1222 char **error_str); 1223 1224 /** 1225 * Compile a compute shader. 1226 * 1227 * Returns the final assembly and the program's size. 1228 */ 1229 const unsigned * 1230 brw_compile_cs(const struct brw_compiler *compiler, void *log_data, 1231 void *mem_ctx, 1232 const struct brw_cs_prog_key *key, 1233 struct brw_cs_prog_data *prog_data, 1234 const struct nir_shader *shader, 1235 int shader_time_index, 1236 char **error_str); 1237 1238 static inline uint32_t 1239 encode_slm_size(unsigned gen, uint32_t bytes) 1240 { 1241 uint32_t slm_size = 0; 1242 1243 /* Shared Local Memory is specified as powers of two, and encoded in 1244 * INTERFACE_DESCRIPTOR_DATA with the following representations: 1245 * 1246 * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | 1247 * ------------------------------------------------------------------- 1248 * Gen7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | 1249 * ------------------------------------------------------------------- 1250 * Gen9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 1251 */ 1252 assert(bytes <= 64 * 1024); 1253 1254 if (bytes > 0) { 1255 /* Shared Local Memory Size is specified as powers of two. */ 1256 slm_size = util_next_power_of_two(bytes); 1257 1258 if (gen >= 9) { 1259 /* Use a minimum of 1kB; turn an exponent of 10 (1024 kB) into 1. */ 1260 slm_size = ffs(MAX2(slm_size, 1024)) - 10; 1261 } else { 1262 /* Use a minimum of 4kB; convert to the pre-Gen9 representation. */ 1263 slm_size = MAX2(slm_size, 4096) / 4096; 1264 } 1265 } 1266 1267 return slm_size; 1268 } 1269 1270 /** 1271 * Return true if the given shader stage is dispatched contiguously by the 1272 * relevant fixed function starting from channel 0 of the SIMD thread, which 1273 * implies that the dispatch mask of a thread can be assumed to have the form 1274 * '2^n - 1' for some n. 1275 */ 1276 static inline bool 1277 brw_stage_has_packed_dispatch(const struct gen_device_info *devinfo, 1278 gl_shader_stage stage, 1279 const struct brw_stage_prog_data *prog_data) 1280 { 1281 /* The code below makes assumptions about the hardware's thread dispatch 1282 * behavior that could be proven wrong in future generations -- Make sure 1283 * to do a full test run with brw_fs_test_dispatch_packing() hooked up to 1284 * the NIR front-end before changing this assertion. 1285 */ 1286 assert(devinfo->gen <= 10); 1287 1288 switch (stage) { 1289 case MESA_SHADER_FRAGMENT: { 1290 /* The PSD discards subspans coming in with no lit samples, which in the 1291 * per-pixel shading case implies that each subspan will either be fully 1292 * lit (due to the VMask being used to allow derivative computations), 1293 * or not dispatched at all. In per-sample dispatch mode individual 1294 * samples from the same subspan have a fixed relative location within 1295 * the SIMD thread, so dispatch of unlit samples cannot be avoided in 1296 * general and we should return false. 1297 */ 1298 const struct brw_wm_prog_data *wm_prog_data = 1299 (const struct brw_wm_prog_data *)prog_data; 1300 return !wm_prog_data->persample_dispatch; 1301 } 1302 case MESA_SHADER_COMPUTE: 1303 /* Compute shaders will be spawned with either a fully enabled dispatch 1304 * mask or with whatever bottom/right execution mask was given to the 1305 * GPGPU walker command to be used along the workgroup edges -- In both 1306 * cases the dispatch mask is required to be tightly packed for our 1307 * invocation index calculations to work. 1308 */ 1309 return true; 1310 default: 1311 /* Most remaining fixed functions are limited to use a packed dispatch 1312 * mask due to the hardware representation of the dispatch mask as a 1313 * single counter representing the number of enabled channels. 1314 */ 1315 return true; 1316 } 1317 } 1318 1319 /** 1320 * Computes the first varying slot in the URB produced by the previous stage 1321 * that is used in the next stage. We do this by testing the varying slots in 1322 * the previous stage's vue map against the inputs read in the next stage. 1323 * 1324 * Note that: 1325 * 1326 * - Each URB offset contains two varying slots and we can only skip a 1327 * full offset if both slots are unused, so the value we return here is always 1328 * rounded down to the closest multiple of two. 1329 * 1330 * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are 1331 * part of the vue header, so if these are read we can't skip anything. 1332 */ 1333 static inline int 1334 brw_compute_first_urb_slot_required(uint64_t inputs_read, 1335 const struct brw_vue_map *prev_stage_vue_map) 1336 { 1337 if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)) == 0) { 1338 for (int i = 0; i < prev_stage_vue_map->num_slots; i++) { 1339 int varying = prev_stage_vue_map->slot_to_varying[i]; 1340 if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) 1341 return ROUND_DOWN_TO(i, 2); 1342 } 1343 } 1344 1345 return 0; 1346 } 1347 1348 #ifdef __cplusplus 1349 } /* extern "C" */ 1350 #endif 1351 1352 #endif /* BRW_COMPILER_H */ 1353