1 /* 2 * Copyright 2010 - 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #pragma once 25 26 #include <stdio.h> 27 #include "common/gen_device_info.h" 28 #include "main/mtypes.h" 29 #include "main/macros.h" 30 31 #ifdef __cplusplus 32 extern "C" { 33 #endif 34 35 struct ra_regs; 36 struct nir_shader; 37 struct brw_program; 38 union gl_constant_value; 39 40 struct brw_compiler { 41 const struct gen_device_info *devinfo; 42 43 struct { 44 struct ra_regs *regs; 45 46 /** 47 * Array of the ra classes for the unaligned contiguous register 48 * block sizes used. 49 */ 50 int *classes; 51 52 /** 53 * Mapping for register-allocated objects in *regs to the first 54 * GRF for that object. 55 */ 56 uint8_t *ra_reg_to_grf; 57 } vec4_reg_set; 58 59 struct { 60 struct ra_regs *regs; 61 62 /** 63 * Array of the ra classes for the unaligned contiguous register 64 * block sizes used, indexed by register size. 65 */ 66 int classes[16]; 67 68 /** 69 * Mapping from classes to ra_reg ranges. Each of the per-size 70 * classes corresponds to a range of ra_reg nodes. This array stores 71 * those ranges in the form of first ra_reg in each class and the 72 * total number of ra_reg elements in the last array element. This 73 * way the range of the i'th class is given by: 74 * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] ) 75 */ 76 int class_to_ra_reg_range[17]; 77 78 /** 79 * Mapping for register-allocated objects in *regs to the first 80 * GRF for that object. 81 */ 82 uint8_t *ra_reg_to_grf; 83 84 /** 85 * ra class for the aligned pairs we use for PLN, which doesn't 86 * appear in *classes. 87 */ 88 int aligned_pairs_class; 89 } fs_reg_sets[3]; 90 91 void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); 92 void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3); 93 94 bool scalar_stage[MESA_SHADER_STAGES]; 95 struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; 96 97 /** 98 * Apply workarounds for SIN and COS output range problems. 99 * This can negatively impact performance. 100 */ 101 bool precise_trig; 102 }; 103 104 105 /** 106 * Program key structures. 107 * 108 * When drawing, we look for the currently bound shaders in the program 109 * cache. This is essentially a hash table lookup, and these are the keys. 110 * 111 * Sometimes OpenGL features specified as state need to be simulated via 112 * shader code, due to a mismatch between the API and the hardware. This 113 * is often referred to as "non-orthagonal state" or "NOS". We store NOS 114 * in the program key so it's considered when searching for a program. If 115 * we haven't seen a particular combination before, we have to recompile a 116 * new specialized version. 117 * 118 * Shader compilation should not look up state in gl_context directly, but 119 * instead use the copy in the program key. This guarantees recompiles will 120 * happen correctly. 121 * 122 * @{ 123 */ 124 125 enum PACKED gen6_gather_sampler_wa { 126 WA_SIGN = 1, /* whether we need to sign extend */ 127 WA_8BIT = 2, /* if we have an 8bit format needing wa */ 128 WA_16BIT = 4, /* if we have a 16bit format needing wa */ 129 }; 130 131 /** 132 * Sampler information needed by VS, WM, and GS program cache keys. 133 */ 134 struct brw_sampler_prog_key_data { 135 /** 136 * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. 137 */ 138 uint16_t swizzles[MAX_SAMPLERS]; 139 140 uint32_t gl_clamp_mask[3]; 141 142 /** 143 * For RG32F, gather4's channel select is broken. 144 */ 145 uint32_t gather_channel_quirk_mask; 146 147 /** 148 * Whether this sampler uses the compressed multisample surface layout. 149 */ 150 uint32_t compressed_multisample_layout_mask; 151 152 /** 153 * Whether this sampler is using 16x multisampling. If so fetching from 154 * this sampler will be handled with a different instruction, ld2dms_w 155 * instead of ld2dms. 156 */ 157 uint32_t msaa_16; 158 159 /** 160 * For Sandybridge, which shader w/a we need for gather quirks. 161 */ 162 enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; 163 164 /** 165 * Texture units that have a YUV image bound. 166 */ 167 uint32_t y_u_v_image_mask; 168 uint32_t y_uv_image_mask; 169 uint32_t yx_xuxv_image_mask; 170 }; 171 172 173 /** The program key for Vertex Shaders. */ 174 struct brw_vs_prog_key { 175 unsigned program_string_id; 176 177 /* 178 * Per-attribute workaround flags 179 */ 180 uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX]; 181 182 bool copy_edgeflag:1; 183 184 bool clamp_vertex_color:1; 185 186 /** 187 * How many user clipping planes are being uploaded to the vertex shader as 188 * push constants. 189 * 190 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to 191 * clip distances. 192 */ 193 unsigned nr_userclip_plane_consts:4; 194 195 /** 196 * For pre-Gen6 hardware, a bitfield indicating which texture coordinates 197 * are going to be replaced with point coordinates (as a consequence of a 198 * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because 199 * our SF thread requires exact matching between VS outputs and FS inputs, 200 * these texture coordinates will need to be unconditionally included in 201 * the VUE, even if they aren't written by the vertex shader. 202 */ 203 uint8_t point_coord_replace; 204 205 struct brw_sampler_prog_key_data tex; 206 }; 207 208 /** The program key for Tessellation Control Shaders. */ 209 struct brw_tcs_prog_key 210 { 211 unsigned program_string_id; 212 213 GLenum tes_primitive_mode; 214 215 unsigned input_vertices; 216 217 /** A bitfield of per-patch outputs written. */ 218 uint32_t patch_outputs_written; 219 220 /** A bitfield of per-vertex outputs written. */ 221 uint64_t outputs_written; 222 223 bool quads_workaround; 224 225 struct brw_sampler_prog_key_data tex; 226 }; 227 228 /** The program key for Tessellation Evaluation Shaders. */ 229 struct brw_tes_prog_key 230 { 231 unsigned program_string_id; 232 233 /** A bitfield of per-patch inputs read. */ 234 uint32_t patch_inputs_read; 235 236 /** A bitfield of per-vertex inputs read. */ 237 uint64_t inputs_read; 238 239 struct brw_sampler_prog_key_data tex; 240 }; 241 242 /** The program key for Geometry Shaders. */ 243 struct brw_gs_prog_key 244 { 245 unsigned program_string_id; 246 247 struct brw_sampler_prog_key_data tex; 248 }; 249 250 /** The program key for Fragment/Pixel Shaders. */ 251 struct brw_wm_prog_key { 252 uint8_t iz_lookup; 253 bool stats_wm:1; 254 bool flat_shade:1; 255 unsigned nr_color_regions:5; 256 bool replicate_alpha:1; 257 bool clamp_fragment_color:1; 258 bool persample_interp:1; 259 bool multisample_fbo:1; 260 unsigned line_aa:2; 261 bool high_quality_derivatives:1; 262 bool force_dual_color_blend:1; 263 bool coherent_fb_fetch:1; 264 265 uint16_t drawable_height; 266 uint64_t input_slots_valid; 267 unsigned program_string_id; 268 GLenum alpha_test_func; /* < For Gen4/5 MRT alpha test */ 269 float alpha_test_ref; 270 271 struct brw_sampler_prog_key_data tex; 272 }; 273 274 struct brw_cs_prog_key { 275 uint32_t program_string_id; 276 struct brw_sampler_prog_key_data tex; 277 }; 278 279 /* 280 * Image metadata structure as laid out in the shader parameter 281 * buffer. Entries have to be 16B-aligned for the vec4 back-end to be 282 * able to use them. That's okay because the padding and any unused 283 * entries [most of them except when we're doing untyped surface 284 * access] will be removed by the uniform packing pass. 285 */ 286 #define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET 0 287 #define BRW_IMAGE_PARAM_OFFSET_OFFSET 4 288 #define BRW_IMAGE_PARAM_SIZE_OFFSET 8 289 #define BRW_IMAGE_PARAM_STRIDE_OFFSET 12 290 #define BRW_IMAGE_PARAM_TILING_OFFSET 16 291 #define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 20 292 #define BRW_IMAGE_PARAM_SIZE 24 293 294 struct brw_image_param { 295 /** Surface binding table index. */ 296 uint32_t surface_idx; 297 298 /** Offset applied to the X and Y surface coordinates. */ 299 uint32_t offset[2]; 300 301 /** Surface X, Y and Z dimensions. */ 302 uint32_t size[3]; 303 304 /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in 305 * pixels, vertical slice stride in pixels. 306 */ 307 uint32_t stride[4]; 308 309 /** Log2 of the tiling modulus in the X, Y and Z dimension. */ 310 uint32_t tiling[3]; 311 312 /** 313 * Right shift to apply for bit 6 address swizzling. Two different 314 * swizzles can be specified and will be applied one after the other. The 315 * resulting address will be: 316 * 317 * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ 318 * (addr >> swizzling[1]))) 319 * 320 * Use \c 0xff if any of the swizzles is not required. 321 */ 322 uint32_t swizzling[2]; 323 }; 324 325 struct brw_stage_prog_data { 326 struct { 327 /** size of our binding table. */ 328 uint32_t size_bytes; 329 330 /** @{ 331 * surface indices for the various groups of surfaces 332 */ 333 uint32_t pull_constants_start; 334 uint32_t texture_start; 335 uint32_t gather_texture_start; 336 uint32_t ubo_start; 337 uint32_t ssbo_start; 338 uint32_t abo_start; 339 uint32_t image_start; 340 uint32_t shader_time_start; 341 uint32_t plane_start[3]; 342 /** @} */ 343 } binding_table; 344 345 GLuint nr_params; /**< number of float params/constants */ 346 GLuint nr_pull_params; 347 unsigned nr_image_params; 348 349 unsigned curb_read_length; 350 unsigned total_scratch; 351 unsigned total_shared; 352 353 /** 354 * Register where the thread expects to find input data from the URB 355 * (typically uniforms, followed by vertex or fragment attributes). 356 */ 357 unsigned dispatch_grf_start_reg; 358 359 bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ 360 361 /* Pointers to tracked values (only valid once 362 * _mesa_load_state_parameters has been called at runtime). 363 */ 364 const union gl_constant_value **param; 365 const union gl_constant_value **pull_param; 366 367 /** Image metadata passed to the shader as uniforms. */ 368 struct brw_image_param *image_param; 369 }; 370 371 /* Data about a particular attempt to compile a program. Note that 372 * there can be many of these, each in a different GL state 373 * corresponding to a different brw_wm_prog_key struct, with different 374 * compiled programs. 375 */ 376 struct brw_wm_prog_data { 377 struct brw_stage_prog_data base; 378 379 GLuint num_varying_inputs; 380 381 uint8_t reg_blocks_0; 382 uint8_t reg_blocks_2; 383 384 uint8_t dispatch_grf_start_reg_2; 385 uint32_t prog_offset_2; 386 387 struct { 388 /** @{ 389 * surface indices the WM-specific surfaces 390 */ 391 uint32_t render_target_start; 392 uint32_t render_target_read_start; 393 /** @} */ 394 } binding_table; 395 396 uint8_t computed_depth_mode; 397 bool computed_stencil; 398 399 bool early_fragment_tests; 400 bool post_depth_coverage; 401 bool inner_coverage; 402 bool dispatch_8; 403 bool dispatch_16; 404 bool dual_src_blend; 405 bool persample_dispatch; 406 bool uses_pos_offset; 407 bool uses_omask; 408 bool uses_kill; 409 bool uses_src_depth; 410 bool uses_src_w; 411 bool uses_sample_mask; 412 bool has_side_effects; 413 bool pulls_bary; 414 415 bool contains_flat_varying; 416 bool contains_noperspective_varying; 417 418 /** 419 * Mask of which interpolation modes are required by the fragment shader. 420 * Used in hardware setup on gen6+. 421 */ 422 uint32_t barycentric_interp_modes; 423 424 /** 425 * Mask of which FS inputs are marked flat by the shader source. This is 426 * needed for setting up 3DSTATE_SF/SBE. 427 */ 428 uint32_t flat_inputs; 429 430 /* Mapping of VUE slots to interpolation modes. 431 * Used by the Gen4-5 clip/sf/wm stages. 432 */ 433 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 434 435 /** 436 * Map from gl_varying_slot to the position within the FS setup data 437 * payload where the varying's attribute vertex deltas should be delivered. 438 * For varying slots that are not used by the FS, the value is -1. 439 */ 440 int urb_setup[VARYING_SLOT_MAX]; 441 }; 442 443 struct brw_push_const_block { 444 unsigned dwords; /* Dword count, not reg aligned */ 445 unsigned regs; 446 unsigned size; /* Bytes, register aligned */ 447 }; 448 449 struct brw_cs_prog_data { 450 struct brw_stage_prog_data base; 451 452 GLuint dispatch_grf_start_reg_16; 453 unsigned local_size[3]; 454 unsigned simd_size; 455 unsigned threads; 456 bool uses_barrier; 457 bool uses_num_work_groups; 458 int thread_local_id_index; 459 460 struct { 461 struct brw_push_const_block cross_thread; 462 struct brw_push_const_block per_thread; 463 struct brw_push_const_block total; 464 } push; 465 466 struct { 467 /** @{ 468 * surface indices the CS-specific surfaces 469 */ 470 uint32_t work_groups_start; 471 /** @} */ 472 } binding_table; 473 }; 474 475 /** 476 * Enum representing the i965-specific vertex results that don't correspond 477 * exactly to any element of gl_varying_slot. The values of this enum are 478 * assigned such that they don't conflict with gl_varying_slot. 479 */ 480 typedef enum 481 { 482 BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, 483 BRW_VARYING_SLOT_PAD, 484 /** 485 * Technically this is not a varying but just a placeholder that 486 * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord 487 * builtin variable to be compiled correctly. see compile_sf_prog() for 488 * more info. 489 */ 490 BRW_VARYING_SLOT_PNTC, 491 BRW_VARYING_SLOT_COUNT 492 } brw_varying_slot; 493 494 /** 495 * Data structure recording the relationship between the gl_varying_slot enum 496 * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a 497 * single octaword within the VUE (128 bits). 498 * 499 * Note that each BRW register contains 256 bits (2 octawords), so when 500 * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two 501 * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as 502 * in a vertex shader), each register corresponds to a single VUE slot, since 503 * it contains data for two separate vertices. 504 */ 505 struct brw_vue_map { 506 /** 507 * Bitfield representing all varying slots that are (a) stored in this VUE 508 * map, and (b) actually written by the shader. Does not include any of 509 * the additional varying slots defined in brw_varying_slot. 510 */ 511 GLbitfield64 slots_valid; 512 513 /** 514 * Is this VUE map for a separate shader pipeline? 515 * 516 * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched 517 * without the linker having a chance to dead code eliminate unused varyings. 518 * 519 * This means that we have to use a fixed slot layout, based on the output's 520 * location field, rather than assigning slots in a compact contiguous block. 521 */ 522 bool separate; 523 524 /** 525 * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are 526 * not stored in a slot (because they are not written, or because 527 * additional processing is applied before storing them in the VUE), the 528 * value is -1. 529 */ 530 signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; 531 532 /** 533 * Map from VUE slot to gl_varying_slot value. For slots that do not 534 * directly correspond to a gl_varying_slot, the value comes from 535 * brw_varying_slot. 536 * 537 * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. 538 */ 539 signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; 540 541 /** 542 * Total number of VUE slots in use 543 */ 544 int num_slots; 545 546 /** 547 * Number of per-patch VUE slots. Only valid for tessellation control 548 * shader outputs and tessellation evaluation shader inputs. 549 */ 550 int num_per_patch_slots; 551 552 /** 553 * Number of per-vertex VUE slots. Only valid for tessellation control 554 * shader outputs and tessellation evaluation shader inputs. 555 */ 556 int num_per_vertex_slots; 557 }; 558 559 void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map); 560 561 /** 562 * Convert a VUE slot number into a byte offset within the VUE. 563 */ 564 static inline GLuint brw_vue_slot_to_offset(GLuint slot) 565 { 566 return 16*slot; 567 } 568 569 /** 570 * Convert a vertex output (brw_varying_slot) into a byte offset within the 571 * VUE. 572 */ 573 static inline 574 GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying) 575 { 576 return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); 577 } 578 579 void brw_compute_vue_map(const struct gen_device_info *devinfo, 580 struct brw_vue_map *vue_map, 581 GLbitfield64 slots_valid, 582 bool separate_shader); 583 584 void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map, 585 const GLbitfield64 slots_valid, 586 const GLbitfield is_patch); 587 588 /* brw_interpolation_map.c */ 589 void brw_setup_vue_interpolation(struct brw_vue_map *vue_map, 590 struct nir_shader *nir, 591 struct brw_wm_prog_data *prog_data, 592 const struct gen_device_info *devinfo); 593 594 enum shader_dispatch_mode { 595 DISPATCH_MODE_4X1_SINGLE = 0, 596 DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, 597 DISPATCH_MODE_4X2_DUAL_OBJECT = 2, 598 DISPATCH_MODE_SIMD8 = 3, 599 }; 600 601 /** 602 * @defgroup Tessellator parameter enumerations. 603 * 604 * These correspond to the hardware values in 3DSTATE_TE, and are provided 605 * as part of the tessellation evaluation shader. 606 * 607 * @{ 608 */ 609 enum brw_tess_partitioning { 610 BRW_TESS_PARTITIONING_INTEGER = 0, 611 BRW_TESS_PARTITIONING_ODD_FRACTIONAL = 1, 612 BRW_TESS_PARTITIONING_EVEN_FRACTIONAL = 2, 613 }; 614 615 enum brw_tess_output_topology { 616 BRW_TESS_OUTPUT_TOPOLOGY_POINT = 0, 617 BRW_TESS_OUTPUT_TOPOLOGY_LINE = 1, 618 BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2, 619 BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3, 620 }; 621 622 enum brw_tess_domain { 623 BRW_TESS_DOMAIN_QUAD = 0, 624 BRW_TESS_DOMAIN_TRI = 1, 625 BRW_TESS_DOMAIN_ISOLINE = 2, 626 }; 627 /** @} */ 628 629 struct brw_vue_prog_data { 630 struct brw_stage_prog_data base; 631 struct brw_vue_map vue_map; 632 633 /** Should the hardware deliver input VUE handles for URB pull loads? */ 634 bool include_vue_handles; 635 636 GLuint urb_read_length; 637 GLuint total_grf; 638 639 uint32_t clip_distance_mask; 640 uint32_t cull_distance_mask; 641 642 /* Used for calculating urb partitions. In the VS, this is the size of the 643 * URB entry used for both input and output to the thread. In the GS, this 644 * is the size of the URB entry used for output. 645 */ 646 GLuint urb_entry_size; 647 648 enum shader_dispatch_mode dispatch_mode; 649 }; 650 651 struct brw_vs_prog_data { 652 struct brw_vue_prog_data base; 653 654 GLbitfield64 inputs_read; 655 GLbitfield64 double_inputs_read; 656 657 unsigned nr_attributes; 658 unsigned nr_attribute_slots; 659 660 bool uses_vertexid; 661 bool uses_instanceid; 662 bool uses_basevertex; 663 bool uses_baseinstance; 664 bool uses_drawid; 665 }; 666 667 struct brw_tcs_prog_data 668 { 669 struct brw_vue_prog_data base; 670 671 /** Number vertices in output patch */ 672 int instances; 673 }; 674 675 676 struct brw_tes_prog_data 677 { 678 struct brw_vue_prog_data base; 679 680 enum brw_tess_partitioning partitioning; 681 enum brw_tess_output_topology output_topology; 682 enum brw_tess_domain domain; 683 }; 684 685 struct brw_gs_prog_data 686 { 687 struct brw_vue_prog_data base; 688 689 unsigned vertices_in; 690 691 /** 692 * Size of an output vertex, measured in HWORDS (32 bytes). 693 */ 694 unsigned output_vertex_size_hwords; 695 696 unsigned output_topology; 697 698 /** 699 * Size of the control data (cut bits or StreamID bits), in hwords (32 700 * bytes). 0 if there is no control data. 701 */ 702 unsigned control_data_header_size_hwords; 703 704 /** 705 * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID 706 * if the control data is StreamID bits, or 707 * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). 708 * Ignored if control_data_header_size is 0. 709 */ 710 unsigned control_data_format; 711 712 bool include_primitive_id; 713 714 /** 715 * The number of vertices emitted, if constant - otherwise -1. 716 */ 717 int static_vertex_count; 718 719 int invocations; 720 721 /** 722 * Gen6: Provoking vertex convention for odd-numbered triangles 723 * in tristrips. 724 */ 725 GLuint pv_first:1; 726 727 /** 728 * Gen6: Number of varyings that are output to transform feedback. 729 */ 730 GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ 731 732 /** 733 * Gen6: Map from the index of a transform feedback binding table entry to the 734 * gl_varying_slot that should be streamed out through that binding table 735 * entry. 736 */ 737 unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */]; 738 739 /** 740 * Gen6: Map from the index of a transform feedback binding table entry to the 741 * swizzles that should be used when streaming out data through that 742 * binding table entry. 743 */ 744 unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; 745 }; 746 747 #define DEFINE_PROG_DATA_DOWNCAST(stage) \ 748 static inline struct brw_##stage##_prog_data * \ 749 brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \ 750 { \ 751 return (struct brw_##stage##_prog_data *) prog_data; \ 752 } 753 DEFINE_PROG_DATA_DOWNCAST(vue) 754 DEFINE_PROG_DATA_DOWNCAST(vs) 755 DEFINE_PROG_DATA_DOWNCAST(tcs) 756 DEFINE_PROG_DATA_DOWNCAST(tes) 757 DEFINE_PROG_DATA_DOWNCAST(gs) 758 DEFINE_PROG_DATA_DOWNCAST(wm) 759 DEFINE_PROG_DATA_DOWNCAST(cs) 760 DEFINE_PROG_DATA_DOWNCAST(ff_gs) 761 DEFINE_PROG_DATA_DOWNCAST(clip) 762 DEFINE_PROG_DATA_DOWNCAST(sf) 763 #undef DEFINE_PROG_DATA_DOWNCAST 764 765 /** @} */ 766 767 struct brw_compiler * 768 brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo); 769 770 /** 771 * Compile a vertex shader. 772 * 773 * Returns the final assembly and the program's size. 774 */ 775 const unsigned * 776 brw_compile_vs(const struct brw_compiler *compiler, void *log_data, 777 void *mem_ctx, 778 const struct brw_vs_prog_key *key, 779 struct brw_vs_prog_data *prog_data, 780 const struct nir_shader *shader, 781 gl_clip_plane *clip_planes, 782 bool use_legacy_snorm_formula, 783 int shader_time_index, 784 unsigned *final_assembly_size, 785 char **error_str); 786 787 /** 788 * Compile a tessellation control shader. 789 * 790 * Returns the final assembly and the program's size. 791 */ 792 const unsigned * 793 brw_compile_tcs(const struct brw_compiler *compiler, 794 void *log_data, 795 void *mem_ctx, 796 const struct brw_tcs_prog_key *key, 797 struct brw_tcs_prog_data *prog_data, 798 const struct nir_shader *nir, 799 int shader_time_index, 800 unsigned *final_assembly_size, 801 char **error_str); 802 803 /** 804 * Compile a tessellation evaluation shader. 805 * 806 * Returns the final assembly and the program's size. 807 */ 808 const unsigned * 809 brw_compile_tes(const struct brw_compiler *compiler, void *log_data, 810 void *mem_ctx, 811 const struct brw_tes_prog_key *key, 812 const struct brw_vue_map *input_vue_map, 813 struct brw_tes_prog_data *prog_data, 814 const struct nir_shader *shader, 815 struct gl_program *prog, 816 int shader_time_index, 817 unsigned *final_assembly_size, 818 char **error_str); 819 820 /** 821 * Compile a vertex shader. 822 * 823 * Returns the final assembly and the program's size. 824 */ 825 const unsigned * 826 brw_compile_gs(const struct brw_compiler *compiler, void *log_data, 827 void *mem_ctx, 828 const struct brw_gs_prog_key *key, 829 struct brw_gs_prog_data *prog_data, 830 const struct nir_shader *shader, 831 struct gl_program *prog, 832 int shader_time_index, 833 unsigned *final_assembly_size, 834 char **error_str); 835 836 /** 837 * Compile a fragment shader. 838 * 839 * Returns the final assembly and the program's size. 840 */ 841 const unsigned * 842 brw_compile_fs(const struct brw_compiler *compiler, void *log_data, 843 void *mem_ctx, 844 const struct brw_wm_prog_key *key, 845 struct brw_wm_prog_data *prog_data, 846 const struct nir_shader *shader, 847 struct gl_program *prog, 848 int shader_time_index8, 849 int shader_time_index16, 850 bool allow_spilling, 851 bool use_rep_send, struct brw_vue_map *vue_map, 852 unsigned *final_assembly_size, 853 char **error_str); 854 855 /** 856 * Compile a compute shader. 857 * 858 * Returns the final assembly and the program's size. 859 */ 860 const unsigned * 861 brw_compile_cs(const struct brw_compiler *compiler, void *log_data, 862 void *mem_ctx, 863 const struct brw_cs_prog_key *key, 864 struct brw_cs_prog_data *prog_data, 865 const struct nir_shader *shader, 866 int shader_time_index, 867 unsigned *final_assembly_size, 868 char **error_str); 869 870 static inline uint32_t 871 encode_slm_size(unsigned gen, uint32_t bytes) 872 { 873 uint32_t slm_size = 0; 874 875 /* Shared Local Memory is specified as powers of two, and encoded in 876 * INTERFACE_DESCRIPTOR_DATA with the following representations: 877 * 878 * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | 879 * ------------------------------------------------------------------- 880 * Gen7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | 881 * ------------------------------------------------------------------- 882 * Gen9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 883 */ 884 assert(bytes <= 64 * 1024); 885 886 if (bytes > 0) { 887 /* Shared Local Memory Size is specified as powers of two. */ 888 slm_size = util_next_power_of_two(bytes); 889 890 if (gen >= 9) { 891 /* Use a minimum of 1kB; turn an exponent of 10 (1024 kB) into 1. */ 892 slm_size = ffs(MAX2(slm_size, 1024)) - 10; 893 } else { 894 /* Use a minimum of 4kB; convert to the pre-Gen9 representation. */ 895 slm_size = MAX2(slm_size, 4096) / 4096; 896 } 897 } 898 899 return slm_size; 900 } 901 902 /** 903 * Return true if the given shader stage is dispatched contiguously by the 904 * relevant fixed function starting from channel 0 of the SIMD thread, which 905 * implies that the dispatch mask of a thread can be assumed to have the form 906 * '2^n - 1' for some n. 907 */ 908 static inline bool 909 brw_stage_has_packed_dispatch(const struct gen_device_info *devinfo, 910 gl_shader_stage stage, 911 const struct brw_stage_prog_data *prog_data) 912 { 913 /* The code below makes assumptions about the hardware's thread dispatch 914 * behavior that could be proven wrong in future generations -- Make sure 915 * to do a full test run with brw_fs_test_dispatch_packing() hooked up to 916 * the NIR front-end before changing this assertion. 917 */ 918 assert(devinfo->gen <= 9); 919 920 switch (stage) { 921 case MESA_SHADER_FRAGMENT: { 922 /* The PSD discards subspans coming in with no lit samples, which in the 923 * per-pixel shading case implies that each subspan will either be fully 924 * lit (due to the VMask being used to allow derivative computations), 925 * or not dispatched at all. In per-sample dispatch mode individual 926 * samples from the same subspan have a fixed relative location within 927 * the SIMD thread, so dispatch of unlit samples cannot be avoided in 928 * general and we should return false. 929 */ 930 const struct brw_wm_prog_data *wm_prog_data = 931 (const struct brw_wm_prog_data *)prog_data; 932 return !wm_prog_data->persample_dispatch; 933 } 934 case MESA_SHADER_COMPUTE: 935 /* Compute shaders will be spawned with either a fully enabled dispatch 936 * mask or with whatever bottom/right execution mask was given to the 937 * GPGPU walker command to be used along the workgroup edges -- In both 938 * cases the dispatch mask is required to be tightly packed for our 939 * invocation index calculations to work. 940 */ 941 return true; 942 default: 943 /* Most remaining fixed functions are limited to use a packed dispatch 944 * mask due to the hardware representation of the dispatch mask as a 945 * single counter representing the number of enabled channels. 946 */ 947 return true; 948 } 949 } 950 951 #ifdef __cplusplus 952 } /* extern "C" */ 953 #endif 954