1 /* 2 * Copyright 2012 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "gallivm/lp_bld_const.h" 25 #include "gallivm/lp_bld_gather.h" 26 #include "gallivm/lp_bld_intr.h" 27 #include "gallivm/lp_bld_logic.h" 28 #include "gallivm/lp_bld_arit.h" 29 #include "gallivm/lp_bld_flow.h" 30 #include "gallivm/lp_bld_misc.h" 31 #include "util/u_memory.h" 32 #include "util/u_string.h" 33 #include "tgsi/tgsi_build.h" 34 #include "tgsi/tgsi_util.h" 35 #include "tgsi/tgsi_dump.h" 36 37 #include "ac_binary.h" 38 #include "ac_llvm_util.h" 39 #include "ac_exp_param.h" 40 #include "ac_shader_util.h" 41 #include "si_shader_internal.h" 42 #include "si_pipe.h" 43 #include "sid.h" 44 45 #include "compiler/nir/nir.h" 46 47 static const char *scratch_rsrc_dword0_symbol = 48 "SCRATCH_RSRC_DWORD0"; 49 50 static const char *scratch_rsrc_dword1_symbol = 51 "SCRATCH_RSRC_DWORD1"; 52 53 struct si_shader_output_values 54 { 55 LLVMValueRef values[4]; 56 unsigned semantic_name; 57 unsigned semantic_index; 58 ubyte vertex_stream[4]; 59 }; 60 61 /** 62 * Used to collect types and other info about arguments of the LLVM function 63 * before the function is created. 64 */ 65 struct si_function_info { 66 LLVMTypeRef types[100]; 67 LLVMValueRef *assign[100]; 68 unsigned num_sgpr_params; 69 unsigned num_params; 70 }; 71 72 enum si_arg_regfile { 73 ARG_SGPR, 74 ARG_VGPR 75 }; 76 77 static void si_init_shader_ctx(struct si_shader_context *ctx, 78 struct si_screen *sscreen, 79 LLVMTargetMachineRef tm); 80 81 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, 82 struct lp_build_tgsi_context *bld_base, 83 struct lp_build_emit_data *emit_data); 84 85 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader, 86 FILE *f); 87 88 static void si_build_vs_prolog_function(struct si_shader_context *ctx, 89 union si_shader_part_key *key); 90 static void si_build_tcs_epilog_function(struct si_shader_context *ctx, 91 union si_shader_part_key *key); 92 static void si_build_ps_prolog_function(struct si_shader_context *ctx, 93 union si_shader_part_key *key); 94 static void si_build_ps_epilog_function(struct si_shader_context *ctx, 95 union si_shader_part_key *key); 96 97 /* Ideally pass the sample mask input to the PS epilog as v14, which 98 * is its usual location, so that the shader doesn't have to add v_mov. 99 */ 100 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14 101 102 enum { 103 CONST_ADDR_SPACE = 2, 104 LOCAL_ADDR_SPACE = 3, 105 }; 106 107 static bool llvm_type_is_64bit(struct si_shader_context *ctx, 108 LLVMTypeRef type) 109 { 110 if (type == ctx->ac.i64 || type == ctx->ac.f64) 111 return true; 112 113 return false; 114 } 115 116 static bool is_merged_shader(struct si_shader *shader) 117 { 118 if (shader->selector->screen->info.chip_class <= VI) 119 return false; 120 121 return shader->key.as_ls || 122 shader->key.as_es || 123 shader->selector->type == PIPE_SHADER_TESS_CTRL || 124 shader->selector->type == PIPE_SHADER_GEOMETRY; 125 } 126 127 static void si_init_function_info(struct si_function_info *fninfo) 128 { 129 fninfo->num_params = 0; 130 fninfo->num_sgpr_params = 0; 131 } 132 133 static unsigned add_arg_assign(struct si_function_info *fninfo, 134 enum si_arg_regfile regfile, LLVMTypeRef type, 135 LLVMValueRef *assign) 136 { 137 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params); 138 139 unsigned idx = fninfo->num_params++; 140 assert(idx < ARRAY_SIZE(fninfo->types)); 141 142 if (regfile == ARG_SGPR) 143 fninfo->num_sgpr_params = fninfo->num_params; 144 145 fninfo->types[idx] = type; 146 fninfo->assign[idx] = assign; 147 return idx; 148 } 149 150 static unsigned add_arg(struct si_function_info *fninfo, 151 enum si_arg_regfile regfile, LLVMTypeRef type) 152 { 153 return add_arg_assign(fninfo, regfile, type, NULL); 154 } 155 156 static void add_arg_assign_checked(struct si_function_info *fninfo, 157 enum si_arg_regfile regfile, LLVMTypeRef type, 158 LLVMValueRef *assign, unsigned idx) 159 { 160 MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign); 161 assert(actual == idx); 162 } 163 164 static void add_arg_checked(struct si_function_info *fninfo, 165 enum si_arg_regfile regfile, LLVMTypeRef type, 166 unsigned idx) 167 { 168 add_arg_assign_checked(fninfo, regfile, type, NULL, idx); 169 } 170 171 /** 172 * Returns a unique index for a per-patch semantic name and index. The index 173 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs 174 * can be calculated. 175 */ 176 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index) 177 { 178 switch (semantic_name) { 179 case TGSI_SEMANTIC_TESSOUTER: 180 return 0; 181 case TGSI_SEMANTIC_TESSINNER: 182 return 1; 183 case TGSI_SEMANTIC_PATCH: 184 assert(index < 30); 185 return 2 + index; 186 187 default: 188 assert(!"invalid semantic name"); 189 return 0; 190 } 191 } 192 193 /** 194 * Returns a unique index for a semantic name and index. The index must be 195 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be 196 * calculated. 197 */ 198 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) 199 { 200 switch (semantic_name) { 201 case TGSI_SEMANTIC_POSITION: 202 return 0; 203 case TGSI_SEMANTIC_GENERIC: 204 /* Since some shader stages use the the highest used IO index 205 * to determine the size to allocate for inputs/outputs 206 * (in LDS, tess and GS rings). GENERIC should be placed right 207 * after POSITION to make that size as small as possible. 208 */ 209 if (index < SI_MAX_IO_GENERIC) 210 return 1 + index; 211 212 assert(!"invalid generic index"); 213 return 0; 214 case TGSI_SEMANTIC_PSIZE: 215 return SI_MAX_IO_GENERIC + 1; 216 case TGSI_SEMANTIC_CLIPDIST: 217 assert(index <= 1); 218 return SI_MAX_IO_GENERIC + 2 + index; 219 case TGSI_SEMANTIC_FOG: 220 return SI_MAX_IO_GENERIC + 4; 221 case TGSI_SEMANTIC_LAYER: 222 return SI_MAX_IO_GENERIC + 5; 223 case TGSI_SEMANTIC_VIEWPORT_INDEX: 224 return SI_MAX_IO_GENERIC + 6; 225 case TGSI_SEMANTIC_PRIMID: 226 return SI_MAX_IO_GENERIC + 7; 227 case TGSI_SEMANTIC_COLOR: /* these alias */ 228 case TGSI_SEMANTIC_BCOLOR: 229 assert(index < 2); 230 return SI_MAX_IO_GENERIC + 8 + index; 231 case TGSI_SEMANTIC_TEXCOORD: 232 assert(index < 8); 233 assert(SI_MAX_IO_GENERIC + 10 + index < 64); 234 return SI_MAX_IO_GENERIC + 10 + index; 235 default: 236 assert(!"invalid semantic name"); 237 return 0; 238 } 239 } 240 241 /** 242 * Get the value of a shader input parameter and extract a bitfield. 243 */ 244 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, 245 LLVMValueRef value, unsigned rshift, 246 unsigned bitwidth) 247 { 248 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) 249 value = ac_to_integer(&ctx->ac, value); 250 251 if (rshift) 252 value = LLVMBuildLShr(ctx->ac.builder, value, 253 LLVMConstInt(ctx->i32, rshift, 0), ""); 254 255 if (rshift + bitwidth < 32) { 256 unsigned mask = (1 << bitwidth) - 1; 257 value = LLVMBuildAnd(ctx->ac.builder, value, 258 LLVMConstInt(ctx->i32, mask, 0), ""); 259 } 260 261 return value; 262 } 263 264 static LLVMValueRef unpack_param(struct si_shader_context *ctx, 265 unsigned param, unsigned rshift, 266 unsigned bitwidth) 267 { 268 LLVMValueRef value = LLVMGetParam(ctx->main_fn, param); 269 270 return unpack_llvm_param(ctx, value, rshift, bitwidth); 271 } 272 273 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) 274 { 275 switch (ctx->type) { 276 case PIPE_SHADER_TESS_CTRL: 277 return unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 0, 8); 278 279 case PIPE_SHADER_TESS_EVAL: 280 return LLVMGetParam(ctx->main_fn, 281 ctx->param_tes_rel_patch_id); 282 283 default: 284 assert(0); 285 return NULL; 286 } 287 } 288 289 /* Tessellation shaders pass outputs to the next shader using LDS. 290 * 291 * LS outputs = TCS inputs 292 * TCS outputs = TES inputs 293 * 294 * The LDS layout is: 295 * - TCS inputs for patch 0 296 * - TCS inputs for patch 1 297 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 298 * - ... 299 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 300 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 301 * - TCS outputs for patch 1 302 * - Per-patch TCS outputs for patch 1 303 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 304 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 305 * - ... 306 * 307 * All three shaders VS(LS), TCS, TES share the same LDS space. 308 */ 309 310 static LLVMValueRef 311 get_tcs_in_patch_stride(struct si_shader_context *ctx) 312 { 313 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13); 314 } 315 316 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx) 317 { 318 assert(ctx->type == PIPE_SHADER_TESS_CTRL); 319 320 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) 321 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4; 322 323 return util_last_bit64(ctx->shader->selector->outputs_written) * 4; 324 } 325 326 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx) 327 { 328 unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx); 329 330 return LLVMConstInt(ctx->i32, stride, 0); 331 } 332 333 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) 334 { 335 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) 336 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13); 337 338 const struct tgsi_shader_info *info = &ctx->shader->selector->info; 339 unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; 340 unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); 341 unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); 342 unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + 343 num_patch_outputs * 4; 344 return LLVMConstInt(ctx->i32, patch_dw_stride, 0); 345 } 346 347 static LLVMValueRef 348 get_tcs_out_patch0_offset(struct si_shader_context *ctx) 349 { 350 return lp_build_mul_imm(&ctx->bld_base.uint_bld, 351 unpack_param(ctx, 352 ctx->param_tcs_out_lds_offsets, 353 0, 16), 354 4); 355 } 356 357 static LLVMValueRef 358 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) 359 { 360 return lp_build_mul_imm(&ctx->bld_base.uint_bld, 361 unpack_param(ctx, 362 ctx->param_tcs_out_lds_offsets, 363 16, 16), 364 4); 365 } 366 367 static LLVMValueRef 368 get_tcs_in_current_patch_offset(struct si_shader_context *ctx) 369 { 370 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); 371 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 372 373 return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""); 374 } 375 376 static LLVMValueRef 377 get_tcs_out_current_patch_offset(struct si_shader_context *ctx) 378 { 379 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); 380 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); 381 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 382 383 return LLVMBuildAdd(ctx->ac.builder, patch0_offset, 384 LLVMBuildMul(ctx->ac.builder, patch_stride, 385 rel_patch_id, ""), 386 ""); 387 } 388 389 static LLVMValueRef 390 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) 391 { 392 LLVMValueRef patch0_patch_data_offset = 393 get_tcs_out_patch0_patch_data_offset(ctx); 394 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); 395 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 396 397 return LLVMBuildAdd(ctx->ac.builder, patch0_patch_data_offset, 398 LLVMBuildMul(ctx->ac.builder, patch_stride, 399 rel_patch_id, ""), 400 ""); 401 } 402 403 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) 404 { 405 unsigned tcs_out_vertices = 406 ctx->shader->selector ? 407 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0; 408 409 /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */ 410 if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices) 411 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0); 412 413 return unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6); 414 } 415 416 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) 417 { 418 unsigned stride; 419 420 switch (ctx->type) { 421 case PIPE_SHADER_VERTEX: 422 stride = util_last_bit64(ctx->shader->selector->outputs_written); 423 return LLVMConstInt(ctx->i32, stride * 4, 0); 424 425 case PIPE_SHADER_TESS_CTRL: 426 if (ctx->screen->info.chip_class >= GFX9 && 427 ctx->shader->is_monolithic) { 428 stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written); 429 return LLVMConstInt(ctx->i32, stride * 4, 0); 430 } 431 return unpack_param(ctx, ctx->param_vs_state_bits, 24, 8); 432 433 default: 434 assert(0); 435 return NULL; 436 } 437 } 438 439 static LLVMValueRef get_instance_index_for_fetch( 440 struct si_shader_context *ctx, 441 unsigned param_start_instance, LLVMValueRef divisor) 442 { 443 LLVMValueRef result = ctx->abi.instance_id; 444 445 /* The division must be done before START_INSTANCE is added. */ 446 if (divisor != ctx->i32_1) 447 result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, ""); 448 449 return LLVMBuildAdd(ctx->ac.builder, result, 450 LLVMGetParam(ctx->main_fn, param_start_instance), ""); 451 } 452 453 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert 454 * to float. */ 455 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, 456 LLVMValueRef vec4, 457 unsigned double_index) 458 { 459 LLVMBuilderRef builder = ctx->ac.builder; 460 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context); 461 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4, 462 LLVMVectorType(f64, 2), ""); 463 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0); 464 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, ""); 465 return LLVMBuildFPTrunc(builder, value, ctx->f32, ""); 466 } 467 468 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, 469 LLVMValueRef i32, unsigned index) 470 { 471 assert(index <= 1); 472 473 if (index == 1) 474 return LLVMBuildAShr(ctx->ac.builder, i32, 475 LLVMConstInt(ctx->i32, 16, 0), ""); 476 477 return LLVMBuildSExt(ctx->ac.builder, 478 LLVMBuildTrunc(ctx->ac.builder, i32, 479 ctx->ac.i16, ""), 480 ctx->i32, ""); 481 } 482 483 void si_llvm_load_input_vs( 484 struct si_shader_context *ctx, 485 unsigned input_index, 486 LLVMValueRef out[4]) 487 { 488 unsigned vs_blit_property = 489 ctx->shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; 490 491 if (vs_blit_property) { 492 LLVMValueRef vertex_id = ctx->abi.vertex_id; 493 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, 494 LLVMIntULE, vertex_id, 495 ctx->i32_1, ""); 496 /* Use LLVMIntNE, because we have 3 vertices and only 497 * the middle one should use y2. 498 */ 499 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, 500 LLVMIntNE, vertex_id, 501 ctx->i32_1, ""); 502 503 if (input_index == 0) { 504 /* Position: */ 505 LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, 506 ctx->param_vs_blit_inputs); 507 LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, 508 ctx->param_vs_blit_inputs + 1); 509 510 LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); 511 LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); 512 LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); 513 LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); 514 515 LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, 516 x1, x2, ""); 517 LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, 518 y1, y2, ""); 519 520 out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, ""); 521 out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, ""); 522 out[2] = LLVMGetParam(ctx->main_fn, 523 ctx->param_vs_blit_inputs + 2); 524 out[3] = ctx->ac.f32_1; 525 return; 526 } 527 528 /* Color or texture coordinates: */ 529 assert(input_index == 1); 530 531 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { 532 for (int i = 0; i < 4; i++) { 533 out[i] = LLVMGetParam(ctx->main_fn, 534 ctx->param_vs_blit_inputs + 3 + i); 535 } 536 } else { 537 assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); 538 LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, 539 ctx->param_vs_blit_inputs + 3); 540 LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, 541 ctx->param_vs_blit_inputs + 4); 542 LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, 543 ctx->param_vs_blit_inputs + 5); 544 LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, 545 ctx->param_vs_blit_inputs + 6); 546 547 out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, 548 x1, x2, ""); 549 out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, 550 y1, y2, ""); 551 out[2] = LLVMGetParam(ctx->main_fn, 552 ctx->param_vs_blit_inputs + 7); 553 out[3] = LLVMGetParam(ctx->main_fn, 554 ctx->param_vs_blit_inputs + 8); 555 } 556 return; 557 } 558 559 unsigned chan; 560 unsigned fix_fetch; 561 unsigned num_fetches; 562 unsigned fetch_stride; 563 564 LLVMValueRef t_list_ptr; 565 LLVMValueRef t_offset; 566 LLVMValueRef t_list; 567 LLVMValueRef vertex_index; 568 LLVMValueRef input[3]; 569 570 /* Load the T list */ 571 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers); 572 573 t_offset = LLVMConstInt(ctx->i32, input_index, 0); 574 575 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); 576 577 vertex_index = LLVMGetParam(ctx->main_fn, 578 ctx->param_vertex_index0 + 579 input_index); 580 581 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index]; 582 583 /* Do multiple loads for special formats. */ 584 switch (fix_fetch) { 585 case SI_FIX_FETCH_RGB_64_FLOAT: 586 num_fetches = 3; /* 3 2-dword loads */ 587 fetch_stride = 8; 588 break; 589 case SI_FIX_FETCH_RGBA_64_FLOAT: 590 num_fetches = 2; /* 2 4-dword loads */ 591 fetch_stride = 16; 592 break; 593 case SI_FIX_FETCH_RGB_8: 594 case SI_FIX_FETCH_RGB_8_INT: 595 num_fetches = 3; 596 fetch_stride = 1; 597 break; 598 case SI_FIX_FETCH_RGB_16: 599 case SI_FIX_FETCH_RGB_16_INT: 600 num_fetches = 3; 601 fetch_stride = 2; 602 break; 603 default: 604 num_fetches = 1; 605 fetch_stride = 0; 606 } 607 608 for (unsigned i = 0; i < num_fetches; i++) { 609 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0); 610 611 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list, 612 vertex_index, voffset, 613 true); 614 } 615 616 /* Break up the vec4 into individual components */ 617 for (chan = 0; chan < 4; chan++) { 618 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0); 619 out[chan] = LLVMBuildExtractElement(ctx->ac.builder, 620 input[0], llvm_chan, ""); 621 } 622 623 switch (fix_fetch) { 624 case SI_FIX_FETCH_A2_SNORM: 625 case SI_FIX_FETCH_A2_SSCALED: 626 case SI_FIX_FETCH_A2_SINT: { 627 /* The hardware returns an unsigned value; convert it to a 628 * signed one. 629 */ 630 LLVMValueRef tmp = out[3]; 631 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0); 632 633 /* First, recover the sign-extended signed integer value. */ 634 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) 635 tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, ""); 636 else 637 tmp = ac_to_integer(&ctx->ac, tmp); 638 639 /* For the integer-like cases, do a natural sign extension. 640 * 641 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 642 * and happen to contain 0, 1, 2, 3 as the two LSBs of the 643 * exponent. 644 */ 645 tmp = LLVMBuildShl(ctx->ac.builder, tmp, 646 fix_fetch == SI_FIX_FETCH_A2_SNORM ? 647 LLVMConstInt(ctx->i32, 7, 0) : c30, ""); 648 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); 649 650 /* Convert back to the right type. */ 651 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) { 652 LLVMValueRef clamp; 653 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); 654 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); 655 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); 656 tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); 657 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) { 658 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); 659 } 660 661 out[3] = tmp; 662 break; 663 } 664 case SI_FIX_FETCH_RGBA_32_UNORM: 665 case SI_FIX_FETCH_RGBX_32_UNORM: 666 for (chan = 0; chan < 4; chan++) { 667 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 668 out[chan] = LLVMBuildUIToFP(ctx->ac.builder, 669 out[chan], ctx->f32, ""); 670 out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], 671 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), ""); 672 } 673 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ 674 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM) 675 out[3] = LLVMConstReal(ctx->f32, 1); 676 break; 677 case SI_FIX_FETCH_RGBA_32_SNORM: 678 case SI_FIX_FETCH_RGBX_32_SNORM: 679 case SI_FIX_FETCH_RGBA_32_FIXED: 680 case SI_FIX_FETCH_RGBX_32_FIXED: { 681 double scale; 682 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED) 683 scale = 1.0 / 0x10000; 684 else 685 scale = 1.0 / INT_MAX; 686 687 for (chan = 0; chan < 4; chan++) { 688 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 689 out[chan] = LLVMBuildSIToFP(ctx->ac.builder, 690 out[chan], ctx->f32, ""); 691 out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], 692 LLVMConstReal(ctx->f32, scale), ""); 693 } 694 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ 695 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM || 696 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED) 697 out[3] = LLVMConstReal(ctx->f32, 1); 698 break; 699 } 700 case SI_FIX_FETCH_RGBA_32_USCALED: 701 for (chan = 0; chan < 4; chan++) { 702 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 703 out[chan] = LLVMBuildUIToFP(ctx->ac.builder, 704 out[chan], ctx->f32, ""); 705 } 706 break; 707 case SI_FIX_FETCH_RGBA_32_SSCALED: 708 for (chan = 0; chan < 4; chan++) { 709 out[chan] = ac_to_integer(&ctx->ac, out[chan]); 710 out[chan] = LLVMBuildSIToFP(ctx->ac.builder, 711 out[chan], ctx->f32, ""); 712 } 713 break; 714 case SI_FIX_FETCH_RG_64_FLOAT: 715 for (chan = 0; chan < 2; chan++) 716 out[chan] = extract_double_to_float(ctx, input[0], chan); 717 718 out[2] = LLVMConstReal(ctx->f32, 0); 719 out[3] = LLVMConstReal(ctx->f32, 1); 720 break; 721 case SI_FIX_FETCH_RGB_64_FLOAT: 722 for (chan = 0; chan < 3; chan++) 723 out[chan] = extract_double_to_float(ctx, input[chan], 0); 724 725 out[3] = LLVMConstReal(ctx->f32, 1); 726 break; 727 case SI_FIX_FETCH_RGBA_64_FLOAT: 728 for (chan = 0; chan < 4; chan++) { 729 out[chan] = extract_double_to_float(ctx, input[chan / 2], 730 chan % 2); 731 } 732 break; 733 case SI_FIX_FETCH_RGB_8: 734 case SI_FIX_FETCH_RGB_8_INT: 735 case SI_FIX_FETCH_RGB_16: 736 case SI_FIX_FETCH_RGB_16_INT: 737 for (chan = 0; chan < 3; chan++) { 738 out[chan] = LLVMBuildExtractElement(ctx->ac.builder, 739 input[chan], 740 ctx->i32_0, ""); 741 } 742 if (fix_fetch == SI_FIX_FETCH_RGB_8 || 743 fix_fetch == SI_FIX_FETCH_RGB_16) { 744 out[3] = LLVMConstReal(ctx->f32, 1); 745 } else { 746 out[3] = ac_to_float(&ctx->ac, ctx->i32_1); 747 } 748 break; 749 } 750 } 751 752 static void declare_input_vs( 753 struct si_shader_context *ctx, 754 unsigned input_index, 755 const struct tgsi_full_declaration *decl, 756 LLVMValueRef out[4]) 757 { 758 si_llvm_load_input_vs(ctx, input_index, out); 759 } 760 761 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx, 762 unsigned swizzle) 763 { 764 if (swizzle > 0) 765 return ctx->i32_0; 766 767 switch (ctx->type) { 768 case PIPE_SHADER_VERTEX: 769 return LLVMGetParam(ctx->main_fn, 770 ctx->param_vs_prim_id); 771 case PIPE_SHADER_TESS_CTRL: 772 return ctx->abi.tcs_patch_id; 773 case PIPE_SHADER_TESS_EVAL: 774 return ctx->abi.tes_patch_id; 775 case PIPE_SHADER_GEOMETRY: 776 return ctx->abi.gs_prim_id; 777 default: 778 assert(0); 779 return ctx->i32_0; 780 } 781 } 782 783 /** 784 * Return the value of tgsi_ind_register for indexing. 785 * This is the indirect index with the constant offset added to it. 786 */ 787 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx, 788 const struct tgsi_ind_register *ind, 789 unsigned addr_mul, 790 int rel_index) 791 { 792 LLVMValueRef result; 793 794 if (ind->File == TGSI_FILE_ADDRESS) { 795 result = ctx->addrs[ind->Index][ind->Swizzle]; 796 result = LLVMBuildLoad(ctx->ac.builder, result, ""); 797 } else { 798 struct tgsi_full_src_register src = {}; 799 800 src.Register.File = ind->File; 801 src.Register.Index = ind->Index; 802 803 /* Set the second index to 0 for constants. */ 804 if (ind->File == TGSI_FILE_CONSTANT) 805 src.Register.Dimension = 1; 806 807 result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src, 808 TGSI_TYPE_SIGNED, 809 ind->Swizzle); 810 result = ac_to_integer(&ctx->ac, result); 811 } 812 813 if (addr_mul != 1) 814 result = LLVMBuildMul(ctx->ac.builder, result, 815 LLVMConstInt(ctx->i32, addr_mul, 0), ""); 816 result = LLVMBuildAdd(ctx->ac.builder, result, 817 LLVMConstInt(ctx->i32, rel_index, 0), ""); 818 return result; 819 } 820 821 /** 822 * Like si_get_indirect_index, but restricts the return value to a (possibly 823 * undefined) value inside [0..num). 824 */ 825 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx, 826 const struct tgsi_ind_register *ind, 827 int rel_index, unsigned num) 828 { 829 LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index); 830 831 return si_llvm_bound_index(ctx, result, num); 832 } 833 834 static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx, 835 LLVMValueRef vertex_dw_stride, 836 LLVMValueRef base_addr, 837 LLVMValueRef vertex_index, 838 LLVMValueRef param_index, 839 unsigned input_index, 840 ubyte *name, 841 ubyte *index, 842 bool is_patch) 843 { 844 if (vertex_dw_stride) { 845 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, 846 LLVMBuildMul(ctx->ac.builder, vertex_index, 847 vertex_dw_stride, ""), ""); 848 } 849 850 if (param_index) { 851 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, 852 LLVMBuildMul(ctx->ac.builder, param_index, 853 LLVMConstInt(ctx->i32, 4, 0), ""), ""); 854 } 855 856 int param = is_patch ? 857 si_shader_io_get_unique_index_patch(name[input_index], 858 index[input_index]) : 859 si_shader_io_get_unique_index(name[input_index], 860 index[input_index]); 861 862 /* Add the base address of the element. */ 863 return LLVMBuildAdd(ctx->ac.builder, base_addr, 864 LLVMConstInt(ctx->i32, param * 4, 0), ""); 865 } 866 867 /** 868 * Calculate a dword address given an input or output register and a stride. 869 */ 870 static LLVMValueRef get_dw_address(struct si_shader_context *ctx, 871 const struct tgsi_full_dst_register *dst, 872 const struct tgsi_full_src_register *src, 873 LLVMValueRef vertex_dw_stride, 874 LLVMValueRef base_addr) 875 { 876 struct tgsi_shader_info *info = &ctx->shader->selector->info; 877 ubyte *name, *index, *array_first; 878 int input_index; 879 struct tgsi_full_dst_register reg; 880 LLVMValueRef vertex_index = NULL; 881 LLVMValueRef ind_index = NULL; 882 883 /* Set the register description. The address computation is the same 884 * for sources and destinations. */ 885 if (src) { 886 reg.Register.File = src->Register.File; 887 reg.Register.Index = src->Register.Index; 888 reg.Register.Indirect = src->Register.Indirect; 889 reg.Register.Dimension = src->Register.Dimension; 890 reg.Indirect = src->Indirect; 891 reg.Dimension = src->Dimension; 892 reg.DimIndirect = src->DimIndirect; 893 } else 894 reg = *dst; 895 896 /* If the register is 2-dimensional (e.g. an array of vertices 897 * in a primitive), calculate the base address of the vertex. */ 898 if (reg.Register.Dimension) { 899 if (reg.Dimension.Indirect) 900 vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, 901 1, reg.Dimension.Index); 902 else 903 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); 904 } 905 906 /* Get information about the register. */ 907 if (reg.Register.File == TGSI_FILE_INPUT) { 908 name = info->input_semantic_name; 909 index = info->input_semantic_index; 910 array_first = info->input_array_first; 911 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 912 name = info->output_semantic_name; 913 index = info->output_semantic_index; 914 array_first = info->output_array_first; 915 } else { 916 assert(0); 917 return NULL; 918 } 919 920 if (reg.Register.Indirect) { 921 /* Add the relative address of the element. */ 922 if (reg.Indirect.ArrayID) 923 input_index = array_first[reg.Indirect.ArrayID]; 924 else 925 input_index = reg.Register.Index; 926 927 ind_index = si_get_indirect_index(ctx, ®.Indirect, 928 1, reg.Register.Index - input_index); 929 } else { 930 input_index = reg.Register.Index; 931 } 932 933 return get_dw_address_from_generic_indices(ctx, vertex_dw_stride, 934 base_addr, vertex_index, 935 ind_index, input_index, 936 name, index, 937 !reg.Register.Dimension); 938 } 939 940 /* The offchip buffer layout for TCS->TES is 941 * 942 * - attribute 0 of patch 0 vertex 0 943 * - attribute 0 of patch 0 vertex 1 944 * - attribute 0 of patch 0 vertex 2 945 * ... 946 * - attribute 0 of patch 1 vertex 0 947 * - attribute 0 of patch 1 vertex 1 948 * ... 949 * - attribute 1 of patch 0 vertex 0 950 * - attribute 1 of patch 0 vertex 1 951 * ... 952 * - per patch attribute 0 of patch 0 953 * - per patch attribute 0 of patch 1 954 * ... 955 * 956 * Note that every attribute has 4 components. 957 */ 958 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, 959 LLVMValueRef rel_patch_id, 960 LLVMValueRef vertex_index, 961 LLVMValueRef param_index) 962 { 963 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; 964 LLVMValueRef param_stride, constant16; 965 966 vertices_per_patch = get_num_tcs_out_vertices(ctx); 967 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6); 968 total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, 969 num_patches, ""); 970 971 constant16 = LLVMConstInt(ctx->i32, 16, 0); 972 if (vertex_index) { 973 base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id, 974 vertices_per_patch, ""); 975 976 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, 977 vertex_index, ""); 978 979 param_stride = total_vertices; 980 } else { 981 base_addr = rel_patch_id; 982 param_stride = num_patches; 983 } 984 985 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, 986 LLVMBuildMul(ctx->ac.builder, param_index, 987 param_stride, ""), ""); 988 989 base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); 990 991 if (!vertex_index) { 992 LLVMValueRef patch_data_offset = 993 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20); 994 995 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, 996 patch_data_offset, ""); 997 } 998 return base_addr; 999 } 1000 1001 /* This is a generic helper that can be shared by the NIR and TGSI backends */ 1002 static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices( 1003 struct si_shader_context *ctx, 1004 LLVMValueRef vertex_index, 1005 LLVMValueRef param_index, 1006 unsigned param_base, 1007 ubyte *name, 1008 ubyte *index, 1009 bool is_patch) 1010 { 1011 unsigned param_index_base; 1012 1013 param_index_base = is_patch ? 1014 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) : 1015 si_shader_io_get_unique_index(name[param_base], index[param_base]); 1016 1017 if (param_index) { 1018 param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1019 LLVMConstInt(ctx->i32, param_index_base, 0), 1020 ""); 1021 } else { 1022 param_index = LLVMConstInt(ctx->i32, param_index_base, 0); 1023 } 1024 1025 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), 1026 vertex_index, param_index); 1027 } 1028 1029 static LLVMValueRef get_tcs_tes_buffer_address_from_reg( 1030 struct si_shader_context *ctx, 1031 const struct tgsi_full_dst_register *dst, 1032 const struct tgsi_full_src_register *src) 1033 { 1034 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1035 ubyte *name, *index, *array_first; 1036 struct tgsi_full_src_register reg; 1037 LLVMValueRef vertex_index = NULL; 1038 LLVMValueRef param_index = NULL; 1039 unsigned param_base; 1040 1041 reg = src ? *src : tgsi_full_src_register_from_dst(dst); 1042 1043 if (reg.Register.Dimension) { 1044 1045 if (reg.Dimension.Indirect) 1046 vertex_index = si_get_indirect_index(ctx, ®.DimIndirect, 1047 1, reg.Dimension.Index); 1048 else 1049 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0); 1050 } 1051 1052 /* Get information about the register. */ 1053 if (reg.Register.File == TGSI_FILE_INPUT) { 1054 name = info->input_semantic_name; 1055 index = info->input_semantic_index; 1056 array_first = info->input_array_first; 1057 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 1058 name = info->output_semantic_name; 1059 index = info->output_semantic_index; 1060 array_first = info->output_array_first; 1061 } else { 1062 assert(0); 1063 return NULL; 1064 } 1065 1066 if (reg.Register.Indirect) { 1067 if (reg.Indirect.ArrayID) 1068 param_base = array_first[reg.Indirect.ArrayID]; 1069 else 1070 param_base = reg.Register.Index; 1071 1072 param_index = si_get_indirect_index(ctx, ®.Indirect, 1073 1, reg.Register.Index - param_base); 1074 1075 } else { 1076 param_base = reg.Register.Index; 1077 } 1078 1079 return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1080 param_index, param_base, 1081 name, index, !reg.Register.Dimension); 1082 } 1083 1084 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, 1085 LLVMTypeRef type, unsigned swizzle, 1086 LLVMValueRef buffer, LLVMValueRef offset, 1087 LLVMValueRef base, bool can_speculate) 1088 { 1089 struct si_shader_context *ctx = si_shader_context(bld_base); 1090 LLVMValueRef value, value2; 1091 LLVMTypeRef vec_type = LLVMVectorType(type, 4); 1092 1093 if (swizzle == ~0) { 1094 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 1095 0, 1, 0, can_speculate, false); 1096 1097 return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); 1098 } 1099 1100 if (!llvm_type_is_64bit(ctx, type)) { 1101 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 1102 0, 1, 0, can_speculate, false); 1103 1104 value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); 1105 return LLVMBuildExtractElement(ctx->ac.builder, value, 1106 LLVMConstInt(ctx->i32, swizzle, 0), ""); 1107 } 1108 1109 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, 1110 swizzle * 4, 1, 0, can_speculate, false); 1111 1112 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, 1113 swizzle * 4 + 4, 1, 0, can_speculate, false); 1114 1115 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); 1116 } 1117 1118 /** 1119 * Load from LDS. 1120 * 1121 * \param type output value type 1122 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4 1123 * \param dw_addr address in dwords 1124 */ 1125 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, 1126 LLVMTypeRef type, unsigned swizzle, 1127 LLVMValueRef dw_addr) 1128 { 1129 struct si_shader_context *ctx = si_shader_context(bld_base); 1130 LLVMValueRef value; 1131 1132 if (swizzle == ~0) { 1133 LLVMValueRef values[TGSI_NUM_CHANNELS]; 1134 1135 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) 1136 values[chan] = lds_load(bld_base, type, chan, dw_addr); 1137 1138 return lp_build_gather_values(&ctx->gallivm, values, 1139 TGSI_NUM_CHANNELS); 1140 } 1141 1142 /* Split 64-bit loads. */ 1143 if (llvm_type_is_64bit(ctx, type)) { 1144 LLVMValueRef lo, hi; 1145 1146 lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr); 1147 hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr); 1148 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi); 1149 } 1150 1151 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, 1152 LLVMConstInt(ctx->i32, swizzle, 0)); 1153 1154 value = ac_lds_load(&ctx->ac, dw_addr); 1155 1156 return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); 1157 } 1158 1159 /** 1160 * Store to LDS. 1161 * 1162 * \param swizzle offset (typically 0..3) 1163 * \param dw_addr address in dwords 1164 * \param value value to store 1165 */ 1166 static void lds_store(struct si_shader_context *ctx, 1167 unsigned dw_offset_imm, LLVMValueRef dw_addr, 1168 LLVMValueRef value) 1169 { 1170 dw_addr = lp_build_add(&ctx->bld_base.uint_bld, dw_addr, 1171 LLVMConstInt(ctx->i32, dw_offset_imm, 0)); 1172 1173 ac_lds_store(&ctx->ac, dw_addr, value); 1174 } 1175 1176 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx, 1177 unsigned param) 1178 { 1179 LLVMBuilderRef builder = ctx->ac.builder; 1180 1181 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param); 1182 addr = LLVMBuildZExt(builder, addr, ctx->i64, ""); 1183 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), ""); 1184 1185 uint64_t desc2 = 0xffffffff; 1186 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 1187 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 1188 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 1189 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 1190 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 1191 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 1192 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0); 1193 1194 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2)); 1195 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, ""); 1196 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, ""); 1197 return LLVMBuildBitCast(builder, desc, ctx->v4i32, ""); 1198 } 1199 1200 static LLVMValueRef fetch_input_tcs( 1201 struct lp_build_tgsi_context *bld_base, 1202 const struct tgsi_full_src_register *reg, 1203 enum tgsi_opcode_type type, unsigned swizzle) 1204 { 1205 struct si_shader_context *ctx = si_shader_context(bld_base); 1206 LLVMValueRef dw_addr, stride; 1207 1208 stride = get_tcs_in_vertex_dw_stride(ctx); 1209 dw_addr = get_tcs_in_current_patch_offset(ctx); 1210 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); 1211 1212 return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr); 1213 } 1214 1215 static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, 1216 LLVMValueRef vertex_index, 1217 LLVMValueRef param_index, 1218 unsigned const_index, 1219 unsigned location, 1220 unsigned driver_location, 1221 unsigned component, 1222 unsigned num_components, 1223 bool is_patch, 1224 bool is_compact, 1225 bool load_input) 1226 { 1227 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1228 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1229 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 1230 LLVMValueRef dw_addr, stride; 1231 1232 driver_location = driver_location / 4; 1233 1234 if (load_input) { 1235 stride = get_tcs_in_vertex_dw_stride(ctx); 1236 dw_addr = get_tcs_in_current_patch_offset(ctx); 1237 } else { 1238 if (is_patch) { 1239 stride = NULL; 1240 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1241 } else { 1242 stride = get_tcs_out_vertex_dw_stride(ctx); 1243 dw_addr = get_tcs_out_current_patch_offset(ctx); 1244 } 1245 } 1246 1247 if (param_index) { 1248 /* Add the constant index to the indirect index */ 1249 param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1250 LLVMConstInt(ctx->i32, const_index, 0), ""); 1251 } else { 1252 param_index = LLVMConstInt(ctx->i32, const_index, 0); 1253 } 1254 1255 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, 1256 vertex_index, param_index, 1257 driver_location, 1258 info->input_semantic_name, 1259 info->input_semantic_index, 1260 is_patch); 1261 1262 LLVMValueRef value[4]; 1263 for (unsigned i = 0; i < num_components + component; i++) { 1264 value[i] = lds_load(bld_base, ctx->i32, i, dw_addr); 1265 } 1266 1267 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 1268 } 1269 1270 static LLVMValueRef fetch_output_tcs( 1271 struct lp_build_tgsi_context *bld_base, 1272 const struct tgsi_full_src_register *reg, 1273 enum tgsi_opcode_type type, unsigned swizzle) 1274 { 1275 struct si_shader_context *ctx = si_shader_context(bld_base); 1276 LLVMValueRef dw_addr, stride; 1277 1278 if (reg->Register.Dimension) { 1279 stride = get_tcs_out_vertex_dw_stride(ctx); 1280 dw_addr = get_tcs_out_current_patch_offset(ctx); 1281 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); 1282 } else { 1283 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1284 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr); 1285 } 1286 1287 return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr); 1288 } 1289 1290 static LLVMValueRef fetch_input_tes( 1291 struct lp_build_tgsi_context *bld_base, 1292 const struct tgsi_full_src_register *reg, 1293 enum tgsi_opcode_type type, unsigned swizzle) 1294 { 1295 struct si_shader_context *ctx = si_shader_context(bld_base); 1296 LLVMValueRef buffer, base, addr; 1297 1298 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); 1299 1300 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1301 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg); 1302 1303 return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, 1304 buffer, base, addr, true); 1305 } 1306 1307 LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, 1308 LLVMValueRef vertex_index, 1309 LLVMValueRef param_index, 1310 unsigned const_index, 1311 unsigned location, 1312 unsigned driver_location, 1313 unsigned component, 1314 unsigned num_components, 1315 bool is_patch, 1316 bool is_compact, 1317 bool load_input) 1318 { 1319 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1320 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1321 LLVMValueRef buffer, base, addr; 1322 1323 driver_location = driver_location / 4; 1324 1325 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); 1326 1327 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1328 1329 if (param_index) { 1330 /* Add the constant index to the indirect index */ 1331 param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1332 LLVMConstInt(ctx->i32, const_index, 0), ""); 1333 } else { 1334 param_index = LLVMConstInt(ctx->i32, const_index, 0); 1335 } 1336 1337 addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1338 param_index, driver_location, 1339 info->input_semantic_name, 1340 info->input_semantic_index, 1341 is_patch); 1342 1343 /* TODO: This will generate rather ordinary llvm code, although it 1344 * should be easy for the optimiser to fix up. In future we might want 1345 * to refactor buffer_load(), but for now this maximises code sharing 1346 * between the NIR and TGSI backends. 1347 */ 1348 LLVMValueRef value[4]; 1349 for (unsigned i = component; i < num_components + component; i++) { 1350 value[i] = buffer_load(&ctx->bld_base, ctx->i32, i, buffer, base, addr, true); 1351 } 1352 1353 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); 1354 } 1355 1356 static void store_output_tcs(struct lp_build_tgsi_context *bld_base, 1357 const struct tgsi_full_instruction *inst, 1358 const struct tgsi_opcode_info *info, 1359 unsigned index, 1360 LLVMValueRef dst[4]) 1361 { 1362 struct si_shader_context *ctx = si_shader_context(bld_base); 1363 const struct tgsi_full_dst_register *reg = &inst->Dst[index]; 1364 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info; 1365 unsigned chan_index; 1366 LLVMValueRef dw_addr, stride; 1367 LLVMValueRef buffer, base, buf_addr; 1368 LLVMValueRef values[4]; 1369 bool skip_lds_store; 1370 bool is_tess_factor = false, is_tess_inner = false; 1371 1372 /* Only handle per-patch and per-vertex outputs here. 1373 * Vectors will be lowered to scalars and this function will be called again. 1374 */ 1375 if (reg->Register.File != TGSI_FILE_OUTPUT || 1376 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) { 1377 si_llvm_emit_store(bld_base, inst, info, index, dst); 1378 return; 1379 } 1380 1381 if (reg->Register.Dimension) { 1382 stride = get_tcs_out_vertex_dw_stride(ctx); 1383 dw_addr = get_tcs_out_current_patch_offset(ctx); 1384 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr); 1385 skip_lds_store = !sh_info->reads_pervertex_outputs; 1386 } else { 1387 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1388 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr); 1389 skip_lds_store = !sh_info->reads_perpatch_outputs; 1390 1391 if (!reg->Register.Indirect) { 1392 int name = sh_info->output_semantic_name[reg->Register.Index]; 1393 1394 /* Always write tess factors into LDS for the TCS epilog. */ 1395 if (name == TGSI_SEMANTIC_TESSINNER || 1396 name == TGSI_SEMANTIC_TESSOUTER) { 1397 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ 1398 skip_lds_store = !sh_info->reads_tessfactor_outputs && 1399 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; 1400 is_tess_factor = true; 1401 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; 1402 } 1403 } 1404 } 1405 1406 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); 1407 1408 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1409 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL); 1410 1411 uint32_t writemask = reg->Register.WriteMask; 1412 while (writemask) { 1413 chan_index = u_bit_scan(&writemask); 1414 LLVMValueRef value = dst[chan_index]; 1415 1416 if (inst->Instruction.Saturate) 1417 value = ac_build_clamp(&ctx->ac, value); 1418 1419 /* Skip LDS stores if there is no LDS read of this output. */ 1420 if (!skip_lds_store) 1421 lds_store(ctx, chan_index, dw_addr, value); 1422 1423 value = ac_to_integer(&ctx->ac, value); 1424 values[chan_index] = value; 1425 1426 if (reg->Register.WriteMask != 0xF && !is_tess_factor) { 1427 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, 1428 buf_addr, base, 1429 4 * chan_index, 1, 0, true, false); 1430 } 1431 1432 /* Write tess factors into VGPRs for the epilog. */ 1433 if (is_tess_factor && 1434 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 1435 if (!is_tess_inner) { 1436 LLVMBuildStore(ctx->ac.builder, value, /* outer */ 1437 ctx->invoc0_tess_factors[chan_index]); 1438 } else if (chan_index < 2) { 1439 LLVMBuildStore(ctx->ac.builder, value, /* inner */ 1440 ctx->invoc0_tess_factors[4 + chan_index]); 1441 } 1442 } 1443 } 1444 1445 if (reg->Register.WriteMask == 0xF && !is_tess_factor) { 1446 LLVMValueRef value = lp_build_gather_values(&ctx->gallivm, 1447 values, 4); 1448 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr, 1449 base, 0, 1, 0, true, false); 1450 } 1451 } 1452 1453 static void si_nir_store_output_tcs(struct ac_shader_abi *abi, 1454 const struct nir_variable *var, 1455 LLVMValueRef vertex_index, 1456 LLVMValueRef param_index, 1457 unsigned const_index, 1458 LLVMValueRef src, 1459 unsigned writemask) 1460 { 1461 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1462 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1463 const unsigned component = var->data.location_frac; 1464 const bool is_patch = var->data.patch; 1465 unsigned driver_location = var->data.driver_location; 1466 LLVMValueRef dw_addr, stride; 1467 LLVMValueRef buffer, base, addr; 1468 LLVMValueRef values[4]; 1469 bool skip_lds_store; 1470 bool is_tess_factor = false, is_tess_inner = false; 1471 1472 driver_location = driver_location / 4; 1473 1474 if (param_index) { 1475 /* Add the constant index to the indirect index */ 1476 param_index = LLVMBuildAdd(ctx->ac.builder, param_index, 1477 LLVMConstInt(ctx->i32, const_index, 0), ""); 1478 } else { 1479 if (const_index != 0) 1480 param_index = LLVMConstInt(ctx->i32, const_index, 0); 1481 } 1482 1483 if (!is_patch) { 1484 stride = get_tcs_out_vertex_dw_stride(ctx); 1485 dw_addr = get_tcs_out_current_patch_offset(ctx); 1486 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, 1487 vertex_index, param_index, 1488 driver_location, 1489 info->output_semantic_name, 1490 info->output_semantic_index, 1491 is_patch); 1492 1493 skip_lds_store = !info->reads_pervertex_outputs; 1494 } else { 1495 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1496 dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, 1497 vertex_index, param_index, 1498 driver_location, 1499 info->output_semantic_name, 1500 info->output_semantic_index, 1501 is_patch); 1502 1503 skip_lds_store = !info->reads_perpatch_outputs; 1504 1505 if (!param_index) { 1506 int name = info->output_semantic_name[driver_location]; 1507 1508 /* Always write tess factors into LDS for the TCS epilog. */ 1509 if (name == TGSI_SEMANTIC_TESSINNER || 1510 name == TGSI_SEMANTIC_TESSOUTER) { 1511 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ 1512 skip_lds_store = !info->reads_tessfactor_outputs && 1513 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; 1514 is_tess_factor = true; 1515 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; 1516 } 1517 } 1518 } 1519 1520 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); 1521 1522 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1523 1524 addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, 1525 param_index, driver_location, 1526 info->output_semantic_name, 1527 info->output_semantic_index, 1528 is_patch); 1529 1530 for (unsigned chan = 0; chan < 4; chan++) { 1531 if (!(writemask & (1 << chan))) 1532 continue; 1533 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); 1534 1535 /* Skip LDS stores if there is no LDS read of this output. */ 1536 if (!skip_lds_store) 1537 ac_lds_store(&ctx->ac, dw_addr, value); 1538 1539 value = ac_to_integer(&ctx->ac, value); 1540 values[chan] = value; 1541 1542 if (writemask != 0xF && !is_tess_factor) { 1543 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, 1544 addr, base, 1545 4 * chan, 1, 0, true, false); 1546 } 1547 1548 /* Write tess factors into VGPRs for the epilog. */ 1549 if (is_tess_factor && 1550 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 1551 if (!is_tess_inner) { 1552 LLVMBuildStore(ctx->ac.builder, value, /* outer */ 1553 ctx->invoc0_tess_factors[chan]); 1554 } else if (chan < 2) { 1555 LLVMBuildStore(ctx->ac.builder, value, /* inner */ 1556 ctx->invoc0_tess_factors[4 + chan]); 1557 } 1558 } 1559 } 1560 1561 if (writemask == 0xF && !is_tess_factor) { 1562 LLVMValueRef value = lp_build_gather_values(&ctx->gallivm, 1563 values, 4); 1564 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, 1565 base, 0, 1, 0, true, false); 1566 } 1567 } 1568 1569 LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, 1570 unsigned input_index, 1571 unsigned vtx_offset_param, 1572 LLVMTypeRef type, 1573 unsigned swizzle) 1574 { 1575 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1576 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 1577 struct si_shader *shader = ctx->shader; 1578 struct lp_build_context *uint = &ctx->bld_base.uint_bld; 1579 LLVMValueRef vtx_offset, soffset; 1580 struct tgsi_shader_info *info = &shader->selector->info; 1581 unsigned semantic_name = info->input_semantic_name[input_index]; 1582 unsigned semantic_index = info->input_semantic_index[input_index]; 1583 unsigned param; 1584 LLVMValueRef value; 1585 1586 param = si_shader_io_get_unique_index(semantic_name, semantic_index); 1587 1588 /* GFX9 has the ESGS ring in LDS. */ 1589 if (ctx->screen->info.chip_class >= GFX9) { 1590 unsigned index = vtx_offset_param; 1591 1592 switch (index / 2) { 1593 case 0: 1594 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset, 1595 index % 2 ? 16 : 0, 16); 1596 break; 1597 case 1: 1598 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset, 1599 index % 2 ? 16 : 0, 16); 1600 break; 1601 case 2: 1602 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset, 1603 index % 2 ? 16 : 0, 16); 1604 break; 1605 default: 1606 assert(0); 1607 return NULL; 1608 } 1609 1610 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, 1611 LLVMConstInt(ctx->i32, param * 4, 0), ""); 1612 return lds_load(bld_base, type, swizzle, vtx_offset); 1613 } 1614 1615 /* GFX6: input load from the ESGS ring in memory. */ 1616 if (swizzle == ~0) { 1617 LLVMValueRef values[TGSI_NUM_CHANNELS]; 1618 unsigned chan; 1619 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1620 values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, 1621 type, chan); 1622 } 1623 return lp_build_gather_values(&ctx->gallivm, values, 1624 TGSI_NUM_CHANNELS); 1625 } 1626 1627 /* Get the vertex offset parameter on GFX6. */ 1628 LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param]; 1629 1630 vtx_offset = lp_build_mul_imm(uint, gs_vtx_offset, 4); 1631 1632 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0); 1633 1634 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0, 1635 vtx_offset, soffset, 0, 1, 0, true, false); 1636 if (llvm_type_is_64bit(ctx, type)) { 1637 LLVMValueRef value2; 1638 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0); 1639 1640 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, 1641 ctx->i32_0, vtx_offset, soffset, 1642 0, 1, 0, true, false); 1643 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); 1644 } 1645 return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); 1646 } 1647 1648 static LLVMValueRef fetch_input_gs( 1649 struct lp_build_tgsi_context *bld_base, 1650 const struct tgsi_full_src_register *reg, 1651 enum tgsi_opcode_type type, 1652 unsigned swizzle) 1653 { 1654 struct si_shader_context *ctx = si_shader_context(bld_base); 1655 struct tgsi_shader_info *info = &ctx->shader->selector->info; 1656 1657 unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; 1658 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) 1659 return get_primitive_id(ctx, swizzle); 1660 1661 if (!reg->Register.Dimension) 1662 return NULL; 1663 1664 return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index, 1665 reg->Dimension.Index, 1666 tgsi2llvmtype(bld_base, type), 1667 swizzle); 1668 } 1669 1670 static int lookup_interp_param_index(unsigned interpolate, unsigned location) 1671 { 1672 switch (interpolate) { 1673 case TGSI_INTERPOLATE_CONSTANT: 1674 return 0; 1675 1676 case TGSI_INTERPOLATE_LINEAR: 1677 if (location == TGSI_INTERPOLATE_LOC_SAMPLE) 1678 return SI_PARAM_LINEAR_SAMPLE; 1679 else if (location == TGSI_INTERPOLATE_LOC_CENTROID) 1680 return SI_PARAM_LINEAR_CENTROID; 1681 else 1682 return SI_PARAM_LINEAR_CENTER; 1683 break; 1684 case TGSI_INTERPOLATE_COLOR: 1685 case TGSI_INTERPOLATE_PERSPECTIVE: 1686 if (location == TGSI_INTERPOLATE_LOC_SAMPLE) 1687 return SI_PARAM_PERSP_SAMPLE; 1688 else if (location == TGSI_INTERPOLATE_LOC_CENTROID) 1689 return SI_PARAM_PERSP_CENTROID; 1690 else 1691 return SI_PARAM_PERSP_CENTER; 1692 break; 1693 default: 1694 fprintf(stderr, "Warning: Unhandled interpolation mode.\n"); 1695 return -1; 1696 } 1697 } 1698 1699 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, 1700 unsigned attr_index, unsigned chan, 1701 LLVMValueRef prim_mask, 1702 LLVMValueRef i, LLVMValueRef j) 1703 { 1704 if (i || j) { 1705 return ac_build_fs_interp(&ctx->ac, 1706 LLVMConstInt(ctx->i32, chan, 0), 1707 LLVMConstInt(ctx->i32, attr_index, 0), 1708 prim_mask, i, j); 1709 } 1710 return ac_build_fs_interp_mov(&ctx->ac, 1711 LLVMConstInt(ctx->i32, 2, 0), /* P0 */ 1712 LLVMConstInt(ctx->i32, chan, 0), 1713 LLVMConstInt(ctx->i32, attr_index, 0), 1714 prim_mask); 1715 } 1716 1717 /** 1718 * Interpolate a fragment shader input. 1719 * 1720 * @param ctx context 1721 * @param input_index index of the input in hardware 1722 * @param semantic_name TGSI_SEMANTIC_* 1723 * @param semantic_index semantic index 1724 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset) 1725 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total) 1726 * @param interp_param interpolation weights (i,j) 1727 * @param prim_mask SI_PARAM_PRIM_MASK 1728 * @param face SI_PARAM_FRONT_FACE 1729 * @param result the return value (4 components) 1730 */ 1731 static void interp_fs_input(struct si_shader_context *ctx, 1732 unsigned input_index, 1733 unsigned semantic_name, 1734 unsigned semantic_index, 1735 unsigned num_interp_inputs, 1736 unsigned colors_read_mask, 1737 LLVMValueRef interp_param, 1738 LLVMValueRef prim_mask, 1739 LLVMValueRef face, 1740 LLVMValueRef result[4]) 1741 { 1742 LLVMValueRef i = NULL, j = NULL; 1743 unsigned chan; 1744 1745 /* fs.constant returns the param from the middle vertex, so it's not 1746 * really useful for flat shading. It's meant to be used for custom 1747 * interpolation (but the intrinsic can't fetch from the other two 1748 * vertices). 1749 * 1750 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state 1751 * to do the right thing. The only reason we use fs.constant is that 1752 * fs.interp cannot be used on integers, because they can be equal 1753 * to NaN. 1754 * 1755 * When interp is false we will use fs.constant or for newer llvm, 1756 * amdgcn.interp.mov. 1757 */ 1758 bool interp = interp_param != NULL; 1759 1760 if (interp) { 1761 interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, 1762 LLVMVectorType(ctx->f32, 2), ""); 1763 1764 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, 1765 ctx->i32_0, ""); 1766 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, 1767 ctx->i32_1, ""); 1768 } 1769 1770 if (semantic_name == TGSI_SEMANTIC_COLOR && 1771 ctx->shader->key.part.ps.prolog.color_two_side) { 1772 LLVMValueRef is_face_positive; 1773 1774 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", 1775 * otherwise it's at offset "num_inputs". 1776 */ 1777 unsigned back_attr_offset = num_interp_inputs; 1778 if (semantic_index == 1 && colors_read_mask & 0xf) 1779 back_attr_offset += 1; 1780 1781 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, 1782 face, ctx->i32_0, ""); 1783 1784 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1785 LLVMValueRef front, back; 1786 1787 front = si_build_fs_interp(ctx, 1788 input_index, chan, 1789 prim_mask, i, j); 1790 back = si_build_fs_interp(ctx, 1791 back_attr_offset, chan, 1792 prim_mask, i, j); 1793 1794 result[chan] = LLVMBuildSelect(ctx->ac.builder, 1795 is_face_positive, 1796 front, 1797 back, 1798 ""); 1799 } 1800 } else if (semantic_name == TGSI_SEMANTIC_FOG) { 1801 result[0] = si_build_fs_interp(ctx, input_index, 1802 0, prim_mask, i, j); 1803 result[1] = 1804 result[2] = LLVMConstReal(ctx->f32, 0.0f); 1805 result[3] = LLVMConstReal(ctx->f32, 1.0f); 1806 } else { 1807 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1808 result[chan] = si_build_fs_interp(ctx, 1809 input_index, chan, 1810 prim_mask, i, j); 1811 } 1812 } 1813 } 1814 1815 void si_llvm_load_input_fs( 1816 struct si_shader_context *ctx, 1817 unsigned input_index, 1818 LLVMValueRef out[4]) 1819 { 1820 struct lp_build_context *base = &ctx->bld_base.base; 1821 struct si_shader *shader = ctx->shader; 1822 struct tgsi_shader_info *info = &shader->selector->info; 1823 LLVMValueRef main_fn = ctx->main_fn; 1824 LLVMValueRef interp_param = NULL; 1825 int interp_param_idx; 1826 enum tgsi_semantic semantic_name = info->input_semantic_name[input_index]; 1827 unsigned semantic_index = info->input_semantic_index[input_index]; 1828 enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index]; 1829 enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index]; 1830 1831 /* Get colors from input VGPRs (set by the prolog). */ 1832 if (semantic_name == TGSI_SEMANTIC_COLOR) { 1833 unsigned colors_read = shader->selector->info.colors_read; 1834 unsigned mask = colors_read >> (semantic_index * 4); 1835 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + 1836 (semantic_index ? util_bitcount(colors_read & 0xf) : 0); 1837 1838 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; 1839 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; 1840 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; 1841 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; 1842 return; 1843 } 1844 1845 interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc); 1846 if (interp_param_idx == -1) 1847 return; 1848 else if (interp_param_idx) { 1849 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); 1850 } 1851 1852 interp_fs_input(ctx, input_index, semantic_name, 1853 semantic_index, 0, /* this param is unused */ 1854 shader->selector->info.colors_read, interp_param, 1855 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK), 1856 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), 1857 &out[0]); 1858 } 1859 1860 static void declare_input_fs( 1861 struct si_shader_context *ctx, 1862 unsigned input_index, 1863 const struct tgsi_full_declaration *decl, 1864 LLVMValueRef out[4]) 1865 { 1866 si_llvm_load_input_fs(ctx, input_index, out); 1867 } 1868 1869 static LLVMValueRef get_sample_id(struct si_shader_context *ctx) 1870 { 1871 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4); 1872 } 1873 1874 1875 /** 1876 * Load a dword from a constant buffer. 1877 */ 1878 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx, 1879 LLVMValueRef resource, 1880 LLVMValueRef offset) 1881 { 1882 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 1883 0, 0, 0, true, true); 1884 } 1885 1886 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id) 1887 { 1888 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld; 1889 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 1890 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0); 1891 LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index); 1892 1893 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ 1894 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8); 1895 LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), ""); 1896 1897 LLVMValueRef pos[4] = { 1898 buffer_load_const(ctx, resource, offset0), 1899 buffer_load_const(ctx, resource, offset1), 1900 LLVMConstReal(ctx->f32, 0), 1901 LLVMConstReal(ctx->f32, 0) 1902 }; 1903 1904 return lp_build_gather_values(&ctx->gallivm, pos, 4); 1905 } 1906 1907 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi, 1908 LLVMTypeRef type, 1909 unsigned num_components) 1910 { 1911 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1912 struct lp_build_context *bld = &ctx->bld_base.base; 1913 1914 LLVMValueRef coord[4] = { 1915 LLVMGetParam(ctx->main_fn, ctx->param_tes_u), 1916 LLVMGetParam(ctx->main_fn, ctx->param_tes_v), 1917 ctx->ac.f32_0, 1918 ctx->ac.f32_0 1919 }; 1920 1921 /* For triangles, the vector should be (u, v, 1-u-v). */ 1922 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == 1923 PIPE_PRIM_TRIANGLES) 1924 coord[2] = lp_build_sub(bld, ctx->ac.f32_1, 1925 lp_build_add(bld, coord[0], coord[1])); 1926 1927 return lp_build_gather_values(&ctx->gallivm, coord, 4); 1928 } 1929 1930 static LLVMValueRef load_tess_level(struct si_shader_context *ctx, 1931 unsigned semantic_name) 1932 { 1933 LLVMValueRef buffer, base, addr; 1934 1935 int param = si_shader_io_get_unique_index_patch(semantic_name, 0); 1936 1937 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); 1938 1939 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 1940 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, 1941 LLVMConstInt(ctx->i32, param, 0)); 1942 1943 return buffer_load(&ctx->bld_base, ctx->f32, 1944 ~0, buffer, base, addr, true); 1945 1946 } 1947 1948 static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, 1949 unsigned varying_id) 1950 { 1951 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1952 unsigned semantic_name; 1953 1954 switch (varying_id) { 1955 case VARYING_SLOT_TESS_LEVEL_INNER: 1956 semantic_name = TGSI_SEMANTIC_TESSINNER; 1957 break; 1958 case VARYING_SLOT_TESS_LEVEL_OUTER: 1959 semantic_name = TGSI_SEMANTIC_TESSOUTER; 1960 break; 1961 default: 1962 unreachable("unknown tess level"); 1963 } 1964 1965 return load_tess_level(ctx, semantic_name); 1966 1967 } 1968 1969 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi) 1970 { 1971 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 1972 if (ctx->type == PIPE_SHADER_TESS_CTRL) 1973 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6); 1974 else if (ctx->type == PIPE_SHADER_TESS_EVAL) 1975 return get_num_tcs_out_vertices(ctx); 1976 else 1977 unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); 1978 } 1979 1980 void si_load_system_value(struct si_shader_context *ctx, 1981 unsigned index, 1982 const struct tgsi_full_declaration *decl) 1983 { 1984 LLVMValueRef value = 0; 1985 1986 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES); 1987 1988 switch (decl->Semantic.Name) { 1989 case TGSI_SEMANTIC_INSTANCEID: 1990 value = ctx->abi.instance_id; 1991 break; 1992 1993 case TGSI_SEMANTIC_VERTEXID: 1994 value = LLVMBuildAdd(ctx->ac.builder, 1995 ctx->abi.vertex_id, 1996 ctx->abi.base_vertex, ""); 1997 break; 1998 1999 case TGSI_SEMANTIC_VERTEXID_NOBASE: 2000 /* Unused. Clarify the meaning in indexed vs. non-indexed 2001 * draws if this is ever used again. */ 2002 assert(false); 2003 break; 2004 2005 case TGSI_SEMANTIC_BASEVERTEX: 2006 { 2007 /* For non-indexed draws, the base vertex set by the driver 2008 * (for direct draws) or the CP (for indirect draws) is the 2009 * first vertex ID, but GLSL expects 0 to be returned. 2010 */ 2011 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits); 2012 LLVMValueRef indexed; 2013 2014 indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, ""); 2015 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, ""); 2016 2017 value = LLVMBuildSelect(ctx->ac.builder, indexed, 2018 ctx->abi.base_vertex, ctx->i32_0, ""); 2019 break; 2020 } 2021 2022 case TGSI_SEMANTIC_BASEINSTANCE: 2023 value = ctx->abi.start_instance; 2024 break; 2025 2026 case TGSI_SEMANTIC_DRAWID: 2027 value = ctx->abi.draw_id; 2028 break; 2029 2030 case TGSI_SEMANTIC_INVOCATIONID: 2031 if (ctx->type == PIPE_SHADER_TESS_CTRL) 2032 value = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 2033 else if (ctx->type == PIPE_SHADER_GEOMETRY) 2034 value = ctx->abi.gs_invocation_id; 2035 else 2036 assert(!"INVOCATIONID not implemented"); 2037 break; 2038 2039 case TGSI_SEMANTIC_POSITION: 2040 { 2041 LLVMValueRef pos[4] = { 2042 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT), 2043 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT), 2044 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT), 2045 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP, 2046 LLVMGetParam(ctx->main_fn, 2047 SI_PARAM_POS_W_FLOAT)), 2048 }; 2049 value = lp_build_gather_values(&ctx->gallivm, pos, 4); 2050 break; 2051 } 2052 2053 case TGSI_SEMANTIC_FACE: 2054 value = ctx->abi.front_face; 2055 break; 2056 2057 case TGSI_SEMANTIC_SAMPLEID: 2058 value = get_sample_id(ctx); 2059 break; 2060 2061 case TGSI_SEMANTIC_SAMPLEPOS: { 2062 LLVMValueRef pos[4] = { 2063 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT), 2064 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT), 2065 LLVMConstReal(ctx->f32, 0), 2066 LLVMConstReal(ctx->f32, 0) 2067 }; 2068 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base, 2069 TGSI_OPCODE_FRC, pos[0]); 2070 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base, 2071 TGSI_OPCODE_FRC, pos[1]); 2072 value = lp_build_gather_values(&ctx->gallivm, pos, 4); 2073 break; 2074 } 2075 2076 case TGSI_SEMANTIC_SAMPLEMASK: 2077 /* This can only occur with the OpenGL Core profile, which 2078 * doesn't support smoothing. 2079 */ 2080 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE); 2081 break; 2082 2083 case TGSI_SEMANTIC_TESSCOORD: 2084 value = si_load_tess_coord(&ctx->abi, NULL, 4); 2085 break; 2086 2087 case TGSI_SEMANTIC_VERTICESIN: 2088 value = si_load_patch_vertices_in(&ctx->abi); 2089 break; 2090 2091 case TGSI_SEMANTIC_TESSINNER: 2092 case TGSI_SEMANTIC_TESSOUTER: 2093 value = load_tess_level(ctx, decl->Semantic.Name); 2094 break; 2095 2096 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI: 2097 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI: 2098 { 2099 LLVMValueRef buf, slot, val[4]; 2100 int i, offset; 2101 2102 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0); 2103 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 2104 buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot); 2105 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0; 2106 2107 for (i = 0; i < 4; i++) 2108 val[i] = buffer_load_const(ctx, buf, 2109 LLVMConstInt(ctx->i32, (offset + i) * 4, 0)); 2110 value = lp_build_gather_values(&ctx->gallivm, val, 4); 2111 break; 2112 } 2113 2114 case TGSI_SEMANTIC_PRIMID: 2115 value = get_primitive_id(ctx, 0); 2116 break; 2117 2118 case TGSI_SEMANTIC_GRID_SIZE: 2119 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size); 2120 break; 2121 2122 case TGSI_SEMANTIC_BLOCK_SIZE: 2123 { 2124 LLVMValueRef values[3]; 2125 unsigned i; 2126 unsigned *properties = ctx->shader->selector->info.properties; 2127 2128 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { 2129 unsigned sizes[3] = { 2130 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], 2131 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], 2132 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] 2133 }; 2134 2135 for (i = 0; i < 3; ++i) 2136 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0); 2137 2138 value = lp_build_gather_values(&ctx->gallivm, values, 3); 2139 } else { 2140 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size); 2141 } 2142 break; 2143 } 2144 2145 case TGSI_SEMANTIC_BLOCK_ID: 2146 { 2147 LLVMValueRef values[3]; 2148 2149 for (int i = 0; i < 3; i++) { 2150 values[i] = ctx->i32_0; 2151 if (ctx->param_block_id[i] >= 0) { 2152 values[i] = LLVMGetParam(ctx->main_fn, 2153 ctx->param_block_id[i]); 2154 } 2155 } 2156 value = lp_build_gather_values(&ctx->gallivm, values, 3); 2157 break; 2158 } 2159 2160 case TGSI_SEMANTIC_THREAD_ID: 2161 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id); 2162 break; 2163 2164 case TGSI_SEMANTIC_HELPER_INVOCATION: 2165 value = lp_build_intrinsic(ctx->ac.builder, 2166 "llvm.amdgcn.ps.live", 2167 ctx->i1, NULL, 0, 2168 LP_FUNC_ATTR_READNONE); 2169 value = LLVMBuildNot(ctx->ac.builder, value, ""); 2170 value = LLVMBuildSExt(ctx->ac.builder, value, ctx->i32, ""); 2171 break; 2172 2173 case TGSI_SEMANTIC_SUBGROUP_SIZE: 2174 value = LLVMConstInt(ctx->i32, 64, 0); 2175 break; 2176 2177 case TGSI_SEMANTIC_SUBGROUP_INVOCATION: 2178 value = ac_get_thread_id(&ctx->ac); 2179 break; 2180 2181 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK: 2182 { 2183 LLVMValueRef id = ac_get_thread_id(&ctx->ac); 2184 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); 2185 value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, ""); 2186 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); 2187 break; 2188 } 2189 2190 case TGSI_SEMANTIC_SUBGROUP_GE_MASK: 2191 case TGSI_SEMANTIC_SUBGROUP_GT_MASK: 2192 case TGSI_SEMANTIC_SUBGROUP_LE_MASK: 2193 case TGSI_SEMANTIC_SUBGROUP_LT_MASK: 2194 { 2195 LLVMValueRef id = ac_get_thread_id(&ctx->ac); 2196 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK || 2197 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) { 2198 /* All bits set except LSB */ 2199 value = LLVMConstInt(ctx->i64, -2, 0); 2200 } else { 2201 /* All bits set */ 2202 value = LLVMConstInt(ctx->i64, -1, 0); 2203 } 2204 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, ""); 2205 value = LLVMBuildShl(ctx->ac.builder, value, id, ""); 2206 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK || 2207 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK) 2208 value = LLVMBuildNot(ctx->ac.builder, value, ""); 2209 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, ""); 2210 break; 2211 } 2212 2213 default: 2214 assert(!"unknown system value"); 2215 return; 2216 } 2217 2218 ctx->system_values[index] = value; 2219 } 2220 2221 void si_declare_compute_memory(struct si_shader_context *ctx, 2222 const struct tgsi_full_declaration *decl) 2223 { 2224 struct si_shader_selector *sel = ctx->shader->selector; 2225 2226 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE); 2227 LLVMValueRef var; 2228 2229 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED); 2230 assert(decl->Range.First == decl->Range.Last); 2231 assert(!ctx->ac.lds); 2232 2233 var = LLVMAddGlobalInAddressSpace(ctx->ac.module, 2234 LLVMArrayType(ctx->i8, sel->local_size), 2235 "compute_lds", 2236 LOCAL_ADDR_SPACE); 2237 LLVMSetAlignment(var, 4); 2238 2239 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); 2240 } 2241 2242 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i) 2243 { 2244 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn, 2245 ctx->param_const_and_shader_buffers); 2246 2247 return ac_build_load_to_sgpr(&ctx->ac, list_ptr, 2248 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0)); 2249 } 2250 2251 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) 2252 { 2253 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2254 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2255 2256 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers); 2257 index = LLVMBuildAdd(ctx->ac.builder, index, 2258 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); 2259 2260 return ac_build_load_to_sgpr(&ctx->ac, ptr, index); 2261 } 2262 2263 static LLVMValueRef 2264 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) 2265 { 2266 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 2267 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, 2268 ctx->param_const_and_shader_buffers); 2269 2270 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); 2271 index = LLVMBuildSub(ctx->ac.builder, 2272 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0), 2273 index, ""); 2274 2275 return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); 2276 } 2277 2278 static LLVMValueRef fetch_constant( 2279 struct lp_build_tgsi_context *bld_base, 2280 const struct tgsi_full_src_register *reg, 2281 enum tgsi_opcode_type type, 2282 unsigned swizzle) 2283 { 2284 struct si_shader_context *ctx = si_shader_context(bld_base); 2285 struct si_shader_selector *sel = ctx->shader->selector; 2286 const struct tgsi_ind_register *ireg = ®->Indirect; 2287 unsigned buf, idx; 2288 2289 LLVMValueRef addr, bufp; 2290 2291 if (swizzle == LP_CHAN_ALL) { 2292 unsigned chan; 2293 LLVMValueRef values[4]; 2294 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) 2295 values[chan] = fetch_constant(bld_base, reg, type, chan); 2296 2297 return lp_build_gather_values(&ctx->gallivm, values, 4); 2298 } 2299 2300 /* Split 64-bit loads. */ 2301 if (tgsi_type_is_64bit(type)) { 2302 LLVMValueRef lo, hi; 2303 2304 lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle); 2305 hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1); 2306 return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type), 2307 lo, hi); 2308 } 2309 2310 idx = reg->Register.Index * 4 + swizzle; 2311 if (reg->Register.Indirect) { 2312 addr = si_get_indirect_index(ctx, ireg, 16, idx * 4); 2313 } else { 2314 addr = LLVMConstInt(ctx->i32, idx * 4, 0); 2315 } 2316 2317 /* Fast path when user data SGPRs point to constant buffer 0 directly. */ 2318 if (sel->info.const_buffers_declared == 1 && 2319 sel->info.shader_buffers_declared == 0) { 2320 LLVMValueRef ptr = 2321 LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2322 2323 /* This enables use of s_load_dword and flat_load_dword for const buffer 0 2324 * loads, and up to x4 load opcode merging. However, it leads to horrible 2325 * code reducing SIMD wave occupancy from 8 to 2 in many cases. 2326 * 2327 * Using s_buffer_load_dword (x1) seems to be the best option right now. 2328 * 2329 * LLVM 5.0 on SI doesn't insert a required s_nop between SALU setting 2330 * a descriptor and s_buffer_load_dword using it, so we can't expand 2331 * the pointer into a full descriptor like below. We have to use 2332 * s_load_dword instead. The only case when LLVM 5.0 would select 2333 * s_buffer_load_dword (that we have to prevent) is when we use use 2334 * a literal offset where we don't need bounds checking. 2335 */ 2336 if (ctx->screen->info.chip_class == SI && 2337 HAVE_LLVM < 0x0600 && 2338 !reg->Register.Indirect) { 2339 addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), ""); 2340 LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr); 2341 return bitcast(bld_base, type, result); 2342 } 2343 2344 /* Do the bounds checking with a descriptor, because 2345 * doing computation and manual bounds checking of 64-bit 2346 * addresses generates horrible VALU code with very high 2347 * VGPR usage and very low SIMD occupancy. 2348 */ 2349 ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, ""); 2350 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, ""); 2351 2352 LLVMValueRef desc_elems[] = { 2353 LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""), 2354 LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""), 2355 LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0), 2356 LLVMConstInt(ctx->i32, 2357 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 2358 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 2359 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 2360 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 2361 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 2362 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0) 2363 }; 2364 LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4); 2365 LLVMValueRef result = buffer_load_const(ctx, desc, addr); 2366 return bitcast(bld_base, type, result); 2367 } 2368 2369 assert(reg->Register.Dimension); 2370 buf = reg->Dimension.Index; 2371 2372 if (reg->Dimension.Indirect) { 2373 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers); 2374 LLVMValueRef index; 2375 index = si_get_bounded_indirect_index(ctx, ®->DimIndirect, 2376 reg->Dimension.Index, 2377 ctx->num_const_buffers); 2378 index = LLVMBuildAdd(ctx->ac.builder, index, 2379 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), ""); 2380 bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index); 2381 } else 2382 bufp = load_const_buffer_desc(ctx, buf); 2383 2384 return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr)); 2385 } 2386 2387 /* Upper 16 bits must be zero. */ 2388 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx, 2389 LLVMValueRef val[2]) 2390 { 2391 return LLVMBuildOr(ctx->ac.builder, val[0], 2392 LLVMBuildShl(ctx->ac.builder, val[1], 2393 LLVMConstInt(ctx->i32, 16, 0), 2394 ""), ""); 2395 } 2396 2397 /* Upper 16 bits are ignored and will be dropped. */ 2398 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx, 2399 LLVMValueRef val[2]) 2400 { 2401 LLVMValueRef v[2] = { 2402 LLVMBuildAnd(ctx->ac.builder, val[0], 2403 LLVMConstInt(ctx->i32, 0xffff, 0), ""), 2404 val[1], 2405 }; 2406 return si_llvm_pack_two_int16(ctx, v); 2407 } 2408 2409 /* Initialize arguments for the shader export intrinsic */ 2410 static void si_llvm_init_export_args(struct si_shader_context *ctx, 2411 LLVMValueRef *values, 2412 unsigned target, 2413 struct ac_export_args *args) 2414 { 2415 LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32); 2416 LLVMBuilderRef builder = ctx->ac.builder; 2417 LLVMValueRef val[4]; 2418 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR; 2419 unsigned chan; 2420 bool is_int8, is_int10; 2421 2422 /* Default is 0xf. Adjusted below depending on the format. */ 2423 args->enabled_channels = 0xf; /* writemask */ 2424 2425 /* Specify whether the EXEC mask represents the valid mask */ 2426 args->valid_mask = 0; 2427 2428 /* Specify whether this is the last export */ 2429 args->done = 0; 2430 2431 /* Specify the target we are exporting */ 2432 args->target = target; 2433 2434 if (ctx->type == PIPE_SHADER_FRAGMENT) { 2435 const struct si_shader_key *key = &ctx->shader->key; 2436 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; 2437 int cbuf = target - V_008DFC_SQ_EXP_MRT; 2438 2439 assert(cbuf >= 0 && cbuf < 8); 2440 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; 2441 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; 2442 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1; 2443 } 2444 2445 args->compr = false; 2446 args->out[0] = f32undef; 2447 args->out[1] = f32undef; 2448 args->out[2] = f32undef; 2449 args->out[3] = f32undef; 2450 2451 switch (spi_shader_col_format) { 2452 case V_028714_SPI_SHADER_ZERO: 2453 args->enabled_channels = 0; /* writemask */ 2454 args->target = V_008DFC_SQ_EXP_NULL; 2455 break; 2456 2457 case V_028714_SPI_SHADER_32_R: 2458 args->enabled_channels = 1; /* writemask */ 2459 args->out[0] = values[0]; 2460 break; 2461 2462 case V_028714_SPI_SHADER_32_GR: 2463 args->enabled_channels = 0x3; /* writemask */ 2464 args->out[0] = values[0]; 2465 args->out[1] = values[1]; 2466 break; 2467 2468 case V_028714_SPI_SHADER_32_AR: 2469 args->enabled_channels = 0x9; /* writemask */ 2470 args->out[0] = values[0]; 2471 args->out[3] = values[3]; 2472 break; 2473 2474 case V_028714_SPI_SHADER_FP16_ABGR: 2475 args->compr = 1; /* COMPR flag */ 2476 2477 for (chan = 0; chan < 2; chan++) { 2478 LLVMValueRef pack_args[2] = { 2479 values[2 * chan], 2480 values[2 * chan + 1] 2481 }; 2482 LLVMValueRef packed; 2483 2484 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args); 2485 args->out[chan] = ac_to_float(&ctx->ac, packed); 2486 } 2487 break; 2488 2489 case V_028714_SPI_SHADER_UNORM16_ABGR: 2490 for (chan = 0; chan < 4; chan++) { 2491 val[chan] = ac_build_clamp(&ctx->ac, values[chan]); 2492 val[chan] = LLVMBuildFMul(builder, val[chan], 2493 LLVMConstReal(ctx->f32, 65535), ""); 2494 val[chan] = LLVMBuildFAdd(builder, val[chan], 2495 LLVMConstReal(ctx->f32, 0.5), ""); 2496 val[chan] = LLVMBuildFPToUI(builder, val[chan], 2497 ctx->i32, ""); 2498 } 2499 2500 args->compr = 1; /* COMPR flag */ 2501 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val)); 2502 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val+2)); 2503 break; 2504 2505 case V_028714_SPI_SHADER_SNORM16_ABGR: 2506 for (chan = 0; chan < 4; chan++) { 2507 /* Clamp between [-1, 1]. */ 2508 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_MIN, 2509 values[chan], 2510 LLVMConstReal(ctx->f32, 1)); 2511 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_MAX, 2512 val[chan], 2513 LLVMConstReal(ctx->f32, -1)); 2514 /* Convert to a signed integer in [-32767, 32767]. */ 2515 val[chan] = LLVMBuildFMul(builder, val[chan], 2516 LLVMConstReal(ctx->f32, 32767), ""); 2517 /* If positive, add 0.5, else add -0.5. */ 2518 val[chan] = LLVMBuildFAdd(builder, val[chan], 2519 LLVMBuildSelect(builder, 2520 LLVMBuildFCmp(builder, LLVMRealOGE, 2521 val[chan], ctx->ac.f32_0, ""), 2522 LLVMConstReal(ctx->f32, 0.5), 2523 LLVMConstReal(ctx->f32, -0.5), ""), ""); 2524 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, ""); 2525 } 2526 2527 args->compr = 1; /* COMPR flag */ 2528 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val)); 2529 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val+2)); 2530 break; 2531 2532 case V_028714_SPI_SHADER_UINT16_ABGR: { 2533 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, 2534 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0); 2535 LLVMValueRef max_alpha = 2536 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0); 2537 2538 /* Clamp. */ 2539 for (chan = 0; chan < 4; chan++) { 2540 val[chan] = ac_to_integer(&ctx->ac, values[chan]); 2541 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_UMIN, 2542 val[chan], 2543 chan == 3 ? max_alpha : max_rgb); 2544 } 2545 2546 args->compr = 1; /* COMPR flag */ 2547 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val)); 2548 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val+2)); 2549 break; 2550 } 2551 2552 case V_028714_SPI_SHADER_SINT16_ABGR: { 2553 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, 2554 is_int8 ? 127 : is_int10 ? 511 : 32767, 0); 2555 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, 2556 is_int8 ? -128 : is_int10 ? -512 : -32768, 0); 2557 LLVMValueRef max_alpha = 2558 !is_int10 ? max_rgb : ctx->i32_1; 2559 LLVMValueRef min_alpha = 2560 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0); 2561 2562 /* Clamp. */ 2563 for (chan = 0; chan < 4; chan++) { 2564 val[chan] = ac_to_integer(&ctx->ac, values[chan]); 2565 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, 2566 TGSI_OPCODE_IMIN, 2567 val[chan], chan == 3 ? max_alpha : max_rgb); 2568 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, 2569 TGSI_OPCODE_IMAX, 2570 val[chan], chan == 3 ? min_alpha : min_rgb); 2571 } 2572 2573 args->compr = 1; /* COMPR flag */ 2574 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val)); 2575 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val+2)); 2576 break; 2577 } 2578 2579 case V_028714_SPI_SHADER_32_ABGR: 2580 memcpy(&args->out[0], values, sizeof(values[0]) * 4); 2581 break; 2582 } 2583 } 2584 2585 static void si_alpha_test(struct lp_build_tgsi_context *bld_base, 2586 LLVMValueRef alpha) 2587 { 2588 struct si_shader_context *ctx = si_shader_context(bld_base); 2589 2590 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { 2591 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = { 2592 [PIPE_FUNC_LESS] = LLVMRealOLT, 2593 [PIPE_FUNC_EQUAL] = LLVMRealOEQ, 2594 [PIPE_FUNC_LEQUAL] = LLVMRealOLE, 2595 [PIPE_FUNC_GREATER] = LLVMRealOGT, 2596 [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, 2597 [PIPE_FUNC_GEQUAL] = LLVMRealOGE, 2598 }; 2599 LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func]; 2600 assert(cond); 2601 2602 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, 2603 SI_PARAM_ALPHA_REF); 2604 LLVMValueRef alpha_pass = 2605 LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); 2606 ac_build_kill_if_false(&ctx->ac, alpha_pass); 2607 } else { 2608 ac_build_kill_if_false(&ctx->ac, LLVMConstInt(ctx->i1, 0, 0)); 2609 } 2610 } 2611 2612 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, 2613 LLVMValueRef alpha, 2614 unsigned samplemask_param) 2615 { 2616 struct si_shader_context *ctx = si_shader_context(bld_base); 2617 LLVMValueRef coverage; 2618 2619 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ 2620 coverage = LLVMGetParam(ctx->main_fn, 2621 samplemask_param); 2622 coverage = ac_to_integer(&ctx->ac, coverage); 2623 2624 coverage = lp_build_intrinsic(ctx->ac.builder, "llvm.ctpop.i32", 2625 ctx->i32, 2626 &coverage, 1, LP_FUNC_ATTR_READNONE); 2627 2628 coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, 2629 ctx->f32, ""); 2630 2631 coverage = LLVMBuildFMul(ctx->ac.builder, coverage, 2632 LLVMConstReal(ctx->f32, 2633 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); 2634 2635 return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); 2636 } 2637 2638 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, 2639 struct ac_export_args *pos, LLVMValueRef *out_elts) 2640 { 2641 unsigned reg_index; 2642 unsigned chan; 2643 unsigned const_chan; 2644 LLVMValueRef base_elt; 2645 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); 2646 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32, 2647 SI_VS_CONST_CLIP_PLANES, 0); 2648 LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); 2649 2650 for (reg_index = 0; reg_index < 2; reg_index ++) { 2651 struct ac_export_args *args = &pos[2 + reg_index]; 2652 2653 args->out[0] = 2654 args->out[1] = 2655 args->out[2] = 2656 args->out[3] = LLVMConstReal(ctx->f32, 0.0f); 2657 2658 /* Compute dot products of position and user clip plane vectors */ 2659 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 2660 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) { 2661 LLVMValueRef addr = 2662 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 + 2663 const_chan) * 4, 0); 2664 base_elt = buffer_load_const(ctx, const_resource, 2665 addr); 2666 args->out[chan] = 2667 lp_build_add(&ctx->bld_base.base, args->out[chan], 2668 lp_build_mul(&ctx->bld_base.base, base_elt, 2669 out_elts[const_chan])); 2670 } 2671 } 2672 2673 args->enabled_channels = 0xf; 2674 args->valid_mask = 0; 2675 args->done = 0; 2676 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; 2677 args->compr = 0; 2678 } 2679 } 2680 2681 static void si_dump_streamout(struct pipe_stream_output_info *so) 2682 { 2683 unsigned i; 2684 2685 if (so->num_outputs) 2686 fprintf(stderr, "STREAMOUT\n"); 2687 2688 for (i = 0; i < so->num_outputs; i++) { 2689 unsigned mask = ((1 << so->output[i].num_components) - 1) << 2690 so->output[i].start_component; 2691 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", 2692 i, so->output[i].output_buffer, 2693 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 2694 so->output[i].register_index, 2695 mask & 1 ? "x" : "", 2696 mask & 2 ? "y" : "", 2697 mask & 4 ? "z" : "", 2698 mask & 8 ? "w" : ""); 2699 } 2700 } 2701 2702 static void emit_streamout_output(struct si_shader_context *ctx, 2703 LLVMValueRef const *so_buffers, 2704 LLVMValueRef const *so_write_offsets, 2705 struct pipe_stream_output *stream_out, 2706 struct si_shader_output_values *shader_out) 2707 { 2708 unsigned buf_idx = stream_out->output_buffer; 2709 unsigned start = stream_out->start_component; 2710 unsigned num_comps = stream_out->num_components; 2711 LLVMValueRef out[4]; 2712 2713 assert(num_comps && num_comps <= 4); 2714 if (!num_comps || num_comps > 4) 2715 return; 2716 2717 /* Load the output as int. */ 2718 for (int j = 0; j < num_comps; j++) { 2719 assert(stream_out->stream == shader_out->vertex_stream[start + j]); 2720 2721 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); 2722 } 2723 2724 /* Pack the output. */ 2725 LLVMValueRef vdata = NULL; 2726 2727 switch (num_comps) { 2728 case 1: /* as i32 */ 2729 vdata = out[0]; 2730 break; 2731 case 2: /* as v2i32 */ 2732 case 3: /* as v4i32 (aligned to 4) */ 2733 case 4: /* as v4i32 */ 2734 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps))); 2735 for (int j = 0; j < num_comps; j++) { 2736 vdata = LLVMBuildInsertElement(ctx->ac.builder, vdata, out[j], 2737 LLVMConstInt(ctx->i32, j, 0), ""); 2738 } 2739 break; 2740 } 2741 2742 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], 2743 vdata, num_comps, 2744 so_write_offsets[buf_idx], 2745 ctx->i32_0, 2746 stream_out->dst_offset * 4, 1, 1, true, false); 2747 } 2748 2749 /** 2750 * Write streamout data to buffers for vertex stream @p stream (different 2751 * vertex streams can occur for GS copy shaders). 2752 */ 2753 static void si_llvm_emit_streamout(struct si_shader_context *ctx, 2754 struct si_shader_output_values *outputs, 2755 unsigned noutput, unsigned stream) 2756 { 2757 struct si_shader_selector *sel = ctx->shader->selector; 2758 struct pipe_stream_output_info *so = &sel->so; 2759 LLVMBuilderRef builder = ctx->ac.builder; 2760 int i; 2761 struct lp_build_if_state if_ctx; 2762 2763 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ 2764 LLVMValueRef so_vtx_count = 2765 unpack_param(ctx, ctx->param_streamout_config, 16, 7); 2766 2767 LLVMValueRef tid = ac_get_thread_id(&ctx->ac); 2768 2769 /* can_emit = tid < so_vtx_count; */ 2770 LLVMValueRef can_emit = 2771 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); 2772 2773 /* Emit the streamout code conditionally. This actually avoids 2774 * out-of-bounds buffer access. The hw tells us via the SGPR 2775 * (so_vtx_count) which threads are allowed to emit streamout data. */ 2776 lp_build_if(&if_ctx, &ctx->gallivm, can_emit); 2777 { 2778 /* The buffer offset is computed as follows: 2779 * ByteOffset = streamout_offset[buffer_id]*4 + 2780 * (streamout_write_index + thread_id)*stride[buffer_id] + 2781 * attrib_offset 2782 */ 2783 2784 LLVMValueRef so_write_index = 2785 LLVMGetParam(ctx->main_fn, 2786 ctx->param_streamout_write_index); 2787 2788 /* Compute (streamout_write_index + thread_id). */ 2789 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); 2790 2791 /* Load the descriptor and compute the write offset for each 2792 * enabled buffer. */ 2793 LLVMValueRef so_write_offset[4] = {}; 2794 LLVMValueRef so_buffers[4]; 2795 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, 2796 ctx->param_rw_buffers); 2797 2798 for (i = 0; i < 4; i++) { 2799 if (!so->stride[i]) 2800 continue; 2801 2802 LLVMValueRef offset = LLVMConstInt(ctx->i32, 2803 SI_VS_STREAMOUT_BUF0 + i, 0); 2804 2805 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 2806 2807 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn, 2808 ctx->param_streamout_offset[i]); 2809 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), ""); 2810 2811 so_write_offset[i] = LLVMBuildMul(builder, so_write_index, 2812 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), ""); 2813 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, ""); 2814 } 2815 2816 /* Write streamout data. */ 2817 for (i = 0; i < so->num_outputs; i++) { 2818 unsigned reg = so->output[i].register_index; 2819 2820 if (reg >= noutput) 2821 continue; 2822 2823 if (stream != so->output[i].stream) 2824 continue; 2825 2826 emit_streamout_output(ctx, so_buffers, so_write_offset, 2827 &so->output[i], &outputs[reg]); 2828 } 2829 } 2830 lp_build_endif(&if_ctx); 2831 } 2832 2833 static void si_export_param(struct si_shader_context *ctx, unsigned index, 2834 LLVMValueRef *values) 2835 { 2836 struct ac_export_args args; 2837 2838 si_llvm_init_export_args(ctx, values, 2839 V_008DFC_SQ_EXP_PARAM + index, &args); 2840 ac_build_export(&ctx->ac, &args); 2841 } 2842 2843 static void si_build_param_exports(struct si_shader_context *ctx, 2844 struct si_shader_output_values *outputs, 2845 unsigned noutput) 2846 { 2847 struct si_shader *shader = ctx->shader; 2848 unsigned param_count = 0; 2849 2850 for (unsigned i = 0; i < noutput; i++) { 2851 unsigned semantic_name = outputs[i].semantic_name; 2852 unsigned semantic_index = outputs[i].semantic_index; 2853 2854 if (outputs[i].vertex_stream[0] != 0 && 2855 outputs[i].vertex_stream[1] != 0 && 2856 outputs[i].vertex_stream[2] != 0 && 2857 outputs[i].vertex_stream[3] != 0) 2858 continue; 2859 2860 switch (semantic_name) { 2861 case TGSI_SEMANTIC_LAYER: 2862 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2863 case TGSI_SEMANTIC_CLIPDIST: 2864 case TGSI_SEMANTIC_COLOR: 2865 case TGSI_SEMANTIC_BCOLOR: 2866 case TGSI_SEMANTIC_PRIMID: 2867 case TGSI_SEMANTIC_FOG: 2868 case TGSI_SEMANTIC_TEXCOORD: 2869 case TGSI_SEMANTIC_GENERIC: 2870 break; 2871 default: 2872 continue; 2873 } 2874 2875 if ((semantic_name != TGSI_SEMANTIC_GENERIC || 2876 semantic_index < SI_MAX_IO_GENERIC) && 2877 shader->key.opt.kill_outputs & 2878 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index))) 2879 continue; 2880 2881 si_export_param(ctx, param_count, outputs[i].values); 2882 2883 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); 2884 shader->info.vs_output_param_offset[i] = param_count++; 2885 } 2886 2887 shader->info.nr_param_exports = param_count; 2888 } 2889 2890 /* Generate export instructions for hardware VS shader stage */ 2891 static void si_llvm_export_vs(struct si_shader_context *ctx, 2892 struct si_shader_output_values *outputs, 2893 unsigned noutput) 2894 { 2895 struct si_shader *shader = ctx->shader; 2896 struct ac_export_args pos_args[4] = {}; 2897 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; 2898 unsigned pos_idx; 2899 int i; 2900 2901 /* Build position exports. */ 2902 for (i = 0; i < noutput; i++) { 2903 switch (outputs[i].semantic_name) { 2904 case TGSI_SEMANTIC_POSITION: 2905 si_llvm_init_export_args(ctx, outputs[i].values, 2906 V_008DFC_SQ_EXP_POS, &pos_args[0]); 2907 break; 2908 case TGSI_SEMANTIC_PSIZE: 2909 psize_value = outputs[i].values[0]; 2910 break; 2911 case TGSI_SEMANTIC_LAYER: 2912 layer_value = outputs[i].values[0]; 2913 break; 2914 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2915 viewport_index_value = outputs[i].values[0]; 2916 break; 2917 case TGSI_SEMANTIC_EDGEFLAG: 2918 edgeflag_value = outputs[i].values[0]; 2919 break; 2920 case TGSI_SEMANTIC_CLIPDIST: 2921 if (!shader->key.opt.clip_disable) { 2922 unsigned index = 2 + outputs[i].semantic_index; 2923 si_llvm_init_export_args(ctx, outputs[i].values, 2924 V_008DFC_SQ_EXP_POS + index, 2925 &pos_args[index]); 2926 } 2927 break; 2928 case TGSI_SEMANTIC_CLIPVERTEX: 2929 if (!shader->key.opt.clip_disable) { 2930 si_llvm_emit_clipvertex(ctx, pos_args, 2931 outputs[i].values); 2932 } 2933 break; 2934 } 2935 } 2936 2937 /* We need to add the position output manually if it's missing. */ 2938 if (!pos_args[0].out[0]) { 2939 pos_args[0].enabled_channels = 0xf; /* writemask */ 2940 pos_args[0].valid_mask = 0; /* EXEC mask */ 2941 pos_args[0].done = 0; /* last export? */ 2942 pos_args[0].target = V_008DFC_SQ_EXP_POS; 2943 pos_args[0].compr = 0; /* COMPR flag */ 2944 pos_args[0].out[0] = ctx->ac.f32_0; /* X */ 2945 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ 2946 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ 2947 pos_args[0].out[3] = ctx->ac.f32_1; /* W */ 2948 } 2949 2950 /* Write the misc vector (point size, edgeflag, layer, viewport). */ 2951 if (shader->selector->info.writes_psize || 2952 shader->selector->info.writes_edgeflag || 2953 shader->selector->info.writes_viewport_index || 2954 shader->selector->info.writes_layer) { 2955 pos_args[1].enabled_channels = shader->selector->info.writes_psize | 2956 (shader->selector->info.writes_edgeflag << 1) | 2957 (shader->selector->info.writes_layer << 2); 2958 2959 pos_args[1].valid_mask = 0; /* EXEC mask */ 2960 pos_args[1].done = 0; /* last export? */ 2961 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; 2962 pos_args[1].compr = 0; /* COMPR flag */ 2963 pos_args[1].out[0] = ctx->ac.f32_0; /* X */ 2964 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ 2965 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ 2966 pos_args[1].out[3] = ctx->ac.f32_0; /* W */ 2967 2968 if (shader->selector->info.writes_psize) 2969 pos_args[1].out[0] = psize_value; 2970 2971 if (shader->selector->info.writes_edgeflag) { 2972 /* The output is a float, but the hw expects an integer 2973 * with the first bit containing the edge flag. */ 2974 edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, 2975 edgeflag_value, 2976 ctx->i32, ""); 2977 edgeflag_value = ac_build_umin(&ctx->ac, 2978 edgeflag_value, 2979 ctx->i32_1); 2980 2981 /* The LLVM intrinsic expects a float. */ 2982 pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); 2983 } 2984 2985 if (ctx->screen->info.chip_class >= GFX9) { 2986 /* GFX9 has the layer in out.z[10:0] and the viewport 2987 * index in out.z[19:16]. 2988 */ 2989 if (shader->selector->info.writes_layer) 2990 pos_args[1].out[2] = layer_value; 2991 2992 if (shader->selector->info.writes_viewport_index) { 2993 LLVMValueRef v = viewport_index_value; 2994 2995 v = ac_to_integer(&ctx->ac, v); 2996 v = LLVMBuildShl(ctx->ac.builder, v, 2997 LLVMConstInt(ctx->i32, 16, 0), ""); 2998 v = LLVMBuildOr(ctx->ac.builder, v, 2999 ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); 3000 pos_args[1].out[2] = ac_to_float(&ctx->ac, v); 3001 pos_args[1].enabled_channels |= 1 << 2; 3002 } 3003 } else { 3004 if (shader->selector->info.writes_layer) 3005 pos_args[1].out[2] = layer_value; 3006 3007 if (shader->selector->info.writes_viewport_index) { 3008 pos_args[1].out[3] = viewport_index_value; 3009 pos_args[1].enabled_channels |= 1 << 3; 3010 } 3011 } 3012 } 3013 3014 for (i = 0; i < 4; i++) 3015 if (pos_args[i].out[0]) 3016 shader->info.nr_pos_exports++; 3017 3018 pos_idx = 0; 3019 for (i = 0; i < 4; i++) { 3020 if (!pos_args[i].out[0]) 3021 continue; 3022 3023 /* Specify the target we are exporting */ 3024 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; 3025 3026 if (pos_idx == shader->info.nr_pos_exports) 3027 /* Specify that this is the last export */ 3028 pos_args[i].done = 1; 3029 3030 ac_build_export(&ctx->ac, &pos_args[i]); 3031 } 3032 3033 /* Build parameter exports. */ 3034 si_build_param_exports(ctx, outputs, noutput); 3035 } 3036 3037 /** 3038 * Forward all outputs from the vertex shader to the TES. This is only used 3039 * for the fixed function TCS. 3040 */ 3041 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) 3042 { 3043 struct si_shader_context *ctx = si_shader_context(bld_base); 3044 LLVMValueRef invocation_id, buffer, buffer_offset; 3045 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base; 3046 uint64_t inputs; 3047 3048 invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 3049 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); 3050 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 3051 3052 lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); 3053 lds_vertex_offset = LLVMBuildMul(ctx->ac.builder, invocation_id, 3054 lds_vertex_stride, ""); 3055 lds_base = get_tcs_in_current_patch_offset(ctx); 3056 lds_base = LLVMBuildAdd(ctx->ac.builder, lds_base, lds_vertex_offset, ""); 3057 3058 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; 3059 while (inputs) { 3060 unsigned i = u_bit_scan64(&inputs); 3061 3062 LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base, 3063 LLVMConstInt(ctx->i32, 4 * i, 0), 3064 ""); 3065 3066 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, 3067 get_rel_patch_id(ctx), 3068 invocation_id, 3069 LLVMConstInt(ctx->i32, i, 0)); 3070 3071 LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0, 3072 lds_ptr); 3073 3074 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, 3075 buffer_offset, 0, 1, 0, true, false); 3076 } 3077 } 3078 3079 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, 3080 LLVMValueRef rel_patch_id, 3081 LLVMValueRef invocation_id, 3082 LLVMValueRef tcs_out_current_patch_data_offset, 3083 LLVMValueRef invoc0_tf_outer[4], 3084 LLVMValueRef invoc0_tf_inner[2]) 3085 { 3086 struct si_shader_context *ctx = si_shader_context(bld_base); 3087 struct si_shader *shader = ctx->shader; 3088 unsigned tess_inner_index, tess_outer_index; 3089 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; 3090 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; 3091 unsigned stride, outer_comps, inner_comps, i, offset; 3092 struct lp_build_if_state if_ctx, inner_if_ctx; 3093 3094 /* Add a barrier before loading tess factors from LDS. */ 3095 if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) 3096 si_llvm_emit_barrier(NULL, bld_base, NULL); 3097 3098 /* Do this only for invocation 0, because the tess levels are per-patch, 3099 * not per-vertex. 3100 * 3101 * This can't jump, because invocation 0 executes this. It should 3102 * at least mask out the loads and stores for other invocations. 3103 */ 3104 lp_build_if(&if_ctx, &ctx->gallivm, 3105 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, 3106 invocation_id, ctx->i32_0, "")); 3107 3108 /* Determine the layout of one tess factor element in the buffer. */ 3109 switch (shader->key.part.tcs.epilog.prim_mode) { 3110 case PIPE_PRIM_LINES: 3111 stride = 2; /* 2 dwords, 1 vec2 store */ 3112 outer_comps = 2; 3113 inner_comps = 0; 3114 break; 3115 case PIPE_PRIM_TRIANGLES: 3116 stride = 4; /* 4 dwords, 1 vec4 store */ 3117 outer_comps = 3; 3118 inner_comps = 1; 3119 break; 3120 case PIPE_PRIM_QUADS: 3121 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ 3122 outer_comps = 4; 3123 inner_comps = 2; 3124 break; 3125 default: 3126 assert(0); 3127 return; 3128 } 3129 3130 for (i = 0; i < 4; i++) { 3131 inner[i] = LLVMGetUndef(ctx->i32); 3132 outer[i] = LLVMGetUndef(ctx->i32); 3133 } 3134 3135 if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { 3136 /* Tess factors are in VGPRs. */ 3137 for (i = 0; i < outer_comps; i++) 3138 outer[i] = out[i] = invoc0_tf_outer[i]; 3139 for (i = 0; i < inner_comps; i++) 3140 inner[i] = out[outer_comps+i] = invoc0_tf_inner[i]; 3141 } else { 3142 /* Load tess_inner and tess_outer from LDS. 3143 * Any invocation can write them, so we can't get them from a temporary. 3144 */ 3145 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); 3146 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); 3147 3148 lds_base = tcs_out_current_patch_data_offset; 3149 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base, 3150 LLVMConstInt(ctx->i32, 3151 tess_inner_index * 4, 0), ""); 3152 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base, 3153 LLVMConstInt(ctx->i32, 3154 tess_outer_index * 4, 0), ""); 3155 3156 for (i = 0; i < outer_comps; i++) { 3157 outer[i] = out[i] = 3158 lds_load(bld_base, ctx->ac.i32, i, lds_outer); 3159 } 3160 for (i = 0; i < inner_comps; i++) { 3161 inner[i] = out[outer_comps+i] = 3162 lds_load(bld_base, ctx->ac.i32, i, lds_inner); 3163 } 3164 } 3165 3166 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { 3167 /* For isolines, the hardware expects tess factors in the 3168 * reverse order from what GLSL / TGSI specify. 3169 */ 3170 LLVMValueRef tmp = out[0]; 3171 out[0] = out[1]; 3172 out[1] = tmp; 3173 } 3174 3175 /* Convert the outputs to vectors for stores. */ 3176 vec0 = lp_build_gather_values(&ctx->gallivm, out, MIN2(stride, 4)); 3177 vec1 = NULL; 3178 3179 if (stride > 4) 3180 vec1 = lp_build_gather_values(&ctx->gallivm, out+4, stride - 4); 3181 3182 /* Get the buffer. */ 3183 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k); 3184 3185 /* Get the offset. */ 3186 tf_base = LLVMGetParam(ctx->main_fn, 3187 ctx->param_tcs_factor_offset); 3188 byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, 3189 LLVMConstInt(ctx->i32, 4 * stride, 0), ""); 3190 3191 lp_build_if(&inner_if_ctx, &ctx->gallivm, 3192 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, 3193 rel_patch_id, ctx->i32_0, "")); 3194 3195 /* Store the dynamic HS control word. */ 3196 offset = 0; 3197 if (ctx->screen->info.chip_class <= VI) { 3198 ac_build_buffer_store_dword(&ctx->ac, buffer, 3199 LLVMConstInt(ctx->i32, 0x80000000, 0), 3200 1, ctx->i32_0, tf_base, 3201 offset, 1, 0, true, false); 3202 offset += 4; 3203 } 3204 3205 lp_build_endif(&inner_if_ctx); 3206 3207 /* Store the tessellation factors. */ 3208 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, 3209 MIN2(stride, 4), byteoffset, tf_base, 3210 offset, 1, 0, true, false); 3211 offset += 16; 3212 if (vec1) 3213 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, 3214 stride - 4, byteoffset, tf_base, 3215 offset, 1, 0, true, false); 3216 3217 /* Store the tess factors into the offchip buffer if TES reads them. */ 3218 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { 3219 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; 3220 LLVMValueRef tf_inner_offset; 3221 unsigned param_outer, param_inner; 3222 3223 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k); 3224 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset); 3225 3226 param_outer = si_shader_io_get_unique_index_patch( 3227 TGSI_SEMANTIC_TESSOUTER, 0); 3228 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, 3229 LLVMConstInt(ctx->i32, param_outer, 0)); 3230 3231 outer_vec = lp_build_gather_values(&ctx->gallivm, outer, 3232 util_next_power_of_two(outer_comps)); 3233 3234 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, 3235 outer_comps, tf_outer_offset, 3236 base, 0, 1, 0, true, false); 3237 if (inner_comps) { 3238 param_inner = si_shader_io_get_unique_index_patch( 3239 TGSI_SEMANTIC_TESSINNER, 0); 3240 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, 3241 LLVMConstInt(ctx->i32, param_inner, 0)); 3242 3243 inner_vec = inner_comps == 1 ? inner[0] : 3244 lp_build_gather_values(&ctx->gallivm, inner, inner_comps); 3245 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, 3246 inner_comps, tf_inner_offset, 3247 base, 0, 1, 0, true, false); 3248 } 3249 } 3250 3251 lp_build_endif(&if_ctx); 3252 } 3253 3254 static LLVMValueRef 3255 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, 3256 unsigned param, unsigned return_index) 3257 { 3258 return LLVMBuildInsertValue(ctx->ac.builder, ret, 3259 LLVMGetParam(ctx->main_fn, param), 3260 return_index, ""); 3261 } 3262 3263 static LLVMValueRef 3264 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, 3265 unsigned param, unsigned return_index) 3266 { 3267 LLVMBuilderRef builder = ctx->ac.builder; 3268 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param); 3269 3270 return LLVMBuildInsertValue(builder, ret, 3271 ac_to_float(&ctx->ac, p), 3272 return_index, ""); 3273 } 3274 3275 static LLVMValueRef 3276 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret, 3277 unsigned param, unsigned return_index) 3278 { 3279 LLVMBuilderRef builder = ctx->ac.builder; 3280 LLVMValueRef ptr, lo, hi; 3281 3282 ptr = LLVMGetParam(ctx->main_fn, param); 3283 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, ""); 3284 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, ""); 3285 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, ""); 3286 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, ""); 3287 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, ""); 3288 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, ""); 3289 } 3290 3291 /* This only writes the tessellation factor levels. */ 3292 static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, 3293 unsigned max_outputs, 3294 LLVMValueRef *addrs) 3295 { 3296 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3297 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 3298 LLVMBuilderRef builder = ctx->ac.builder; 3299 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; 3300 3301 si_copy_tcs_inputs(bld_base); 3302 3303 rel_patch_id = get_rel_patch_id(ctx); 3304 invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5); 3305 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); 3306 3307 if (ctx->screen->info.chip_class >= GFX9) { 3308 LLVMBasicBlockRef blocks[2] = { 3309 LLVMGetInsertBlock(builder), 3310 ctx->merged_wrap_if_state.entry_block 3311 }; 3312 LLVMValueRef values[2]; 3313 3314 lp_build_endif(&ctx->merged_wrap_if_state); 3315 3316 values[0] = rel_patch_id; 3317 values[1] = LLVMGetUndef(ctx->i32); 3318 rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3319 3320 values[0] = tf_lds_offset; 3321 values[1] = LLVMGetUndef(ctx->i32); 3322 tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3323 3324 values[0] = invocation_id; 3325 values[1] = ctx->i32_1; /* cause the epilog to skip threads */ 3326 invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks); 3327 } 3328 3329 /* Return epilog parameters from this function. */ 3330 LLVMValueRef ret = ctx->return_value; 3331 unsigned vgpr; 3332 3333 if (ctx->screen->info.chip_class >= GFX9) { 3334 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3335 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); 3336 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k, 3337 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K); 3338 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k, 3339 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K); 3340 /* Tess offchip and tess factor offsets are at the beginning. */ 3341 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); 3342 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); 3343 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1; 3344 } else { 3345 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3346 GFX6_SGPR_TCS_OFFCHIP_LAYOUT); 3347 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k, 3348 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K); 3349 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k, 3350 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K); 3351 /* Tess offchip and tess factor offsets are after user SGPRs. */ 3352 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 3353 GFX6_TCS_NUM_USER_SGPR); 3354 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 3355 GFX6_TCS_NUM_USER_SGPR + 1); 3356 vgpr = GFX6_TCS_NUM_USER_SGPR + 2; 3357 } 3358 3359 /* VGPRs */ 3360 rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id); 3361 invocation_id = ac_to_float(&ctx->ac, invocation_id); 3362 tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset); 3363 3364 /* Leave a hole corresponding to the two input VGPRs. This ensures that 3365 * the invocation_id output does not alias the tcs_rel_ids input, 3366 * which saves a V_MOV on gfx9. 3367 */ 3368 vgpr += 2; 3369 3370 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); 3371 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); 3372 3373 if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { 3374 vgpr++; /* skip the tess factor LDS offset */ 3375 for (unsigned i = 0; i < 6; i++) { 3376 LLVMValueRef value = 3377 LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); 3378 value = ac_to_float(&ctx->ac, value); 3379 ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); 3380 } 3381 } else { 3382 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); 3383 } 3384 ctx->return_value = ret; 3385 } 3386 3387 /* Pass TCS inputs from LS to TCS on GFX9. */ 3388 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) 3389 { 3390 LLVMValueRef ret = ctx->return_value; 3391 3392 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2); 3393 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); 3394 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4); 3395 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); 3396 3397 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 3398 8 + SI_SGPR_RW_BUFFERS); 3399 ret = si_insert_input_ptr_as_2xi32(ctx, ret, 3400 ctx->param_bindless_samplers_and_images, 3401 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); 3402 3403 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits, 3404 8 + SI_SGPR_VS_STATE_BITS); 3405 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout, 3406 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); 3407 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets, 3408 8 + GFX9_SGPR_TCS_OUT_OFFSETS); 3409 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout, 3410 8 + GFX9_SGPR_TCS_OUT_LAYOUT); 3411 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k, 3412 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K); 3413 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k, 3414 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K); 3415 3416 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2; 3417 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param, 3418 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS); 3419 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1, 3420 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES); 3421 3422 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; 3423 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 3424 ac_to_float(&ctx->ac, ctx->abi.tcs_patch_id), 3425 vgpr++, ""); 3426 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 3427 ac_to_float(&ctx->ac, ctx->abi.tcs_rel_ids), 3428 vgpr++, ""); 3429 ctx->return_value = ret; 3430 } 3431 3432 /* Pass GS inputs from ES to GS on GFX9. */ 3433 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) 3434 { 3435 LLVMValueRef ret = ctx->return_value; 3436 3437 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2); 3438 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3); 3439 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5); 3440 3441 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 3442 8 + SI_SGPR_RW_BUFFERS); 3443 ret = si_insert_input_ptr_as_2xi32(ctx, ret, 3444 ctx->param_bindless_samplers_and_images, 3445 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); 3446 3447 unsigned desc_param = ctx->param_vs_state_bits + 1; 3448 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param, 3449 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS); 3450 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1, 3451 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES); 3452 3453 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR; 3454 for (unsigned i = 0; i < 5; i++) { 3455 unsigned param = ctx->param_gs_vtx01_offset + i; 3456 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++); 3457 } 3458 ctx->return_value = ret; 3459 } 3460 3461 static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, 3462 unsigned max_outputs, 3463 LLVMValueRef *addrs) 3464 { 3465 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3466 struct si_shader *shader = ctx->shader; 3467 struct tgsi_shader_info *info = &shader->selector->info; 3468 unsigned i, chan; 3469 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn, 3470 ctx->param_rel_auto_id); 3471 LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); 3472 LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, 3473 vertex_dw_stride, ""); 3474 3475 /* Write outputs to LDS. The next shader (TCS aka HS) will read 3476 * its inputs from it. */ 3477 for (i = 0; i < info->num_outputs; i++) { 3478 unsigned name = info->output_semantic_name[i]; 3479 unsigned index = info->output_semantic_index[i]; 3480 3481 /* The ARB_shader_viewport_layer_array spec contains the 3482 * following issue: 3483 * 3484 * 2) What happens if gl_ViewportIndex or gl_Layer is 3485 * written in the vertex shader and a geometry shader is 3486 * present? 3487 * 3488 * RESOLVED: The value written by the last vertex processing 3489 * stage is used. If the last vertex processing stage 3490 * (vertex, tessellation evaluation or geometry) does not 3491 * statically assign to gl_ViewportIndex or gl_Layer, index 3492 * or layer zero is assumed. 3493 * 3494 * So writes to those outputs in VS-as-LS are simply ignored. 3495 */ 3496 if (name == TGSI_SEMANTIC_LAYER || 3497 name == TGSI_SEMANTIC_VIEWPORT_INDEX) 3498 continue; 3499 3500 int param = si_shader_io_get_unique_index(name, index); 3501 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, 3502 LLVMConstInt(ctx->i32, param * 4, 0), ""); 3503 3504 for (chan = 0; chan < 4; chan++) { 3505 if (!(info->output_usagemask[i] & (1 << chan))) 3506 continue; 3507 3508 lds_store(ctx, chan, dw_addr, 3509 LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); 3510 } 3511 } 3512 3513 if (ctx->screen->info.chip_class >= GFX9) 3514 si_set_ls_return_value_for_tcs(ctx); 3515 } 3516 3517 static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, 3518 unsigned max_outputs, 3519 LLVMValueRef *addrs) 3520 { 3521 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3522 struct si_shader *es = ctx->shader; 3523 struct tgsi_shader_info *info = &es->selector->info; 3524 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, 3525 ctx->param_es2gs_offset); 3526 LLVMValueRef lds_base = NULL; 3527 unsigned chan; 3528 int i; 3529 3530 if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { 3531 unsigned itemsize_dw = es->selector->esgs_itemsize / 4; 3532 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); 3533 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4); 3534 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, 3535 LLVMBuildMul(ctx->ac.builder, wave_idx, 3536 LLVMConstInt(ctx->i32, 64, false), ""), ""); 3537 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, 3538 LLVMConstInt(ctx->i32, itemsize_dw, 0), ""); 3539 } 3540 3541 for (i = 0; i < info->num_outputs; i++) { 3542 int param; 3543 3544 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || 3545 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) 3546 continue; 3547 3548 param = si_shader_io_get_unique_index(info->output_semantic_name[i], 3549 info->output_semantic_index[i]); 3550 3551 for (chan = 0; chan < 4; chan++) { 3552 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); 3553 out_val = ac_to_integer(&ctx->ac, out_val); 3554 3555 /* GFX9 has the ESGS ring in LDS. */ 3556 if (ctx->screen->info.chip_class >= GFX9) { 3557 lds_store(ctx, param * 4 + chan, lds_base, out_val); 3558 continue; 3559 } 3560 3561 ac_build_buffer_store_dword(&ctx->ac, 3562 ctx->esgs_ring, 3563 out_val, 1, NULL, soffset, 3564 (4 * param + chan) * 4, 3565 1, 1, true, true); 3566 } 3567 } 3568 3569 if (ctx->screen->info.chip_class >= GFX9) 3570 si_set_es_return_value_for_gs(ctx); 3571 } 3572 3573 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) 3574 { 3575 if (ctx->screen->info.chip_class >= GFX9) 3576 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8); 3577 else 3578 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id); 3579 } 3580 3581 static void emit_gs_epilogue(struct si_shader_context *ctx) 3582 { 3583 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, 3584 si_get_gs_wave_id(ctx)); 3585 3586 if (ctx->screen->info.chip_class >= GFX9) 3587 lp_build_endif(&ctx->merged_wrap_if_state); 3588 } 3589 3590 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, 3591 unsigned max_outputs, 3592 LLVMValueRef *addrs) 3593 { 3594 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3595 struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info; 3596 3597 assert(info->num_outputs <= max_outputs); 3598 3599 emit_gs_epilogue(ctx); 3600 } 3601 3602 static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) 3603 { 3604 struct si_shader_context *ctx = si_shader_context(bld_base); 3605 emit_gs_epilogue(ctx); 3606 } 3607 3608 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, 3609 unsigned max_outputs, 3610 LLVMValueRef *addrs) 3611 { 3612 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3613 struct tgsi_shader_info *info = &ctx->shader->selector->info; 3614 struct si_shader_output_values *outputs = NULL; 3615 int i,j; 3616 3617 assert(!ctx->shader->is_gs_copy_shader); 3618 assert(info->num_outputs <= max_outputs); 3619 3620 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); 3621 3622 /* Vertex color clamping. 3623 * 3624 * This uses a state constant loaded in a user data SGPR and 3625 * an IF statement is added that clamps all colors if the constant 3626 * is true. 3627 */ 3628 if (ctx->type == PIPE_SHADER_VERTEX) { 3629 struct lp_build_if_state if_ctx; 3630 LLVMValueRef cond = NULL; 3631 LLVMValueRef addr, val; 3632 3633 for (i = 0; i < info->num_outputs; i++) { 3634 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR && 3635 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR) 3636 continue; 3637 3638 /* We've found a color. */ 3639 if (!cond) { 3640 /* The state is in the first bit of the user SGPR. */ 3641 cond = LLVMGetParam(ctx->main_fn, 3642 ctx->param_vs_state_bits); 3643 cond = LLVMBuildTrunc(ctx->ac.builder, cond, 3644 ctx->i1, ""); 3645 lp_build_if(&if_ctx, &ctx->gallivm, cond); 3646 } 3647 3648 for (j = 0; j < 4; j++) { 3649 addr = addrs[4 * i + j]; 3650 val = LLVMBuildLoad(ctx->ac.builder, addr, ""); 3651 val = ac_build_clamp(&ctx->ac, val); 3652 LLVMBuildStore(ctx->ac.builder, val, addr); 3653 } 3654 } 3655 3656 if (cond) 3657 lp_build_endif(&if_ctx); 3658 } 3659 3660 for (i = 0; i < info->num_outputs; i++) { 3661 outputs[i].semantic_name = info->output_semantic_name[i]; 3662 outputs[i].semantic_index = info->output_semantic_index[i]; 3663 3664 for (j = 0; j < 4; j++) { 3665 outputs[i].values[j] = 3666 LLVMBuildLoad(ctx->ac.builder, 3667 addrs[4 * i + j], 3668 ""); 3669 outputs[i].vertex_stream[j] = 3670 (info->output_streams[i] >> (2 * j)) & 3; 3671 } 3672 } 3673 3674 if (ctx->shader->selector->so.num_outputs) 3675 si_llvm_emit_streamout(ctx, outputs, i, 0); 3676 3677 /* Export PrimitiveID. */ 3678 if (ctx->shader->key.mono.u.vs_export_prim_id) { 3679 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; 3680 outputs[i].semantic_index = 0; 3681 outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0)); 3682 for (j = 1; j < 4; j++) 3683 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0); 3684 3685 memset(outputs[i].vertex_stream, 0, 3686 sizeof(outputs[i].vertex_stream)); 3687 i++; 3688 } 3689 3690 si_llvm_export_vs(ctx, outputs, i); 3691 FREE(outputs); 3692 } 3693 3694 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base) 3695 { 3696 struct si_shader_context *ctx = si_shader_context(bld_base); 3697 3698 ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS, 3699 &ctx->outputs[0][0]); 3700 } 3701 3702 struct si_ps_exports { 3703 unsigned num; 3704 struct ac_export_args args[10]; 3705 }; 3706 3707 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, 3708 LLVMValueRef depth, LLVMValueRef stencil, 3709 LLVMValueRef samplemask, struct si_ps_exports *exp) 3710 { 3711 struct si_shader_context *ctx = si_shader_context(bld_base); 3712 struct ac_export_args args; 3713 3714 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args); 3715 3716 memcpy(&exp->args[exp->num++], &args, sizeof(args)); 3717 } 3718 3719 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, 3720 LLVMValueRef *color, unsigned index, 3721 unsigned samplemask_param, 3722 bool is_last, struct si_ps_exports *exp) 3723 { 3724 struct si_shader_context *ctx = si_shader_context(bld_base); 3725 int i; 3726 3727 /* Clamp color */ 3728 if (ctx->shader->key.part.ps.epilog.clamp_color) 3729 for (i = 0; i < 4; i++) 3730 color[i] = ac_build_clamp(&ctx->ac, color[i]); 3731 3732 /* Alpha to one */ 3733 if (ctx->shader->key.part.ps.epilog.alpha_to_one) 3734 color[3] = ctx->ac.f32_1; 3735 3736 /* Alpha test */ 3737 if (index == 0 && 3738 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) 3739 si_alpha_test(bld_base, color[3]); 3740 3741 /* Line & polygon smoothing */ 3742 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) 3743 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], 3744 samplemask_param); 3745 3746 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ 3747 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { 3748 struct ac_export_args args[8]; 3749 int c, last = -1; 3750 3751 /* Get the export arguments, also find out what the last one is. */ 3752 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { 3753 si_llvm_init_export_args(ctx, color, 3754 V_008DFC_SQ_EXP_MRT + c, &args[c]); 3755 if (args[c].enabled_channels) 3756 last = c; 3757 } 3758 3759 /* Emit all exports. */ 3760 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { 3761 if (is_last && last == c) { 3762 args[c].valid_mask = 1; /* whether the EXEC mask is valid */ 3763 args[c].done = 1; /* DONE bit */ 3764 } else if (!args[c].enabled_channels) 3765 continue; /* unnecessary NULL export */ 3766 3767 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c])); 3768 } 3769 } else { 3770 struct ac_export_args args; 3771 3772 /* Export */ 3773 si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, 3774 &args); 3775 if (is_last) { 3776 args.valid_mask = 1; /* whether the EXEC mask is valid */ 3777 args.done = 1; /* DONE bit */ 3778 } else if (!args.enabled_channels) 3779 return; /* unnecessary NULL export */ 3780 3781 memcpy(&exp->args[exp->num++], &args, sizeof(args)); 3782 } 3783 } 3784 3785 static void si_emit_ps_exports(struct si_shader_context *ctx, 3786 struct si_ps_exports *exp) 3787 { 3788 for (unsigned i = 0; i < exp->num; i++) 3789 ac_build_export(&ctx->ac, &exp->args[i]); 3790 } 3791 3792 static void si_export_null(struct lp_build_tgsi_context *bld_base) 3793 { 3794 struct si_shader_context *ctx = si_shader_context(bld_base); 3795 struct lp_build_context *base = &bld_base->base; 3796 struct ac_export_args args; 3797 3798 args.enabled_channels = 0x0; /* enabled channels */ 3799 args.valid_mask = 1; /* whether the EXEC mask is valid */ 3800 args.done = 1; /* DONE bit */ 3801 args.target = V_008DFC_SQ_EXP_NULL; 3802 args.compr = 0; /* COMPR flag (0 = 32-bit export) */ 3803 args.out[0] = base->undef; /* R */ 3804 args.out[1] = base->undef; /* G */ 3805 args.out[2] = base->undef; /* B */ 3806 args.out[3] = base->undef; /* A */ 3807 3808 ac_build_export(&ctx->ac, &args); 3809 } 3810 3811 /** 3812 * Return PS outputs in this order: 3813 * 3814 * v[0:3] = color0.xyzw 3815 * v[4:7] = color1.xyzw 3816 * ... 3817 * vN+0 = Depth 3818 * vN+1 = Stencil 3819 * vN+2 = SampleMask 3820 * vN+3 = SampleMaskIn (used for OpenGL smoothing) 3821 * 3822 * The alpha-ref SGPR is returned via its original location. 3823 */ 3824 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, 3825 unsigned max_outputs, 3826 LLVMValueRef *addrs) 3827 { 3828 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 3829 struct si_shader *shader = ctx->shader; 3830 struct tgsi_shader_info *info = &shader->selector->info; 3831 LLVMBuilderRef builder = ctx->ac.builder; 3832 unsigned i, j, first_vgpr, vgpr; 3833 3834 LLVMValueRef color[8][4] = {}; 3835 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 3836 LLVMValueRef ret; 3837 3838 if (ctx->postponed_kill) 3839 ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, "")); 3840 3841 /* Read the output values. */ 3842 for (i = 0; i < info->num_outputs; i++) { 3843 unsigned semantic_name = info->output_semantic_name[i]; 3844 unsigned semantic_index = info->output_semantic_index[i]; 3845 3846 switch (semantic_name) { 3847 case TGSI_SEMANTIC_COLOR: 3848 assert(semantic_index < 8); 3849 for (j = 0; j < 4; j++) { 3850 LLVMValueRef ptr = addrs[4 * i + j]; 3851 LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); 3852 color[semantic_index][j] = result; 3853 } 3854 break; 3855 case TGSI_SEMANTIC_POSITION: 3856 depth = LLVMBuildLoad(builder, 3857 addrs[4 * i + 2], ""); 3858 break; 3859 case TGSI_SEMANTIC_STENCIL: 3860 stencil = LLVMBuildLoad(builder, 3861 addrs[4 * i + 1], ""); 3862 break; 3863 case TGSI_SEMANTIC_SAMPLEMASK: 3864 samplemask = LLVMBuildLoad(builder, 3865 addrs[4 * i + 0], ""); 3866 break; 3867 default: 3868 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", 3869 semantic_name); 3870 } 3871 } 3872 3873 /* Fill the return structure. */ 3874 ret = ctx->return_value; 3875 3876 /* Set SGPRs. */ 3877 ret = LLVMBuildInsertValue(builder, ret, 3878 ac_to_integer(&ctx->ac, 3879 LLVMGetParam(ctx->main_fn, 3880 SI_PARAM_ALPHA_REF)), 3881 SI_SGPR_ALPHA_REF, ""); 3882 3883 /* Set VGPRs */ 3884 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; 3885 for (i = 0; i < ARRAY_SIZE(color); i++) { 3886 if (!color[i][0]) 3887 continue; 3888 3889 for (j = 0; j < 4; j++) 3890 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); 3891 } 3892 if (depth) 3893 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); 3894 if (stencil) 3895 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); 3896 if (samplemask) 3897 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); 3898 3899 /* Add the input sample mask for smoothing at the end. */ 3900 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) 3901 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; 3902 ret = LLVMBuildInsertValue(builder, ret, 3903 LLVMGetParam(ctx->main_fn, 3904 SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); 3905 3906 ctx->return_value = ret; 3907 } 3908 3909 static void membar_emit( 3910 const struct lp_build_tgsi_action *action, 3911 struct lp_build_tgsi_context *bld_base, 3912 struct lp_build_emit_data *emit_data) 3913 { 3914 struct si_shader_context *ctx = si_shader_context(bld_base); 3915 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0); 3916 unsigned flags = LLVMConstIntGetZExtValue(src0); 3917 unsigned waitcnt = NOOP_WAITCNT; 3918 3919 if (flags & TGSI_MEMBAR_THREAD_GROUP) 3920 waitcnt &= VM_CNT & LGKM_CNT; 3921 3922 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER | 3923 TGSI_MEMBAR_SHADER_BUFFER | 3924 TGSI_MEMBAR_SHADER_IMAGE)) 3925 waitcnt &= VM_CNT; 3926 3927 if (flags & TGSI_MEMBAR_SHARED) 3928 waitcnt &= LGKM_CNT; 3929 3930 if (waitcnt != NOOP_WAITCNT) 3931 ac_build_waitcnt(&ctx->ac, waitcnt); 3932 } 3933 3934 static void clock_emit( 3935 const struct lp_build_tgsi_action *action, 3936 struct lp_build_tgsi_context *bld_base, 3937 struct lp_build_emit_data *emit_data) 3938 { 3939 struct si_shader_context *ctx = si_shader_context(bld_base); 3940 LLVMValueRef tmp; 3941 3942 tmp = lp_build_intrinsic(ctx->ac.builder, "llvm.readcyclecounter", 3943 ctx->i64, NULL, 0, 0); 3944 tmp = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->v2i32, ""); 3945 3946 emit_data->output[0] = 3947 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, ""); 3948 emit_data->output[1] = 3949 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, ""); 3950 } 3951 3952 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements) 3953 { 3954 return LLVMPointerType(LLVMArrayType(elem_type, num_elements), 3955 CONST_ADDR_SPACE); 3956 } 3957 3958 static void si_llvm_emit_ddxy( 3959 const struct lp_build_tgsi_action *action, 3960 struct lp_build_tgsi_context *bld_base, 3961 struct lp_build_emit_data *emit_data) 3962 { 3963 struct si_shader_context *ctx = si_shader_context(bld_base); 3964 unsigned opcode = emit_data->info->opcode; 3965 LLVMValueRef val; 3966 int idx; 3967 unsigned mask; 3968 3969 if (opcode == TGSI_OPCODE_DDX_FINE) 3970 mask = AC_TID_MASK_LEFT; 3971 else if (opcode == TGSI_OPCODE_DDY_FINE) 3972 mask = AC_TID_MASK_TOP; 3973 else 3974 mask = AC_TID_MASK_TOP_LEFT; 3975 3976 /* for DDX we want to next X pixel, DDY next Y pixel. */ 3977 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; 3978 3979 val = ac_to_integer(&ctx->ac, emit_data->args[0]); 3980 val = ac_build_ddxy(&ctx->ac, mask, idx, val); 3981 emit_data->output[emit_data->chan] = val; 3982 } 3983 3984 /* 3985 * this takes an I,J coordinate pair, 3986 * and works out the X and Y derivatives. 3987 * it returns DDX(I), DDX(J), DDY(I), DDY(J). 3988 */ 3989 static LLVMValueRef si_llvm_emit_ddxy_interp( 3990 struct lp_build_tgsi_context *bld_base, 3991 LLVMValueRef interp_ij) 3992 { 3993 struct si_shader_context *ctx = si_shader_context(bld_base); 3994 LLVMValueRef result[4], a; 3995 unsigned i; 3996 3997 for (i = 0; i < 2; i++) { 3998 a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij, 3999 LLVMConstInt(ctx->i32, i, 0), ""); 4000 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a); 4001 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a); 4002 } 4003 4004 return lp_build_gather_values(&ctx->gallivm, result, 4); 4005 } 4006 4007 static void interp_fetch_args( 4008 struct lp_build_tgsi_context *bld_base, 4009 struct lp_build_emit_data *emit_data) 4010 { 4011 struct si_shader_context *ctx = si_shader_context(bld_base); 4012 const struct tgsi_full_instruction *inst = emit_data->inst; 4013 4014 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 4015 /* offset is in second src, first two channels */ 4016 emit_data->args[0] = lp_build_emit_fetch(bld_base, 4017 emit_data->inst, 1, 4018 TGSI_CHAN_X); 4019 emit_data->args[1] = lp_build_emit_fetch(bld_base, 4020 emit_data->inst, 1, 4021 TGSI_CHAN_Y); 4022 emit_data->arg_count = 2; 4023 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 4024 LLVMValueRef sample_position; 4025 LLVMValueRef sample_id; 4026 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f); 4027 4028 /* fetch sample ID, then fetch its sample position, 4029 * and place into first two channels. 4030 */ 4031 sample_id = lp_build_emit_fetch(bld_base, 4032 emit_data->inst, 1, TGSI_CHAN_X); 4033 sample_id = ac_to_integer(&ctx->ac, sample_id); 4034 4035 /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading 4036 * Language 4.50 spec says about interpolateAtSample: 4037 * 4038 * "Returns the value of the input interpolant variable at 4039 * the location of sample number sample. If multisample 4040 * buffers are not available, the input variable will be 4041 * evaluated at the center of the pixel. If sample sample 4042 * does not exist, the position used to interpolate the 4043 * input variable is undefined." 4044 * 4045 * This means that sample_id values outside of the valid are 4046 * in fact valid input, and the usual mechanism for loading the 4047 * sample position doesn't work. 4048 */ 4049 if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) { 4050 LLVMValueRef center[4] = { 4051 LLVMConstReal(ctx->f32, 0.5), 4052 LLVMConstReal(ctx->f32, 0.5), 4053 ctx->ac.f32_0, 4054 ctx->ac.f32_0, 4055 }; 4056 4057 sample_position = lp_build_gather_values(&ctx->gallivm, center, 4); 4058 } else { 4059 sample_position = load_sample_position(ctx, sample_id); 4060 } 4061 4062 emit_data->args[0] = LLVMBuildExtractElement(ctx->ac.builder, 4063 sample_position, 4064 ctx->i32_0, ""); 4065 4066 emit_data->args[0] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[0], halfval, ""); 4067 emit_data->args[1] = LLVMBuildExtractElement(ctx->ac.builder, 4068 sample_position, 4069 ctx->i32_1, ""); 4070 emit_data->args[1] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[1], halfval, ""); 4071 emit_data->arg_count = 2; 4072 } 4073 } 4074 4075 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, 4076 struct lp_build_tgsi_context *bld_base, 4077 struct lp_build_emit_data *emit_data) 4078 { 4079 struct si_shader_context *ctx = si_shader_context(bld_base); 4080 struct si_shader *shader = ctx->shader; 4081 const struct tgsi_shader_info *info = &shader->selector->info; 4082 LLVMValueRef interp_param; 4083 const struct tgsi_full_instruction *inst = emit_data->inst; 4084 const struct tgsi_full_src_register *input = &inst->Src[0]; 4085 int input_base, input_array_size; 4086 int chan; 4087 int i; 4088 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK); 4089 LLVMValueRef array_idx; 4090 int interp_param_idx; 4091 unsigned interp; 4092 unsigned location; 4093 4094 assert(input->Register.File == TGSI_FILE_INPUT); 4095 4096 if (input->Register.Indirect) { 4097 unsigned array_id = input->Indirect.ArrayID; 4098 4099 if (array_id) { 4100 input_base = info->input_array_first[array_id]; 4101 input_array_size = info->input_array_last[array_id] - input_base + 1; 4102 } else { 4103 input_base = inst->Src[0].Register.Index; 4104 input_array_size = info->num_inputs - input_base; 4105 } 4106 4107 array_idx = si_get_indirect_index(ctx, &input->Indirect, 4108 1, input->Register.Index - input_base); 4109 } else { 4110 input_base = inst->Src[0].Register.Index; 4111 input_array_size = 1; 4112 array_idx = ctx->i32_0; 4113 } 4114 4115 interp = shader->selector->info.input_interpolate[input_base]; 4116 4117 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 4118 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) 4119 location = TGSI_INTERPOLATE_LOC_CENTER; 4120 else 4121 location = TGSI_INTERPOLATE_LOC_CENTROID; 4122 4123 interp_param_idx = lookup_interp_param_index(interp, location); 4124 if (interp_param_idx == -1) 4125 return; 4126 else if (interp_param_idx) 4127 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); 4128 else 4129 interp_param = NULL; 4130 4131 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 4132 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 4133 LLVMValueRef ij_out[2]; 4134 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param); 4135 4136 /* 4137 * take the I then J parameters, and the DDX/Y for it, and 4138 * calculate the IJ inputs for the interpolator. 4139 * temp1 = ddx * offset/sample.x + I; 4140 * interp_param.I = ddy * offset/sample.y + temp1; 4141 * temp1 = ddx * offset/sample.x + J; 4142 * interp_param.J = ddy * offset/sample.y + temp1; 4143 */ 4144 for (i = 0; i < 2; i++) { 4145 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0); 4146 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0); 4147 LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, 4148 ddxy_out, ix_ll, ""); 4149 LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, 4150 ddxy_out, iy_ll, ""); 4151 LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, 4152 interp_param, ix_ll, ""); 4153 LLVMValueRef temp1, temp2; 4154 4155 interp_el = ac_to_float(&ctx->ac, interp_el); 4156 4157 temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, emit_data->args[0], ""); 4158 4159 temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, ""); 4160 4161 temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, emit_data->args[1], ""); 4162 4163 ij_out[i] = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, ""); 4164 } 4165 interp_param = lp_build_gather_values(&ctx->gallivm, ij_out, 2); 4166 } 4167 4168 if (interp_param) 4169 interp_param = ac_to_float(&ctx->ac, interp_param); 4170 4171 for (chan = 0; chan < 4; chan++) { 4172 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size)); 4173 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); 4174 4175 for (unsigned idx = 0; idx < input_array_size; ++idx) { 4176 LLVMValueRef v, i = NULL, j = NULL; 4177 4178 if (interp_param) { 4179 i = LLVMBuildExtractElement( 4180 ctx->ac.builder, interp_param, ctx->i32_0, ""); 4181 j = LLVMBuildExtractElement( 4182 ctx->ac.builder, interp_param, ctx->i32_1, ""); 4183 } 4184 v = si_build_fs_interp(ctx, input_base + idx, schan, 4185 prim_mask, i, j); 4186 4187 gather = LLVMBuildInsertElement(ctx->ac.builder, 4188 gather, v, LLVMConstInt(ctx->i32, idx, false), ""); 4189 } 4190 4191 emit_data->output[chan] = LLVMBuildExtractElement( 4192 ctx->ac.builder, gather, array_idx, ""); 4193 } 4194 } 4195 4196 static void vote_all_emit( 4197 const struct lp_build_tgsi_action *action, 4198 struct lp_build_tgsi_context *bld_base, 4199 struct lp_build_emit_data *emit_data) 4200 { 4201 struct si_shader_context *ctx = si_shader_context(bld_base); 4202 4203 LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]); 4204 emit_data->output[emit_data->chan] = 4205 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4206 } 4207 4208 static void vote_any_emit( 4209 const struct lp_build_tgsi_action *action, 4210 struct lp_build_tgsi_context *bld_base, 4211 struct lp_build_emit_data *emit_data) 4212 { 4213 struct si_shader_context *ctx = si_shader_context(bld_base); 4214 4215 LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]); 4216 emit_data->output[emit_data->chan] = 4217 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4218 } 4219 4220 static void vote_eq_emit( 4221 const struct lp_build_tgsi_action *action, 4222 struct lp_build_tgsi_context *bld_base, 4223 struct lp_build_emit_data *emit_data) 4224 { 4225 struct si_shader_context *ctx = si_shader_context(bld_base); 4226 4227 LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]); 4228 emit_data->output[emit_data->chan] = 4229 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, ""); 4230 } 4231 4232 static void ballot_emit( 4233 const struct lp_build_tgsi_action *action, 4234 struct lp_build_tgsi_context *bld_base, 4235 struct lp_build_emit_data *emit_data) 4236 { 4237 struct si_shader_context *ctx = si_shader_context(bld_base); 4238 LLVMBuilderRef builder = ctx->ac.builder; 4239 LLVMValueRef tmp; 4240 4241 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); 4242 tmp = ac_build_ballot(&ctx->ac, tmp); 4243 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, ""); 4244 4245 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, ""); 4246 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, ""); 4247 } 4248 4249 static void read_invoc_fetch_args( 4250 struct lp_build_tgsi_context *bld_base, 4251 struct lp_build_emit_data *emit_data) 4252 { 4253 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, 4254 0, emit_data->src_chan); 4255 4256 /* Always read the source invocation (= lane) from the X channel. */ 4257 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, 4258 1, TGSI_CHAN_X); 4259 emit_data->arg_count = 2; 4260 } 4261 4262 static void read_lane_emit( 4263 const struct lp_build_tgsi_action *action, 4264 struct lp_build_tgsi_context *bld_base, 4265 struct lp_build_emit_data *emit_data) 4266 { 4267 struct si_shader_context *ctx = si_shader_context(bld_base); 4268 4269 /* We currently have no other way to prevent LLVM from lifting the icmp 4270 * calls to a dominating basic block. 4271 */ 4272 ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]); 4273 4274 for (unsigned i = 0; i < emit_data->arg_count; ++i) 4275 emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]); 4276 4277 emit_data->output[emit_data->chan] = 4278 ac_build_intrinsic(&ctx->ac, action->intr_name, 4279 ctx->i32, emit_data->args, emit_data->arg_count, 4280 AC_FUNC_ATTR_READNONE | 4281 AC_FUNC_ATTR_CONVERGENT); 4282 } 4283 4284 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, 4285 struct lp_build_emit_data *emit_data) 4286 { 4287 struct si_shader_context *ctx = si_shader_context(bld_base); 4288 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; 4289 LLVMValueRef imm; 4290 unsigned stream; 4291 4292 assert(src0.File == TGSI_FILE_IMMEDIATE); 4293 4294 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX]; 4295 stream = LLVMConstIntGetZExtValue(imm) & 0x3; 4296 return stream; 4297 } 4298 4299 /* Emit one vertex from the geometry shader */ 4300 static void si_llvm_emit_vertex(struct ac_shader_abi *abi, 4301 unsigned stream, 4302 LLVMValueRef *addrs) 4303 { 4304 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 4305 struct tgsi_shader_info *info = &ctx->shader->selector->info; 4306 struct lp_build_context *uint = &ctx->bld_base.uint_bld; 4307 struct si_shader *shader = ctx->shader; 4308 struct lp_build_if_state if_state; 4309 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, 4310 ctx->param_gs2vs_offset); 4311 LLVMValueRef gs_next_vertex; 4312 LLVMValueRef can_emit; 4313 unsigned chan, offset; 4314 int i; 4315 4316 /* Write vertex attribute values to GSVS ring */ 4317 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, 4318 ctx->gs_next_vertex[stream], 4319 ""); 4320 4321 /* If this thread has already emitted the declared maximum number of 4322 * vertices, skip the write: excessive vertex emissions are not 4323 * supposed to have any effect. 4324 * 4325 * If the shader has no writes to memory, kill it instead. This skips 4326 * further memory loads and may allow LLVM to skip to the end 4327 * altogether. 4328 */ 4329 can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, 4330 LLVMConstInt(ctx->i32, 4331 shader->selector->gs_max_out_vertices, 0), ""); 4332 4333 bool use_kill = !info->writes_memory; 4334 if (use_kill) { 4335 ac_build_kill_if_false(&ctx->ac, can_emit); 4336 } else { 4337 lp_build_if(&if_state, &ctx->gallivm, can_emit); 4338 } 4339 4340 offset = 0; 4341 for (i = 0; i < info->num_outputs; i++) { 4342 for (chan = 0; chan < 4; chan++) { 4343 if (!(info->output_usagemask[i] & (1 << chan)) || 4344 ((info->output_streams[i] >> (2 * chan)) & 3) != stream) 4345 continue; 4346 4347 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); 4348 LLVMValueRef voffset = 4349 LLVMConstInt(ctx->i32, offset * 4350 shader->selector->gs_max_out_vertices, 0); 4351 offset++; 4352 4353 voffset = lp_build_add(uint, voffset, gs_next_vertex); 4354 voffset = lp_build_mul_imm(uint, voffset, 4); 4355 4356 out_val = ac_to_integer(&ctx->ac, out_val); 4357 4358 ac_build_buffer_store_dword(&ctx->ac, 4359 ctx->gsvs_ring[stream], 4360 out_val, 1, 4361 voffset, soffset, 0, 4362 1, 1, true, true); 4363 } 4364 } 4365 4366 gs_next_vertex = lp_build_add(uint, gs_next_vertex, 4367 ctx->i32_1); 4368 4369 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); 4370 4371 /* Signal vertex emission */ 4372 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), 4373 si_get_gs_wave_id(ctx)); 4374 if (!use_kill) 4375 lp_build_endif(&if_state); 4376 } 4377 4378 /* Emit one vertex from the geometry shader */ 4379 static void si_tgsi_emit_vertex( 4380 const struct lp_build_tgsi_action *action, 4381 struct lp_build_tgsi_context *bld_base, 4382 struct lp_build_emit_data *emit_data) 4383 { 4384 struct si_shader_context *ctx = si_shader_context(bld_base); 4385 unsigned stream = si_llvm_get_stream(bld_base, emit_data); 4386 4387 si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]); 4388 } 4389 4390 /* Cut one primitive from the geometry shader */ 4391 static void si_llvm_emit_primitive(struct ac_shader_abi *abi, 4392 unsigned stream) 4393 { 4394 struct si_shader_context *ctx = si_shader_context_from_abi(abi); 4395 4396 /* Signal primitive cut */ 4397 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), 4398 si_get_gs_wave_id(ctx)); 4399 } 4400 4401 /* Cut one primitive from the geometry shader */ 4402 static void si_tgsi_emit_primitive( 4403 const struct lp_build_tgsi_action *action, 4404 struct lp_build_tgsi_context *bld_base, 4405 struct lp_build_emit_data *emit_data) 4406 { 4407 struct si_shader_context *ctx = si_shader_context(bld_base); 4408 4409 si_llvm_emit_primitive(&ctx->abi, si_llvm_get_stream(bld_base, emit_data)); 4410 } 4411 4412 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, 4413 struct lp_build_tgsi_context *bld_base, 4414 struct lp_build_emit_data *emit_data) 4415 { 4416 struct si_shader_context *ctx = si_shader_context(bld_base); 4417 4418 /* SI only (thanks to a hw bug workaround): 4419 * The real barrier instruction isnt needed, because an entire patch 4420 * always fits into a single wave. 4421 */ 4422 if (ctx->screen->info.chip_class == SI && 4423 ctx->type == PIPE_SHADER_TESS_CTRL) { 4424 ac_build_waitcnt(&ctx->ac, LGKM_CNT & VM_CNT); 4425 return; 4426 } 4427 4428 lp_build_intrinsic(ctx->ac.builder, 4429 "llvm.amdgcn.s.barrier", 4430 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT); 4431 } 4432 4433 static const struct lp_build_tgsi_action interp_action = { 4434 .fetch_args = interp_fetch_args, 4435 .emit = build_interp_intrinsic, 4436 }; 4437 4438 static void si_create_function(struct si_shader_context *ctx, 4439 const char *name, 4440 LLVMTypeRef *returns, unsigned num_returns, 4441 struct si_function_info *fninfo, 4442 unsigned max_workgroup_size) 4443 { 4444 int i; 4445 4446 si_llvm_create_func(ctx, name, returns, num_returns, 4447 fninfo->types, fninfo->num_params); 4448 ctx->return_value = LLVMGetUndef(ctx->return_type); 4449 4450 for (i = 0; i < fninfo->num_sgpr_params; ++i) { 4451 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i); 4452 4453 /* The combination of: 4454 * - ByVal 4455 * - dereferenceable 4456 * - invariant.load 4457 * allows the optimization passes to move loads and reduces 4458 * SGPR spilling significantly. 4459 */ 4460 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { 4461 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL); 4462 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS); 4463 ac_add_attr_dereferenceable(P, UINT64_MAX); 4464 } else 4465 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG); 4466 } 4467 4468 for (i = 0; i < fninfo->num_params; ++i) { 4469 if (fninfo->assign[i]) 4470 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i); 4471 } 4472 4473 if (max_workgroup_size) { 4474 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size", 4475 max_workgroup_size); 4476 } 4477 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4478 "no-signed-zeros-fp-math", 4479 "true"); 4480 4481 if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) { 4482 /* These were copied from some LLVM test. */ 4483 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4484 "less-precise-fpmad", 4485 "true"); 4486 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4487 "no-infs-fp-math", 4488 "true"); 4489 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4490 "no-nans-fp-math", 4491 "true"); 4492 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 4493 "unsafe-fp-math", 4494 "true"); 4495 } 4496 } 4497 4498 static void declare_streamout_params(struct si_shader_context *ctx, 4499 struct pipe_stream_output_info *so, 4500 struct si_function_info *fninfo) 4501 { 4502 int i; 4503 4504 /* Streamout SGPRs. */ 4505 if (so->num_outputs) { 4506 if (ctx->type != PIPE_SHADER_TESS_EVAL) 4507 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4508 else 4509 ctx->param_streamout_config = fninfo->num_params - 1; 4510 4511 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4512 } 4513 /* A streamout buffer offset is loaded if the stride is non-zero. */ 4514 for (i = 0; i < 4; i++) { 4515 if (!so->stride[i]) 4516 continue; 4517 4518 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32); 4519 } 4520 } 4521 4522 static unsigned si_get_max_workgroup_size(const struct si_shader *shader) 4523 { 4524 switch (shader->selector->type) { 4525 case PIPE_SHADER_TESS_CTRL: 4526 /* Return this so that LLVM doesn't remove s_barrier 4527 * instructions on chips where we use s_barrier. */ 4528 return shader->selector->screen->info.chip_class >= CIK ? 128 : 64; 4529 4530 case PIPE_SHADER_GEOMETRY: 4531 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64; 4532 4533 case PIPE_SHADER_COMPUTE: 4534 break; /* see below */ 4535 4536 default: 4537 return 0; 4538 } 4539 4540 const unsigned *properties = shader->selector->info.properties; 4541 unsigned max_work_group_size = 4542 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * 4543 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * 4544 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; 4545 4546 if (!max_work_group_size) { 4547 /* This is a variable group size compute shader, 4548 * compile it for the maximum possible group size. 4549 */ 4550 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; 4551 } 4552 return max_work_group_size; 4553 } 4554 4555 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, 4556 struct si_function_info *fninfo, 4557 bool assign_params) 4558 { 4559 LLVMTypeRef const_shader_buf_type; 4560 4561 if (ctx->shader->selector->info.const_buffers_declared == 1 && 4562 ctx->shader->selector->info.shader_buffers_declared == 0) 4563 const_shader_buf_type = ctx->f32; 4564 else 4565 const_shader_buf_type = ctx->v4i32; 4566 4567 unsigned const_and_shader_buffers = 4568 add_arg(fninfo, ARG_SGPR, 4569 si_const_array(const_shader_buf_type, 0)); 4570 4571 unsigned samplers_and_images = 4572 add_arg(fninfo, ARG_SGPR, 4573 si_const_array(ctx->v8i32, 4574 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2)); 4575 4576 if (assign_params) { 4577 ctx->param_const_and_shader_buffers = const_and_shader_buffers; 4578 ctx->param_samplers_and_images = samplers_and_images; 4579 } 4580 } 4581 4582 static void declare_global_desc_pointers(struct si_shader_context *ctx, 4583 struct si_function_info *fninfo) 4584 { 4585 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR, 4586 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS)); 4587 ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR, 4588 si_const_array(ctx->v8i32, 0)); 4589 } 4590 4591 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx, 4592 struct si_function_info *fninfo) 4593 { 4594 ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR, 4595 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS)); 4596 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex); 4597 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance); 4598 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id); 4599 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32); 4600 } 4601 4602 static void declare_vs_input_vgprs(struct si_shader_context *ctx, 4603 struct si_function_info *fninfo, 4604 unsigned *num_prolog_vgprs) 4605 { 4606 struct si_shader *shader = ctx->shader; 4607 4608 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id); 4609 if (shader->key.as_ls) { 4610 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4611 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); 4612 } else { 4613 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id); 4614 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4615 } 4616 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */ 4617 4618 if (!shader->is_gs_copy_shader) { 4619 /* Vertex load indices. */ 4620 ctx->param_vertex_index0 = fninfo->num_params; 4621 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++) 4622 add_arg(fninfo, ARG_VGPR, ctx->i32); 4623 *num_prolog_vgprs += shader->selector->info.num_inputs; 4624 } 4625 } 4626 4627 static void declare_tes_input_vgprs(struct si_shader_context *ctx, 4628 struct si_function_info *fninfo) 4629 { 4630 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32); 4631 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32); 4632 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32); 4633 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id); 4634 } 4635 4636 enum { 4637 /* Convenient merged shader definitions. */ 4638 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES, 4639 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, 4640 }; 4641 4642 static void create_function(struct si_shader_context *ctx) 4643 { 4644 struct si_shader *shader = ctx->shader; 4645 struct si_function_info fninfo; 4646 LLVMTypeRef returns[16+32*4]; 4647 unsigned i, num_return_sgprs; 4648 unsigned num_returns = 0; 4649 unsigned num_prolog_vgprs = 0; 4650 unsigned type = ctx->type; 4651 unsigned vs_blit_property = 4652 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; 4653 4654 si_init_function_info(&fninfo); 4655 4656 /* Set MERGED shaders. */ 4657 if (ctx->screen->info.chip_class >= GFX9) { 4658 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL) 4659 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ 4660 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY) 4661 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; 4662 } 4663 4664 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3); 4665 4666 switch (type) { 4667 case PIPE_SHADER_VERTEX: 4668 declare_global_desc_pointers(ctx, &fninfo); 4669 4670 if (vs_blit_property) { 4671 ctx->param_vs_blit_inputs = fninfo.num_params; 4672 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */ 4673 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */ 4674 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */ 4675 4676 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { 4677 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */ 4678 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */ 4679 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */ 4680 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */ 4681 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { 4682 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */ 4683 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */ 4684 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */ 4685 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */ 4686 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */ 4687 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */ 4688 } 4689 4690 /* VGPRs */ 4691 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); 4692 break; 4693 } 4694 4695 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4696 declare_vs_specific_input_sgprs(ctx, &fninfo); 4697 4698 if (shader->key.as_es) { 4699 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4700 } else if (shader->key.as_ls) { 4701 /* no extra parameters */ 4702 } else { 4703 if (shader->is_gs_copy_shader) { 4704 fninfo.num_params = ctx->param_rw_buffers + 1; 4705 fninfo.num_sgpr_params = fninfo.num_params; 4706 } 4707 4708 /* The locations of the other parameters are assigned dynamically. */ 4709 declare_streamout_params(ctx, &shader->selector->so, 4710 &fninfo); 4711 } 4712 4713 /* VGPRs */ 4714 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs); 4715 break; 4716 4717 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */ 4718 declare_global_desc_pointers(ctx, &fninfo); 4719 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4720 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4721 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4722 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4723 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4724 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4725 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4726 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4727 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4728 4729 /* VGPRs */ 4730 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id); 4731 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids); 4732 4733 /* param_tcs_offchip_offset and param_tcs_factor_offset are 4734 * placed after the user SGPRs. 4735 */ 4736 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) 4737 returns[num_returns++] = ctx->i32; /* SGPRs */ 4738 for (i = 0; i < 11; i++) 4739 returns[num_returns++] = ctx->f32; /* VGPRs */ 4740 break; 4741 4742 case SI_SHADER_MERGED_VERTEX_TESSCTRL: 4743 /* Merged stages have 8 system SGPRs at the beginning. */ 4744 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* SPI_SHADER_USER_DATA_ADDR_LO_HS */ 4745 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* SPI_SHADER_USER_DATA_ADDR_HI_HS */ 4746 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4747 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4748 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4749 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4750 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4751 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4752 4753 declare_global_desc_pointers(ctx, &fninfo); 4754 declare_per_stage_desc_pointers(ctx, &fninfo, 4755 ctx->type == PIPE_SHADER_VERTEX); 4756 declare_vs_specific_input_sgprs(ctx, &fninfo); 4757 4758 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4759 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4760 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4761 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4762 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4763 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4764 4765 declare_per_stage_desc_pointers(ctx, &fninfo, 4766 ctx->type == PIPE_SHADER_TESS_CTRL); 4767 4768 /* VGPRs (first TCS, then VS) */ 4769 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id); 4770 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids); 4771 4772 if (ctx->type == PIPE_SHADER_VERTEX) { 4773 declare_vs_input_vgprs(ctx, &fninfo, 4774 &num_prolog_vgprs); 4775 4776 /* LS return values are inputs to the TCS main shader part. */ 4777 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) 4778 returns[num_returns++] = ctx->i32; /* SGPRs */ 4779 for (i = 0; i < 2; i++) 4780 returns[num_returns++] = ctx->f32; /* VGPRs */ 4781 } else { 4782 /* TCS return values are inputs to the TCS epilog. 4783 * 4784 * param_tcs_offchip_offset, param_tcs_factor_offset, 4785 * param_tcs_offchip_layout, and param_rw_buffers 4786 * should be passed to the epilog. 4787 */ 4788 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++) 4789 returns[num_returns++] = ctx->i32; /* SGPRs */ 4790 for (i = 0; i < 11; i++) 4791 returns[num_returns++] = ctx->f32; /* VGPRs */ 4792 } 4793 break; 4794 4795 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: 4796 /* Merged stages have 8 system SGPRs at the beginning. */ 4797 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_USER_DATA_ADDR_LO_GS) */ 4798 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_USER_DATA_ADDR_HI_GS) */ 4799 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4800 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4801 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4802 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4803 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */ 4804 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ 4805 4806 declare_global_desc_pointers(ctx, &fninfo); 4807 declare_per_stage_desc_pointers(ctx, &fninfo, 4808 (ctx->type == PIPE_SHADER_VERTEX || 4809 ctx->type == PIPE_SHADER_TESS_EVAL)); 4810 if (ctx->type == PIPE_SHADER_VERTEX) { 4811 declare_vs_specific_input_sgprs(ctx, &fninfo); 4812 } else { 4813 /* TESS_EVAL (and also GEOMETRY): 4814 * Declare as many input SGPRs as the VS has. */ 4815 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4816 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4817 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4818 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4819 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4820 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */ 4821 } 4822 4823 declare_per_stage_desc_pointers(ctx, &fninfo, 4824 ctx->type == PIPE_SHADER_GEOMETRY); 4825 4826 /* VGPRs (first GS, then VS/TES) */ 4827 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4828 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4829 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); 4830 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); 4831 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32); 4832 4833 if (ctx->type == PIPE_SHADER_VERTEX) { 4834 declare_vs_input_vgprs(ctx, &fninfo, 4835 &num_prolog_vgprs); 4836 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { 4837 declare_tes_input_vgprs(ctx, &fninfo); 4838 } 4839 4840 if (ctx->type == PIPE_SHADER_VERTEX || 4841 ctx->type == PIPE_SHADER_TESS_EVAL) { 4842 /* ES return values are inputs to GS. */ 4843 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++) 4844 returns[num_returns++] = ctx->i32; /* SGPRs */ 4845 for (i = 0; i < 5; i++) 4846 returns[num_returns++] = ctx->f32; /* VGPRs */ 4847 } 4848 break; 4849 4850 case PIPE_SHADER_TESS_EVAL: 4851 declare_global_desc_pointers(ctx, &fninfo); 4852 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4853 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4854 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4855 4856 if (shader->key.as_es) { 4857 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4858 add_arg(&fninfo, ARG_SGPR, ctx->i32); 4859 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4860 } else { 4861 add_arg(&fninfo, ARG_SGPR, ctx->i32); 4862 declare_streamout_params(ctx, &shader->selector->so, 4863 &fninfo); 4864 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4865 } 4866 4867 /* VGPRs */ 4868 declare_tes_input_vgprs(ctx, &fninfo); 4869 break; 4870 4871 case PIPE_SHADER_GEOMETRY: 4872 declare_global_desc_pointers(ctx, &fninfo); 4873 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4874 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4875 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4876 4877 /* VGPRs */ 4878 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]); 4879 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]); 4880 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id); 4881 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]); 4882 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]); 4883 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]); 4884 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]); 4885 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id); 4886 break; 4887 4888 case PIPE_SHADER_FRAGMENT: 4889 declare_global_desc_pointers(ctx, &fninfo); 4890 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4891 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); 4892 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK); 4893 4894 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE); 4895 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER); 4896 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID); 4897 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL); 4898 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE); 4899 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER); 4900 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID); 4901 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX); 4902 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4903 &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT); 4904 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4905 &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT); 4906 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4907 &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT); 4908 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4909 &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT); 4910 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, 4911 &ctx->abi.front_face, SI_PARAM_FRONT_FACE); 4912 shader->info.face_vgpr_index = 20; 4913 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32, 4914 &ctx->abi.ancillary, SI_PARAM_ANCILLARY); 4915 shader->info.ancillary_vgpr_index = 21; 4916 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32, 4917 &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE); 4918 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT); 4919 4920 /* Color inputs from the prolog. */ 4921 if (shader->selector->info.colors_read) { 4922 unsigned num_color_elements = 4923 util_bitcount(shader->selector->info.colors_read); 4924 4925 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types)); 4926 for (i = 0; i < num_color_elements; i++) 4927 add_arg(&fninfo, ARG_VGPR, ctx->f32); 4928 4929 num_prolog_vgprs += num_color_elements; 4930 } 4931 4932 /* Outputs for the epilog. */ 4933 num_return_sgprs = SI_SGPR_ALPHA_REF + 1; 4934 num_returns = 4935 num_return_sgprs + 4936 util_bitcount(shader->selector->info.colors_written) * 4 + 4937 shader->selector->info.writes_z + 4938 shader->selector->info.writes_stencil + 4939 shader->selector->info.writes_samplemask + 4940 1 /* SampleMaskIn */; 4941 4942 num_returns = MAX2(num_returns, 4943 num_return_sgprs + 4944 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 4945 4946 for (i = 0; i < num_return_sgprs; i++) 4947 returns[i] = ctx->i32; 4948 for (; i < num_returns; i++) 4949 returns[i] = ctx->f32; 4950 break; 4951 4952 case PIPE_SHADER_COMPUTE: 4953 declare_global_desc_pointers(ctx, &fninfo); 4954 declare_per_stage_desc_pointers(ctx, &fninfo, true); 4955 if (shader->selector->info.uses_grid_size) 4956 ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32); 4957 if (shader->selector->info.uses_block_size) 4958 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32); 4959 4960 for (i = 0; i < 3; i++) { 4961 ctx->param_block_id[i] = -1; 4962 if (shader->selector->info.uses_block_id[i]) 4963 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32); 4964 } 4965 4966 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32); 4967 break; 4968 default: 4969 assert(0 && "unimplemented shader"); 4970 return; 4971 } 4972 4973 si_create_function(ctx, "main", returns, num_returns, &fninfo, 4974 si_get_max_workgroup_size(shader)); 4975 4976 /* Reserve register locations for VGPR inputs the PS prolog may need. */ 4977 if (ctx->type == PIPE_SHADER_FRAGMENT && 4978 ctx->separate_prolog) { 4979 si_llvm_add_attribute(ctx->main_fn, 4980 "InitialPSInputAddr", 4981 S_0286D0_PERSP_SAMPLE_ENA(1) | 4982 S_0286D0_PERSP_CENTER_ENA(1) | 4983 S_0286D0_PERSP_CENTROID_ENA(1) | 4984 S_0286D0_LINEAR_SAMPLE_ENA(1) | 4985 S_0286D0_LINEAR_CENTER_ENA(1) | 4986 S_0286D0_LINEAR_CENTROID_ENA(1) | 4987 S_0286D0_FRONT_FACE_ENA(1) | 4988 S_0286D0_ANCILLARY_ENA(1) | 4989 S_0286D0_POS_FIXED_PT_ENA(1)); 4990 } 4991 4992 shader->info.num_input_sgprs = 0; 4993 shader->info.num_input_vgprs = 0; 4994 4995 for (i = 0; i < fninfo.num_sgpr_params; ++i) 4996 shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4; 4997 4998 for (; i < fninfo.num_params; ++i) 4999 shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4; 5000 5001 assert(shader->info.num_input_vgprs >= num_prolog_vgprs); 5002 shader->info.num_input_vgprs -= num_prolog_vgprs; 5003 5004 if (shader->key.as_ls || 5005 ctx->type == PIPE_SHADER_TESS_CTRL || 5006 /* GFX9 has the ESGS ring buffer in LDS. */ 5007 type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY) 5008 ac_declare_lds_as_pointer(&ctx->ac); 5009 } 5010 5011 /** 5012 * Load ESGS and GSVS ring buffer resource descriptors and save the variables 5013 * for later use. 5014 */ 5015 static void preload_ring_buffers(struct si_shader_context *ctx) 5016 { 5017 LLVMBuilderRef builder = ctx->ac.builder; 5018 5019 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, 5020 ctx->param_rw_buffers); 5021 5022 if (ctx->screen->info.chip_class <= VI && 5023 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) { 5024 unsigned ring = 5025 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS 5026 : SI_ES_RING_ESGS; 5027 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); 5028 5029 ctx->esgs_ring = 5030 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 5031 } 5032 5033 if (ctx->shader->is_gs_copy_shader) { 5034 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); 5035 5036 ctx->gsvs_ring[0] = 5037 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 5038 } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 5039 const struct si_shader_selector *sel = ctx->shader->selector; 5040 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); 5041 LLVMValueRef base_ring; 5042 5043 base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); 5044 5045 /* The conceptual layout of the GSVS ring is 5046 * v0c0 .. vLv0 v0c1 .. vLc1 .. 5047 * but the real memory layout is swizzled across 5048 * threads: 5049 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL 5050 * t16v0c0 .. 5051 * Override the buffer descriptor accordingly. 5052 */ 5053 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); 5054 uint64_t stream_offset = 0; 5055 5056 for (unsigned stream = 0; stream < 4; ++stream) { 5057 unsigned num_components; 5058 unsigned stride; 5059 unsigned num_records; 5060 LLVMValueRef ring, tmp; 5061 5062 num_components = sel->info.num_stream_output_components[stream]; 5063 if (!num_components) 5064 continue; 5065 5066 stride = 4 * num_components * sel->gs_max_out_vertices; 5067 5068 /* Limit on the stride field for <= CIK. */ 5069 assert(stride < (1 << 14)); 5070 5071 num_records = 64; 5072 5073 ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); 5074 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, ""); 5075 tmp = LLVMBuildAdd(builder, tmp, 5076 LLVMConstInt(ctx->i64, 5077 stream_offset, 0), ""); 5078 stream_offset += stride * 64; 5079 5080 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, ""); 5081 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); 5082 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, ""); 5083 tmp = LLVMBuildOr(builder, tmp, 5084 LLVMConstInt(ctx->i32, 5085 S_008F04_STRIDE(stride) | 5086 S_008F04_SWIZZLE_ENABLE(1), 0), ""); 5087 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, ""); 5088 ring = LLVMBuildInsertElement(builder, ring, 5089 LLVMConstInt(ctx->i32, num_records, 0), 5090 LLVMConstInt(ctx->i32, 2, 0), ""); 5091 ring = LLVMBuildInsertElement(builder, ring, 5092 LLVMConstInt(ctx->i32, 5093 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 5094 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 5095 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 5096 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 5097 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 5098 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | 5099 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */ 5100 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ 5101 S_008F0C_ADD_TID_ENABLE(1), 5102 0), 5103 LLVMConstInt(ctx->i32, 3, 0), ""); 5104 5105 ctx->gsvs_ring[stream] = ring; 5106 } 5107 } 5108 } 5109 5110 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, 5111 LLVMValueRef param_rw_buffers, 5112 unsigned param_pos_fixed_pt) 5113 { 5114 LLVMBuilderRef builder = ctx->ac.builder; 5115 LLVMValueRef slot, desc, offset, row, bit, address[2]; 5116 5117 /* Use the fixed-point gl_FragCoord input. 5118 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits 5119 * per coordinate to get the repeating effect. 5120 */ 5121 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5); 5122 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5); 5123 5124 /* Load the buffer descriptor. */ 5125 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0); 5126 desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot); 5127 5128 /* The stipple pattern is 32x32, each row has 32 bits. */ 5129 offset = LLVMBuildMul(builder, address[1], 5130 LLVMConstInt(ctx->i32, 4, 0), ""); 5131 row = buffer_load_const(ctx, desc, offset); 5132 row = ac_to_integer(&ctx->ac, row); 5133 bit = LLVMBuildLShr(builder, row, address[0], ""); 5134 bit = LLVMBuildTrunc(builder, bit, ctx->i1, ""); 5135 ac_build_kill_if_false(&ctx->ac, bit); 5136 } 5137 5138 void si_shader_binary_read_config(struct ac_shader_binary *binary, 5139 struct si_shader_config *conf, 5140 unsigned symbol_offset) 5141 { 5142 unsigned i; 5143 const unsigned char *config = 5144 ac_shader_binary_config_start(binary, symbol_offset); 5145 bool really_needs_scratch = false; 5146 5147 /* LLVM adds SGPR spills to the scratch size. 5148 * Find out if we really need the scratch buffer. 5149 */ 5150 for (i = 0; i < binary->reloc_count; i++) { 5151 const struct ac_shader_reloc *reloc = &binary->relocs[i]; 5152 5153 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || 5154 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { 5155 really_needs_scratch = true; 5156 break; 5157 } 5158 } 5159 5160 /* XXX: We may be able to emit some of these values directly rather than 5161 * extracting fields to be emitted later. 5162 */ 5163 5164 for (i = 0; i < binary->config_size_per_symbol; i+= 8) { 5165 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); 5166 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4)); 5167 switch (reg) { 5168 case R_00B028_SPI_SHADER_PGM_RSRC1_PS: 5169 case R_00B128_SPI_SHADER_PGM_RSRC1_VS: 5170 case R_00B228_SPI_SHADER_PGM_RSRC1_GS: 5171 case R_00B428_SPI_SHADER_PGM_RSRC1_HS: 5172 case R_00B848_COMPUTE_PGM_RSRC1: 5173 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); 5174 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); 5175 conf->float_mode = G_00B028_FLOAT_MODE(value); 5176 conf->rsrc1 = value; 5177 break; 5178 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: 5179 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); 5180 break; 5181 case R_00B84C_COMPUTE_PGM_RSRC2: 5182 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); 5183 conf->rsrc2 = value; 5184 break; 5185 case R_0286CC_SPI_PS_INPUT_ENA: 5186 conf->spi_ps_input_ena = value; 5187 break; 5188 case R_0286D0_SPI_PS_INPUT_ADDR: 5189 conf->spi_ps_input_addr = value; 5190 break; 5191 case R_0286E8_SPI_TMPRING_SIZE: 5192 case R_00B860_COMPUTE_TMPRING_SIZE: 5193 /* WAVESIZE is in units of 256 dwords. */ 5194 if (really_needs_scratch) 5195 conf->scratch_bytes_per_wave = 5196 G_00B860_WAVESIZE(value) * 256 * 4; 5197 break; 5198 case 0x4: /* SPILLED_SGPRS */ 5199 conf->spilled_sgprs = value; 5200 break; 5201 case 0x8: /* SPILLED_VGPRS */ 5202 conf->spilled_vgprs = value; 5203 break; 5204 default: 5205 { 5206 static bool printed; 5207 5208 if (!printed) { 5209 fprintf(stderr, "Warning: LLVM emitted unknown " 5210 "config register: 0x%x\n", reg); 5211 printed = true; 5212 } 5213 } 5214 break; 5215 } 5216 } 5217 5218 if (!conf->spi_ps_input_addr) 5219 conf->spi_ps_input_addr = conf->spi_ps_input_ena; 5220 } 5221 5222 void si_shader_apply_scratch_relocs(struct si_shader *shader, 5223 uint64_t scratch_va) 5224 { 5225 unsigned i; 5226 uint32_t scratch_rsrc_dword0 = scratch_va; 5227 uint32_t scratch_rsrc_dword1 = 5228 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); 5229 5230 /* Enable scratch coalescing. */ 5231 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); 5232 5233 for (i = 0 ; i < shader->binary.reloc_count; i++) { 5234 const struct ac_shader_reloc *reloc = 5235 &shader->binary.relocs[i]; 5236 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { 5237 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, 5238 &scratch_rsrc_dword0, 4); 5239 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { 5240 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, 5241 &scratch_rsrc_dword1, 4); 5242 } 5243 } 5244 } 5245 5246 static unsigned si_get_shader_binary_size(const struct si_shader *shader) 5247 { 5248 unsigned size = shader->binary.code_size; 5249 5250 if (shader->prolog) 5251 size += shader->prolog->binary.code_size; 5252 if (shader->previous_stage) 5253 size += shader->previous_stage->binary.code_size; 5254 if (shader->prolog2) 5255 size += shader->prolog2->binary.code_size; 5256 if (shader->epilog) 5257 size += shader->epilog->binary.code_size; 5258 return size; 5259 } 5260 5261 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) 5262 { 5263 const struct ac_shader_binary *prolog = 5264 shader->prolog ? &shader->prolog->binary : NULL; 5265 const struct ac_shader_binary *previous_stage = 5266 shader->previous_stage ? &shader->previous_stage->binary : NULL; 5267 const struct ac_shader_binary *prolog2 = 5268 shader->prolog2 ? &shader->prolog2->binary : NULL; 5269 const struct ac_shader_binary *epilog = 5270 shader->epilog ? &shader->epilog->binary : NULL; 5271 const struct ac_shader_binary *mainb = &shader->binary; 5272 unsigned bo_size = si_get_shader_binary_size(shader) + 5273 (!epilog ? mainb->rodata_size : 0); 5274 unsigned char *ptr; 5275 5276 assert(!prolog || !prolog->rodata_size); 5277 assert(!previous_stage || !previous_stage->rodata_size); 5278 assert(!prolog2 || !prolog2->rodata_size); 5279 assert((!prolog && !previous_stage && !prolog2 && !epilog) || 5280 !mainb->rodata_size); 5281 assert(!epilog || !epilog->rodata_size); 5282 5283 r600_resource_reference(&shader->bo, NULL); 5284 shader->bo = (struct r600_resource*) 5285 si_aligned_buffer_create(&sscreen->b, 5286 sscreen->cpdma_prefetch_writes_memory ? 5287 0 : R600_RESOURCE_FLAG_READ_ONLY, 5288 PIPE_USAGE_IMMUTABLE, 5289 align(bo_size, SI_CPDMA_ALIGNMENT), 5290 256); 5291 if (!shader->bo) 5292 return -ENOMEM; 5293 5294 /* Upload. */ 5295 ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, 5296 PIPE_TRANSFER_READ_WRITE | 5297 PIPE_TRANSFER_UNSYNCHRONIZED); 5298 5299 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are 5300 * endian-independent. */ 5301 if (prolog) { 5302 memcpy(ptr, prolog->code, prolog->code_size); 5303 ptr += prolog->code_size; 5304 } 5305 if (previous_stage) { 5306 memcpy(ptr, previous_stage->code, previous_stage->code_size); 5307 ptr += previous_stage->code_size; 5308 } 5309 if (prolog2) { 5310 memcpy(ptr, prolog2->code, prolog2->code_size); 5311 ptr += prolog2->code_size; 5312 } 5313 5314 memcpy(ptr, mainb->code, mainb->code_size); 5315 ptr += mainb->code_size; 5316 5317 if (epilog) 5318 memcpy(ptr, epilog->code, epilog->code_size); 5319 else if (mainb->rodata_size > 0) 5320 memcpy(ptr, mainb->rodata, mainb->rodata_size); 5321 5322 sscreen->ws->buffer_unmap(shader->bo->buf); 5323 return 0; 5324 } 5325 5326 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary, 5327 struct pipe_debug_callback *debug, 5328 const char *name, FILE *file) 5329 { 5330 char *line, *p; 5331 unsigned i, count; 5332 5333 if (binary->disasm_string) { 5334 fprintf(file, "Shader %s disassembly:\n", name); 5335 fprintf(file, "%s", binary->disasm_string); 5336 5337 if (debug && debug->debug_message) { 5338 /* Very long debug messages are cut off, so send the 5339 * disassembly one line at a time. This causes more 5340 * overhead, but on the plus side it simplifies 5341 * parsing of resulting logs. 5342 */ 5343 pipe_debug_message(debug, SHADER_INFO, 5344 "Shader Disassembly Begin"); 5345 5346 line = binary->disasm_string; 5347 while (*line) { 5348 p = util_strchrnul(line, '\n'); 5349 count = p - line; 5350 5351 if (count) { 5352 pipe_debug_message(debug, SHADER_INFO, 5353 "%.*s", count, line); 5354 } 5355 5356 if (!*p) 5357 break; 5358 line = p + 1; 5359 } 5360 5361 pipe_debug_message(debug, SHADER_INFO, 5362 "Shader Disassembly End"); 5363 } 5364 } else { 5365 fprintf(file, "Shader %s binary:\n", name); 5366 for (i = 0; i < binary->code_size; i += 4) { 5367 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i, 5368 binary->code[i + 3], binary->code[i + 2], 5369 binary->code[i + 1], binary->code[i]); 5370 } 5371 } 5372 } 5373 5374 static void si_shader_dump_stats(struct si_screen *sscreen, 5375 const struct si_shader *shader, 5376 struct pipe_debug_callback *debug, 5377 unsigned processor, 5378 FILE *file, 5379 bool check_debug_option) 5380 { 5381 const struct si_shader_config *conf = &shader->config; 5382 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0; 5383 unsigned code_size = si_get_shader_binary_size(shader); 5384 unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256; 5385 unsigned lds_per_wave = 0; 5386 unsigned max_simd_waves; 5387 5388 switch (sscreen->info.family) { 5389 /* These always have 8 waves: */ 5390 case CHIP_POLARIS10: 5391 case CHIP_POLARIS11: 5392 case CHIP_POLARIS12: 5393 max_simd_waves = 8; 5394 break; 5395 default: 5396 max_simd_waves = 10; 5397 } 5398 5399 /* Compute LDS usage for PS. */ 5400 switch (processor) { 5401 case PIPE_SHADER_FRAGMENT: 5402 /* The minimum usage per wave is (num_inputs * 48). The maximum 5403 * usage is (num_inputs * 48 * 16). 5404 * We can get anything in between and it varies between waves. 5405 * 5406 * The 48 bytes per input for a single primitive is equal to 5407 * 4 bytes/component * 4 components/input * 3 points. 5408 * 5409 * Other stages don't know the size at compile time or don't 5410 * allocate LDS per wave, but instead they do it per thread group. 5411 */ 5412 lds_per_wave = conf->lds_size * lds_increment + 5413 align(num_inputs * 48, lds_increment); 5414 break; 5415 case PIPE_SHADER_COMPUTE: 5416 if (shader->selector) { 5417 unsigned max_workgroup_size = 5418 si_get_max_workgroup_size(shader); 5419 lds_per_wave = (conf->lds_size * lds_increment) / 5420 DIV_ROUND_UP(max_workgroup_size, 64); 5421 } 5422 break; 5423 } 5424 5425 /* Compute the per-SIMD wave counts. */ 5426 if (conf->num_sgprs) { 5427 if (sscreen->info.chip_class >= VI) 5428 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs); 5429 else 5430 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs); 5431 } 5432 5433 if (conf->num_vgprs) 5434 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); 5435 5436 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above 5437 * 16KB makes some SIMDs unoccupied). */ 5438 if (lds_per_wave) 5439 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); 5440 5441 if (!check_debug_option || 5442 si_can_dump_shader(sscreen, processor)) { 5443 if (processor == PIPE_SHADER_FRAGMENT) { 5444 fprintf(file, "*** SHADER CONFIG ***\n" 5445 "SPI_PS_INPUT_ADDR = 0x%04x\n" 5446 "SPI_PS_INPUT_ENA = 0x%04x\n", 5447 conf->spi_ps_input_addr, conf->spi_ps_input_ena); 5448 } 5449 5450 fprintf(file, "*** SHADER STATS ***\n" 5451 "SGPRS: %d\n" 5452 "VGPRS: %d\n" 5453 "Spilled SGPRs: %d\n" 5454 "Spilled VGPRs: %d\n" 5455 "Private memory VGPRs: %d\n" 5456 "Code Size: %d bytes\n" 5457 "LDS: %d blocks\n" 5458 "Scratch: %d bytes per wave\n" 5459 "Max Waves: %d\n" 5460 "********************\n\n\n", 5461 conf->num_sgprs, conf->num_vgprs, 5462 conf->spilled_sgprs, conf->spilled_vgprs, 5463 conf->private_mem_vgprs, code_size, 5464 conf->lds_size, conf->scratch_bytes_per_wave, 5465 max_simd_waves); 5466 } 5467 5468 pipe_debug_message(debug, SHADER_INFO, 5469 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " 5470 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " 5471 "Spilled VGPRs: %d PrivMem VGPRs: %d", 5472 conf->num_sgprs, conf->num_vgprs, code_size, 5473 conf->lds_size, conf->scratch_bytes_per_wave, 5474 max_simd_waves, conf->spilled_sgprs, 5475 conf->spilled_vgprs, conf->private_mem_vgprs); 5476 } 5477 5478 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor) 5479 { 5480 switch (processor) { 5481 case PIPE_SHADER_VERTEX: 5482 if (shader->key.as_es) 5483 return "Vertex Shader as ES"; 5484 else if (shader->key.as_ls) 5485 return "Vertex Shader as LS"; 5486 else 5487 return "Vertex Shader as VS"; 5488 case PIPE_SHADER_TESS_CTRL: 5489 return "Tessellation Control Shader"; 5490 case PIPE_SHADER_TESS_EVAL: 5491 if (shader->key.as_es) 5492 return "Tessellation Evaluation Shader as ES"; 5493 else 5494 return "Tessellation Evaluation Shader as VS"; 5495 case PIPE_SHADER_GEOMETRY: 5496 if (shader->is_gs_copy_shader) 5497 return "GS Copy Shader as VS"; 5498 else 5499 return "Geometry Shader"; 5500 case PIPE_SHADER_FRAGMENT: 5501 return "Pixel Shader"; 5502 case PIPE_SHADER_COMPUTE: 5503 return "Compute Shader"; 5504 default: 5505 return "Unknown Shader"; 5506 } 5507 } 5508 5509 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader, 5510 struct pipe_debug_callback *debug, unsigned processor, 5511 FILE *file, bool check_debug_option) 5512 { 5513 if (!check_debug_option || 5514 si_can_dump_shader(sscreen, processor)) 5515 si_dump_shader_key(processor, shader, file); 5516 5517 if (!check_debug_option && shader->binary.llvm_ir_string) { 5518 if (shader->previous_stage && 5519 shader->previous_stage->binary.llvm_ir_string) { 5520 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", 5521 si_get_shader_name(shader, processor)); 5522 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); 5523 } 5524 5525 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", 5526 si_get_shader_name(shader, processor)); 5527 fprintf(file, "%s\n", shader->binary.llvm_ir_string); 5528 } 5529 5530 if (!check_debug_option || 5531 (si_can_dump_shader(sscreen, processor) && 5532 !(sscreen->debug_flags & DBG(NO_ASM)))) { 5533 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor)); 5534 5535 if (shader->prolog) 5536 si_shader_dump_disassembly(&shader->prolog->binary, 5537 debug, "prolog", file); 5538 if (shader->previous_stage) 5539 si_shader_dump_disassembly(&shader->previous_stage->binary, 5540 debug, "previous stage", file); 5541 if (shader->prolog2) 5542 si_shader_dump_disassembly(&shader->prolog2->binary, 5543 debug, "prolog2", file); 5544 5545 si_shader_dump_disassembly(&shader->binary, debug, "main", file); 5546 5547 if (shader->epilog) 5548 si_shader_dump_disassembly(&shader->epilog->binary, 5549 debug, "epilog", file); 5550 fprintf(file, "\n"); 5551 } 5552 5553 si_shader_dump_stats(sscreen, shader, debug, processor, file, 5554 check_debug_option); 5555 } 5556 5557 static int si_compile_llvm(struct si_screen *sscreen, 5558 struct ac_shader_binary *binary, 5559 struct si_shader_config *conf, 5560 LLVMTargetMachineRef tm, 5561 LLVMModuleRef mod, 5562 struct pipe_debug_callback *debug, 5563 unsigned processor, 5564 const char *name) 5565 { 5566 int r = 0; 5567 unsigned count = p_atomic_inc_return(&sscreen->num_compilations); 5568 5569 if (si_can_dump_shader(sscreen, processor)) { 5570 fprintf(stderr, "radeonsi: Compiling shader %d\n", count); 5571 5572 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { 5573 fprintf(stderr, "%s LLVM IR:\n\n", name); 5574 ac_dump_module(mod); 5575 fprintf(stderr, "\n"); 5576 } 5577 } 5578 5579 if (sscreen->record_llvm_ir) { 5580 char *ir = LLVMPrintModuleToString(mod); 5581 binary->llvm_ir_string = strdup(ir); 5582 LLVMDisposeMessage(ir); 5583 } 5584 5585 if (!si_replace_shader(count, binary)) { 5586 r = si_llvm_compile(mod, binary, tm, debug); 5587 if (r) 5588 return r; 5589 } 5590 5591 si_shader_binary_read_config(binary, conf, 0); 5592 5593 /* Enable 64-bit and 16-bit denormals, because there is no performance 5594 * cost. 5595 * 5596 * If denormals are enabled, all floating-point output modifiers are 5597 * ignored. 5598 * 5599 * Don't enable denormals for 32-bit floats, because: 5600 * - Floating-point output modifiers would be ignored by the hw. 5601 * - Some opcodes don't support denormals, such as v_mad_f32. We would 5602 * have to stop using those. 5603 * - SI & CI would be very slow. 5604 */ 5605 conf->float_mode |= V_00B028_FP_64_DENORMS; 5606 5607 FREE(binary->config); 5608 FREE(binary->global_symbol_offsets); 5609 binary->config = NULL; 5610 binary->global_symbol_offsets = NULL; 5611 5612 /* Some shaders can't have rodata because their binaries can be 5613 * concatenated. 5614 */ 5615 if (binary->rodata_size && 5616 (processor == PIPE_SHADER_VERTEX || 5617 processor == PIPE_SHADER_TESS_CTRL || 5618 processor == PIPE_SHADER_TESS_EVAL || 5619 processor == PIPE_SHADER_FRAGMENT)) { 5620 fprintf(stderr, "radeonsi: The shader can't have rodata."); 5621 return -EINVAL; 5622 } 5623 5624 return r; 5625 } 5626 5627 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) 5628 { 5629 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) 5630 LLVMBuildRetVoid(ctx->ac.builder); 5631 else 5632 LLVMBuildRet(ctx->ac.builder, ret); 5633 } 5634 5635 /* Generate code for the hardware VS shader stage to go with a geometry shader */ 5636 struct si_shader * 5637 si_generate_gs_copy_shader(struct si_screen *sscreen, 5638 LLVMTargetMachineRef tm, 5639 struct si_shader_selector *gs_selector, 5640 struct pipe_debug_callback *debug) 5641 { 5642 struct si_shader_context ctx; 5643 struct si_shader *shader; 5644 LLVMBuilderRef builder; 5645 struct lp_build_tgsi_context *bld_base = &ctx.bld_base; 5646 struct lp_build_context *uint = &bld_base->uint_bld; 5647 struct si_shader_output_values *outputs; 5648 struct tgsi_shader_info *gsinfo = &gs_selector->info; 5649 int i, r; 5650 5651 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0])); 5652 5653 if (!outputs) 5654 return NULL; 5655 5656 shader = CALLOC_STRUCT(si_shader); 5657 if (!shader) { 5658 FREE(outputs); 5659 return NULL; 5660 } 5661 5662 /* We can leave the fence as permanently signaled because the GS copy 5663 * shader only becomes visible globally after it has been compiled. */ 5664 util_queue_fence_init(&shader->ready); 5665 5666 shader->selector = gs_selector; 5667 shader->is_gs_copy_shader = true; 5668 5669 si_init_shader_ctx(&ctx, sscreen, tm); 5670 ctx.shader = shader; 5671 ctx.type = PIPE_SHADER_VERTEX; 5672 5673 builder = ctx.ac.builder; 5674 5675 create_function(&ctx); 5676 preload_ring_buffers(&ctx); 5677 5678 LLVMValueRef voffset = 5679 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4); 5680 5681 /* Fetch the vertex stream ID.*/ 5682 LLVMValueRef stream_id; 5683 5684 if (gs_selector->so.num_outputs) 5685 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2); 5686 else 5687 stream_id = ctx.i32_0; 5688 5689 /* Fill in output information. */ 5690 for (i = 0; i < gsinfo->num_outputs; ++i) { 5691 outputs[i].semantic_name = gsinfo->output_semantic_name[i]; 5692 outputs[i].semantic_index = gsinfo->output_semantic_index[i]; 5693 5694 for (int chan = 0; chan < 4; chan++) { 5695 outputs[i].vertex_stream[chan] = 5696 (gsinfo->output_streams[i] >> (2 * chan)) & 3; 5697 } 5698 } 5699 5700 LLVMBasicBlockRef end_bb; 5701 LLVMValueRef switch_inst; 5702 5703 end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); 5704 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); 5705 5706 for (int stream = 0; stream < 4; stream++) { 5707 LLVMBasicBlockRef bb; 5708 unsigned offset; 5709 5710 if (!gsinfo->num_stream_output_components[stream]) 5711 continue; 5712 5713 if (stream > 0 && !gs_selector->so.num_outputs) 5714 continue; 5715 5716 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); 5717 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb); 5718 LLVMPositionBuilderAtEnd(builder, bb); 5719 5720 /* Fetch vertex data from GSVS ring */ 5721 offset = 0; 5722 for (i = 0; i < gsinfo->num_outputs; ++i) { 5723 for (unsigned chan = 0; chan < 4; chan++) { 5724 if (!(gsinfo->output_usagemask[i] & (1 << chan)) || 5725 outputs[i].vertex_stream[chan] != stream) { 5726 outputs[i].values[chan] = ctx.bld_base.base.undef; 5727 continue; 5728 } 5729 5730 LLVMValueRef soffset = LLVMConstInt(ctx.i32, 5731 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); 5732 offset++; 5733 5734 outputs[i].values[chan] = 5735 ac_build_buffer_load(&ctx.ac, 5736 ctx.gsvs_ring[0], 1, 5737 ctx.i32_0, voffset, 5738 soffset, 0, 1, 1, 5739 true, false); 5740 } 5741 } 5742 5743 /* Streamout and exports. */ 5744 if (gs_selector->so.num_outputs) { 5745 si_llvm_emit_streamout(&ctx, outputs, 5746 gsinfo->num_outputs, 5747 stream); 5748 } 5749 5750 if (stream == 0) 5751 si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs); 5752 5753 LLVMBuildBr(builder, end_bb); 5754 } 5755 5756 LLVMPositionBuilderAtEnd(builder, end_bb); 5757 5758 LLVMBuildRetVoid(ctx.ac.builder); 5759 5760 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ 5761 si_llvm_optimize_module(&ctx); 5762 5763 r = si_compile_llvm(sscreen, &ctx.shader->binary, 5764 &ctx.shader->config, ctx.tm, 5765 ctx.gallivm.module, 5766 debug, PIPE_SHADER_GEOMETRY, 5767 "GS Copy Shader"); 5768 if (!r) { 5769 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) 5770 fprintf(stderr, "GS Copy Shader:\n"); 5771 si_shader_dump(sscreen, ctx.shader, debug, 5772 PIPE_SHADER_GEOMETRY, stderr, true); 5773 r = si_shader_binary_upload(sscreen, ctx.shader); 5774 } 5775 5776 si_llvm_dispose(&ctx); 5777 5778 FREE(outputs); 5779 5780 if (r != 0) { 5781 FREE(shader); 5782 shader = NULL; 5783 } 5784 return shader; 5785 } 5786 5787 static void si_dump_shader_key_vs(const struct si_shader_key *key, 5788 const struct si_vs_prolog_bits *prolog, 5789 const char *prefix, FILE *f) 5790 { 5791 fprintf(f, " %s.instance_divisor_is_one = %u\n", 5792 prefix, prolog->instance_divisor_is_one); 5793 fprintf(f, " %s.instance_divisor_is_fetched = %u\n", 5794 prefix, prolog->instance_divisor_is_fetched); 5795 fprintf(f, " %s.ls_vgpr_fix = %u\n", 5796 prefix, prolog->ls_vgpr_fix); 5797 5798 fprintf(f, " mono.vs.fix_fetch = {"); 5799 for (int i = 0; i < SI_MAX_ATTRIBS; i++) 5800 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]); 5801 fprintf(f, "}\n"); 5802 } 5803 5804 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader, 5805 FILE *f) 5806 { 5807 const struct si_shader_key *key = &shader->key; 5808 5809 fprintf(f, "SHADER KEY\n"); 5810 5811 switch (processor) { 5812 case PIPE_SHADER_VERTEX: 5813 si_dump_shader_key_vs(key, &key->part.vs.prolog, 5814 "part.vs.prolog", f); 5815 fprintf(f, " as_es = %u\n", key->as_es); 5816 fprintf(f, " as_ls = %u\n", key->as_ls); 5817 fprintf(f, " mono.u.vs_export_prim_id = %u\n", 5818 key->mono.u.vs_export_prim_id); 5819 break; 5820 5821 case PIPE_SHADER_TESS_CTRL: 5822 if (shader->selector->screen->info.chip_class >= GFX9) { 5823 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, 5824 "part.tcs.ls_prolog", f); 5825 } 5826 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); 5827 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy); 5828 break; 5829 5830 case PIPE_SHADER_TESS_EVAL: 5831 fprintf(f, " as_es = %u\n", key->as_es); 5832 fprintf(f, " mono.u.vs_export_prim_id = %u\n", 5833 key->mono.u.vs_export_prim_id); 5834 break; 5835 5836 case PIPE_SHADER_GEOMETRY: 5837 if (shader->is_gs_copy_shader) 5838 break; 5839 5840 if (shader->selector->screen->info.chip_class >= GFX9 && 5841 key->part.gs.es->type == PIPE_SHADER_VERTEX) { 5842 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, 5843 "part.gs.vs_prolog", f); 5844 } 5845 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); 5846 break; 5847 5848 case PIPE_SHADER_COMPUTE: 5849 break; 5850 5851 case PIPE_SHADER_FRAGMENT: 5852 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); 5853 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); 5854 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); 5855 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp); 5856 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp); 5857 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp); 5858 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp); 5859 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp); 5860 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear); 5861 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format); 5862 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); 5863 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); 5864 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); 5865 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); 5866 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); 5867 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing); 5868 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); 5869 break; 5870 5871 default: 5872 assert(0); 5873 } 5874 5875 if ((processor == PIPE_SHADER_GEOMETRY || 5876 processor == PIPE_SHADER_TESS_EVAL || 5877 processor == PIPE_SHADER_VERTEX) && 5878 !key->as_es && !key->as_ls) { 5879 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs); 5880 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable); 5881 } 5882 } 5883 5884 static void si_init_shader_ctx(struct si_shader_context *ctx, 5885 struct si_screen *sscreen, 5886 LLVMTargetMachineRef tm) 5887 { 5888 struct lp_build_tgsi_context *bld_base; 5889 5890 si_llvm_context_init(ctx, sscreen, tm); 5891 5892 bld_base = &ctx->bld_base; 5893 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; 5894 5895 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action; 5896 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action; 5897 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action; 5898 5899 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; 5900 5901 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit; 5902 5903 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; 5904 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; 5905 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; 5906 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy; 5907 5908 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit; 5909 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit; 5910 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit; 5911 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit; 5912 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane"; 5913 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit; 5914 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane"; 5915 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args; 5916 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit; 5917 5918 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex; 5919 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_tgsi_emit_primitive; 5920 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; 5921 } 5922 5923 static void si_optimize_vs_outputs(struct si_shader_context *ctx) 5924 { 5925 struct si_shader *shader = ctx->shader; 5926 struct tgsi_shader_info *info = &shader->selector->info; 5927 5928 if ((ctx->type != PIPE_SHADER_VERTEX && 5929 ctx->type != PIPE_SHADER_TESS_EVAL) || 5930 shader->key.as_ls || 5931 shader->key.as_es) 5932 return; 5933 5934 ac_optimize_vs_outputs(&ctx->ac, 5935 ctx->main_fn, 5936 shader->info.vs_output_param_offset, 5937 info->num_outputs, 5938 &shader->info.nr_param_exports); 5939 } 5940 5941 static void si_count_scratch_private_memory(struct si_shader_context *ctx) 5942 { 5943 ctx->shader->config.private_mem_vgprs = 0; 5944 5945 /* Process all LLVM instructions. */ 5946 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn); 5947 while (bb) { 5948 LLVMValueRef next = LLVMGetFirstInstruction(bb); 5949 5950 while (next) { 5951 LLVMValueRef inst = next; 5952 next = LLVMGetNextInstruction(next); 5953 5954 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca) 5955 continue; 5956 5957 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); 5958 /* No idea why LLVM aligns allocas to 4 elements. */ 5959 unsigned alignment = LLVMGetAlignment(inst); 5960 unsigned dw_size = align(ac_get_type_size(type) / 4, alignment); 5961 ctx->shader->config.private_mem_vgprs += dw_size; 5962 } 5963 bb = LLVMGetNextBasicBlock(bb); 5964 } 5965 } 5966 5967 static void si_init_exec_from_input(struct si_shader_context *ctx, 5968 unsigned param, unsigned bitoffset) 5969 { 5970 LLVMValueRef args[] = { 5971 LLVMGetParam(ctx->main_fn, param), 5972 LLVMConstInt(ctx->i32, bitoffset, 0), 5973 }; 5974 lp_build_intrinsic(ctx->ac.builder, 5975 "llvm.amdgcn.init.exec.from.input", 5976 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); 5977 } 5978 5979 static bool si_vs_needs_prolog(const struct si_shader_selector *sel, 5980 const struct si_vs_prolog_bits *key) 5981 { 5982 /* VGPR initialization fixup for Vega10 and Raven is always done in the 5983 * VS prolog. */ 5984 return sel->vs_needs_prolog || key->ls_vgpr_fix; 5985 } 5986 5987 static bool si_compile_tgsi_main(struct si_shader_context *ctx, 5988 bool is_monolithic) 5989 { 5990 struct si_shader *shader = ctx->shader; 5991 struct si_shader_selector *sel = shader->selector; 5992 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 5993 5994 // TODO clean all this up! 5995 switch (ctx->type) { 5996 case PIPE_SHADER_VERTEX: 5997 ctx->load_input = declare_input_vs; 5998 if (shader->key.as_ls) 5999 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; 6000 else if (shader->key.as_es) 6001 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; 6002 else 6003 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; 6004 bld_base->emit_epilogue = si_tgsi_emit_epilogue; 6005 break; 6006 case PIPE_SHADER_TESS_CTRL: 6007 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs; 6008 ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings; 6009 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs; 6010 bld_base->emit_store = store_output_tcs; 6011 ctx->abi.store_tcs_outputs = si_nir_store_output_tcs; 6012 ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue; 6013 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; 6014 bld_base->emit_epilogue = si_tgsi_emit_epilogue; 6015 break; 6016 case PIPE_SHADER_TESS_EVAL: 6017 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes; 6018 ctx->abi.load_tess_varyings = si_nir_load_input_tes; 6019 ctx->abi.load_tess_coord = si_load_tess_coord; 6020 ctx->abi.load_tess_level = si_load_tess_level; 6021 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; 6022 if (shader->key.as_es) 6023 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; 6024 else 6025 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; 6026 bld_base->emit_epilogue = si_tgsi_emit_epilogue; 6027 break; 6028 case PIPE_SHADER_GEOMETRY: 6029 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs; 6030 ctx->abi.load_inputs = si_nir_load_input_gs; 6031 ctx->abi.emit_vertex = si_llvm_emit_vertex; 6032 ctx->abi.emit_primitive = si_llvm_emit_primitive; 6033 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; 6034 bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue; 6035 break; 6036 case PIPE_SHADER_FRAGMENT: 6037 ctx->load_input = declare_input_fs; 6038 ctx->abi.emit_outputs = si_llvm_return_fs_outputs; 6039 bld_base->emit_epilogue = si_tgsi_emit_epilogue; 6040 break; 6041 case PIPE_SHADER_COMPUTE: 6042 break; 6043 default: 6044 assert(!"Unsupported shader type"); 6045 return false; 6046 } 6047 6048 ctx->abi.load_ubo = load_ubo; 6049 ctx->abi.load_ssbo = load_ssbo; 6050 6051 create_function(ctx); 6052 preload_ring_buffers(ctx); 6053 6054 /* For GFX9 merged shaders: 6055 * - Set EXEC for the first shader. If the prolog is present, set 6056 * EXEC there instead. 6057 * - Add a barrier before the second shader. 6058 * - In the second shader, reset EXEC to ~0 and wrap the main part in 6059 * an if-statement. This is required for correctness in geometry 6060 * shaders, to ensure that empty GS waves do not send GS_EMIT and 6061 * GS_CUT messages. 6062 * 6063 * For monolithic merged shaders, the first shader is wrapped in an 6064 * if-block together with its prolog in si_build_wrapper_function. 6065 */ 6066 if (ctx->screen->info.chip_class >= GFX9) { 6067 if (!is_monolithic && 6068 sel->info.num_instructions > 1 && /* not empty shader */ 6069 (shader->key.as_es || shader->key.as_ls) && 6070 (ctx->type == PIPE_SHADER_TESS_EVAL || 6071 (ctx->type == PIPE_SHADER_VERTEX && 6072 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) { 6073 si_init_exec_from_input(ctx, 6074 ctx->param_merged_wave_info, 0); 6075 } else if (ctx->type == PIPE_SHADER_TESS_CTRL || 6076 ctx->type == PIPE_SHADER_GEOMETRY) { 6077 if (!is_monolithic) 6078 ac_init_exec_full_mask(&ctx->ac); 6079 6080 /* The barrier must execute for all shaders in a 6081 * threadgroup. 6082 */ 6083 si_llvm_emit_barrier(NULL, bld_base, NULL); 6084 6085 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8); 6086 LLVMValueRef ena = 6087 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, 6088 ac_get_thread_id(&ctx->ac), num_threads, ""); 6089 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena); 6090 } 6091 } 6092 6093 if (ctx->type == PIPE_SHADER_TESS_CTRL && 6094 sel->tcs_info.tessfactors_are_def_in_all_invocs) { 6095 for (unsigned i = 0; i < 6; i++) { 6096 ctx->invoc0_tess_factors[i] = 6097 lp_build_alloca_undef(&ctx->gallivm, ctx->i32, ""); 6098 } 6099 } 6100 6101 if (ctx->type == PIPE_SHADER_GEOMETRY) { 6102 int i; 6103 for (i = 0; i < 4; i++) { 6104 ctx->gs_next_vertex[i] = 6105 lp_build_alloca(&ctx->gallivm, 6106 ctx->i32, ""); 6107 } 6108 } 6109 6110 if (sel->force_correct_derivs_after_kill) { 6111 ctx->postponed_kill = lp_build_alloca_undef(&ctx->gallivm, ctx->i1, ""); 6112 /* true = don't kill. */ 6113 LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0), 6114 ctx->postponed_kill); 6115 } 6116 6117 if (sel->tokens) { 6118 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { 6119 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); 6120 return false; 6121 } 6122 } else { 6123 if (!si_nir_build_llvm(ctx, sel->nir)) { 6124 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); 6125 return false; 6126 } 6127 } 6128 6129 si_llvm_build_ret(ctx, ctx->return_value); 6130 return true; 6131 } 6132 6133 /** 6134 * Compute the VS prolog key, which contains all the information needed to 6135 * build the VS prolog function, and set shader->info bits where needed. 6136 * 6137 * \param info Shader info of the vertex shader. 6138 * \param num_input_sgprs Number of input SGPRs for the vertex shader. 6139 * \param prolog_key Key of the VS prolog 6140 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. 6141 * \param key Output shader part key. 6142 */ 6143 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info, 6144 unsigned num_input_sgprs, 6145 const struct si_vs_prolog_bits *prolog_key, 6146 struct si_shader *shader_out, 6147 union si_shader_part_key *key) 6148 { 6149 memset(key, 0, sizeof(*key)); 6150 key->vs_prolog.states = *prolog_key; 6151 key->vs_prolog.num_input_sgprs = num_input_sgprs; 6152 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; 6153 key->vs_prolog.as_ls = shader_out->key.as_ls; 6154 key->vs_prolog.as_es = shader_out->key.as_es; 6155 6156 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { 6157 key->vs_prolog.as_ls = 1; 6158 key->vs_prolog.num_merged_next_stage_vgprs = 2; 6159 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { 6160 key->vs_prolog.as_es = 1; 6161 key->vs_prolog.num_merged_next_stage_vgprs = 5; 6162 } 6163 6164 /* Enable loading the InstanceID VGPR. */ 6165 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); 6166 6167 if ((key->vs_prolog.states.instance_divisor_is_one | 6168 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask) 6169 shader_out->info.uses_instanceid = true; 6170 } 6171 6172 /** 6173 * Compute the PS prolog key, which contains all the information needed to 6174 * build the PS prolog function, and set related bits in shader->config. 6175 */ 6176 static void si_get_ps_prolog_key(struct si_shader *shader, 6177 union si_shader_part_key *key, 6178 bool separate_prolog) 6179 { 6180 struct tgsi_shader_info *info = &shader->selector->info; 6181 6182 memset(key, 0, sizeof(*key)); 6183 key->ps_prolog.states = shader->key.part.ps.prolog; 6184 key->ps_prolog.colors_read = info->colors_read; 6185 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; 6186 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; 6187 key->ps_prolog.wqm = info->uses_derivatives && 6188 (key->ps_prolog.colors_read || 6189 key->ps_prolog.states.force_persp_sample_interp || 6190 key->ps_prolog.states.force_linear_sample_interp || 6191 key->ps_prolog.states.force_persp_center_interp || 6192 key->ps_prolog.states.force_linear_center_interp || 6193 key->ps_prolog.states.bc_optimize_for_persp || 6194 key->ps_prolog.states.bc_optimize_for_linear); 6195 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; 6196 6197 if (info->colors_read) { 6198 unsigned *color = shader->selector->color_attr_index; 6199 6200 if (shader->key.part.ps.prolog.color_two_side) { 6201 /* BCOLORs are stored after the last input. */ 6202 key->ps_prolog.num_interp_inputs = info->num_inputs; 6203 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; 6204 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); 6205 } 6206 6207 for (unsigned i = 0; i < 2; i++) { 6208 unsigned interp = info->input_interpolate[color[i]]; 6209 unsigned location = info->input_interpolate_loc[color[i]]; 6210 6211 if (!(info->colors_read & (0xf << i*4))) 6212 continue; 6213 6214 key->ps_prolog.color_attr_index[i] = color[i]; 6215 6216 if (shader->key.part.ps.prolog.flatshade_colors && 6217 interp == TGSI_INTERPOLATE_COLOR) 6218 interp = TGSI_INTERPOLATE_CONSTANT; 6219 6220 switch (interp) { 6221 case TGSI_INTERPOLATE_CONSTANT: 6222 key->ps_prolog.color_interp_vgpr_index[i] = -1; 6223 break; 6224 case TGSI_INTERPOLATE_PERSPECTIVE: 6225 case TGSI_INTERPOLATE_COLOR: 6226 /* Force the interpolation location for colors here. */ 6227 if (shader->key.part.ps.prolog.force_persp_sample_interp) 6228 location = TGSI_INTERPOLATE_LOC_SAMPLE; 6229 if (shader->key.part.ps.prolog.force_persp_center_interp) 6230 location = TGSI_INTERPOLATE_LOC_CENTER; 6231 6232 switch (location) { 6233 case TGSI_INTERPOLATE_LOC_SAMPLE: 6234 key->ps_prolog.color_interp_vgpr_index[i] = 0; 6235 shader->config.spi_ps_input_ena |= 6236 S_0286CC_PERSP_SAMPLE_ENA(1); 6237 break; 6238 case TGSI_INTERPOLATE_LOC_CENTER: 6239 key->ps_prolog.color_interp_vgpr_index[i] = 2; 6240 shader->config.spi_ps_input_ena |= 6241 S_0286CC_PERSP_CENTER_ENA(1); 6242 break; 6243 case TGSI_INTERPOLATE_LOC_CENTROID: 6244 key->ps_prolog.color_interp_vgpr_index[i] = 4; 6245 shader->config.spi_ps_input_ena |= 6246 S_0286CC_PERSP_CENTROID_ENA(1); 6247 break; 6248 default: 6249 assert(0); 6250 } 6251 break; 6252 case TGSI_INTERPOLATE_LINEAR: 6253 /* Force the interpolation location for colors here. */ 6254 if (shader->key.part.ps.prolog.force_linear_sample_interp) 6255 location = TGSI_INTERPOLATE_LOC_SAMPLE; 6256 if (shader->key.part.ps.prolog.force_linear_center_interp) 6257 location = TGSI_INTERPOLATE_LOC_CENTER; 6258 6259 /* The VGPR assignment for non-monolithic shaders 6260 * works because InitialPSInputAddr is set on the 6261 * main shader and PERSP_PULL_MODEL is never used. 6262 */ 6263 switch (location) { 6264 case TGSI_INTERPOLATE_LOC_SAMPLE: 6265 key->ps_prolog.color_interp_vgpr_index[i] = 6266 separate_prolog ? 6 : 9; 6267 shader->config.spi_ps_input_ena |= 6268 S_0286CC_LINEAR_SAMPLE_ENA(1); 6269 break; 6270 case TGSI_INTERPOLATE_LOC_CENTER: 6271 key->ps_prolog.color_interp_vgpr_index[i] = 6272 separate_prolog ? 8 : 11; 6273 shader->config.spi_ps_input_ena |= 6274 S_0286CC_LINEAR_CENTER_ENA(1); 6275 break; 6276 case TGSI_INTERPOLATE_LOC_CENTROID: 6277 key->ps_prolog.color_interp_vgpr_index[i] = 6278 separate_prolog ? 10 : 13; 6279 shader->config.spi_ps_input_ena |= 6280 S_0286CC_LINEAR_CENTROID_ENA(1); 6281 break; 6282 default: 6283 assert(0); 6284 } 6285 break; 6286 default: 6287 assert(0); 6288 } 6289 } 6290 } 6291 } 6292 6293 /** 6294 * Check whether a PS prolog is required based on the key. 6295 */ 6296 static bool si_need_ps_prolog(const union si_shader_part_key *key) 6297 { 6298 return key->ps_prolog.colors_read || 6299 key->ps_prolog.states.force_persp_sample_interp || 6300 key->ps_prolog.states.force_linear_sample_interp || 6301 key->ps_prolog.states.force_persp_center_interp || 6302 key->ps_prolog.states.force_linear_center_interp || 6303 key->ps_prolog.states.bc_optimize_for_persp || 6304 key->ps_prolog.states.bc_optimize_for_linear || 6305 key->ps_prolog.states.poly_stipple || 6306 key->ps_prolog.states.samplemask_log_ps_iter; 6307 } 6308 6309 /** 6310 * Compute the PS epilog key, which contains all the information needed to 6311 * build the PS epilog function. 6312 */ 6313 static void si_get_ps_epilog_key(struct si_shader *shader, 6314 union si_shader_part_key *key) 6315 { 6316 struct tgsi_shader_info *info = &shader->selector->info; 6317 memset(key, 0, sizeof(*key)); 6318 key->ps_epilog.colors_written = info->colors_written; 6319 key->ps_epilog.writes_z = info->writes_z; 6320 key->ps_epilog.writes_stencil = info->writes_stencil; 6321 key->ps_epilog.writes_samplemask = info->writes_samplemask; 6322 key->ps_epilog.states = shader->key.part.ps.epilog; 6323 } 6324 6325 /** 6326 * Build the GS prolog function. Rotate the input vertices for triangle strips 6327 * with adjacency. 6328 */ 6329 static void si_build_gs_prolog_function(struct si_shader_context *ctx, 6330 union si_shader_part_key *key) 6331 { 6332 unsigned num_sgprs, num_vgprs; 6333 struct si_function_info fninfo; 6334 LLVMBuilderRef builder = ctx->ac.builder; 6335 LLVMTypeRef returns[48]; 6336 LLVMValueRef func, ret; 6337 6338 si_init_function_info(&fninfo); 6339 6340 if (ctx->screen->info.chip_class >= GFX9) { 6341 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; 6342 num_vgprs = 5; /* ES inputs are not needed by GS */ 6343 } else { 6344 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; 6345 num_vgprs = 8; 6346 } 6347 6348 for (unsigned i = 0; i < num_sgprs; ++i) { 6349 add_arg(&fninfo, ARG_SGPR, ctx->i32); 6350 returns[i] = ctx->i32; 6351 } 6352 6353 for (unsigned i = 0; i < num_vgprs; ++i) { 6354 add_arg(&fninfo, ARG_VGPR, ctx->i32); 6355 returns[num_sgprs + i] = ctx->f32; 6356 } 6357 6358 /* Create the function. */ 6359 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 6360 &fninfo, 0); 6361 func = ctx->main_fn; 6362 6363 /* Set the full EXEC mask for the prolog, because we are only fiddling 6364 * with registers here. The main shader part will set the correct EXEC 6365 * mask. 6366 */ 6367 if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) 6368 ac_init_exec_full_mask(&ctx->ac); 6369 6370 /* Copy inputs to outputs. This should be no-op, as the registers match, 6371 * but it will prevent the compiler from overwriting them unintentionally. 6372 */ 6373 ret = ctx->return_value; 6374 for (unsigned i = 0; i < num_sgprs; i++) { 6375 LLVMValueRef p = LLVMGetParam(func, i); 6376 ret = LLVMBuildInsertValue(builder, ret, p, i, ""); 6377 } 6378 for (unsigned i = 0; i < num_vgprs; i++) { 6379 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); 6380 p = ac_to_float(&ctx->ac, p); 6381 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); 6382 } 6383 6384 if (key->gs_prolog.states.tri_strip_adj_fix) { 6385 /* Remap the input vertices for every other primitive. */ 6386 const unsigned gfx6_vtx_params[6] = { 6387 num_sgprs, 6388 num_sgprs + 1, 6389 num_sgprs + 3, 6390 num_sgprs + 4, 6391 num_sgprs + 5, 6392 num_sgprs + 6 6393 }; 6394 const unsigned gfx9_vtx_params[3] = { 6395 num_sgprs, 6396 num_sgprs + 1, 6397 num_sgprs + 4, 6398 }; 6399 LLVMValueRef vtx_in[6], vtx_out[6]; 6400 LLVMValueRef prim_id, rotate; 6401 6402 if (ctx->screen->info.chip_class >= GFX9) { 6403 for (unsigned i = 0; i < 3; i++) { 6404 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16); 6405 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16); 6406 } 6407 } else { 6408 for (unsigned i = 0; i < 6; i++) 6409 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]); 6410 } 6411 6412 prim_id = LLVMGetParam(func, num_sgprs + 2); 6413 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); 6414 6415 for (unsigned i = 0; i < 6; ++i) { 6416 LLVMValueRef base, rotated; 6417 base = vtx_in[i]; 6418 rotated = vtx_in[(i + 4) % 6]; 6419 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); 6420 } 6421 6422 if (ctx->screen->info.chip_class >= GFX9) { 6423 for (unsigned i = 0; i < 3; i++) { 6424 LLVMValueRef hi, out; 6425 6426 hi = LLVMBuildShl(builder, vtx_out[i*2+1], 6427 LLVMConstInt(ctx->i32, 16, 0), ""); 6428 out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); 6429 out = ac_to_float(&ctx->ac, out); 6430 ret = LLVMBuildInsertValue(builder, ret, out, 6431 gfx9_vtx_params[i], ""); 6432 } 6433 } else { 6434 for (unsigned i = 0; i < 6; i++) { 6435 LLVMValueRef out; 6436 6437 out = ac_to_float(&ctx->ac, vtx_out[i]); 6438 ret = LLVMBuildInsertValue(builder, ret, out, 6439 gfx6_vtx_params[i], ""); 6440 } 6441 } 6442 } 6443 6444 LLVMBuildRet(builder, ret); 6445 } 6446 6447 /** 6448 * Given a list of shader part functions, build a wrapper function that 6449 * runs them in sequence to form a monolithic shader. 6450 */ 6451 static void si_build_wrapper_function(struct si_shader_context *ctx, 6452 LLVMValueRef *parts, 6453 unsigned num_parts, 6454 unsigned main_part, 6455 unsigned next_shader_first_part) 6456 { 6457 LLVMBuilderRef builder = ctx->ac.builder; 6458 /* PS epilog has one arg per color component; gfx9 merged shader 6459 * prologs need to forward 32 user SGPRs. 6460 */ 6461 struct si_function_info fninfo; 6462 LLVMValueRef initial[64], out[64]; 6463 LLVMTypeRef function_type; 6464 unsigned num_first_params; 6465 unsigned num_out, initial_num_out; 6466 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */ 6467 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */ 6468 unsigned num_sgprs, num_vgprs; 6469 unsigned gprs; 6470 struct lp_build_if_state if_state; 6471 6472 si_init_function_info(&fninfo); 6473 6474 for (unsigned i = 0; i < num_parts; ++i) { 6475 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE); 6476 LLVMSetLinkage(parts[i], LLVMPrivateLinkage); 6477 } 6478 6479 /* The parameters of the wrapper function correspond to those of the 6480 * first part in terms of SGPRs and VGPRs, but we use the types of the 6481 * main part to get the right types. This is relevant for the 6482 * dereferenceable attribute on descriptor table pointers. 6483 */ 6484 num_sgprs = 0; 6485 num_vgprs = 0; 6486 6487 function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); 6488 num_first_params = LLVMCountParamTypes(function_type); 6489 6490 for (unsigned i = 0; i < num_first_params; ++i) { 6491 LLVMValueRef param = LLVMGetParam(parts[0], i); 6492 6493 if (ac_is_sgpr_param(param)) { 6494 assert(num_vgprs == 0); 6495 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; 6496 } else { 6497 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; 6498 } 6499 } 6500 6501 gprs = 0; 6502 while (gprs < num_sgprs + num_vgprs) { 6503 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params); 6504 LLVMTypeRef type = LLVMTypeOf(param); 6505 unsigned size = ac_get_type_size(type) / 4; 6506 6507 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type); 6508 6509 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); 6510 assert(gprs + size <= num_sgprs + num_vgprs && 6511 (gprs >= num_sgprs || gprs + size <= num_sgprs)); 6512 6513 gprs += size; 6514 } 6515 6516 si_create_function(ctx, "wrapper", NULL, 0, &fninfo, 6517 si_get_max_workgroup_size(ctx->shader)); 6518 6519 if (is_merged_shader(ctx->shader)) 6520 ac_init_exec_full_mask(&ctx->ac); 6521 6522 /* Record the arguments of the function as if they were an output of 6523 * a previous part. 6524 */ 6525 num_out = 0; 6526 num_out_sgpr = 0; 6527 6528 for (unsigned i = 0; i < fninfo.num_params; ++i) { 6529 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); 6530 LLVMTypeRef param_type = LLVMTypeOf(param); 6531 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32; 6532 unsigned size = ac_get_type_size(param_type) / 4; 6533 6534 if (size == 1) { 6535 if (param_type != out_type) 6536 param = LLVMBuildBitCast(builder, param, out_type, ""); 6537 out[num_out++] = param; 6538 } else { 6539 LLVMTypeRef vector_type = LLVMVectorType(out_type, size); 6540 6541 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 6542 param = LLVMBuildPtrToInt(builder, param, ctx->i64, ""); 6543 param_type = ctx->i64; 6544 } 6545 6546 if (param_type != vector_type) 6547 param = LLVMBuildBitCast(builder, param, vector_type, ""); 6548 6549 for (unsigned j = 0; j < size; ++j) 6550 out[num_out++] = LLVMBuildExtractElement( 6551 builder, param, LLVMConstInt(ctx->i32, j, 0), ""); 6552 } 6553 6554 if (i < fninfo.num_sgpr_params) 6555 num_out_sgpr = num_out; 6556 } 6557 6558 memcpy(initial, out, sizeof(out)); 6559 initial_num_out = num_out; 6560 initial_num_out_sgpr = num_out_sgpr; 6561 6562 /* Now chain the parts. */ 6563 for (unsigned part = 0; part < num_parts; ++part) { 6564 LLVMValueRef in[48]; 6565 LLVMValueRef ret; 6566 LLVMTypeRef ret_type; 6567 unsigned out_idx = 0; 6568 unsigned num_params = LLVMCountParams(parts[part]); 6569 6570 /* Merged shaders are executed conditionally depending 6571 * on the number of enabled threads passed in the input SGPRs. */ 6572 if (is_merged_shader(ctx->shader) && part == 0) { 6573 LLVMValueRef ena, count = initial[3]; 6574 6575 count = LLVMBuildAnd(builder, count, 6576 LLVMConstInt(ctx->i32, 0x7f, 0), ""); 6577 ena = LLVMBuildICmp(builder, LLVMIntULT, 6578 ac_get_thread_id(&ctx->ac), count, ""); 6579 lp_build_if(&if_state, &ctx->gallivm, ena); 6580 } 6581 6582 /* Derive arguments for the next part from outputs of the 6583 * previous one. 6584 */ 6585 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { 6586 LLVMValueRef param; 6587 LLVMTypeRef param_type; 6588 bool is_sgpr; 6589 unsigned param_size; 6590 LLVMValueRef arg = NULL; 6591 6592 param = LLVMGetParam(parts[part], param_idx); 6593 param_type = LLVMTypeOf(param); 6594 param_size = ac_get_type_size(param_type) / 4; 6595 is_sgpr = ac_is_sgpr_param(param); 6596 6597 if (is_sgpr) { 6598 #if HAVE_LLVM < 0x0400 6599 LLVMRemoveAttribute(param, LLVMByValAttribute); 6600 #else 6601 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5); 6602 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id); 6603 #endif 6604 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG); 6605 } 6606 6607 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); 6608 assert(is_sgpr || out_idx >= num_out_sgpr); 6609 6610 if (param_size == 1) 6611 arg = out[out_idx]; 6612 else 6613 arg = lp_build_gather_values(&ctx->gallivm, &out[out_idx], param_size); 6614 6615 if (LLVMTypeOf(arg) != param_type) { 6616 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 6617 arg = LLVMBuildBitCast(builder, arg, ctx->i64, ""); 6618 arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); 6619 } else { 6620 arg = LLVMBuildBitCast(builder, arg, param_type, ""); 6621 } 6622 } 6623 6624 in[param_idx] = arg; 6625 out_idx += param_size; 6626 } 6627 6628 ret = LLVMBuildCall(builder, parts[part], in, num_params, ""); 6629 6630 if (is_merged_shader(ctx->shader) && 6631 part + 1 == next_shader_first_part) { 6632 lp_build_endif(&if_state); 6633 6634 /* The second half of the merged shader should use 6635 * the inputs from the toplevel (wrapper) function, 6636 * not the return value from the last call. 6637 * 6638 * That's because the last call was executed condi- 6639 * tionally, so we can't consume it in the main 6640 * block. 6641 */ 6642 memcpy(out, initial, sizeof(initial)); 6643 num_out = initial_num_out; 6644 num_out_sgpr = initial_num_out_sgpr; 6645 continue; 6646 } 6647 6648 /* Extract the returned GPRs. */ 6649 ret_type = LLVMTypeOf(ret); 6650 num_out = 0; 6651 num_out_sgpr = 0; 6652 6653 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { 6654 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); 6655 6656 unsigned ret_size = LLVMCountStructElementTypes(ret_type); 6657 6658 for (unsigned i = 0; i < ret_size; ++i) { 6659 LLVMValueRef val = 6660 LLVMBuildExtractValue(builder, ret, i, ""); 6661 6662 assert(num_out < ARRAY_SIZE(out)); 6663 out[num_out++] = val; 6664 6665 if (LLVMTypeOf(val) == ctx->i32) { 6666 assert(num_out_sgpr + 1 == num_out); 6667 num_out_sgpr = num_out; 6668 } 6669 } 6670 } 6671 } 6672 6673 LLVMBuildRetVoid(builder); 6674 } 6675 6676 int si_compile_tgsi_shader(struct si_screen *sscreen, 6677 LLVMTargetMachineRef tm, 6678 struct si_shader *shader, 6679 bool is_monolithic, 6680 struct pipe_debug_callback *debug) 6681 { 6682 struct si_shader_selector *sel = shader->selector; 6683 struct si_shader_context ctx; 6684 int r = -1; 6685 6686 /* Dump TGSI code before doing TGSI->LLVM conversion in case the 6687 * conversion fails. */ 6688 if (si_can_dump_shader(sscreen, sel->info.processor) && 6689 !(sscreen->debug_flags & DBG(NO_TGSI))) { 6690 if (sel->tokens) 6691 tgsi_dump(sel->tokens, 0); 6692 else 6693 nir_print_shader(sel->nir, stderr); 6694 si_dump_streamout(&sel->so); 6695 } 6696 6697 si_init_shader_ctx(&ctx, sscreen, tm); 6698 si_llvm_context_set_tgsi(&ctx, shader); 6699 ctx.separate_prolog = !is_monolithic; 6700 6701 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, 6702 sizeof(shader->info.vs_output_param_offset)); 6703 6704 shader->info.uses_instanceid = sel->info.uses_instanceid; 6705 6706 if (!si_compile_tgsi_main(&ctx, is_monolithic)) { 6707 si_llvm_dispose(&ctx); 6708 return -1; 6709 } 6710 6711 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { 6712 LLVMValueRef parts[2]; 6713 bool need_prolog = sel->vs_needs_prolog; 6714 6715 parts[1] = ctx.main_fn; 6716 6717 if (need_prolog) { 6718 union si_shader_part_key prolog_key; 6719 si_get_vs_prolog_key(&sel->info, 6720 shader->info.num_input_sgprs, 6721 &shader->key.part.vs.prolog, 6722 shader, &prolog_key); 6723 si_build_vs_prolog_function(&ctx, &prolog_key); 6724 parts[0] = ctx.main_fn; 6725 } 6726 6727 si_build_wrapper_function(&ctx, parts + !need_prolog, 6728 1 + need_prolog, need_prolog, 0); 6729 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { 6730 if (sscreen->info.chip_class >= GFX9) { 6731 struct si_shader_selector *ls = shader->key.part.tcs.ls; 6732 LLVMValueRef parts[4]; 6733 bool vs_needs_prolog = 6734 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog); 6735 6736 /* TCS main part */ 6737 parts[2] = ctx.main_fn; 6738 6739 /* TCS epilog */ 6740 union si_shader_part_key tcs_epilog_key; 6741 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); 6742 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 6743 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key); 6744 parts[3] = ctx.main_fn; 6745 6746 /* VS prolog */ 6747 if (vs_needs_prolog) { 6748 union si_shader_part_key vs_prolog_key; 6749 si_get_vs_prolog_key(&ls->info, 6750 shader->info.num_input_sgprs, 6751 &shader->key.part.tcs.ls_prolog, 6752 shader, &vs_prolog_key); 6753 vs_prolog_key.vs_prolog.is_monolithic = true; 6754 si_build_vs_prolog_function(&ctx, &vs_prolog_key); 6755 parts[0] = ctx.main_fn; 6756 } 6757 6758 /* VS as LS main part */ 6759 struct si_shader shader_ls = {}; 6760 shader_ls.selector = ls; 6761 shader_ls.key.as_ls = 1; 6762 shader_ls.key.mono = shader->key.mono; 6763 shader_ls.key.opt = shader->key.opt; 6764 si_llvm_context_set_tgsi(&ctx, &shader_ls); 6765 6766 if (!si_compile_tgsi_main(&ctx, true)) { 6767 si_llvm_dispose(&ctx); 6768 return -1; 6769 } 6770 shader->info.uses_instanceid |= ls->info.uses_instanceid; 6771 parts[1] = ctx.main_fn; 6772 6773 /* Reset the shader context. */ 6774 ctx.shader = shader; 6775 ctx.type = PIPE_SHADER_TESS_CTRL; 6776 6777 si_build_wrapper_function(&ctx, 6778 parts + !vs_needs_prolog, 6779 4 - !vs_needs_prolog, 0, 6780 vs_needs_prolog ? 2 : 1); 6781 } else { 6782 LLVMValueRef parts[2]; 6783 union si_shader_part_key epilog_key; 6784 6785 parts[0] = ctx.main_fn; 6786 6787 memset(&epilog_key, 0, sizeof(epilog_key)); 6788 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 6789 si_build_tcs_epilog_function(&ctx, &epilog_key); 6790 parts[1] = ctx.main_fn; 6791 6792 si_build_wrapper_function(&ctx, parts, 2, 0, 0); 6793 } 6794 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { 6795 if (ctx.screen->info.chip_class >= GFX9) { 6796 struct si_shader_selector *es = shader->key.part.gs.es; 6797 LLVMValueRef es_prolog = NULL; 6798 LLVMValueRef es_main = NULL; 6799 LLVMValueRef gs_prolog = NULL; 6800 LLVMValueRef gs_main = ctx.main_fn; 6801 6802 /* GS prolog */ 6803 union si_shader_part_key gs_prolog_key; 6804 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key)); 6805 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 6806 gs_prolog_key.gs_prolog.is_monolithic = true; 6807 si_build_gs_prolog_function(&ctx, &gs_prolog_key); 6808 gs_prolog = ctx.main_fn; 6809 6810 /* ES prolog */ 6811 if (es->vs_needs_prolog) { 6812 union si_shader_part_key vs_prolog_key; 6813 si_get_vs_prolog_key(&es->info, 6814 shader->info.num_input_sgprs, 6815 &shader->key.part.gs.vs_prolog, 6816 shader, &vs_prolog_key); 6817 vs_prolog_key.vs_prolog.is_monolithic = true; 6818 si_build_vs_prolog_function(&ctx, &vs_prolog_key); 6819 es_prolog = ctx.main_fn; 6820 } 6821 6822 /* ES main part */ 6823 struct si_shader shader_es = {}; 6824 shader_es.selector = es; 6825 shader_es.key.as_es = 1; 6826 shader_es.key.mono = shader->key.mono; 6827 shader_es.key.opt = shader->key.opt; 6828 si_llvm_context_set_tgsi(&ctx, &shader_es); 6829 6830 if (!si_compile_tgsi_main(&ctx, true)) { 6831 si_llvm_dispose(&ctx); 6832 return -1; 6833 } 6834 shader->info.uses_instanceid |= es->info.uses_instanceid; 6835 es_main = ctx.main_fn; 6836 6837 /* Reset the shader context. */ 6838 ctx.shader = shader; 6839 ctx.type = PIPE_SHADER_GEOMETRY; 6840 6841 /* Prepare the array of shader parts. */ 6842 LLVMValueRef parts[4]; 6843 unsigned num_parts = 0, main_part, next_first_part; 6844 6845 if (es_prolog) 6846 parts[num_parts++] = es_prolog; 6847 6848 parts[main_part = num_parts++] = es_main; 6849 parts[next_first_part = num_parts++] = gs_prolog; 6850 parts[num_parts++] = gs_main; 6851 6852 si_build_wrapper_function(&ctx, parts, num_parts, 6853 main_part, next_first_part); 6854 } else { 6855 LLVMValueRef parts[2]; 6856 union si_shader_part_key prolog_key; 6857 6858 parts[1] = ctx.main_fn; 6859 6860 memset(&prolog_key, 0, sizeof(prolog_key)); 6861 prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 6862 si_build_gs_prolog_function(&ctx, &prolog_key); 6863 parts[0] = ctx.main_fn; 6864 6865 si_build_wrapper_function(&ctx, parts, 2, 1, 0); 6866 } 6867 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { 6868 LLVMValueRef parts[3]; 6869 union si_shader_part_key prolog_key; 6870 union si_shader_part_key epilog_key; 6871 bool need_prolog; 6872 6873 si_get_ps_prolog_key(shader, &prolog_key, false); 6874 need_prolog = si_need_ps_prolog(&prolog_key); 6875 6876 parts[need_prolog ? 1 : 0] = ctx.main_fn; 6877 6878 if (need_prolog) { 6879 si_build_ps_prolog_function(&ctx, &prolog_key); 6880 parts[0] = ctx.main_fn; 6881 } 6882 6883 si_get_ps_epilog_key(shader, &epilog_key); 6884 si_build_ps_epilog_function(&ctx, &epilog_key); 6885 parts[need_prolog ? 2 : 1] = ctx.main_fn; 6886 6887 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, 6888 need_prolog ? 1 : 0, 0); 6889 } 6890 6891 si_llvm_optimize_module(&ctx); 6892 6893 /* Post-optimization transformations and analysis. */ 6894 si_optimize_vs_outputs(&ctx); 6895 6896 if ((debug && debug->debug_message) || 6897 si_can_dump_shader(sscreen, ctx.type)) 6898 si_count_scratch_private_memory(&ctx); 6899 6900 /* Compile to bytecode. */ 6901 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm, 6902 ctx.gallivm.module, debug, ctx.type, "TGSI shader"); 6903 si_llvm_dispose(&ctx); 6904 if (r) { 6905 fprintf(stderr, "LLVM failed to compile shader\n"); 6906 return r; 6907 } 6908 6909 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. 6910 * LLVM 3.9svn has this bug. 6911 */ 6912 if (sel->type == PIPE_SHADER_COMPUTE) { 6913 unsigned wave_size = 64; 6914 unsigned max_vgprs = 256; 6915 unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512; 6916 unsigned max_sgprs_per_wave = 128; 6917 unsigned max_block_threads = si_get_max_workgroup_size(shader); 6918 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); 6919 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); 6920 6921 max_vgprs = max_vgprs / min_waves_per_simd; 6922 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave); 6923 6924 if (shader->config.num_sgprs > max_sgprs || 6925 shader->config.num_vgprs > max_vgprs) { 6926 fprintf(stderr, "LLVM failed to compile a shader correctly: " 6927 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n", 6928 shader->config.num_sgprs, shader->config.num_vgprs, 6929 max_sgprs, max_vgprs); 6930 6931 /* Just terminate the process, because dependent 6932 * shaders can hang due to bad input data, but use 6933 * the env var to allow shader-db to work. 6934 */ 6935 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false)) 6936 abort(); 6937 } 6938 } 6939 6940 /* Add the scratch offset to input SGPRs. */ 6941 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader)) 6942 shader->info.num_input_sgprs += 1; /* scratch byte offset */ 6943 6944 /* Calculate the number of fragment input VGPRs. */ 6945 if (ctx.type == PIPE_SHADER_FRAGMENT) { 6946 shader->info.num_input_vgprs = 0; 6947 shader->info.face_vgpr_index = -1; 6948 shader->info.ancillary_vgpr_index = -1; 6949 6950 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) 6951 shader->info.num_input_vgprs += 2; 6952 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)) 6953 shader->info.num_input_vgprs += 2; 6954 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr)) 6955 shader->info.num_input_vgprs += 2; 6956 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr)) 6957 shader->info.num_input_vgprs += 3; 6958 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr)) 6959 shader->info.num_input_vgprs += 2; 6960 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)) 6961 shader->info.num_input_vgprs += 2; 6962 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr)) 6963 shader->info.num_input_vgprs += 2; 6964 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr)) 6965 shader->info.num_input_vgprs += 1; 6966 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6967 shader->info.num_input_vgprs += 1; 6968 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6969 shader->info.num_input_vgprs += 1; 6970 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6971 shader->info.num_input_vgprs += 1; 6972 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr)) 6973 shader->info.num_input_vgprs += 1; 6974 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) { 6975 shader->info.face_vgpr_index = shader->info.num_input_vgprs; 6976 shader->info.num_input_vgprs += 1; 6977 } 6978 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) { 6979 shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs; 6980 shader->info.num_input_vgprs += 1; 6981 } 6982 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) 6983 shader->info.num_input_vgprs += 1; 6984 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) 6985 shader->info.num_input_vgprs += 1; 6986 } 6987 6988 return 0; 6989 } 6990 6991 /** 6992 * Create, compile and return a shader part (prolog or epilog). 6993 * 6994 * \param sscreen screen 6995 * \param list list of shader parts of the same category 6996 * \param type shader type 6997 * \param key shader part key 6998 * \param prolog whether the part being requested is a prolog 6999 * \param tm LLVM target machine 7000 * \param debug debug callback 7001 * \param build the callback responsible for building the main function 7002 * \return non-NULL on success 7003 */ 7004 static struct si_shader_part * 7005 si_get_shader_part(struct si_screen *sscreen, 7006 struct si_shader_part **list, 7007 enum pipe_shader_type type, 7008 bool prolog, 7009 union si_shader_part_key *key, 7010 LLVMTargetMachineRef tm, 7011 struct pipe_debug_callback *debug, 7012 void (*build)(struct si_shader_context *, 7013 union si_shader_part_key *), 7014 const char *name) 7015 { 7016 struct si_shader_part *result; 7017 7018 mtx_lock(&sscreen->shader_parts_mutex); 7019 7020 /* Find existing. */ 7021 for (result = *list; result; result = result->next) { 7022 if (memcmp(&result->key, key, sizeof(*key)) == 0) { 7023 mtx_unlock(&sscreen->shader_parts_mutex); 7024 return result; 7025 } 7026 } 7027 7028 /* Compile a new one. */ 7029 result = CALLOC_STRUCT(si_shader_part); 7030 result->key = *key; 7031 7032 struct si_shader shader = {}; 7033 struct si_shader_context ctx; 7034 7035 si_init_shader_ctx(&ctx, sscreen, tm); 7036 ctx.shader = &shader; 7037 ctx.type = type; 7038 7039 switch (type) { 7040 case PIPE_SHADER_VERTEX: 7041 shader.key.as_ls = key->vs_prolog.as_ls; 7042 shader.key.as_es = key->vs_prolog.as_es; 7043 break; 7044 case PIPE_SHADER_TESS_CTRL: 7045 assert(!prolog); 7046 shader.key.part.tcs.epilog = key->tcs_epilog.states; 7047 break; 7048 case PIPE_SHADER_GEOMETRY: 7049 assert(prolog); 7050 break; 7051 case PIPE_SHADER_FRAGMENT: 7052 if (prolog) 7053 shader.key.part.ps.prolog = key->ps_prolog.states; 7054 else 7055 shader.key.part.ps.epilog = key->ps_epilog.states; 7056 break; 7057 default: 7058 unreachable("bad shader part"); 7059 } 7060 7061 build(&ctx, key); 7062 7063 /* Compile. */ 7064 si_llvm_optimize_module(&ctx); 7065 7066 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm, 7067 ctx.ac.module, debug, ctx.type, name)) { 7068 FREE(result); 7069 result = NULL; 7070 goto out; 7071 } 7072 7073 result->next = *list; 7074 *list = result; 7075 7076 out: 7077 si_llvm_dispose(&ctx); 7078 mtx_unlock(&sscreen->shader_parts_mutex); 7079 return result; 7080 } 7081 7082 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) 7083 { 7084 LLVMValueRef ptr[2], list; 7085 bool is_merged_shader = 7086 ctx->screen->info.chip_class >= GFX9 && 7087 (ctx->type == PIPE_SHADER_TESS_CTRL || 7088 ctx->type == PIPE_SHADER_GEOMETRY || 7089 ctx->shader->key.as_ls || ctx->shader->key.as_es); 7090 7091 /* Get the pointer to rw buffers. */ 7092 ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); 7093 ptr[1] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS_HI); 7094 list = lp_build_gather_values(&ctx->gallivm, ptr, 2); 7095 list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, ""); 7096 list = LLVMBuildIntToPtr(ctx->ac.builder, list, 7097 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), ""); 7098 return list; 7099 } 7100 7101 /** 7102 * Build the vertex shader prolog function. 7103 * 7104 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). 7105 * All inputs are returned unmodified. The vertex load indices are 7106 * stored after them, which will be used by the API VS for fetching inputs. 7107 * 7108 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: 7109 * input_v0, 7110 * input_v1, 7111 * input_v2, 7112 * input_v3, 7113 * (VertexID + BaseVertex), 7114 * (InstanceID + StartInstance), 7115 * (InstanceID / 2 + StartInstance) 7116 */ 7117 static void si_build_vs_prolog_function(struct si_shader_context *ctx, 7118 union si_shader_part_key *key) 7119 { 7120 struct si_function_info fninfo; 7121 LLVMTypeRef *returns; 7122 LLVMValueRef ret, func; 7123 int num_returns, i; 7124 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; 7125 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4; 7126 LLVMValueRef input_vgprs[9]; 7127 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + 7128 num_input_vgprs; 7129 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; 7130 7131 si_init_function_info(&fninfo); 7132 7133 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ 7134 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) * 7135 sizeof(LLVMTypeRef)); 7136 num_returns = 0; 7137 7138 /* Declare input and output SGPRs. */ 7139 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { 7140 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7141 returns[num_returns++] = ctx->i32; 7142 } 7143 7144 /* Preloaded VGPRs (outputs must be floats) */ 7145 for (i = 0; i < num_input_vgprs; i++) { 7146 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]); 7147 returns[num_returns++] = ctx->f32; 7148 } 7149 7150 /* Vertex load indices. */ 7151 for (i = 0; i <= key->vs_prolog.last_input; i++) 7152 returns[num_returns++] = ctx->f32; 7153 7154 /* Create the function. */ 7155 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0); 7156 func = ctx->main_fn; 7157 7158 if (key->vs_prolog.num_merged_next_stage_vgprs) { 7159 if (!key->vs_prolog.is_monolithic) 7160 si_init_exec_from_input(ctx, 3, 0); 7161 7162 if (key->vs_prolog.as_ls && 7163 ctx->screen->has_ls_vgpr_init_bug) { 7164 /* If there are no HS threads, SPI loads the LS VGPRs 7165 * starting at VGPR 0. Shift them back to where they 7166 * belong. 7167 */ 7168 LLVMValueRef has_hs_threads = 7169 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, 7170 unpack_param(ctx, 3, 8, 8), 7171 ctx->i32_0, ""); 7172 7173 for (i = 4; i > 0; --i) { 7174 input_vgprs[i + 1] = 7175 LLVMBuildSelect(ctx->ac.builder, has_hs_threads, 7176 input_vgprs[i + 1], 7177 input_vgprs[i - 1], ""); 7178 } 7179 } 7180 } 7181 7182 ctx->abi.vertex_id = input_vgprs[first_vs_vgpr]; 7183 ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)]; 7184 7185 /* Copy inputs to outputs. This should be no-op, as the registers match, 7186 * but it will prevent the compiler from overwriting them unintentionally. 7187 */ 7188 ret = ctx->return_value; 7189 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { 7190 LLVMValueRef p = LLVMGetParam(func, i); 7191 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); 7192 } 7193 for (i = 0; i < num_input_vgprs; i++) { 7194 LLVMValueRef p = input_vgprs[i]; 7195 p = ac_to_float(&ctx->ac, p); 7196 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, 7197 key->vs_prolog.num_input_sgprs + i, ""); 7198 } 7199 7200 /* Compute vertex load indices from instance divisors. */ 7201 LLVMValueRef instance_divisor_constbuf = NULL; 7202 7203 if (key->vs_prolog.states.instance_divisor_is_fetched) { 7204 LLVMValueRef list = si_prolog_get_rw_buffers(ctx); 7205 LLVMValueRef buf_index = 7206 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); 7207 instance_divisor_constbuf = 7208 ac_build_load_to_sgpr(&ctx->ac, list, buf_index); 7209 } 7210 7211 for (i = 0; i <= key->vs_prolog.last_input; i++) { 7212 bool divisor_is_one = 7213 key->vs_prolog.states.instance_divisor_is_one & (1u << i); 7214 bool divisor_is_fetched = 7215 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); 7216 LLVMValueRef index; 7217 7218 if (divisor_is_one || divisor_is_fetched) { 7219 LLVMValueRef divisor = ctx->i32_1; 7220 7221 if (divisor_is_fetched) { 7222 divisor = buffer_load_const(ctx, instance_divisor_constbuf, 7223 LLVMConstInt(ctx->i32, i * 4, 0)); 7224 divisor = ac_to_integer(&ctx->ac, divisor); 7225 } 7226 7227 /* InstanceID / Divisor + StartInstance */ 7228 index = get_instance_index_for_fetch(ctx, 7229 user_sgpr_base + 7230 SI_SGPR_START_INSTANCE, 7231 divisor); 7232 } else { 7233 /* VertexID + BaseVertex */ 7234 index = LLVMBuildAdd(ctx->ac.builder, 7235 ctx->abi.vertex_id, 7236 LLVMGetParam(func, user_sgpr_base + 7237 SI_SGPR_BASE_VERTEX), ""); 7238 } 7239 7240 index = ac_to_float(&ctx->ac, index); 7241 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, 7242 fninfo.num_params + i, ""); 7243 } 7244 7245 si_llvm_build_ret(ctx, ret); 7246 } 7247 7248 static bool si_get_vs_prolog(struct si_screen *sscreen, 7249 LLVMTargetMachineRef tm, 7250 struct si_shader *shader, 7251 struct pipe_debug_callback *debug, 7252 struct si_shader *main_part, 7253 const struct si_vs_prolog_bits *key) 7254 { 7255 struct si_shader_selector *vs = main_part->selector; 7256 7257 if (!si_vs_needs_prolog(vs, key)) 7258 return true; 7259 7260 /* Get the prolog. */ 7261 union si_shader_part_key prolog_key; 7262 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, 7263 key, shader, &prolog_key); 7264 7265 shader->prolog = 7266 si_get_shader_part(sscreen, &sscreen->vs_prologs, 7267 PIPE_SHADER_VERTEX, true, &prolog_key, tm, 7268 debug, si_build_vs_prolog_function, 7269 "Vertex Shader Prolog"); 7270 return shader->prolog != NULL; 7271 } 7272 7273 /** 7274 * Select and compile (or reuse) vertex shader parts (prolog & epilog). 7275 */ 7276 static bool si_shader_select_vs_parts(struct si_screen *sscreen, 7277 LLVMTargetMachineRef tm, 7278 struct si_shader *shader, 7279 struct pipe_debug_callback *debug) 7280 { 7281 return si_get_vs_prolog(sscreen, tm, shader, debug, shader, 7282 &shader->key.part.vs.prolog); 7283 } 7284 7285 /** 7286 * Compile the TCS epilog function. This writes tesselation factors to memory 7287 * based on the output primitive type of the tesselator (determined by TES). 7288 */ 7289 static void si_build_tcs_epilog_function(struct si_shader_context *ctx, 7290 union si_shader_part_key *key) 7291 { 7292 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 7293 struct si_function_info fninfo; 7294 LLVMValueRef func; 7295 7296 si_init_function_info(&fninfo); 7297 7298 if (ctx->screen->info.chip_class >= GFX9) { 7299 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7300 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7301 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */ 7302 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7303 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7304 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7305 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7306 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7307 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7308 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7309 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7310 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7311 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7312 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7313 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7314 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7315 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7316 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7317 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7318 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7319 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7320 } else { 7321 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7322 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7323 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7324 add_arg(&fninfo, ARG_SGPR, ctx->i64); 7325 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7326 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7327 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7328 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7329 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7330 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7331 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7332 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32); 7333 } 7334 7335 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ 7336 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */ 7337 unsigned tess_factors_idx = 7338 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */ 7339 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */ 7340 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */ 7341 7342 for (unsigned i = 0; i < 6; i++) 7343 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */ 7344 7345 /* Create the function. */ 7346 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo, 7347 ctx->screen->info.chip_class >= CIK ? 128 : 64); 7348 ac_declare_lds_as_pointer(&ctx->ac); 7349 func = ctx->main_fn; 7350 7351 LLVMValueRef invoc0_tess_factors[6]; 7352 for (unsigned i = 0; i < 6; i++) 7353 invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i); 7354 7355 si_write_tess_factors(bld_base, 7356 LLVMGetParam(func, tess_factors_idx), 7357 LLVMGetParam(func, tess_factors_idx + 1), 7358 LLVMGetParam(func, tess_factors_idx + 2), 7359 invoc0_tess_factors, invoc0_tess_factors + 4); 7360 7361 LLVMBuildRetVoid(ctx->ac.builder); 7362 } 7363 7364 /** 7365 * Select and compile (or reuse) TCS parts (epilog). 7366 */ 7367 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, 7368 LLVMTargetMachineRef tm, 7369 struct si_shader *shader, 7370 struct pipe_debug_callback *debug) 7371 { 7372 if (sscreen->info.chip_class >= GFX9) { 7373 struct si_shader *ls_main_part = 7374 shader->key.part.tcs.ls->main_shader_part_ls; 7375 7376 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part, 7377 &shader->key.part.tcs.ls_prolog)) 7378 return false; 7379 7380 shader->previous_stage = ls_main_part; 7381 } 7382 7383 /* Get the epilog. */ 7384 union si_shader_part_key epilog_key; 7385 memset(&epilog_key, 0, sizeof(epilog_key)); 7386 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 7387 7388 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, 7389 PIPE_SHADER_TESS_CTRL, false, 7390 &epilog_key, tm, debug, 7391 si_build_tcs_epilog_function, 7392 "Tessellation Control Shader Epilog"); 7393 return shader->epilog != NULL; 7394 } 7395 7396 /** 7397 * Select and compile (or reuse) GS parts (prolog). 7398 */ 7399 static bool si_shader_select_gs_parts(struct si_screen *sscreen, 7400 LLVMTargetMachineRef tm, 7401 struct si_shader *shader, 7402 struct pipe_debug_callback *debug) 7403 { 7404 if (sscreen->info.chip_class >= GFX9) { 7405 struct si_shader *es_main_part = 7406 shader->key.part.gs.es->main_shader_part_es; 7407 7408 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX && 7409 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part, 7410 &shader->key.part.gs.vs_prolog)) 7411 return false; 7412 7413 shader->previous_stage = es_main_part; 7414 } 7415 7416 if (!shader->key.part.gs.prolog.tri_strip_adj_fix) 7417 return true; 7418 7419 union si_shader_part_key prolog_key; 7420 memset(&prolog_key, 0, sizeof(prolog_key)); 7421 prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 7422 7423 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs, 7424 PIPE_SHADER_GEOMETRY, true, 7425 &prolog_key, tm, debug, 7426 si_build_gs_prolog_function, 7427 "Geometry Shader Prolog"); 7428 return shader->prolog2 != NULL; 7429 } 7430 7431 /** 7432 * Build the pixel shader prolog function. This handles: 7433 * - two-side color selection and interpolation 7434 * - overriding interpolation parameters for the API PS 7435 * - polygon stippling 7436 * 7437 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are 7438 * overriden by other states. (e.g. per-sample interpolation) 7439 * Interpolated colors are stored after the preloaded VGPRs. 7440 */ 7441 static void si_build_ps_prolog_function(struct si_shader_context *ctx, 7442 union si_shader_part_key *key) 7443 { 7444 struct si_function_info fninfo; 7445 LLVMValueRef ret, func; 7446 int num_returns, i, num_color_channels; 7447 7448 assert(si_need_ps_prolog(key)); 7449 7450 si_init_function_info(&fninfo); 7451 7452 /* Declare inputs. */ 7453 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) 7454 add_arg(&fninfo, ARG_SGPR, ctx->i32); 7455 7456 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) 7457 add_arg(&fninfo, ARG_VGPR, ctx->f32); 7458 7459 /* Declare outputs (same as inputs + add colors if needed) */ 7460 num_returns = fninfo.num_params; 7461 num_color_channels = util_bitcount(key->ps_prolog.colors_read); 7462 for (i = 0; i < num_color_channels; i++) 7463 fninfo.types[num_returns++] = ctx->f32; 7464 7465 /* Create the function. */ 7466 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns, 7467 &fninfo, 0); 7468 func = ctx->main_fn; 7469 7470 /* Copy inputs to outputs. This should be no-op, as the registers match, 7471 * but it will prevent the compiler from overwriting them unintentionally. 7472 */ 7473 ret = ctx->return_value; 7474 for (i = 0; i < fninfo.num_params; i++) { 7475 LLVMValueRef p = LLVMGetParam(func, i); 7476 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); 7477 } 7478 7479 /* Polygon stippling. */ 7480 if (key->ps_prolog.states.poly_stipple) { 7481 /* POS_FIXED_PT is always last. */ 7482 unsigned pos = key->ps_prolog.num_input_sgprs + 7483 key->ps_prolog.num_input_vgprs - 1; 7484 LLVMValueRef list = si_prolog_get_rw_buffers(ctx); 7485 7486 si_llvm_emit_polygon_stipple(ctx, list, pos); 7487 } 7488 7489 if (key->ps_prolog.states.bc_optimize_for_persp || 7490 key->ps_prolog.states.bc_optimize_for_linear) { 7491 unsigned i, base = key->ps_prolog.num_input_sgprs; 7492 LLVMValueRef center[2], centroid[2], tmp, bc_optimize; 7493 7494 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; 7495 * The hw doesn't compute CENTROID if the whole wave only 7496 * contains fully-covered quads. 7497 * 7498 * PRIM_MASK is after user SGPRs. 7499 */ 7500 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); 7501 bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize, 7502 LLVMConstInt(ctx->i32, 31, 0), ""); 7503 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, 7504 ctx->i1, ""); 7505 7506 if (key->ps_prolog.states.bc_optimize_for_persp) { 7507 /* Read PERSP_CENTER. */ 7508 for (i = 0; i < 2; i++) 7509 center[i] = LLVMGetParam(func, base + 2 + i); 7510 /* Read PERSP_CENTROID. */ 7511 for (i = 0; i < 2; i++) 7512 centroid[i] = LLVMGetParam(func, base + 4 + i); 7513 /* Select PERSP_CENTROID. */ 7514 for (i = 0; i < 2; i++) { 7515 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, 7516 center[i], centroid[i], ""); 7517 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7518 tmp, base + 4 + i, ""); 7519 } 7520 } 7521 if (key->ps_prolog.states.bc_optimize_for_linear) { 7522 /* Read LINEAR_CENTER. */ 7523 for (i = 0; i < 2; i++) 7524 center[i] = LLVMGetParam(func, base + 8 + i); 7525 /* Read LINEAR_CENTROID. */ 7526 for (i = 0; i < 2; i++) 7527 centroid[i] = LLVMGetParam(func, base + 10 + i); 7528 /* Select LINEAR_CENTROID. */ 7529 for (i = 0; i < 2; i++) { 7530 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, 7531 center[i], centroid[i], ""); 7532 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7533 tmp, base + 10 + i, ""); 7534 } 7535 } 7536 } 7537 7538 /* Force per-sample interpolation. */ 7539 if (key->ps_prolog.states.force_persp_sample_interp) { 7540 unsigned i, base = key->ps_prolog.num_input_sgprs; 7541 LLVMValueRef persp_sample[2]; 7542 7543 /* Read PERSP_SAMPLE. */ 7544 for (i = 0; i < 2; i++) 7545 persp_sample[i] = LLVMGetParam(func, base + i); 7546 /* Overwrite PERSP_CENTER. */ 7547 for (i = 0; i < 2; i++) 7548 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7549 persp_sample[i], base + 2 + i, ""); 7550 /* Overwrite PERSP_CENTROID. */ 7551 for (i = 0; i < 2; i++) 7552 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7553 persp_sample[i], base + 4 + i, ""); 7554 } 7555 if (key->ps_prolog.states.force_linear_sample_interp) { 7556 unsigned i, base = key->ps_prolog.num_input_sgprs; 7557 LLVMValueRef linear_sample[2]; 7558 7559 /* Read LINEAR_SAMPLE. */ 7560 for (i = 0; i < 2; i++) 7561 linear_sample[i] = LLVMGetParam(func, base + 6 + i); 7562 /* Overwrite LINEAR_CENTER. */ 7563 for (i = 0; i < 2; i++) 7564 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7565 linear_sample[i], base + 8 + i, ""); 7566 /* Overwrite LINEAR_CENTROID. */ 7567 for (i = 0; i < 2; i++) 7568 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7569 linear_sample[i], base + 10 + i, ""); 7570 } 7571 7572 /* Force center interpolation. */ 7573 if (key->ps_prolog.states.force_persp_center_interp) { 7574 unsigned i, base = key->ps_prolog.num_input_sgprs; 7575 LLVMValueRef persp_center[2]; 7576 7577 /* Read PERSP_CENTER. */ 7578 for (i = 0; i < 2; i++) 7579 persp_center[i] = LLVMGetParam(func, base + 2 + i); 7580 /* Overwrite PERSP_SAMPLE. */ 7581 for (i = 0; i < 2; i++) 7582 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7583 persp_center[i], base + i, ""); 7584 /* Overwrite PERSP_CENTROID. */ 7585 for (i = 0; i < 2; i++) 7586 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7587 persp_center[i], base + 4 + i, ""); 7588 } 7589 if (key->ps_prolog.states.force_linear_center_interp) { 7590 unsigned i, base = key->ps_prolog.num_input_sgprs; 7591 LLVMValueRef linear_center[2]; 7592 7593 /* Read LINEAR_CENTER. */ 7594 for (i = 0; i < 2; i++) 7595 linear_center[i] = LLVMGetParam(func, base + 8 + i); 7596 /* Overwrite LINEAR_SAMPLE. */ 7597 for (i = 0; i < 2; i++) 7598 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7599 linear_center[i], base + 6 + i, ""); 7600 /* Overwrite LINEAR_CENTROID. */ 7601 for (i = 0; i < 2; i++) 7602 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, 7603 linear_center[i], base + 10 + i, ""); 7604 } 7605 7606 /* Interpolate colors. */ 7607 unsigned color_out_idx = 0; 7608 for (i = 0; i < 2; i++) { 7609 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; 7610 unsigned face_vgpr = key->ps_prolog.num_input_sgprs + 7611 key->ps_prolog.face_vgpr_index; 7612 LLVMValueRef interp[2], color[4]; 7613 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; 7614 7615 if (!writemask) 7616 continue; 7617 7618 /* If the interpolation qualifier is not CONSTANT (-1). */ 7619 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { 7620 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + 7621 key->ps_prolog.color_interp_vgpr_index[i]; 7622 7623 /* Get the (i,j) updated by bc_optimize handling. */ 7624 interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, 7625 interp_vgpr, ""); 7626 interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, 7627 interp_vgpr + 1, ""); 7628 interp_ij = lp_build_gather_values(&ctx->gallivm, interp, 2); 7629 } 7630 7631 /* Use the absolute location of the input. */ 7632 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); 7633 7634 if (key->ps_prolog.states.color_two_side) { 7635 face = LLVMGetParam(func, face_vgpr); 7636 face = ac_to_integer(&ctx->ac, face); 7637 } 7638 7639 interp_fs_input(ctx, 7640 key->ps_prolog.color_attr_index[i], 7641 TGSI_SEMANTIC_COLOR, i, 7642 key->ps_prolog.num_interp_inputs, 7643 key->ps_prolog.colors_read, interp_ij, 7644 prim_mask, face, color); 7645 7646 while (writemask) { 7647 unsigned chan = u_bit_scan(&writemask); 7648 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan], 7649 fninfo.num_params + color_out_idx++, ""); 7650 } 7651 } 7652 7653 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec 7654 * says: 7655 * 7656 * "When per-sample shading is active due to the use of a fragment 7657 * input qualified by sample or due to the use of the gl_SampleID 7658 * or gl_SamplePosition variables, only the bit for the current 7659 * sample is set in gl_SampleMaskIn. When state specifies multiple 7660 * fragment shader invocations for a given fragment, the sample 7661 * mask for any single fragment shader invocation may specify a 7662 * subset of the covered samples for the fragment. In this case, 7663 * the bit corresponding to each covered sample will be set in 7664 * exactly one fragment shader invocation." 7665 * 7666 * The samplemask loaded by hardware is always the coverage of the 7667 * entire pixel/fragment, so mask bits out based on the sample ID. 7668 */ 7669 if (key->ps_prolog.states.samplemask_log_ps_iter) { 7670 /* The bit pattern matches that used by fixed function fragment 7671 * processing. */ 7672 static const uint16_t ps_iter_masks[] = { 7673 0xffff, /* not used */ 7674 0x5555, 7675 0x1111, 7676 0x0101, 7677 0x0001, 7678 }; 7679 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks)); 7680 7681 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter]; 7682 unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs + 7683 key->ps_prolog.ancillary_vgpr_index; 7684 LLVMValueRef sampleid = unpack_param(ctx, ancillary_vgpr, 8, 4); 7685 LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1); 7686 7687 samplemask = ac_to_integer(&ctx->ac, samplemask); 7688 samplemask = LLVMBuildAnd( 7689 ctx->ac.builder, 7690 samplemask, 7691 LLVMBuildShl(ctx->ac.builder, 7692 LLVMConstInt(ctx->i32, ps_iter_mask, false), 7693 sampleid, ""), 7694 ""); 7695 samplemask = ac_to_float(&ctx->ac, samplemask); 7696 7697 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, 7698 ancillary_vgpr + 1, ""); 7699 } 7700 7701 /* Tell LLVM to insert WQM instruction sequence when needed. */ 7702 if (key->ps_prolog.wqm) { 7703 LLVMAddTargetDependentFunctionAttr(func, 7704 "amdgpu-ps-wqm-outputs", ""); 7705 } 7706 7707 si_llvm_build_ret(ctx, ret); 7708 } 7709 7710 /** 7711 * Build the pixel shader epilog function. This handles everything that must be 7712 * emulated for pixel shader exports. (alpha-test, format conversions, etc) 7713 */ 7714 static void si_build_ps_epilog_function(struct si_shader_context *ctx, 7715 union si_shader_part_key *key) 7716 { 7717 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 7718 struct si_function_info fninfo; 7719 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 7720 int i; 7721 struct si_ps_exports exp = {}; 7722 7723 si_init_function_info(&fninfo); 7724 7725 /* Declare input SGPRs. */ 7726 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64); 7727 ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64); 7728 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64); 7729 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64); 7730 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF); 7731 7732 /* Declare input VGPRs. */ 7733 unsigned required_num_params = 7734 fninfo.num_sgpr_params + 7735 util_bitcount(key->ps_epilog.colors_written) * 4 + 7736 key->ps_epilog.writes_z + 7737 key->ps_epilog.writes_stencil + 7738 key->ps_epilog.writes_samplemask; 7739 7740 required_num_params = MAX2(required_num_params, 7741 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 7742 7743 while (fninfo.num_params < required_num_params) 7744 add_arg(&fninfo, ARG_VGPR, ctx->f32); 7745 7746 /* Create the function. */ 7747 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0); 7748 /* Disable elimination of unused inputs. */ 7749 si_llvm_add_attribute(ctx->main_fn, 7750 "InitialPSInputAddr", 0xffffff); 7751 7752 /* Process colors. */ 7753 unsigned vgpr = fninfo.num_sgpr_params; 7754 unsigned colors_written = key->ps_epilog.colors_written; 7755 int last_color_export = -1; 7756 7757 /* Find the last color export. */ 7758 if (!key->ps_epilog.writes_z && 7759 !key->ps_epilog.writes_stencil && 7760 !key->ps_epilog.writes_samplemask) { 7761 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; 7762 7763 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ 7764 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { 7765 /* Just set this if any of the colorbuffers are enabled. */ 7766 if (spi_format & 7767 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) 7768 last_color_export = 0; 7769 } else { 7770 for (i = 0; i < 8; i++) 7771 if (colors_written & (1 << i) && 7772 (spi_format >> (i * 4)) & 0xf) 7773 last_color_export = i; 7774 } 7775 } 7776 7777 while (colors_written) { 7778 LLVMValueRef color[4]; 7779 int mrt = u_bit_scan(&colors_written); 7780 7781 for (i = 0; i < 4; i++) 7782 color[i] = LLVMGetParam(ctx->main_fn, vgpr++); 7783 7784 si_export_mrt_color(bld_base, color, mrt, 7785 fninfo.num_params - 1, 7786 mrt == last_color_export, &exp); 7787 } 7788 7789 /* Process depth, stencil, samplemask. */ 7790 if (key->ps_epilog.writes_z) 7791 depth = LLVMGetParam(ctx->main_fn, vgpr++); 7792 if (key->ps_epilog.writes_stencil) 7793 stencil = LLVMGetParam(ctx->main_fn, vgpr++); 7794 if (key->ps_epilog.writes_samplemask) 7795 samplemask = LLVMGetParam(ctx->main_fn, vgpr++); 7796 7797 if (depth || stencil || samplemask) 7798 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp); 7799 else if (last_color_export == -1) 7800 si_export_null(bld_base); 7801 7802 if (exp.num) 7803 si_emit_ps_exports(ctx, &exp); 7804 7805 /* Compile. */ 7806 LLVMBuildRetVoid(ctx->ac.builder); 7807 } 7808 7809 /** 7810 * Select and compile (or reuse) pixel shader parts (prolog & epilog). 7811 */ 7812 static bool si_shader_select_ps_parts(struct si_screen *sscreen, 7813 LLVMTargetMachineRef tm, 7814 struct si_shader *shader, 7815 struct pipe_debug_callback *debug) 7816 { 7817 union si_shader_part_key prolog_key; 7818 union si_shader_part_key epilog_key; 7819 7820 /* Get the prolog. */ 7821 si_get_ps_prolog_key(shader, &prolog_key, true); 7822 7823 /* The prolog is a no-op if these aren't set. */ 7824 if (si_need_ps_prolog(&prolog_key)) { 7825 shader->prolog = 7826 si_get_shader_part(sscreen, &sscreen->ps_prologs, 7827 PIPE_SHADER_FRAGMENT, true, 7828 &prolog_key, tm, debug, 7829 si_build_ps_prolog_function, 7830 "Fragment Shader Prolog"); 7831 if (!shader->prolog) 7832 return false; 7833 } 7834 7835 /* Get the epilog. */ 7836 si_get_ps_epilog_key(shader, &epilog_key); 7837 7838 shader->epilog = 7839 si_get_shader_part(sscreen, &sscreen->ps_epilogs, 7840 PIPE_SHADER_FRAGMENT, false, 7841 &epilog_key, tm, debug, 7842 si_build_ps_epilog_function, 7843 "Fragment Shader Epilog"); 7844 if (!shader->epilog) 7845 return false; 7846 7847 /* Enable POS_FIXED_PT if polygon stippling is enabled. */ 7848 if (shader->key.part.ps.prolog.poly_stipple) { 7849 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); 7850 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); 7851 } 7852 7853 /* Set up the enable bits for per-sample shading if needed. */ 7854 if (shader->key.part.ps.prolog.force_persp_sample_interp && 7855 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || 7856 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7857 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; 7858 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 7859 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); 7860 } 7861 if (shader->key.part.ps.prolog.force_linear_sample_interp && 7862 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || 7863 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7864 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; 7865 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 7866 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); 7867 } 7868 if (shader->key.part.ps.prolog.force_persp_center_interp && 7869 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 7870 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7871 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; 7872 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 7873 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 7874 } 7875 if (shader->key.part.ps.prolog.force_linear_center_interp && 7876 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 7877 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 7878 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; 7879 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 7880 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 7881 } 7882 7883 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ 7884 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && 7885 !(shader->config.spi_ps_input_ena & 0xf)) { 7886 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 7887 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); 7888 } 7889 7890 /* At least one pair of interpolation weights must be enabled. */ 7891 if (!(shader->config.spi_ps_input_ena & 0x7f)) { 7892 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 7893 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); 7894 } 7895 7896 /* Samplemask fixup requires the sample ID. */ 7897 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { 7898 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); 7899 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); 7900 } 7901 7902 /* The sample mask input is always enabled, because the API shader always 7903 * passes it through to the epilog. Disable it here if it's unused. 7904 */ 7905 if (!shader->key.part.ps.epilog.poly_line_smoothing && 7906 !shader->selector->info.reads_samplemask) 7907 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; 7908 7909 return true; 7910 } 7911 7912 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, 7913 unsigned *lds_size) 7914 { 7915 /* SPI barrier management bug: 7916 * Make sure we have at least 4k of LDS in use to avoid the bug. 7917 * It applies to workgroup sizes of more than one wavefront. 7918 */ 7919 if (sscreen->info.family == CHIP_BONAIRE || 7920 sscreen->info.family == CHIP_KABINI || 7921 sscreen->info.family == CHIP_MULLINS) 7922 *lds_size = MAX2(*lds_size, 8); 7923 } 7924 7925 static void si_fix_resource_usage(struct si_screen *sscreen, 7926 struct si_shader *shader) 7927 { 7928 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ 7929 7930 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); 7931 7932 if (shader->selector->type == PIPE_SHADER_COMPUTE && 7933 si_get_max_workgroup_size(shader) > 64) { 7934 si_multiwave_lds_size_workaround(sscreen, 7935 &shader->config.lds_size); 7936 } 7937 } 7938 7939 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, 7940 struct si_shader *shader, 7941 struct pipe_debug_callback *debug) 7942 { 7943 struct si_shader_selector *sel = shader->selector; 7944 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); 7945 int r; 7946 7947 /* LS, ES, VS are compiled on demand if the main part hasn't been 7948 * compiled for that stage. 7949 * 7950 * Vertex shaders are compiled on demand when a vertex fetch 7951 * workaround must be applied. 7952 */ 7953 if (shader->is_monolithic) { 7954 /* Monolithic shader (compiled as a whole, has many variants, 7955 * may take a long time to compile). 7956 */ 7957 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug); 7958 if (r) 7959 return r; 7960 } else { 7961 /* The shader consists of several parts: 7962 * 7963 * - the middle part is the user shader, it has 1 variant only 7964 * and it was compiled during the creation of the shader 7965 * selector 7966 * - the prolog part is inserted at the beginning 7967 * - the epilog part is inserted at the end 7968 * 7969 * The prolog and epilog have many (but simple) variants. 7970 * 7971 * Starting with gfx9, geometry and tessellation control 7972 * shaders also contain the prolog and user shader parts of 7973 * the previous shader stage. 7974 */ 7975 7976 if (!mainp) 7977 return -1; 7978 7979 /* Copy the compiled TGSI shader data over. */ 7980 shader->is_binary_shared = true; 7981 shader->binary = mainp->binary; 7982 shader->config = mainp->config; 7983 shader->info.num_input_sgprs = mainp->info.num_input_sgprs; 7984 shader->info.num_input_vgprs = mainp->info.num_input_vgprs; 7985 shader->info.face_vgpr_index = mainp->info.face_vgpr_index; 7986 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; 7987 memcpy(shader->info.vs_output_param_offset, 7988 mainp->info.vs_output_param_offset, 7989 sizeof(mainp->info.vs_output_param_offset)); 7990 shader->info.uses_instanceid = mainp->info.uses_instanceid; 7991 shader->info.nr_pos_exports = mainp->info.nr_pos_exports; 7992 shader->info.nr_param_exports = mainp->info.nr_param_exports; 7993 7994 /* Select prologs and/or epilogs. */ 7995 switch (sel->type) { 7996 case PIPE_SHADER_VERTEX: 7997 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug)) 7998 return -1; 7999 break; 8000 case PIPE_SHADER_TESS_CTRL: 8001 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug)) 8002 return -1; 8003 break; 8004 case PIPE_SHADER_TESS_EVAL: 8005 break; 8006 case PIPE_SHADER_GEOMETRY: 8007 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug)) 8008 return -1; 8009 break; 8010 case PIPE_SHADER_FRAGMENT: 8011 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug)) 8012 return -1; 8013 8014 /* Make sure we have at least as many VGPRs as there 8015 * are allocated inputs. 8016 */ 8017 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8018 shader->info.num_input_vgprs); 8019 break; 8020 } 8021 8022 /* Update SGPR and VGPR counts. */ 8023 if (shader->prolog) { 8024 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8025 shader->prolog->config.num_sgprs); 8026 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8027 shader->prolog->config.num_vgprs); 8028 } 8029 if (shader->previous_stage) { 8030 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8031 shader->previous_stage->config.num_sgprs); 8032 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8033 shader->previous_stage->config.num_vgprs); 8034 shader->config.spilled_sgprs = 8035 MAX2(shader->config.spilled_sgprs, 8036 shader->previous_stage->config.spilled_sgprs); 8037 shader->config.spilled_vgprs = 8038 MAX2(shader->config.spilled_vgprs, 8039 shader->previous_stage->config.spilled_vgprs); 8040 shader->config.private_mem_vgprs = 8041 MAX2(shader->config.private_mem_vgprs, 8042 shader->previous_stage->config.private_mem_vgprs); 8043 shader->config.scratch_bytes_per_wave = 8044 MAX2(shader->config.scratch_bytes_per_wave, 8045 shader->previous_stage->config.scratch_bytes_per_wave); 8046 shader->info.uses_instanceid |= 8047 shader->previous_stage->info.uses_instanceid; 8048 } 8049 if (shader->prolog2) { 8050 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8051 shader->prolog2->config.num_sgprs); 8052 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8053 shader->prolog2->config.num_vgprs); 8054 } 8055 if (shader->epilog) { 8056 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8057 shader->epilog->config.num_sgprs); 8058 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8059 shader->epilog->config.num_vgprs); 8060 } 8061 } 8062 8063 si_fix_resource_usage(sscreen, shader); 8064 si_shader_dump(sscreen, shader, debug, sel->info.processor, 8065 stderr, true); 8066 8067 /* Upload. */ 8068 r = si_shader_binary_upload(sscreen, shader); 8069 if (r) { 8070 fprintf(stderr, "LLVM failed to upload shader\n"); 8071 return r; 8072 } 8073 8074 return 0; 8075 } 8076 8077 void si_shader_destroy(struct si_shader *shader) 8078 { 8079 if (shader->scratch_bo) 8080 r600_resource_reference(&shader->scratch_bo, NULL); 8081 8082 r600_resource_reference(&shader->bo, NULL); 8083 8084 if (!shader->is_binary_shared) 8085 ac_shader_binary_clean(&shader->binary); 8086 8087 free(shader->shader_log); 8088 } 8089