1 /* 2 * Copyright 2012 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Tom Stellard <thomas.stellard (at) amd.com> 25 * Michel Dnzer <michel.daenzer (at) amd.com> 26 * Christian Knig <christian.koenig (at) amd.com> 27 */ 28 29 #include "gallivm/lp_bld_const.h" 30 #include "gallivm/lp_bld_gather.h" 31 #include "gallivm/lp_bld_intr.h" 32 #include "gallivm/lp_bld_logic.h" 33 #include "gallivm/lp_bld_arit.h" 34 #include "gallivm/lp_bld_flow.h" 35 #include "gallivm/lp_bld_misc.h" 36 #include "radeon/radeon_elf_util.h" 37 #include "util/u_memory.h" 38 #include "util/u_string.h" 39 #include "tgsi/tgsi_build.h" 40 #include "tgsi/tgsi_util.h" 41 #include "tgsi/tgsi_dump.h" 42 43 #include "ac_llvm_util.h" 44 #include "si_shader_internal.h" 45 #include "si_pipe.h" 46 #include "sid.h" 47 48 49 static const char *scratch_rsrc_dword0_symbol = 50 "SCRATCH_RSRC_DWORD0"; 51 52 static const char *scratch_rsrc_dword1_symbol = 53 "SCRATCH_RSRC_DWORD1"; 54 55 struct si_shader_output_values 56 { 57 LLVMValueRef values[4]; 58 unsigned semantic_name; 59 unsigned semantic_index; 60 ubyte vertex_stream[4]; 61 }; 62 63 static void si_init_shader_ctx(struct si_shader_context *ctx, 64 struct si_screen *sscreen, 65 struct si_shader *shader, 66 LLVMTargetMachineRef tm); 67 68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, 69 struct lp_build_tgsi_context *bld_base, 70 struct lp_build_emit_data *emit_data); 71 72 static void si_dump_shader_key(unsigned shader, struct si_shader_key *key, 73 FILE *f); 74 75 static void si_build_vs_prolog_function(struct si_shader_context *ctx, 76 union si_shader_part_key *key); 77 static void si_build_vs_epilog_function(struct si_shader_context *ctx, 78 union si_shader_part_key *key); 79 static void si_build_tcs_epilog_function(struct si_shader_context *ctx, 80 union si_shader_part_key *key); 81 static void si_build_ps_prolog_function(struct si_shader_context *ctx, 82 union si_shader_part_key *key); 83 static void si_build_ps_epilog_function(struct si_shader_context *ctx, 84 union si_shader_part_key *key); 85 86 /* Ideally pass the sample mask input to the PS epilog as v13, which 87 * is its usual location, so that the shader doesn't have to add v_mov. 88 */ 89 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13 90 91 /* The VS location of the PrimitiveID input is the same in the epilog, 92 * so that the main shader part doesn't have to move it. 93 */ 94 #define VS_EPILOG_PRIMID_LOC 2 95 96 enum { 97 CONST_ADDR_SPACE = 2, 98 LOCAL_ADDR_SPACE = 3, 99 }; 100 101 #define SENDMSG_GS 2 102 #define SENDMSG_GS_DONE 3 103 104 #define SENDMSG_GS_OP_NOP (0 << 4) 105 #define SENDMSG_GS_OP_CUT (1 << 4) 106 #define SENDMSG_GS_OP_EMIT (2 << 4) 107 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4) 108 109 /** 110 * Returns a unique index for a semantic name and index. The index must be 111 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be 112 * calculated. 113 */ 114 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) 115 { 116 switch (semantic_name) { 117 case TGSI_SEMANTIC_POSITION: 118 return 0; 119 case TGSI_SEMANTIC_PSIZE: 120 return 1; 121 case TGSI_SEMANTIC_CLIPDIST: 122 assert(index <= 1); 123 return 2 + index; 124 case TGSI_SEMANTIC_GENERIC: 125 if (index <= 63-4) 126 return 4 + index; 127 128 assert(!"invalid generic index"); 129 return 0; 130 131 /* patch indices are completely separate and thus start from 0 */ 132 case TGSI_SEMANTIC_TESSOUTER: 133 return 0; 134 case TGSI_SEMANTIC_TESSINNER: 135 return 1; 136 case TGSI_SEMANTIC_PATCH: 137 return 2 + index; 138 139 default: 140 assert(!"invalid semantic name"); 141 return 0; 142 } 143 } 144 145 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index) 146 { 147 switch (name) { 148 case TGSI_SEMANTIC_FOG: 149 return 0; 150 case TGSI_SEMANTIC_LAYER: 151 return 1; 152 case TGSI_SEMANTIC_VIEWPORT_INDEX: 153 return 2; 154 case TGSI_SEMANTIC_PRIMID: 155 return 3; 156 case TGSI_SEMANTIC_COLOR: /* these alias */ 157 case TGSI_SEMANTIC_BCOLOR: 158 return 4 + index; 159 case TGSI_SEMANTIC_TEXCOORD: 160 return 6 + index; 161 default: 162 assert(!"invalid semantic name"); 163 return 0; 164 } 165 } 166 167 /** 168 * Get the value of a shader input parameter and extract a bitfield. 169 */ 170 static LLVMValueRef unpack_param(struct si_shader_context *ctx, 171 unsigned param, unsigned rshift, 172 unsigned bitwidth) 173 { 174 struct gallivm_state *gallivm = &ctx->gallivm; 175 LLVMValueRef value = LLVMGetParam(ctx->main_fn, 176 param); 177 178 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) 179 value = bitcast(&ctx->bld_base, 180 TGSI_TYPE_UNSIGNED, value); 181 182 if (rshift) 183 value = LLVMBuildLShr(gallivm->builder, value, 184 lp_build_const_int32(gallivm, rshift), ""); 185 186 if (rshift + bitwidth < 32) { 187 unsigned mask = (1 << bitwidth) - 1; 188 value = LLVMBuildAnd(gallivm->builder, value, 189 lp_build_const_int32(gallivm, mask), ""); 190 } 191 192 return value; 193 } 194 195 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) 196 { 197 switch (ctx->type) { 198 case PIPE_SHADER_TESS_CTRL: 199 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8); 200 201 case PIPE_SHADER_TESS_EVAL: 202 return LLVMGetParam(ctx->main_fn, 203 ctx->param_tes_rel_patch_id); 204 205 default: 206 assert(0); 207 return NULL; 208 } 209 } 210 211 /* Tessellation shaders pass outputs to the next shader using LDS. 212 * 213 * LS outputs = TCS inputs 214 * TCS outputs = TES inputs 215 * 216 * The LDS layout is: 217 * - TCS inputs for patch 0 218 * - TCS inputs for patch 1 219 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 220 * - ... 221 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 222 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 223 * - TCS outputs for patch 1 224 * - Per-patch TCS outputs for patch 1 225 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 226 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 227 * - ... 228 * 229 * All three shaders VS(LS), TCS, TES share the same LDS space. 230 */ 231 232 static LLVMValueRef 233 get_tcs_in_patch_stride(struct si_shader_context *ctx) 234 { 235 if (ctx->type == PIPE_SHADER_VERTEX) 236 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13); 237 else if (ctx->type == PIPE_SHADER_TESS_CTRL) 238 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13); 239 else { 240 assert(0); 241 return NULL; 242 } 243 } 244 245 static LLVMValueRef 246 get_tcs_out_patch_stride(struct si_shader_context *ctx) 247 { 248 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13); 249 } 250 251 static LLVMValueRef 252 get_tcs_out_patch0_offset(struct si_shader_context *ctx) 253 { 254 return lp_build_mul_imm(&ctx->bld_base.uint_bld, 255 unpack_param(ctx, 256 SI_PARAM_TCS_OUT_OFFSETS, 257 0, 16), 258 4); 259 } 260 261 static LLVMValueRef 262 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) 263 { 264 return lp_build_mul_imm(&ctx->bld_base.uint_bld, 265 unpack_param(ctx, 266 SI_PARAM_TCS_OUT_OFFSETS, 267 16, 16), 268 4); 269 } 270 271 static LLVMValueRef 272 get_tcs_in_current_patch_offset(struct si_shader_context *ctx) 273 { 274 struct gallivm_state *gallivm = &ctx->gallivm; 275 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); 276 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 277 278 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, ""); 279 } 280 281 static LLVMValueRef 282 get_tcs_out_current_patch_offset(struct si_shader_context *ctx) 283 { 284 struct gallivm_state *gallivm = &ctx->gallivm; 285 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); 286 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); 287 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 288 289 return LLVMBuildAdd(gallivm->builder, patch0_offset, 290 LLVMBuildMul(gallivm->builder, patch_stride, 291 rel_patch_id, ""), 292 ""); 293 } 294 295 static LLVMValueRef 296 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) 297 { 298 struct gallivm_state *gallivm = &ctx->gallivm; 299 LLVMValueRef patch0_patch_data_offset = 300 get_tcs_out_patch0_patch_data_offset(ctx); 301 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); 302 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); 303 304 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset, 305 LLVMBuildMul(gallivm->builder, patch_stride, 306 rel_patch_id, ""), 307 ""); 308 } 309 310 static LLVMValueRef build_gep0(struct si_shader_context *ctx, 311 LLVMValueRef base_ptr, LLVMValueRef index) 312 { 313 LLVMValueRef indices[2] = { 314 LLVMConstInt(ctx->i32, 0, 0), 315 index, 316 }; 317 return LLVMBuildGEP(ctx->gallivm.builder, base_ptr, 318 indices, 2, ""); 319 } 320 321 static void build_indexed_store(struct si_shader_context *ctx, 322 LLVMValueRef base_ptr, LLVMValueRef index, 323 LLVMValueRef value) 324 { 325 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 326 struct gallivm_state *gallivm = bld_base->base.gallivm; 327 328 LLVMBuildStore(gallivm->builder, value, 329 build_gep0(ctx, base_ptr, index)); 330 } 331 332 /** 333 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. 334 * It's equivalent to doing a load from &base_ptr[index]. 335 * 336 * \param base_ptr Where the array starts. 337 * \param index The element index into the array. 338 * \param uniform Whether the base_ptr and index can be assumed to be 339 * dynamically uniform 340 */ 341 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx, 342 LLVMValueRef base_ptr, LLVMValueRef index, 343 bool uniform) 344 { 345 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 346 struct gallivm_state *gallivm = bld_base->base.gallivm; 347 LLVMValueRef pointer; 348 349 pointer = build_gep0(ctx, base_ptr, index); 350 if (uniform) 351 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); 352 return LLVMBuildLoad(gallivm->builder, pointer, ""); 353 } 354 355 /** 356 * Do a load from &base_ptr[index], but also add a flag that it's loading 357 * a constant from a dynamically uniform index. 358 */ 359 static LLVMValueRef build_indexed_load_const( 360 struct si_shader_context *ctx, 361 LLVMValueRef base_ptr, LLVMValueRef index) 362 { 363 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true); 364 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); 365 return result; 366 } 367 368 static LLVMValueRef get_instance_index_for_fetch( 369 struct si_shader_context *radeon_bld, 370 unsigned param_start_instance, unsigned divisor) 371 { 372 struct si_shader_context *ctx = 373 si_shader_context(&radeon_bld->bld_base); 374 struct gallivm_state *gallivm = radeon_bld->bld_base.base.gallivm; 375 376 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn, 377 ctx->param_instance_id); 378 379 /* The division must be done before START_INSTANCE is added. */ 380 if (divisor > 1) 381 result = LLVMBuildUDiv(gallivm->builder, result, 382 lp_build_const_int32(gallivm, divisor), ""); 383 384 return LLVMBuildAdd(gallivm->builder, result, 385 LLVMGetParam(radeon_bld->main_fn, param_start_instance), ""); 386 } 387 388 static void declare_input_vs( 389 struct si_shader_context *ctx, 390 unsigned input_index, 391 const struct tgsi_full_declaration *decl, 392 LLVMValueRef out[4]) 393 { 394 struct lp_build_context *base = &ctx->bld_base.base; 395 struct gallivm_state *gallivm = base->gallivm; 396 397 unsigned chan; 398 unsigned fix_fetch; 399 400 LLVMValueRef t_list_ptr; 401 LLVMValueRef t_offset; 402 LLVMValueRef t_list; 403 LLVMValueRef attribute_offset; 404 LLVMValueRef buffer_index; 405 LLVMValueRef args[3]; 406 LLVMValueRef input; 407 408 /* Load the T list */ 409 t_list_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_VERTEX_BUFFERS); 410 411 t_offset = lp_build_const_int32(gallivm, input_index); 412 413 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset); 414 415 /* Build the attribute offset */ 416 attribute_offset = lp_build_const_int32(gallivm, 0); 417 418 buffer_index = LLVMGetParam(ctx->main_fn, 419 ctx->param_vertex_index0 + 420 input_index); 421 422 args[0] = t_list; 423 args[1] = attribute_offset; 424 args[2] = buffer_index; 425 input = lp_build_intrinsic(gallivm->builder, 426 "llvm.SI.vs.load.input", ctx->v4f32, args, 3, 427 LP_FUNC_ATTR_READNONE); 428 429 /* Break up the vec4 into individual components */ 430 for (chan = 0; chan < 4; chan++) { 431 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); 432 out[chan] = LLVMBuildExtractElement(gallivm->builder, 433 input, llvm_chan, ""); 434 } 435 436 fix_fetch = (ctx->shader->key.mono.vs.fix_fetch >> (4 * input_index)) & 0xf; 437 438 switch (fix_fetch) { 439 case SI_FIX_FETCH_A2_SNORM: 440 case SI_FIX_FETCH_A2_SSCALED: 441 case SI_FIX_FETCH_A2_SINT: { 442 /* The hardware returns an unsigned value; convert it to a 443 * signed one. 444 */ 445 LLVMValueRef tmp = out[3]; 446 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0); 447 448 /* First, recover the sign-extended signed integer value. */ 449 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) 450 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, ""); 451 else 452 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, ""); 453 454 /* For the integer-like cases, do a natural sign extension. 455 * 456 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 457 * and happen to contain 0, 1, 2, 3 as the two LSBs of the 458 * exponent. 459 */ 460 tmp = LLVMBuildShl(gallivm->builder, tmp, 461 fix_fetch == SI_FIX_FETCH_A2_SNORM ? 462 LLVMConstInt(ctx->i32, 7, 0) : c30, ""); 463 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, ""); 464 465 /* Convert back to the right type. */ 466 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) { 467 LLVMValueRef clamp; 468 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); 469 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, ""); 470 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, ""); 471 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, ""); 472 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) { 473 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, ""); 474 } 475 476 out[3] = tmp; 477 break; 478 } 479 case SI_FIX_FETCH_RGBA_32_UNORM: 480 case SI_FIX_FETCH_RGBX_32_UNORM: 481 for (chan = 0; chan < 4; chan++) { 482 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], 483 ctx->i32, ""); 484 out[chan] = LLVMBuildUIToFP(gallivm->builder, 485 out[chan], ctx->f32, ""); 486 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan], 487 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), ""); 488 } 489 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ 490 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM) 491 out[3] = LLVMConstReal(ctx->f32, 1); 492 break; 493 case SI_FIX_FETCH_RGBA_32_SNORM: 494 case SI_FIX_FETCH_RGBX_32_SNORM: 495 case SI_FIX_FETCH_RGBA_32_FIXED: 496 case SI_FIX_FETCH_RGBX_32_FIXED: { 497 double scale; 498 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED) 499 scale = 1.0 / 0x10000; 500 else 501 scale = 1.0 / INT_MAX; 502 503 for (chan = 0; chan < 4; chan++) { 504 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], 505 ctx->i32, ""); 506 out[chan] = LLVMBuildSIToFP(gallivm->builder, 507 out[chan], ctx->f32, ""); 508 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan], 509 LLVMConstReal(ctx->f32, scale), ""); 510 } 511 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ 512 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM || 513 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED) 514 out[3] = LLVMConstReal(ctx->f32, 1); 515 break; 516 } 517 case SI_FIX_FETCH_RGBA_32_USCALED: 518 for (chan = 0; chan < 4; chan++) { 519 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], 520 ctx->i32, ""); 521 out[chan] = LLVMBuildUIToFP(gallivm->builder, 522 out[chan], ctx->f32, ""); 523 } 524 break; 525 case SI_FIX_FETCH_RGBA_32_SSCALED: 526 for (chan = 0; chan < 4; chan++) { 527 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan], 528 ctx->i32, ""); 529 out[chan] = LLVMBuildSIToFP(gallivm->builder, 530 out[chan], ctx->f32, ""); 531 } 532 break; 533 } 534 } 535 536 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base, 537 unsigned swizzle) 538 { 539 struct si_shader_context *ctx = si_shader_context(bld_base); 540 541 if (swizzle > 0) 542 return bld_base->uint_bld.zero; 543 544 switch (ctx->type) { 545 case PIPE_SHADER_VERTEX: 546 return LLVMGetParam(ctx->main_fn, 547 ctx->param_vs_prim_id); 548 case PIPE_SHADER_TESS_CTRL: 549 return LLVMGetParam(ctx->main_fn, 550 SI_PARAM_PATCH_ID); 551 case PIPE_SHADER_TESS_EVAL: 552 return LLVMGetParam(ctx->main_fn, 553 ctx->param_tes_patch_id); 554 case PIPE_SHADER_GEOMETRY: 555 return LLVMGetParam(ctx->main_fn, 556 SI_PARAM_PRIMITIVE_ID); 557 default: 558 assert(0); 559 return bld_base->uint_bld.zero; 560 } 561 } 562 563 /** 564 * Return the value of tgsi_ind_register for indexing. 565 * This is the indirect index with the constant offset added to it. 566 */ 567 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx, 568 const struct tgsi_ind_register *ind, 569 int rel_index) 570 { 571 struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; 572 LLVMValueRef result; 573 574 result = ctx->addrs[ind->Index][ind->Swizzle]; 575 result = LLVMBuildLoad(gallivm->builder, result, ""); 576 result = LLVMBuildAdd(gallivm->builder, result, 577 lp_build_const_int32(gallivm, rel_index), ""); 578 return result; 579 } 580 581 /** 582 * Like get_indirect_index, but restricts the return value to a (possibly 583 * undefined) value inside [0..num). 584 */ 585 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx, 586 const struct tgsi_ind_register *ind, 587 int rel_index, unsigned num) 588 { 589 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index); 590 591 /* LLVM 3.8: If indirect resource indexing is used: 592 * - SI & CIK hang 593 * - VI crashes 594 */ 595 if (HAVE_LLVM <= 0x0308) 596 return LLVMGetUndef(ctx->i32); 597 598 return si_llvm_bound_index(ctx, result, num); 599 } 600 601 602 /** 603 * Calculate a dword address given an input or output register and a stride. 604 */ 605 static LLVMValueRef get_dw_address(struct si_shader_context *ctx, 606 const struct tgsi_full_dst_register *dst, 607 const struct tgsi_full_src_register *src, 608 LLVMValueRef vertex_dw_stride, 609 LLVMValueRef base_addr) 610 { 611 struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; 612 struct tgsi_shader_info *info = &ctx->shader->selector->info; 613 ubyte *name, *index, *array_first; 614 int first, param; 615 struct tgsi_full_dst_register reg; 616 617 /* Set the register description. The address computation is the same 618 * for sources and destinations. */ 619 if (src) { 620 reg.Register.File = src->Register.File; 621 reg.Register.Index = src->Register.Index; 622 reg.Register.Indirect = src->Register.Indirect; 623 reg.Register.Dimension = src->Register.Dimension; 624 reg.Indirect = src->Indirect; 625 reg.Dimension = src->Dimension; 626 reg.DimIndirect = src->DimIndirect; 627 } else 628 reg = *dst; 629 630 /* If the register is 2-dimensional (e.g. an array of vertices 631 * in a primitive), calculate the base address of the vertex. */ 632 if (reg.Register.Dimension) { 633 LLVMValueRef index; 634 635 if (reg.Dimension.Indirect) 636 index = get_indirect_index(ctx, ®.DimIndirect, 637 reg.Dimension.Index); 638 else 639 index = lp_build_const_int32(gallivm, reg.Dimension.Index); 640 641 base_addr = LLVMBuildAdd(gallivm->builder, base_addr, 642 LLVMBuildMul(gallivm->builder, index, 643 vertex_dw_stride, ""), ""); 644 } 645 646 /* Get information about the register. */ 647 if (reg.Register.File == TGSI_FILE_INPUT) { 648 name = info->input_semantic_name; 649 index = info->input_semantic_index; 650 array_first = info->input_array_first; 651 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 652 name = info->output_semantic_name; 653 index = info->output_semantic_index; 654 array_first = info->output_array_first; 655 } else { 656 assert(0); 657 return NULL; 658 } 659 660 if (reg.Register.Indirect) { 661 /* Add the relative address of the element. */ 662 LLVMValueRef ind_index; 663 664 if (reg.Indirect.ArrayID) 665 first = array_first[reg.Indirect.ArrayID]; 666 else 667 first = reg.Register.Index; 668 669 ind_index = get_indirect_index(ctx, ®.Indirect, 670 reg.Register.Index - first); 671 672 base_addr = LLVMBuildAdd(gallivm->builder, base_addr, 673 LLVMBuildMul(gallivm->builder, ind_index, 674 lp_build_const_int32(gallivm, 4), ""), ""); 675 676 param = si_shader_io_get_unique_index(name[first], index[first]); 677 } else { 678 param = si_shader_io_get_unique_index(name[reg.Register.Index], 679 index[reg.Register.Index]); 680 } 681 682 /* Add the base address of the element. */ 683 return LLVMBuildAdd(gallivm->builder, base_addr, 684 lp_build_const_int32(gallivm, param * 4), ""); 685 } 686 687 /* The offchip buffer layout for TCS->TES is 688 * 689 * - attribute 0 of patch 0 vertex 0 690 * - attribute 0 of patch 0 vertex 1 691 * - attribute 0 of patch 0 vertex 2 692 * ... 693 * - attribute 0 of patch 1 vertex 0 694 * - attribute 0 of patch 1 vertex 1 695 * ... 696 * - attribute 1 of patch 0 vertex 0 697 * - attribute 1 of patch 0 vertex 1 698 * ... 699 * - per patch attribute 0 of patch 0 700 * - per patch attribute 0 of patch 1 701 * ... 702 * 703 * Note that every attribute has 4 components. 704 */ 705 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, 706 LLVMValueRef vertex_index, 707 LLVMValueRef param_index) 708 { 709 struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; 710 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; 711 LLVMValueRef param_stride, constant16; 712 713 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6); 714 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9); 715 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch, 716 num_patches, ""); 717 718 constant16 = lp_build_const_int32(gallivm, 16); 719 if (vertex_index) { 720 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx), 721 vertices_per_patch, ""); 722 723 base_addr = LLVMBuildAdd(gallivm->builder, base_addr, 724 vertex_index, ""); 725 726 param_stride = total_vertices; 727 } else { 728 base_addr = get_rel_patch_id(ctx); 729 param_stride = num_patches; 730 } 731 732 base_addr = LLVMBuildAdd(gallivm->builder, base_addr, 733 LLVMBuildMul(gallivm->builder, param_index, 734 param_stride, ""), ""); 735 736 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, ""); 737 738 if (!vertex_index) { 739 LLVMValueRef patch_data_offset = 740 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16); 741 742 base_addr = LLVMBuildAdd(gallivm->builder, base_addr, 743 patch_data_offset, ""); 744 } 745 return base_addr; 746 } 747 748 static LLVMValueRef get_tcs_tes_buffer_address_from_reg( 749 struct si_shader_context *ctx, 750 const struct tgsi_full_dst_register *dst, 751 const struct tgsi_full_src_register *src) 752 { 753 struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; 754 struct tgsi_shader_info *info = &ctx->shader->selector->info; 755 ubyte *name, *index, *array_first; 756 struct tgsi_full_src_register reg; 757 LLVMValueRef vertex_index = NULL; 758 LLVMValueRef param_index = NULL; 759 unsigned param_index_base, param_base; 760 761 reg = src ? *src : tgsi_full_src_register_from_dst(dst); 762 763 if (reg.Register.Dimension) { 764 765 if (reg.Dimension.Indirect) 766 vertex_index = get_indirect_index(ctx, ®.DimIndirect, 767 reg.Dimension.Index); 768 else 769 vertex_index = lp_build_const_int32(gallivm, 770 reg.Dimension.Index); 771 } 772 773 /* Get information about the register. */ 774 if (reg.Register.File == TGSI_FILE_INPUT) { 775 name = info->input_semantic_name; 776 index = info->input_semantic_index; 777 array_first = info->input_array_first; 778 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 779 name = info->output_semantic_name; 780 index = info->output_semantic_index; 781 array_first = info->output_array_first; 782 } else { 783 assert(0); 784 return NULL; 785 } 786 787 if (reg.Register.Indirect) { 788 if (reg.Indirect.ArrayID) 789 param_base = array_first[reg.Indirect.ArrayID]; 790 else 791 param_base = reg.Register.Index; 792 793 param_index = get_indirect_index(ctx, ®.Indirect, 794 reg.Register.Index - param_base); 795 796 } else { 797 param_base = reg.Register.Index; 798 param_index = lp_build_const_int32(gallivm, 0); 799 } 800 801 param_index_base = si_shader_io_get_unique_index(name[param_base], 802 index[param_base]); 803 804 param_index = LLVMBuildAdd(gallivm->builder, param_index, 805 lp_build_const_int32(gallivm, param_index_base), 806 ""); 807 808 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index); 809 } 810 811 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. 812 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), 813 * or v4i32 (num_channels=3,4). */ 814 static void build_tbuffer_store(struct si_shader_context *ctx, 815 LLVMValueRef rsrc, 816 LLVMValueRef vdata, 817 unsigned num_channels, 818 LLVMValueRef vaddr, 819 LLVMValueRef soffset, 820 unsigned inst_offset, 821 unsigned dfmt, 822 unsigned nfmt, 823 unsigned offen, 824 unsigned idxen, 825 unsigned glc, 826 unsigned slc, 827 unsigned tfe) 828 { 829 struct gallivm_state *gallivm = &ctx->gallivm; 830 LLVMValueRef args[] = { 831 rsrc, 832 vdata, 833 LLVMConstInt(ctx->i32, num_channels, 0), 834 vaddr, 835 soffset, 836 LLVMConstInt(ctx->i32, inst_offset, 0), 837 LLVMConstInt(ctx->i32, dfmt, 0), 838 LLVMConstInt(ctx->i32, nfmt, 0), 839 LLVMConstInt(ctx->i32, offen, 0), 840 LLVMConstInt(ctx->i32, idxen, 0), 841 LLVMConstInt(ctx->i32, glc, 0), 842 LLVMConstInt(ctx->i32, slc, 0), 843 LLVMConstInt(ctx->i32, tfe, 0) 844 }; 845 846 /* The instruction offset field has 12 bits */ 847 assert(offen || inst_offset < (1 << 12)); 848 849 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */ 850 unsigned func = CLAMP(num_channels, 1, 3) - 1; 851 const char *types[] = {"i32", "v2i32", "v4i32"}; 852 char name[256]; 853 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]); 854 855 lp_build_intrinsic(gallivm->builder, name, ctx->voidt, 856 args, ARRAY_SIZE(args), 0); 857 } 858 859 static void build_tbuffer_store_dwords(struct si_shader_context *ctx, 860 LLVMValueRef rsrc, 861 LLVMValueRef vdata, 862 unsigned num_channels, 863 LLVMValueRef vaddr, 864 LLVMValueRef soffset, 865 unsigned inst_offset) 866 { 867 static unsigned dfmt[] = { 868 V_008F0C_BUF_DATA_FORMAT_32, 869 V_008F0C_BUF_DATA_FORMAT_32_32, 870 V_008F0C_BUF_DATA_FORMAT_32_32_32, 871 V_008F0C_BUF_DATA_FORMAT_32_32_32_32 872 }; 873 assert(num_channels >= 1 && num_channels <= 4); 874 875 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset, 876 inst_offset, dfmt[num_channels-1], 877 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0); 878 } 879 880 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx, 881 LLVMValueRef rsrc, 882 int num_channels, 883 LLVMValueRef vindex, 884 LLVMValueRef voffset, 885 LLVMValueRef soffset, 886 unsigned inst_offset, 887 unsigned glc, 888 unsigned slc) 889 { 890 struct gallivm_state *gallivm = &ctx->gallivm; 891 unsigned func = CLAMP(num_channels, 1, 3) - 1; 892 893 if (HAVE_LLVM >= 0x309) { 894 LLVMValueRef args[] = { 895 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""), 896 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0), 897 LLVMConstInt(ctx->i32, inst_offset, 0), 898 LLVMConstInt(ctx->i1, glc, 0), 899 LLVMConstInt(ctx->i1, slc, 0) 900 }; 901 902 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2), 903 ctx->v4f32}; 904 const char *type_names[] = {"f32", "v2f32", "v4f32"}; 905 char name[256]; 906 907 if (voffset) { 908 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset, 909 ""); 910 } 911 912 if (soffset) { 913 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset, 914 ""); 915 } 916 917 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s", 918 type_names[func]); 919 920 return lp_build_intrinsic(gallivm->builder, name, types[func], args, 921 ARRAY_SIZE(args), LP_FUNC_ATTR_READONLY); 922 } else { 923 LLVMValueRef args[] = { 924 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""), 925 voffset ? voffset : vindex, 926 soffset, 927 LLVMConstInt(ctx->i32, inst_offset, 0), 928 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen 929 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen 930 LLVMConstInt(ctx->i32, glc, 0), 931 LLVMConstInt(ctx->i32, slc, 0), 932 LLVMConstInt(ctx->i32, 0, 0), // TFE 933 }; 934 935 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2), 936 ctx->v4i32}; 937 const char *type_names[] = {"i32", "v2i32", "v4i32"}; 938 const char *arg_type = "i32"; 939 char name[256]; 940 941 if (voffset && vindex) { 942 LLVMValueRef vaddr[] = {vindex, voffset}; 943 944 arg_type = "v2i32"; 945 args[1] = lp_build_gather_values(gallivm, vaddr, 2); 946 } 947 948 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s", 949 type_names[func], arg_type); 950 951 return lp_build_intrinsic(gallivm->builder, name, types[func], args, 952 ARRAY_SIZE(args), LP_FUNC_ATTR_READONLY); 953 } 954 } 955 956 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, 957 enum tgsi_opcode_type type, unsigned swizzle, 958 LLVMValueRef buffer, LLVMValueRef offset, 959 LLVMValueRef base) 960 { 961 struct si_shader_context *ctx = si_shader_context(bld_base); 962 struct gallivm_state *gallivm = bld_base->base.gallivm; 963 LLVMValueRef value, value2; 964 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type); 965 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4); 966 967 if (swizzle == ~0) { 968 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset, 969 0, 1, 0); 970 971 return LLVMBuildBitCast(gallivm->builder, value, vec_type, ""); 972 } 973 974 if (!tgsi_type_is_64bit(type)) { 975 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset, 976 0, 1, 0); 977 978 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, ""); 979 return LLVMBuildExtractElement(gallivm->builder, value, 980 lp_build_const_int32(gallivm, swizzle), ""); 981 } 982 983 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset, 984 swizzle * 4, 1, 0); 985 986 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset, 987 swizzle * 4 + 4, 1, 0); 988 989 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); 990 } 991 992 /** 993 * Load from LDS. 994 * 995 * \param type output value type 996 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4 997 * \param dw_addr address in dwords 998 */ 999 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, 1000 enum tgsi_opcode_type type, unsigned swizzle, 1001 LLVMValueRef dw_addr) 1002 { 1003 struct si_shader_context *ctx = si_shader_context(bld_base); 1004 struct gallivm_state *gallivm = bld_base->base.gallivm; 1005 LLVMValueRef value; 1006 1007 if (swizzle == ~0) { 1008 LLVMValueRef values[TGSI_NUM_CHANNELS]; 1009 1010 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) 1011 values[chan] = lds_load(bld_base, type, chan, dw_addr); 1012 1013 return lp_build_gather_values(bld_base->base.gallivm, values, 1014 TGSI_NUM_CHANNELS); 1015 } 1016 1017 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, 1018 lp_build_const_int32(gallivm, swizzle)); 1019 1020 value = build_indexed_load(ctx, ctx->lds, dw_addr, false); 1021 if (tgsi_type_is_64bit(type)) { 1022 LLVMValueRef value2; 1023 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, 1024 lp_build_const_int32(gallivm, 1)); 1025 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false); 1026 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2); 1027 } 1028 1029 return LLVMBuildBitCast(gallivm->builder, value, 1030 tgsi2llvmtype(bld_base, type), ""); 1031 } 1032 1033 /** 1034 * Store to LDS. 1035 * 1036 * \param swizzle offset (typically 0..3) 1037 * \param dw_addr address in dwords 1038 * \param value value to store 1039 */ 1040 static void lds_store(struct lp_build_tgsi_context *bld_base, 1041 unsigned swizzle, LLVMValueRef dw_addr, 1042 LLVMValueRef value) 1043 { 1044 struct si_shader_context *ctx = si_shader_context(bld_base); 1045 struct gallivm_state *gallivm = bld_base->base.gallivm; 1046 1047 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, 1048 lp_build_const_int32(gallivm, swizzle)); 1049 1050 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); 1051 build_indexed_store(ctx, ctx->lds, 1052 dw_addr, value); 1053 } 1054 1055 static LLVMValueRef fetch_input_tcs( 1056 struct lp_build_tgsi_context *bld_base, 1057 const struct tgsi_full_src_register *reg, 1058 enum tgsi_opcode_type type, unsigned swizzle) 1059 { 1060 struct si_shader_context *ctx = si_shader_context(bld_base); 1061 LLVMValueRef dw_addr, stride; 1062 1063 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8); 1064 dw_addr = get_tcs_in_current_patch_offset(ctx); 1065 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); 1066 1067 return lds_load(bld_base, type, swizzle, dw_addr); 1068 } 1069 1070 static LLVMValueRef fetch_output_tcs( 1071 struct lp_build_tgsi_context *bld_base, 1072 const struct tgsi_full_src_register *reg, 1073 enum tgsi_opcode_type type, unsigned swizzle) 1074 { 1075 struct si_shader_context *ctx = si_shader_context(bld_base); 1076 LLVMValueRef dw_addr, stride; 1077 1078 if (reg->Register.Dimension) { 1079 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); 1080 dw_addr = get_tcs_out_current_patch_offset(ctx); 1081 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr); 1082 } else { 1083 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1084 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr); 1085 } 1086 1087 return lds_load(bld_base, type, swizzle, dw_addr); 1088 } 1089 1090 static LLVMValueRef fetch_input_tes( 1091 struct lp_build_tgsi_context *bld_base, 1092 const struct tgsi_full_src_register *reg, 1093 enum tgsi_opcode_type type, unsigned swizzle) 1094 { 1095 struct si_shader_context *ctx = si_shader_context(bld_base); 1096 struct gallivm_state *gallivm = bld_base->base.gallivm; 1097 LLVMValueRef rw_buffers, buffer, base, addr; 1098 1099 rw_buffers = LLVMGetParam(ctx->main_fn, 1100 SI_PARAM_RW_BUFFERS); 1101 buffer = build_indexed_load_const(ctx, rw_buffers, 1102 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); 1103 1104 base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); 1105 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg); 1106 1107 return buffer_load(bld_base, type, swizzle, buffer, base, addr); 1108 } 1109 1110 static void store_output_tcs(struct lp_build_tgsi_context *bld_base, 1111 const struct tgsi_full_instruction *inst, 1112 const struct tgsi_opcode_info *info, 1113 LLVMValueRef dst[4]) 1114 { 1115 struct si_shader_context *ctx = si_shader_context(bld_base); 1116 struct gallivm_state *gallivm = bld_base->base.gallivm; 1117 const struct tgsi_full_dst_register *reg = &inst->Dst[0]; 1118 unsigned chan_index; 1119 LLVMValueRef dw_addr, stride; 1120 LLVMValueRef rw_buffers, buffer, base, buf_addr; 1121 LLVMValueRef values[4]; 1122 1123 /* Only handle per-patch and per-vertex outputs here. 1124 * Vectors will be lowered to scalars and this function will be called again. 1125 */ 1126 if (reg->Register.File != TGSI_FILE_OUTPUT || 1127 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) { 1128 si_llvm_emit_store(bld_base, inst, info, dst); 1129 return; 1130 } 1131 1132 if (reg->Register.Dimension) { 1133 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8); 1134 dw_addr = get_tcs_out_current_patch_offset(ctx); 1135 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr); 1136 } else { 1137 dw_addr = get_tcs_out_current_patch_data_offset(ctx); 1138 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr); 1139 } 1140 1141 rw_buffers = LLVMGetParam(ctx->main_fn, 1142 SI_PARAM_RW_BUFFERS); 1143 buffer = build_indexed_load_const(ctx, rw_buffers, 1144 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); 1145 1146 base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); 1147 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL); 1148 1149 1150 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) { 1151 LLVMValueRef value = dst[chan_index]; 1152 1153 if (inst->Instruction.Saturate) 1154 value = si_llvm_saturate(bld_base, value); 1155 1156 lds_store(bld_base, chan_index, dw_addr, value); 1157 1158 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); 1159 values[chan_index] = value; 1160 1161 if (inst->Dst[0].Register.WriteMask != 0xF) { 1162 build_tbuffer_store_dwords(ctx, buffer, value, 1, 1163 buf_addr, base, 1164 4 * chan_index); 1165 } 1166 } 1167 1168 if (inst->Dst[0].Register.WriteMask == 0xF) { 1169 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm, 1170 values, 4); 1171 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr, 1172 base, 0); 1173 } 1174 } 1175 1176 static LLVMValueRef fetch_input_gs( 1177 struct lp_build_tgsi_context *bld_base, 1178 const struct tgsi_full_src_register *reg, 1179 enum tgsi_opcode_type type, 1180 unsigned swizzle) 1181 { 1182 struct lp_build_context *base = &bld_base->base; 1183 struct si_shader_context *ctx = si_shader_context(bld_base); 1184 struct si_shader *shader = ctx->shader; 1185 struct lp_build_context *uint = &ctx->bld_base.uint_bld; 1186 struct gallivm_state *gallivm = base->gallivm; 1187 LLVMValueRef vtx_offset; 1188 LLVMValueRef args[9]; 1189 unsigned vtx_offset_param; 1190 struct tgsi_shader_info *info = &shader->selector->info; 1191 unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; 1192 unsigned semantic_index = info->input_semantic_index[reg->Register.Index]; 1193 unsigned param; 1194 LLVMValueRef value; 1195 1196 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) 1197 return get_primitive_id(bld_base, swizzle); 1198 1199 if (!reg->Register.Dimension) 1200 return NULL; 1201 1202 if (swizzle == ~0) { 1203 LLVMValueRef values[TGSI_NUM_CHANNELS]; 1204 unsigned chan; 1205 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1206 values[chan] = fetch_input_gs(bld_base, reg, type, chan); 1207 } 1208 return lp_build_gather_values(bld_base->base.gallivm, values, 1209 TGSI_NUM_CHANNELS); 1210 } 1211 1212 /* Get the vertex offset parameter */ 1213 vtx_offset_param = reg->Dimension.Index; 1214 if (vtx_offset_param < 2) { 1215 vtx_offset_param += SI_PARAM_VTX0_OFFSET; 1216 } else { 1217 assert(vtx_offset_param < 6); 1218 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2; 1219 } 1220 vtx_offset = lp_build_mul_imm(uint, 1221 LLVMGetParam(ctx->main_fn, 1222 vtx_offset_param), 1223 4); 1224 1225 param = si_shader_io_get_unique_index(semantic_name, semantic_index); 1226 args[0] = ctx->esgs_ring; 1227 args[1] = vtx_offset; 1228 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256); 1229 args[3] = uint->zero; 1230 args[4] = uint->one; /* OFFEN */ 1231 args[5] = uint->zero; /* IDXEN */ 1232 args[6] = uint->one; /* GLC */ 1233 args[7] = uint->zero; /* SLC */ 1234 args[8] = uint->zero; /* TFE */ 1235 1236 value = lp_build_intrinsic(gallivm->builder, 1237 "llvm.SI.buffer.load.dword.i32.i32", 1238 ctx->i32, args, 9, 1239 LP_FUNC_ATTR_READONLY); 1240 if (tgsi_type_is_64bit(type)) { 1241 LLVMValueRef value2; 1242 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256); 1243 value2 = lp_build_intrinsic(gallivm->builder, 1244 "llvm.SI.buffer.load.dword.i32.i32", 1245 ctx->i32, args, 9, 1246 LP_FUNC_ATTR_READONLY); 1247 return si_llvm_emit_fetch_64bit(bld_base, type, 1248 value, value2); 1249 } 1250 return LLVMBuildBitCast(gallivm->builder, 1251 value, 1252 tgsi2llvmtype(bld_base, type), ""); 1253 } 1254 1255 static int lookup_interp_param_index(unsigned interpolate, unsigned location) 1256 { 1257 switch (interpolate) { 1258 case TGSI_INTERPOLATE_CONSTANT: 1259 return 0; 1260 1261 case TGSI_INTERPOLATE_LINEAR: 1262 if (location == TGSI_INTERPOLATE_LOC_SAMPLE) 1263 return SI_PARAM_LINEAR_SAMPLE; 1264 else if (location == TGSI_INTERPOLATE_LOC_CENTROID) 1265 return SI_PARAM_LINEAR_CENTROID; 1266 else 1267 return SI_PARAM_LINEAR_CENTER; 1268 break; 1269 case TGSI_INTERPOLATE_COLOR: 1270 case TGSI_INTERPOLATE_PERSPECTIVE: 1271 if (location == TGSI_INTERPOLATE_LOC_SAMPLE) 1272 return SI_PARAM_PERSP_SAMPLE; 1273 else if (location == TGSI_INTERPOLATE_LOC_CENTROID) 1274 return SI_PARAM_PERSP_CENTROID; 1275 else 1276 return SI_PARAM_PERSP_CENTER; 1277 break; 1278 default: 1279 fprintf(stderr, "Warning: Unhandled interpolation mode.\n"); 1280 return -1; 1281 } 1282 } 1283 1284 static LLVMValueRef build_fs_interp( 1285 struct lp_build_tgsi_context *bld_base, 1286 LLVMValueRef llvm_chan, 1287 LLVMValueRef attr_number, 1288 LLVMValueRef params, 1289 LLVMValueRef i, 1290 LLVMValueRef j) { 1291 1292 struct si_shader_context *ctx = si_shader_context(bld_base); 1293 struct gallivm_state *gallivm = bld_base->base.gallivm; 1294 LLVMValueRef args[5]; 1295 LLVMValueRef p1; 1296 if (HAVE_LLVM < 0x0400) { 1297 LLVMValueRef ij[2]; 1298 ij[0] = LLVMBuildBitCast(gallivm->builder, i, ctx->i32, ""); 1299 ij[1] = LLVMBuildBitCast(gallivm->builder, j, ctx->i32, ""); 1300 1301 args[0] = llvm_chan; 1302 args[1] = attr_number; 1303 args[2] = params; 1304 args[3] = lp_build_gather_values(gallivm, ij, 2); 1305 return lp_build_intrinsic(gallivm->builder, "llvm.SI.fs.interp", 1306 ctx->f32, args, 4, 1307 LP_FUNC_ATTR_READNONE); 1308 } 1309 1310 args[0] = i; 1311 args[1] = llvm_chan; 1312 args[2] = attr_number; 1313 args[3] = params; 1314 1315 p1 = lp_build_intrinsic(gallivm->builder, "llvm.amdgcn.interp.p1", 1316 ctx->f32, args, 4, LP_FUNC_ATTR_READNONE); 1317 1318 args[0] = p1; 1319 args[1] = j; 1320 args[2] = llvm_chan; 1321 args[3] = attr_number; 1322 args[4] = params; 1323 1324 return lp_build_intrinsic(gallivm->builder, "llvm.amdgcn.interp.p2", 1325 ctx->f32, args, 5, LP_FUNC_ATTR_READNONE); 1326 } 1327 1328 static LLVMValueRef build_fs_interp_mov( 1329 struct lp_build_tgsi_context *bld_base, 1330 LLVMValueRef parameter, 1331 LLVMValueRef llvm_chan, 1332 LLVMValueRef attr_number, 1333 LLVMValueRef params) { 1334 1335 struct si_shader_context *ctx = si_shader_context(bld_base); 1336 struct gallivm_state *gallivm = bld_base->base.gallivm; 1337 LLVMValueRef args[4]; 1338 if (HAVE_LLVM < 0x0400) { 1339 args[0] = llvm_chan; 1340 args[1] = attr_number; 1341 args[2] = params; 1342 1343 return lp_build_intrinsic(gallivm->builder, 1344 "llvm.SI.fs.constant", 1345 ctx->f32, args, 3, 1346 LP_FUNC_ATTR_READNONE); 1347 } 1348 1349 args[0] = parameter; 1350 args[1] = llvm_chan; 1351 args[2] = attr_number; 1352 args[3] = params; 1353 1354 return lp_build_intrinsic(gallivm->builder, "llvm.amdgcn.interp.mov", 1355 ctx->f32, args, 4, LP_FUNC_ATTR_READNONE); 1356 } 1357 1358 /** 1359 * Interpolate a fragment shader input. 1360 * 1361 * @param ctx context 1362 * @param input_index index of the input in hardware 1363 * @param semantic_name TGSI_SEMANTIC_* 1364 * @param semantic_index semantic index 1365 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset) 1366 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total) 1367 * @param interp_param interpolation weights (i,j) 1368 * @param prim_mask SI_PARAM_PRIM_MASK 1369 * @param face SI_PARAM_FRONT_FACE 1370 * @param result the return value (4 components) 1371 */ 1372 static void interp_fs_input(struct si_shader_context *ctx, 1373 unsigned input_index, 1374 unsigned semantic_name, 1375 unsigned semantic_index, 1376 unsigned num_interp_inputs, 1377 unsigned colors_read_mask, 1378 LLVMValueRef interp_param, 1379 LLVMValueRef prim_mask, 1380 LLVMValueRef face, 1381 LLVMValueRef result[4]) 1382 { 1383 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 1384 struct lp_build_context *base = &bld_base->base; 1385 struct lp_build_context *uint = &bld_base->uint_bld; 1386 struct gallivm_state *gallivm = base->gallivm; 1387 LLVMValueRef attr_number; 1388 LLVMValueRef i, j; 1389 1390 unsigned chan; 1391 1392 /* fs.constant returns the param from the middle vertex, so it's not 1393 * really useful for flat shading. It's meant to be used for custom 1394 * interpolation (but the intrinsic can't fetch from the other two 1395 * vertices). 1396 * 1397 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state 1398 * to do the right thing. The only reason we use fs.constant is that 1399 * fs.interp cannot be used on integers, because they can be equal 1400 * to NaN. 1401 * 1402 * When interp is false we will use fs.constant or for newer llvm, 1403 * amdgcn.interp.mov. 1404 */ 1405 bool interp = interp_param != NULL; 1406 1407 attr_number = lp_build_const_int32(gallivm, input_index); 1408 1409 if (interp) { 1410 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param, 1411 LLVMVectorType(ctx->f32, 2), ""); 1412 1413 i = LLVMBuildExtractElement(gallivm->builder, interp_param, 1414 uint->zero, ""); 1415 j = LLVMBuildExtractElement(gallivm->builder, interp_param, 1416 uint->one, ""); 1417 } 1418 1419 if (semantic_name == TGSI_SEMANTIC_COLOR && 1420 ctx->shader->key.part.ps.prolog.color_two_side) { 1421 LLVMValueRef is_face_positive; 1422 LLVMValueRef back_attr_number; 1423 1424 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", 1425 * otherwise it's at offset "num_inputs". 1426 */ 1427 unsigned back_attr_offset = num_interp_inputs; 1428 if (semantic_index == 1 && colors_read_mask & 0xf) 1429 back_attr_offset += 1; 1430 1431 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset); 1432 1433 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE, 1434 face, uint->zero, ""); 1435 1436 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1437 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); 1438 LLVMValueRef front, back; 1439 1440 if (interp) { 1441 front = build_fs_interp(bld_base, llvm_chan, 1442 attr_number, prim_mask, 1443 i, j); 1444 back = build_fs_interp(bld_base, llvm_chan, 1445 back_attr_number, prim_mask, 1446 i, j); 1447 } else { 1448 front = build_fs_interp_mov(bld_base, 1449 lp_build_const_int32(gallivm, 2), /* P0 */ 1450 llvm_chan, attr_number, prim_mask); 1451 back = build_fs_interp_mov(bld_base, 1452 lp_build_const_int32(gallivm, 2), /* P0 */ 1453 llvm_chan, back_attr_number, prim_mask); 1454 } 1455 1456 result[chan] = LLVMBuildSelect(gallivm->builder, 1457 is_face_positive, 1458 front, 1459 back, 1460 ""); 1461 } 1462 } else if (semantic_name == TGSI_SEMANTIC_FOG) { 1463 if (interp) { 1464 result[0] = build_fs_interp(bld_base, uint->zero, 1465 attr_number, prim_mask, i, j); 1466 } else { 1467 result[0] = build_fs_interp_mov(bld_base, uint->zero, 1468 lp_build_const_int32(gallivm, 2), /* P0 */ 1469 attr_number, prim_mask); 1470 } 1471 result[1] = 1472 result[2] = lp_build_const_float(gallivm, 0.0f); 1473 result[3] = lp_build_const_float(gallivm, 1.0f); 1474 } else { 1475 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 1476 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); 1477 1478 if (interp) { 1479 result[chan] = build_fs_interp(bld_base, 1480 llvm_chan, attr_number, prim_mask, i, j); 1481 } else { 1482 result[chan] = build_fs_interp_mov(bld_base, 1483 lp_build_const_int32(gallivm, 2), /* P0 */ 1484 llvm_chan, attr_number, prim_mask); 1485 } 1486 } 1487 } 1488 } 1489 1490 static void declare_input_fs( 1491 struct si_shader_context *radeon_bld, 1492 unsigned input_index, 1493 const struct tgsi_full_declaration *decl, 1494 LLVMValueRef out[4]) 1495 { 1496 struct lp_build_context *base = &radeon_bld->bld_base.base; 1497 struct si_shader_context *ctx = 1498 si_shader_context(&radeon_bld->bld_base); 1499 struct si_shader *shader = ctx->shader; 1500 LLVMValueRef main_fn = radeon_bld->main_fn; 1501 LLVMValueRef interp_param = NULL; 1502 int interp_param_idx; 1503 1504 /* Get colors from input VGPRs (set by the prolog). */ 1505 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { 1506 unsigned i = decl->Semantic.Index; 1507 unsigned colors_read = shader->selector->info.colors_read; 1508 unsigned mask = colors_read >> (i * 4); 1509 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + 1510 (i ? util_bitcount(colors_read & 0xf) : 0); 1511 1512 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; 1513 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; 1514 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; 1515 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; 1516 return; 1517 } 1518 1519 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, 1520 decl->Interp.Location); 1521 if (interp_param_idx == -1) 1522 return; 1523 else if (interp_param_idx) { 1524 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); 1525 } 1526 1527 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR && 1528 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR && 1529 ctx->shader->key.part.ps.prolog.flatshade_colors) 1530 interp_param = NULL; /* load the constant color */ 1531 1532 interp_fs_input(ctx, input_index, decl->Semantic.Name, 1533 decl->Semantic.Index, shader->selector->info.num_inputs, 1534 shader->selector->info.colors_read, interp_param, 1535 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK), 1536 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), 1537 &out[0]); 1538 } 1539 1540 static LLVMValueRef get_sample_id(struct si_shader_context *radeon_bld) 1541 { 1542 return unpack_param(si_shader_context(&radeon_bld->bld_base), 1543 SI_PARAM_ANCILLARY, 8, 4); 1544 } 1545 1546 /** 1547 * Set range metadata on an instruction. This can only be used on load and 1548 * call instructions. If you know an instruction can only produce the values 1549 * 0, 1, 2, you would do set_range_metadata(value, 0, 3); 1550 * \p lo is the minimum value inclusive. 1551 * \p hi is the maximum value exclusive. 1552 */ 1553 static void set_range_metadata(struct si_shader_context *ctx, 1554 LLVMValueRef value, unsigned lo, unsigned hi) 1555 { 1556 LLVMValueRef range_md, md_args[2]; 1557 LLVMTypeRef type = LLVMTypeOf(value); 1558 LLVMContextRef context = LLVMGetTypeContext(type); 1559 1560 md_args[0] = LLVMConstInt(type, lo, false); 1561 md_args[1] = LLVMConstInt(type, hi, false); 1562 range_md = LLVMMDNodeInContext(context, md_args, 2); 1563 LLVMSetMetadata(value, ctx->range_md_kind, range_md); 1564 } 1565 1566 static LLVMValueRef get_thread_id(struct si_shader_context *ctx) 1567 { 1568 struct gallivm_state *gallivm = &ctx->gallivm; 1569 LLVMValueRef tid; 1570 1571 if (HAVE_LLVM < 0x0308) { 1572 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", 1573 ctx->i32, NULL, 0, LP_FUNC_ATTR_READNONE); 1574 } else { 1575 LLVMValueRef tid_args[2]; 1576 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff); 1577 tid_args[1] = lp_build_const_int32(gallivm, 0); 1578 tid_args[1] = lp_build_intrinsic(gallivm->builder, 1579 "llvm.amdgcn.mbcnt.lo", ctx->i32, 1580 tid_args, 2, LP_FUNC_ATTR_READNONE); 1581 1582 tid = lp_build_intrinsic(gallivm->builder, 1583 "llvm.amdgcn.mbcnt.hi", ctx->i32, 1584 tid_args, 2, LP_FUNC_ATTR_READNONE); 1585 } 1586 set_range_metadata(ctx, tid, 0, 64); 1587 return tid; 1588 } 1589 1590 /** 1591 * Load a dword from a constant buffer. 1592 */ 1593 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx, 1594 LLVMValueRef resource, 1595 LLVMValueRef offset) 1596 { 1597 LLVMBuilderRef builder = ctx->gallivm.builder; 1598 LLVMValueRef args[2] = {resource, offset}; 1599 1600 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2, 1601 LP_FUNC_ATTR_READNONE); 1602 } 1603 1604 static LLVMValueRef load_sample_position(struct si_shader_context *radeon_bld, LLVMValueRef sample_id) 1605 { 1606 struct si_shader_context *ctx = 1607 si_shader_context(&radeon_bld->bld_base); 1608 struct lp_build_context *uint_bld = &radeon_bld->bld_base.uint_bld; 1609 struct gallivm_state *gallivm = &radeon_bld->gallivm; 1610 LLVMBuilderRef builder = gallivm->builder; 1611 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); 1612 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS); 1613 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index); 1614 1615 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ 1616 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8); 1617 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), ""); 1618 1619 LLVMValueRef pos[4] = { 1620 buffer_load_const(ctx, resource, offset0), 1621 buffer_load_const(ctx, resource, offset1), 1622 lp_build_const_float(gallivm, 0), 1623 lp_build_const_float(gallivm, 0) 1624 }; 1625 1626 return lp_build_gather_values(gallivm, pos, 4); 1627 } 1628 1629 static void declare_system_value( 1630 struct si_shader_context *radeon_bld, 1631 unsigned index, 1632 const struct tgsi_full_declaration *decl) 1633 { 1634 struct si_shader_context *ctx = 1635 si_shader_context(&radeon_bld->bld_base); 1636 struct lp_build_context *bld = &radeon_bld->bld_base.base; 1637 struct gallivm_state *gallivm = &radeon_bld->gallivm; 1638 LLVMValueRef value = 0; 1639 1640 switch (decl->Semantic.Name) { 1641 case TGSI_SEMANTIC_INSTANCEID: 1642 value = LLVMGetParam(radeon_bld->main_fn, 1643 ctx->param_instance_id); 1644 break; 1645 1646 case TGSI_SEMANTIC_VERTEXID: 1647 value = LLVMBuildAdd(gallivm->builder, 1648 LLVMGetParam(radeon_bld->main_fn, 1649 ctx->param_vertex_id), 1650 LLVMGetParam(radeon_bld->main_fn, 1651 SI_PARAM_BASE_VERTEX), ""); 1652 break; 1653 1654 case TGSI_SEMANTIC_VERTEXID_NOBASE: 1655 value = LLVMGetParam(radeon_bld->main_fn, 1656 ctx->param_vertex_id); 1657 break; 1658 1659 case TGSI_SEMANTIC_BASEVERTEX: 1660 value = LLVMGetParam(radeon_bld->main_fn, 1661 SI_PARAM_BASE_VERTEX); 1662 break; 1663 1664 case TGSI_SEMANTIC_BASEINSTANCE: 1665 value = LLVMGetParam(radeon_bld->main_fn, 1666 SI_PARAM_START_INSTANCE); 1667 break; 1668 1669 case TGSI_SEMANTIC_DRAWID: 1670 value = LLVMGetParam(radeon_bld->main_fn, 1671 SI_PARAM_DRAWID); 1672 break; 1673 1674 case TGSI_SEMANTIC_INVOCATIONID: 1675 if (ctx->type == PIPE_SHADER_TESS_CTRL) 1676 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); 1677 else if (ctx->type == PIPE_SHADER_GEOMETRY) 1678 value = LLVMGetParam(radeon_bld->main_fn, 1679 SI_PARAM_GS_INSTANCE_ID); 1680 else 1681 assert(!"INVOCATIONID not implemented"); 1682 break; 1683 1684 case TGSI_SEMANTIC_POSITION: 1685 { 1686 LLVMValueRef pos[4] = { 1687 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT), 1688 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT), 1689 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT), 1690 lp_build_emit_llvm_unary(&radeon_bld->bld_base, TGSI_OPCODE_RCP, 1691 LLVMGetParam(radeon_bld->main_fn, 1692 SI_PARAM_POS_W_FLOAT)), 1693 }; 1694 value = lp_build_gather_values(gallivm, pos, 4); 1695 break; 1696 } 1697 1698 case TGSI_SEMANTIC_FACE: 1699 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE); 1700 break; 1701 1702 case TGSI_SEMANTIC_SAMPLEID: 1703 value = get_sample_id(radeon_bld); 1704 break; 1705 1706 case TGSI_SEMANTIC_SAMPLEPOS: { 1707 LLVMValueRef pos[4] = { 1708 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT), 1709 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT), 1710 lp_build_const_float(gallivm, 0), 1711 lp_build_const_float(gallivm, 0) 1712 }; 1713 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->bld_base, 1714 TGSI_OPCODE_FRC, pos[0]); 1715 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->bld_base, 1716 TGSI_OPCODE_FRC, pos[1]); 1717 value = lp_build_gather_values(gallivm, pos, 4); 1718 break; 1719 } 1720 1721 case TGSI_SEMANTIC_SAMPLEMASK: 1722 /* This can only occur with the OpenGL Core profile, which 1723 * doesn't support smoothing. 1724 */ 1725 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE); 1726 break; 1727 1728 case TGSI_SEMANTIC_TESSCOORD: 1729 { 1730 LLVMValueRef coord[4] = { 1731 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u), 1732 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v), 1733 bld->zero, 1734 bld->zero 1735 }; 1736 1737 /* For triangles, the vector should be (u, v, 1-u-v). */ 1738 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == 1739 PIPE_PRIM_TRIANGLES) 1740 coord[2] = lp_build_sub(bld, bld->one, 1741 lp_build_add(bld, coord[0], coord[1])); 1742 1743 value = lp_build_gather_values(gallivm, coord, 4); 1744 break; 1745 } 1746 1747 case TGSI_SEMANTIC_VERTICESIN: 1748 if (ctx->type == PIPE_SHADER_TESS_CTRL) 1749 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6); 1750 else if (ctx->type == PIPE_SHADER_TESS_EVAL) 1751 value = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 7); 1752 else 1753 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); 1754 break; 1755 1756 case TGSI_SEMANTIC_TESSINNER: 1757 case TGSI_SEMANTIC_TESSOUTER: 1758 { 1759 LLVMValueRef rw_buffers, buffer, base, addr; 1760 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0); 1761 1762 rw_buffers = LLVMGetParam(ctx->main_fn, 1763 SI_PARAM_RW_BUFFERS); 1764 buffer = build_indexed_load_const(ctx, rw_buffers, 1765 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); 1766 1767 base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); 1768 addr = get_tcs_tes_buffer_address(ctx, NULL, 1769 lp_build_const_int32(gallivm, param)); 1770 1771 value = buffer_load(&radeon_bld->bld_base, TGSI_TYPE_FLOAT, 1772 ~0, buffer, base, addr); 1773 1774 break; 1775 } 1776 1777 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI: 1778 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI: 1779 { 1780 LLVMValueRef buf, slot, val[4]; 1781 int i, offset; 1782 1783 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS); 1784 buf = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); 1785 buf = build_indexed_load_const(ctx, buf, slot); 1786 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0; 1787 1788 for (i = 0; i < 4; i++) 1789 val[i] = buffer_load_const(ctx, buf, 1790 lp_build_const_int32(gallivm, (offset + i) * 4)); 1791 value = lp_build_gather_values(gallivm, val, 4); 1792 break; 1793 } 1794 1795 case TGSI_SEMANTIC_PRIMID: 1796 value = get_primitive_id(&radeon_bld->bld_base, 0); 1797 break; 1798 1799 case TGSI_SEMANTIC_GRID_SIZE: 1800 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE); 1801 break; 1802 1803 case TGSI_SEMANTIC_BLOCK_SIZE: 1804 { 1805 LLVMValueRef values[3]; 1806 unsigned i; 1807 unsigned *properties = ctx->shader->selector->info.properties; 1808 1809 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { 1810 unsigned sizes[3] = { 1811 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], 1812 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], 1813 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] 1814 }; 1815 1816 for (i = 0; i < 3; ++i) 1817 values[i] = lp_build_const_int32(gallivm, sizes[i]); 1818 1819 value = lp_build_gather_values(gallivm, values, 3); 1820 } else { 1821 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE); 1822 } 1823 break; 1824 } 1825 1826 case TGSI_SEMANTIC_BLOCK_ID: 1827 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID); 1828 break; 1829 1830 case TGSI_SEMANTIC_THREAD_ID: 1831 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID); 1832 break; 1833 1834 case TGSI_SEMANTIC_HELPER_INVOCATION: 1835 if (HAVE_LLVM >= 0x0309) { 1836 value = lp_build_intrinsic(gallivm->builder, 1837 "llvm.amdgcn.ps.live", 1838 ctx->i1, NULL, 0, 1839 LP_FUNC_ATTR_READNONE); 1840 value = LLVMBuildNot(gallivm->builder, value, ""); 1841 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, ""); 1842 } else { 1843 assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported"); 1844 return; 1845 } 1846 break; 1847 1848 default: 1849 assert(!"unknown system value"); 1850 return; 1851 } 1852 1853 radeon_bld->system_values[index] = value; 1854 } 1855 1856 static void declare_compute_memory(struct si_shader_context *radeon_bld, 1857 const struct tgsi_full_declaration *decl) 1858 { 1859 struct si_shader_context *ctx = 1860 si_shader_context(&radeon_bld->bld_base); 1861 struct si_shader_selector *sel = ctx->shader->selector; 1862 struct gallivm_state *gallivm = &radeon_bld->gallivm; 1863 1864 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE); 1865 LLVMValueRef var; 1866 1867 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED); 1868 assert(decl->Range.First == decl->Range.Last); 1869 assert(!ctx->shared_memory); 1870 1871 var = LLVMAddGlobalInAddressSpace(gallivm->module, 1872 LLVMArrayType(ctx->i8, sel->local_size), 1873 "compute_lds", 1874 LOCAL_ADDR_SPACE); 1875 LLVMSetAlignment(var, 4); 1876 1877 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, ""); 1878 } 1879 1880 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i) 1881 { 1882 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn, 1883 SI_PARAM_CONST_BUFFERS); 1884 1885 return build_indexed_load_const(ctx, list_ptr, 1886 LLVMConstInt(ctx->i32, i, 0)); 1887 } 1888 1889 static LLVMValueRef fetch_constant( 1890 struct lp_build_tgsi_context *bld_base, 1891 const struct tgsi_full_src_register *reg, 1892 enum tgsi_opcode_type type, 1893 unsigned swizzle) 1894 { 1895 struct si_shader_context *ctx = si_shader_context(bld_base); 1896 struct lp_build_context *base = &bld_base->base; 1897 const struct tgsi_ind_register *ireg = ®->Indirect; 1898 unsigned buf, idx; 1899 1900 LLVMValueRef addr, bufp; 1901 LLVMValueRef result; 1902 1903 if (swizzle == LP_CHAN_ALL) { 1904 unsigned chan; 1905 LLVMValueRef values[4]; 1906 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) 1907 values[chan] = fetch_constant(bld_base, reg, type, chan); 1908 1909 return lp_build_gather_values(bld_base->base.gallivm, values, 4); 1910 } 1911 1912 buf = reg->Register.Dimension ? reg->Dimension.Index : 0; 1913 idx = reg->Register.Index * 4 + swizzle; 1914 1915 if (reg->Register.Dimension && reg->Dimension.Indirect) { 1916 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_CONST_BUFFERS); 1917 LLVMValueRef index; 1918 index = get_bounded_indirect_index(ctx, ®->DimIndirect, 1919 reg->Dimension.Index, 1920 SI_NUM_CONST_BUFFERS); 1921 bufp = build_indexed_load_const(ctx, ptr, index); 1922 } else 1923 bufp = load_const_buffer_desc(ctx, buf); 1924 1925 if (reg->Register.Indirect) { 1926 addr = ctx->addrs[ireg->Index][ireg->Swizzle]; 1927 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg"); 1928 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16); 1929 addr = lp_build_add(&bld_base->uint_bld, addr, 1930 lp_build_const_int32(base->gallivm, idx * 4)); 1931 } else { 1932 addr = LLVMConstInt(ctx->i32, idx * 4, 0); 1933 } 1934 1935 result = buffer_load_const(ctx, bufp, addr); 1936 1937 if (!tgsi_type_is_64bit(type)) 1938 result = bitcast(bld_base, type, result); 1939 else { 1940 LLVMValueRef addr2, result2; 1941 1942 addr2 = lp_build_add(&bld_base->uint_bld, addr, 1943 LLVMConstInt(ctx->i32, 4, 0)); 1944 result2 = buffer_load_const(ctx, bufp, addr2); 1945 1946 result = si_llvm_emit_fetch_64bit(bld_base, type, 1947 result, result2); 1948 } 1949 return result; 1950 } 1951 1952 /* Upper 16 bits must be zero. */ 1953 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm, 1954 LLVMValueRef val[2]) 1955 { 1956 return LLVMBuildOr(gallivm->builder, val[0], 1957 LLVMBuildShl(gallivm->builder, val[1], 1958 lp_build_const_int32(gallivm, 16), 1959 ""), ""); 1960 } 1961 1962 /* Upper 16 bits are ignored and will be dropped. */ 1963 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm, 1964 LLVMValueRef val[2]) 1965 { 1966 LLVMValueRef v[2] = { 1967 LLVMBuildAnd(gallivm->builder, val[0], 1968 lp_build_const_int32(gallivm, 0xffff), ""), 1969 val[1], 1970 }; 1971 return si_llvm_pack_two_int16(gallivm, v); 1972 } 1973 1974 /* Initialize arguments for the shader export intrinsic */ 1975 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base, 1976 LLVMValueRef *values, 1977 unsigned target, 1978 LLVMValueRef *args) 1979 { 1980 struct si_shader_context *ctx = si_shader_context(bld_base); 1981 struct lp_build_context *uint = &ctx->bld_base.uint_bld; 1982 struct lp_build_context *base = &bld_base->base; 1983 struct gallivm_state *gallivm = base->gallivm; 1984 LLVMBuilderRef builder = base->gallivm->builder; 1985 LLVMValueRef val[4]; 1986 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR; 1987 unsigned chan; 1988 bool is_int8, is_int10; 1989 1990 /* Default is 0xf. Adjusted below depending on the format. */ 1991 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */ 1992 1993 /* Specify whether the EXEC mask represents the valid mask */ 1994 args[1] = uint->zero; 1995 1996 /* Specify whether this is the last export */ 1997 args[2] = uint->zero; 1998 1999 /* Specify the target we are exporting */ 2000 args[3] = lp_build_const_int32(base->gallivm, target); 2001 2002 if (ctx->type == PIPE_SHADER_FRAGMENT) { 2003 const struct si_shader_key *key = &ctx->shader->key; 2004 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; 2005 int cbuf = target - V_008DFC_SQ_EXP_MRT; 2006 2007 assert(cbuf >= 0 && cbuf < 8); 2008 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; 2009 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; 2010 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1; 2011 } 2012 2013 args[4] = uint->zero; /* COMPR flag */ 2014 args[5] = base->undef; 2015 args[6] = base->undef; 2016 args[7] = base->undef; 2017 args[8] = base->undef; 2018 2019 switch (spi_shader_col_format) { 2020 case V_028714_SPI_SHADER_ZERO: 2021 args[0] = uint->zero; /* writemask */ 2022 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL); 2023 break; 2024 2025 case V_028714_SPI_SHADER_32_R: 2026 args[0] = uint->one; /* writemask */ 2027 args[5] = values[0]; 2028 break; 2029 2030 case V_028714_SPI_SHADER_32_GR: 2031 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */ 2032 args[5] = values[0]; 2033 args[6] = values[1]; 2034 break; 2035 2036 case V_028714_SPI_SHADER_32_AR: 2037 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */ 2038 args[5] = values[0]; 2039 args[8] = values[3]; 2040 break; 2041 2042 case V_028714_SPI_SHADER_FP16_ABGR: 2043 args[4] = uint->one; /* COMPR flag */ 2044 2045 for (chan = 0; chan < 2; chan++) { 2046 LLVMValueRef pack_args[2] = { 2047 values[2 * chan], 2048 values[2 * chan + 1] 2049 }; 2050 LLVMValueRef packed; 2051 2052 packed = lp_build_intrinsic(base->gallivm->builder, 2053 "llvm.SI.packf16", 2054 ctx->i32, pack_args, 2, 2055 LP_FUNC_ATTR_READNONE); 2056 args[chan + 5] = 2057 LLVMBuildBitCast(base->gallivm->builder, 2058 packed, ctx->f32, ""); 2059 } 2060 break; 2061 2062 case V_028714_SPI_SHADER_UNORM16_ABGR: 2063 for (chan = 0; chan < 4; chan++) { 2064 val[chan] = si_llvm_saturate(bld_base, values[chan]); 2065 val[chan] = LLVMBuildFMul(builder, val[chan], 2066 lp_build_const_float(gallivm, 65535), ""); 2067 val[chan] = LLVMBuildFAdd(builder, val[chan], 2068 lp_build_const_float(gallivm, 0.5), ""); 2069 val[chan] = LLVMBuildFPToUI(builder, val[chan], 2070 ctx->i32, ""); 2071 } 2072 2073 args[4] = uint->one; /* COMPR flag */ 2074 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, 2075 si_llvm_pack_two_int16(gallivm, val)); 2076 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT, 2077 si_llvm_pack_two_int16(gallivm, val+2)); 2078 break; 2079 2080 case V_028714_SPI_SHADER_SNORM16_ABGR: 2081 for (chan = 0; chan < 4; chan++) { 2082 /* Clamp between [-1, 1]. */ 2083 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN, 2084 values[chan], 2085 lp_build_const_float(gallivm, 1)); 2086 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX, 2087 val[chan], 2088 lp_build_const_float(gallivm, -1)); 2089 /* Convert to a signed integer in [-32767, 32767]. */ 2090 val[chan] = LLVMBuildFMul(builder, val[chan], 2091 lp_build_const_float(gallivm, 32767), ""); 2092 /* If positive, add 0.5, else add -0.5. */ 2093 val[chan] = LLVMBuildFAdd(builder, val[chan], 2094 LLVMBuildSelect(builder, 2095 LLVMBuildFCmp(builder, LLVMRealOGE, 2096 val[chan], base->zero, ""), 2097 lp_build_const_float(gallivm, 0.5), 2098 lp_build_const_float(gallivm, -0.5), ""), ""); 2099 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, ""); 2100 } 2101 2102 args[4] = uint->one; /* COMPR flag */ 2103 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, 2104 si_llvm_pack_two_int32_as_int16(gallivm, val)); 2105 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT, 2106 si_llvm_pack_two_int32_as_int16(gallivm, val+2)); 2107 break; 2108 2109 case V_028714_SPI_SHADER_UINT16_ABGR: { 2110 LLVMValueRef max_rgb = lp_build_const_int32(gallivm, 2111 is_int8 ? 255 : is_int10 ? 1023 : 65535); 2112 LLVMValueRef max_alpha = 2113 !is_int10 ? max_rgb : lp_build_const_int32(gallivm, 3); 2114 2115 /* Clamp. */ 2116 for (chan = 0; chan < 4; chan++) { 2117 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]); 2118 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN, 2119 val[chan], 2120 chan == 3 ? max_alpha : max_rgb); 2121 } 2122 2123 args[4] = uint->one; /* COMPR flag */ 2124 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, 2125 si_llvm_pack_two_int16(gallivm, val)); 2126 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT, 2127 si_llvm_pack_two_int16(gallivm, val+2)); 2128 break; 2129 } 2130 2131 case V_028714_SPI_SHADER_SINT16_ABGR: { 2132 LLVMValueRef max_rgb = lp_build_const_int32(gallivm, 2133 is_int8 ? 127 : is_int10 ? 511 : 32767); 2134 LLVMValueRef min_rgb = lp_build_const_int32(gallivm, 2135 is_int8 ? -128 : is_int10 ? -512 : -32768); 2136 LLVMValueRef max_alpha = 2137 !is_int10 ? max_rgb : lp_build_const_int32(gallivm, 1); 2138 LLVMValueRef min_alpha = 2139 !is_int10 ? min_rgb : lp_build_const_int32(gallivm, -2); 2140 2141 /* Clamp. */ 2142 for (chan = 0; chan < 4; chan++) { 2143 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]); 2144 val[chan] = lp_build_emit_llvm_binary(bld_base, 2145 TGSI_OPCODE_IMIN, 2146 val[chan], chan == 3 ? max_alpha : max_rgb); 2147 val[chan] = lp_build_emit_llvm_binary(bld_base, 2148 TGSI_OPCODE_IMAX, 2149 val[chan], chan == 3 ? min_alpha : min_rgb); 2150 } 2151 2152 args[4] = uint->one; /* COMPR flag */ 2153 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, 2154 si_llvm_pack_two_int32_as_int16(gallivm, val)); 2155 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT, 2156 si_llvm_pack_two_int32_as_int16(gallivm, val+2)); 2157 break; 2158 } 2159 2160 case V_028714_SPI_SHADER_32_ABGR: 2161 memcpy(&args[5], values, sizeof(values[0]) * 4); 2162 break; 2163 } 2164 } 2165 2166 static void si_alpha_test(struct lp_build_tgsi_context *bld_base, 2167 LLVMValueRef alpha) 2168 { 2169 struct si_shader_context *ctx = si_shader_context(bld_base); 2170 struct gallivm_state *gallivm = bld_base->base.gallivm; 2171 2172 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { 2173 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, 2174 SI_PARAM_ALPHA_REF); 2175 2176 LLVMValueRef alpha_pass = 2177 lp_build_cmp(&bld_base->base, 2178 ctx->shader->key.part.ps.epilog.alpha_func, 2179 alpha, alpha_ref); 2180 LLVMValueRef arg = 2181 lp_build_select(&bld_base->base, 2182 alpha_pass, 2183 lp_build_const_float(gallivm, 1.0f), 2184 lp_build_const_float(gallivm, -1.0f)); 2185 2186 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", 2187 ctx->voidt, &arg, 1, 0); 2188 } else { 2189 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp", 2190 ctx->voidt, NULL, 0, 0); 2191 } 2192 } 2193 2194 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base, 2195 LLVMValueRef alpha, 2196 unsigned samplemask_param) 2197 { 2198 struct si_shader_context *ctx = si_shader_context(bld_base); 2199 struct gallivm_state *gallivm = bld_base->base.gallivm; 2200 LLVMValueRef coverage; 2201 2202 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ 2203 coverage = LLVMGetParam(ctx->main_fn, 2204 samplemask_param); 2205 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage); 2206 2207 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32", 2208 ctx->i32, 2209 &coverage, 1, LP_FUNC_ATTR_READNONE); 2210 2211 coverage = LLVMBuildUIToFP(gallivm->builder, coverage, 2212 ctx->f32, ""); 2213 2214 coverage = LLVMBuildFMul(gallivm->builder, coverage, 2215 lp_build_const_float(gallivm, 2216 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); 2217 2218 return LLVMBuildFMul(gallivm->builder, alpha, coverage, ""); 2219 } 2220 2221 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base, 2222 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts) 2223 { 2224 struct si_shader_context *ctx = si_shader_context(bld_base); 2225 struct lp_build_context *base = &bld_base->base; 2226 struct lp_build_context *uint = &ctx->bld_base.uint_bld; 2227 unsigned reg_index; 2228 unsigned chan; 2229 unsigned const_chan; 2230 LLVMValueRef base_elt; 2231 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); 2232 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, 2233 SI_VS_CONST_CLIP_PLANES); 2234 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index); 2235 2236 for (reg_index = 0; reg_index < 2; reg_index ++) { 2237 LLVMValueRef *args = pos[2 + reg_index]; 2238 2239 args[5] = 2240 args[6] = 2241 args[7] = 2242 args[8] = lp_build_const_float(base->gallivm, 0.0f); 2243 2244 /* Compute dot products of position and user clip plane vectors */ 2245 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 2246 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) { 2247 args[1] = lp_build_const_int32(base->gallivm, 2248 ((reg_index * 4 + chan) * 4 + 2249 const_chan) * 4); 2250 base_elt = buffer_load_const(ctx, const_resource, 2251 args[1]); 2252 args[5 + chan] = 2253 lp_build_add(base, args[5 + chan], 2254 lp_build_mul(base, base_elt, 2255 out_elts[const_chan])); 2256 } 2257 } 2258 2259 args[0] = lp_build_const_int32(base->gallivm, 0xf); 2260 args[1] = uint->zero; 2261 args[2] = uint->zero; 2262 args[3] = lp_build_const_int32(base->gallivm, 2263 V_008DFC_SQ_EXP_POS + 2 + reg_index); 2264 args[4] = uint->zero; 2265 } 2266 } 2267 2268 static void si_dump_streamout(struct pipe_stream_output_info *so) 2269 { 2270 unsigned i; 2271 2272 if (so->num_outputs) 2273 fprintf(stderr, "STREAMOUT\n"); 2274 2275 for (i = 0; i < so->num_outputs; i++) { 2276 unsigned mask = ((1 << so->output[i].num_components) - 1) << 2277 so->output[i].start_component; 2278 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", 2279 i, so->output[i].output_buffer, 2280 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 2281 so->output[i].register_index, 2282 mask & 1 ? "x" : "", 2283 mask & 2 ? "y" : "", 2284 mask & 4 ? "z" : "", 2285 mask & 8 ? "w" : ""); 2286 } 2287 } 2288 2289 static void emit_streamout_output(struct si_shader_context *ctx, 2290 LLVMValueRef const *so_buffers, 2291 LLVMValueRef const *so_write_offsets, 2292 struct pipe_stream_output *stream_out, 2293 struct si_shader_output_values *shader_out) 2294 { 2295 struct gallivm_state *gallivm = &ctx->gallivm; 2296 LLVMBuilderRef builder = gallivm->builder; 2297 unsigned buf_idx = stream_out->output_buffer; 2298 unsigned start = stream_out->start_component; 2299 unsigned num_comps = stream_out->num_components; 2300 LLVMValueRef out[4]; 2301 2302 assert(num_comps && num_comps <= 4); 2303 if (!num_comps || num_comps > 4) 2304 return; 2305 2306 /* Load the output as int. */ 2307 for (int j = 0; j < num_comps; j++) { 2308 assert(stream_out->stream == shader_out->vertex_stream[start + j]); 2309 2310 out[j] = LLVMBuildBitCast(builder, 2311 shader_out->values[start + j], 2312 ctx->i32, ""); 2313 } 2314 2315 /* Pack the output. */ 2316 LLVMValueRef vdata = NULL; 2317 2318 switch (num_comps) { 2319 case 1: /* as i32 */ 2320 vdata = out[0]; 2321 break; 2322 case 2: /* as v2i32 */ 2323 case 3: /* as v4i32 (aligned to 4) */ 2324 case 4: /* as v4i32 */ 2325 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps))); 2326 for (int j = 0; j < num_comps; j++) { 2327 vdata = LLVMBuildInsertElement(builder, vdata, out[j], 2328 LLVMConstInt(ctx->i32, j, 0), ""); 2329 } 2330 break; 2331 } 2332 2333 build_tbuffer_store_dwords(ctx, so_buffers[buf_idx], 2334 vdata, num_comps, 2335 so_write_offsets[buf_idx], 2336 LLVMConstInt(ctx->i32, 0, 0), 2337 stream_out->dst_offset * 4); 2338 } 2339 2340 /** 2341 * Write streamout data to buffers for vertex stream @p stream (different 2342 * vertex streams can occur for GS copy shaders). 2343 */ 2344 static void si_llvm_emit_streamout(struct si_shader_context *ctx, 2345 struct si_shader_output_values *outputs, 2346 unsigned noutput, unsigned stream) 2347 { 2348 struct si_shader_selector *sel = ctx->shader->selector; 2349 struct pipe_stream_output_info *so = &sel->so; 2350 struct gallivm_state *gallivm = &ctx->gallivm; 2351 LLVMBuilderRef builder = gallivm->builder; 2352 int i; 2353 struct lp_build_if_state if_ctx; 2354 2355 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ 2356 LLVMValueRef so_vtx_count = 2357 unpack_param(ctx, ctx->param_streamout_config, 16, 7); 2358 2359 LLVMValueRef tid = get_thread_id(ctx); 2360 2361 /* can_emit = tid < so_vtx_count; */ 2362 LLVMValueRef can_emit = 2363 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); 2364 2365 /* Emit the streamout code conditionally. This actually avoids 2366 * out-of-bounds buffer access. The hw tells us via the SGPR 2367 * (so_vtx_count) which threads are allowed to emit streamout data. */ 2368 lp_build_if(&if_ctx, gallivm, can_emit); 2369 { 2370 /* The buffer offset is computed as follows: 2371 * ByteOffset = streamout_offset[buffer_id]*4 + 2372 * (streamout_write_index + thread_id)*stride[buffer_id] + 2373 * attrib_offset 2374 */ 2375 2376 LLVMValueRef so_write_index = 2377 LLVMGetParam(ctx->main_fn, 2378 ctx->param_streamout_write_index); 2379 2380 /* Compute (streamout_write_index + thread_id). */ 2381 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); 2382 2383 /* Load the descriptor and compute the write offset for each 2384 * enabled buffer. */ 2385 LLVMValueRef so_write_offset[4] = {}; 2386 LLVMValueRef so_buffers[4]; 2387 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, 2388 SI_PARAM_RW_BUFFERS); 2389 2390 for (i = 0; i < 4; i++) { 2391 if (!so->stride[i]) 2392 continue; 2393 2394 LLVMValueRef offset = lp_build_const_int32(gallivm, 2395 SI_VS_STREAMOUT_BUF0 + i); 2396 2397 so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset); 2398 2399 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn, 2400 ctx->param_streamout_offset[i]); 2401 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), ""); 2402 2403 so_write_offset[i] = LLVMBuildMul(builder, so_write_index, 2404 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), ""); 2405 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, ""); 2406 } 2407 2408 /* Write streamout data. */ 2409 for (i = 0; i < so->num_outputs; i++) { 2410 unsigned reg = so->output[i].register_index; 2411 2412 if (reg >= noutput) 2413 continue; 2414 2415 if (stream != so->output[i].stream) 2416 continue; 2417 2418 emit_streamout_output(ctx, so_buffers, so_write_offset, 2419 &so->output[i], &outputs[reg]); 2420 } 2421 } 2422 lp_build_endif(&if_ctx); 2423 } 2424 2425 2426 /* Generate export instructions for hardware VS shader stage */ 2427 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base, 2428 struct si_shader_output_values *outputs, 2429 unsigned noutput) 2430 { 2431 struct si_shader_context *ctx = si_shader_context(bld_base); 2432 struct si_shader *shader = ctx->shader; 2433 struct lp_build_context *base = &bld_base->base; 2434 struct lp_build_context *uint = &ctx->bld_base.uint_bld; 2435 LLVMValueRef args[9]; 2436 LLVMValueRef pos_args[4][9] = { { 0 } }; 2437 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; 2438 unsigned semantic_name, semantic_index; 2439 unsigned target; 2440 unsigned param_count = 0; 2441 unsigned pos_idx; 2442 int i; 2443 2444 for (i = 0; i < noutput; i++) { 2445 semantic_name = outputs[i].semantic_name; 2446 semantic_index = outputs[i].semantic_index; 2447 bool export_param = true; 2448 2449 switch (semantic_name) { 2450 case TGSI_SEMANTIC_POSITION: /* ignore these */ 2451 case TGSI_SEMANTIC_PSIZE: 2452 case TGSI_SEMANTIC_CLIPVERTEX: 2453 case TGSI_SEMANTIC_EDGEFLAG: 2454 break; 2455 case TGSI_SEMANTIC_GENERIC: 2456 case TGSI_SEMANTIC_CLIPDIST: 2457 if (shader->key.opt.hw_vs.kill_outputs & 2458 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index))) 2459 export_param = false; 2460 break; 2461 default: 2462 if (shader->key.opt.hw_vs.kill_outputs2 & 2463 (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index))) 2464 export_param = false; 2465 break; 2466 } 2467 2468 if (outputs[i].vertex_stream[0] != 0 && 2469 outputs[i].vertex_stream[1] != 0 && 2470 outputs[i].vertex_stream[2] != 0 && 2471 outputs[i].vertex_stream[3] != 0) 2472 export_param = false; 2473 2474 handle_semantic: 2475 /* Select the correct target */ 2476 switch(semantic_name) { 2477 case TGSI_SEMANTIC_PSIZE: 2478 psize_value = outputs[i].values[0]; 2479 continue; 2480 case TGSI_SEMANTIC_EDGEFLAG: 2481 edgeflag_value = outputs[i].values[0]; 2482 continue; 2483 case TGSI_SEMANTIC_LAYER: 2484 layer_value = outputs[i].values[0]; 2485 semantic_name = TGSI_SEMANTIC_GENERIC; 2486 goto handle_semantic; 2487 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2488 viewport_index_value = outputs[i].values[0]; 2489 semantic_name = TGSI_SEMANTIC_GENERIC; 2490 goto handle_semantic; 2491 case TGSI_SEMANTIC_POSITION: 2492 target = V_008DFC_SQ_EXP_POS; 2493 break; 2494 case TGSI_SEMANTIC_CLIPDIST: 2495 if (shader->key.opt.hw_vs.clip_disable) { 2496 semantic_name = TGSI_SEMANTIC_GENERIC; 2497 goto handle_semantic; 2498 } 2499 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index; 2500 break; 2501 case TGSI_SEMANTIC_CLIPVERTEX: 2502 if (shader->key.opt.hw_vs.clip_disable) 2503 continue; 2504 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values); 2505 continue; 2506 case TGSI_SEMANTIC_COLOR: 2507 case TGSI_SEMANTIC_BCOLOR: 2508 case TGSI_SEMANTIC_PRIMID: 2509 case TGSI_SEMANTIC_FOG: 2510 case TGSI_SEMANTIC_TEXCOORD: 2511 case TGSI_SEMANTIC_GENERIC: 2512 if (!export_param) 2513 continue; 2514 target = V_008DFC_SQ_EXP_PARAM + param_count; 2515 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); 2516 shader->info.vs_output_param_offset[i] = param_count; 2517 param_count++; 2518 break; 2519 default: 2520 target = 0; 2521 fprintf(stderr, 2522 "Warning: SI unhandled vs output type:%d\n", 2523 semantic_name); 2524 } 2525 2526 si_llvm_init_export_args(bld_base, outputs[i].values, target, args); 2527 2528 if (target >= V_008DFC_SQ_EXP_POS && 2529 target <= (V_008DFC_SQ_EXP_POS + 3)) { 2530 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS], 2531 args, sizeof(args)); 2532 } else { 2533 lp_build_intrinsic(base->gallivm->builder, 2534 "llvm.SI.export", ctx->voidt, 2535 args, 9, 0); 2536 } 2537 2538 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) { 2539 semantic_name = TGSI_SEMANTIC_GENERIC; 2540 goto handle_semantic; 2541 } 2542 } 2543 2544 shader->info.nr_param_exports = param_count; 2545 2546 /* We need to add the position output manually if it's missing. */ 2547 if (!pos_args[0][0]) { 2548 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */ 2549 pos_args[0][1] = uint->zero; /* EXEC mask */ 2550 pos_args[0][2] = uint->zero; /* last export? */ 2551 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS); 2552 pos_args[0][4] = uint->zero; /* COMPR flag */ 2553 pos_args[0][5] = base->zero; /* X */ 2554 pos_args[0][6] = base->zero; /* Y */ 2555 pos_args[0][7] = base->zero; /* Z */ 2556 pos_args[0][8] = base->one; /* W */ 2557 } 2558 2559 /* Write the misc vector (point size, edgeflag, layer, viewport). */ 2560 if (shader->selector->info.writes_psize || 2561 shader->selector->info.writes_edgeflag || 2562 shader->selector->info.writes_viewport_index || 2563 shader->selector->info.writes_layer) { 2564 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */ 2565 shader->selector->info.writes_psize | 2566 (shader->selector->info.writes_edgeflag << 1) | 2567 (shader->selector->info.writes_layer << 2) | 2568 (shader->selector->info.writes_viewport_index << 3)); 2569 pos_args[1][1] = uint->zero; /* EXEC mask */ 2570 pos_args[1][2] = uint->zero; /* last export? */ 2571 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1); 2572 pos_args[1][4] = uint->zero; /* COMPR flag */ 2573 pos_args[1][5] = base->zero; /* X */ 2574 pos_args[1][6] = base->zero; /* Y */ 2575 pos_args[1][7] = base->zero; /* Z */ 2576 pos_args[1][8] = base->zero; /* W */ 2577 2578 if (shader->selector->info.writes_psize) 2579 pos_args[1][5] = psize_value; 2580 2581 if (shader->selector->info.writes_edgeflag) { 2582 /* The output is a float, but the hw expects an integer 2583 * with the first bit containing the edge flag. */ 2584 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder, 2585 edgeflag_value, 2586 ctx->i32, ""); 2587 edgeflag_value = lp_build_min(&bld_base->int_bld, 2588 edgeflag_value, 2589 bld_base->int_bld.one); 2590 2591 /* The LLVM intrinsic expects a float. */ 2592 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder, 2593 edgeflag_value, 2594 ctx->f32, ""); 2595 } 2596 2597 if (shader->selector->info.writes_layer) 2598 pos_args[1][7] = layer_value; 2599 2600 if (shader->selector->info.writes_viewport_index) 2601 pos_args[1][8] = viewport_index_value; 2602 } 2603 2604 for (i = 0; i < 4; i++) 2605 if (pos_args[i][0]) 2606 shader->info.nr_pos_exports++; 2607 2608 pos_idx = 0; 2609 for (i = 0; i < 4; i++) { 2610 if (!pos_args[i][0]) 2611 continue; 2612 2613 /* Specify the target we are exporting */ 2614 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++); 2615 2616 if (pos_idx == shader->info.nr_pos_exports) 2617 /* Specify that this is the last export */ 2618 pos_args[i][2] = uint->one; 2619 2620 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", 2621 ctx->voidt, pos_args[i], 9, 0); 2622 } 2623 } 2624 2625 /** 2626 * Forward all outputs from the vertex shader to the TES. This is only used 2627 * for the fixed function TCS. 2628 */ 2629 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) 2630 { 2631 struct si_shader_context *ctx = si_shader_context(bld_base); 2632 struct gallivm_state *gallivm = bld_base->base.gallivm; 2633 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset; 2634 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base; 2635 uint64_t inputs; 2636 2637 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); 2638 2639 rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); 2640 buffer = build_indexed_load_const(ctx, rw_buffers, 2641 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); 2642 2643 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); 2644 2645 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8); 2646 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id, 2647 lds_vertex_stride, ""); 2648 lds_base = get_tcs_in_current_patch_offset(ctx); 2649 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, ""); 2650 2651 inputs = ctx->shader->key.mono.tcs.inputs_to_copy; 2652 while (inputs) { 2653 unsigned i = u_bit_scan64(&inputs); 2654 2655 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base, 2656 lp_build_const_int32(gallivm, 4 * i), 2657 ""); 2658 2659 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, 2660 invocation_id, 2661 lp_build_const_int32(gallivm, i)); 2662 2663 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0, 2664 lds_ptr); 2665 2666 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr, 2667 buffer_offset, 0); 2668 } 2669 } 2670 2671 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, 2672 LLVMValueRef rel_patch_id, 2673 LLVMValueRef invocation_id, 2674 LLVMValueRef tcs_out_current_patch_data_offset) 2675 { 2676 struct si_shader_context *ctx = si_shader_context(bld_base); 2677 struct gallivm_state *gallivm = bld_base->base.gallivm; 2678 struct si_shader *shader = ctx->shader; 2679 unsigned tess_inner_index, tess_outer_index; 2680 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; 2681 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base; 2682 unsigned stride, outer_comps, inner_comps, i; 2683 struct lp_build_if_state if_ctx, inner_if_ctx; 2684 2685 si_llvm_emit_barrier(NULL, bld_base, NULL); 2686 2687 /* Do this only for invocation 0, because the tess levels are per-patch, 2688 * not per-vertex. 2689 * 2690 * This can't jump, because invocation 0 executes this. It should 2691 * at least mask out the loads and stores for other invocations. 2692 */ 2693 lp_build_if(&if_ctx, gallivm, 2694 LLVMBuildICmp(gallivm->builder, LLVMIntEQ, 2695 invocation_id, bld_base->uint_bld.zero, "")); 2696 2697 /* Determine the layout of one tess factor element in the buffer. */ 2698 switch (shader->key.part.tcs.epilog.prim_mode) { 2699 case PIPE_PRIM_LINES: 2700 stride = 2; /* 2 dwords, 1 vec2 store */ 2701 outer_comps = 2; 2702 inner_comps = 0; 2703 break; 2704 case PIPE_PRIM_TRIANGLES: 2705 stride = 4; /* 4 dwords, 1 vec4 store */ 2706 outer_comps = 3; 2707 inner_comps = 1; 2708 break; 2709 case PIPE_PRIM_QUADS: 2710 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ 2711 outer_comps = 4; 2712 inner_comps = 2; 2713 break; 2714 default: 2715 assert(0); 2716 return; 2717 } 2718 2719 /* Load tess_inner and tess_outer from LDS. 2720 * Any invocation can write them, so we can't get them from a temporary. 2721 */ 2722 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0); 2723 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0); 2724 2725 lds_base = tcs_out_current_patch_data_offset; 2726 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base, 2727 lp_build_const_int32(gallivm, 2728 tess_inner_index * 4), ""); 2729 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base, 2730 lp_build_const_int32(gallivm, 2731 tess_outer_index * 4), ""); 2732 2733 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { 2734 /* For isolines, the hardware expects tess factors in the 2735 * reverse order from what GLSL / TGSI specify. 2736 */ 2737 out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer); 2738 out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer); 2739 } else { 2740 for (i = 0; i < outer_comps; i++) 2741 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); 2742 for (i = 0; i < inner_comps; i++) 2743 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner); 2744 } 2745 2746 /* Convert the outputs to vectors for stores. */ 2747 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4)); 2748 vec1 = NULL; 2749 2750 if (stride > 4) 2751 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4); 2752 2753 /* Get the buffer. */ 2754 rw_buffers = LLVMGetParam(ctx->main_fn, 2755 SI_PARAM_RW_BUFFERS); 2756 buffer = build_indexed_load_const(ctx, rw_buffers, 2757 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR)); 2758 2759 /* Get the offset. */ 2760 tf_base = LLVMGetParam(ctx->main_fn, 2761 SI_PARAM_TESS_FACTOR_OFFSET); 2762 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id, 2763 lp_build_const_int32(gallivm, 4 * stride), ""); 2764 2765 lp_build_if(&inner_if_ctx, gallivm, 2766 LLVMBuildICmp(gallivm->builder, LLVMIntEQ, 2767 rel_patch_id, bld_base->uint_bld.zero, "")); 2768 2769 /* Store the dynamic HS control word. */ 2770 build_tbuffer_store_dwords(ctx, buffer, 2771 lp_build_const_int32(gallivm, 0x80000000), 2772 1, lp_build_const_int32(gallivm, 0), tf_base, 0); 2773 2774 lp_build_endif(&inner_if_ctx); 2775 2776 /* Store the tessellation factors. */ 2777 build_tbuffer_store_dwords(ctx, buffer, vec0, 2778 MIN2(stride, 4), byteoffset, tf_base, 4); 2779 if (vec1) 2780 build_tbuffer_store_dwords(ctx, buffer, vec1, 2781 stride - 4, byteoffset, tf_base, 20); 2782 lp_build_endif(&if_ctx); 2783 } 2784 2785 /* This only writes the tessellation factor levels. */ 2786 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) 2787 { 2788 struct si_shader_context *ctx = si_shader_context(bld_base); 2789 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; 2790 2791 si_copy_tcs_inputs(bld_base); 2792 2793 rel_patch_id = get_rel_patch_id(ctx); 2794 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); 2795 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); 2796 2797 /* Return epilog parameters from this function. */ 2798 LLVMBuilderRef builder = bld_base->base.gallivm->builder; 2799 LLVMValueRef ret = ctx->return_value; 2800 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset; 2801 unsigned vgpr; 2802 2803 /* RW_BUFFERS pointer */ 2804 rw_buffers = LLVMGetParam(ctx->main_fn, 2805 SI_PARAM_RW_BUFFERS); 2806 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); 2807 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); 2808 rw0 = LLVMBuildExtractElement(builder, rw_buffers, 2809 bld_base->uint_bld.zero, ""); 2810 rw1 = LLVMBuildExtractElement(builder, rw_buffers, 2811 bld_base->uint_bld.one, ""); 2812 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); 2813 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); 2814 2815 /* Tess factor buffer soffset is after user SGPRs. */ 2816 tf_soffset = LLVMGetParam(ctx->main_fn, 2817 SI_PARAM_TESS_FACTOR_OFFSET); 2818 ret = LLVMBuildInsertValue(builder, ret, tf_soffset, 2819 SI_TCS_NUM_USER_SGPR + 1, ""); 2820 2821 /* VGPRs */ 2822 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); 2823 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); 2824 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); 2825 2826 vgpr = SI_TCS_NUM_USER_SGPR + 2; 2827 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); 2828 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); 2829 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); 2830 ctx->return_value = ret; 2831 } 2832 2833 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base) 2834 { 2835 struct si_shader_context *ctx = si_shader_context(bld_base); 2836 struct si_shader *shader = ctx->shader; 2837 struct tgsi_shader_info *info = &shader->selector->info; 2838 struct gallivm_state *gallivm = bld_base->base.gallivm; 2839 unsigned i, chan; 2840 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn, 2841 ctx->param_rel_auto_id); 2842 LLVMValueRef vertex_dw_stride = 2843 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8); 2844 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id, 2845 vertex_dw_stride, ""); 2846 2847 /* Write outputs to LDS. The next shader (TCS aka HS) will read 2848 * its inputs from it. */ 2849 for (i = 0; i < info->num_outputs; i++) { 2850 LLVMValueRef *out_ptr = ctx->outputs[i]; 2851 unsigned name = info->output_semantic_name[i]; 2852 unsigned index = info->output_semantic_index[i]; 2853 int param = si_shader_io_get_unique_index(name, index); 2854 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr, 2855 lp_build_const_int32(gallivm, param * 4), ""); 2856 2857 for (chan = 0; chan < 4; chan++) { 2858 lds_store(bld_base, chan, dw_addr, 2859 LLVMBuildLoad(gallivm->builder, out_ptr[chan], "")); 2860 } 2861 } 2862 } 2863 2864 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base) 2865 { 2866 struct si_shader_context *ctx = si_shader_context(bld_base); 2867 struct gallivm_state *gallivm = bld_base->base.gallivm; 2868 struct si_shader *es = ctx->shader; 2869 struct tgsi_shader_info *info = &es->selector->info; 2870 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, 2871 ctx->param_es2gs_offset); 2872 unsigned chan; 2873 int i; 2874 2875 for (i = 0; i < info->num_outputs; i++) { 2876 LLVMValueRef *out_ptr = ctx->outputs[i]; 2877 int param_index; 2878 2879 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || 2880 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) 2881 continue; 2882 2883 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i], 2884 info->output_semantic_index[i]); 2885 2886 for (chan = 0; chan < 4; chan++) { 2887 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); 2888 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); 2889 2890 build_tbuffer_store(ctx, 2891 ctx->esgs_ring, 2892 out_val, 1, 2893 LLVMGetUndef(ctx->i32), soffset, 2894 (4 * param_index + chan) * 4, 2895 V_008F0C_BUF_DATA_FORMAT_32, 2896 V_008F0C_BUF_NUM_FORMAT_UINT, 2897 0, 0, 1, 1, 0); 2898 } 2899 } 2900 } 2901 2902 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) 2903 { 2904 struct si_shader_context *ctx = si_shader_context(bld_base); 2905 struct gallivm_state *gallivm = bld_base->base.gallivm; 2906 LLVMValueRef args[2]; 2907 2908 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE); 2909 args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID); 2910 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", 2911 ctx->voidt, args, 2, 0); 2912 } 2913 2914 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) 2915 { 2916 struct si_shader_context *ctx = si_shader_context(bld_base); 2917 struct gallivm_state *gallivm = bld_base->base.gallivm; 2918 struct tgsi_shader_info *info = &ctx->shader->selector->info; 2919 struct si_shader_output_values *outputs = NULL; 2920 int i,j; 2921 2922 assert(!ctx->shader->is_gs_copy_shader); 2923 2924 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); 2925 2926 /* Vertex color clamping. 2927 * 2928 * This uses a state constant loaded in a user data SGPR and 2929 * an IF statement is added that clamps all colors if the constant 2930 * is true. 2931 */ 2932 if (ctx->type == PIPE_SHADER_VERTEX) { 2933 struct lp_build_if_state if_ctx; 2934 LLVMValueRef cond = NULL; 2935 LLVMValueRef addr, val; 2936 2937 for (i = 0; i < info->num_outputs; i++) { 2938 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR && 2939 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR) 2940 continue; 2941 2942 /* We've found a color. */ 2943 if (!cond) { 2944 /* The state is in the first bit of the user SGPR. */ 2945 cond = LLVMGetParam(ctx->main_fn, 2946 SI_PARAM_VS_STATE_BITS); 2947 cond = LLVMBuildTrunc(gallivm->builder, cond, 2948 ctx->i1, ""); 2949 lp_build_if(&if_ctx, gallivm, cond); 2950 } 2951 2952 for (j = 0; j < 4; j++) { 2953 addr = ctx->outputs[i][j]; 2954 val = LLVMBuildLoad(gallivm->builder, addr, ""); 2955 val = si_llvm_saturate(bld_base, val); 2956 LLVMBuildStore(gallivm->builder, val, addr); 2957 } 2958 } 2959 2960 if (cond) 2961 lp_build_endif(&if_ctx); 2962 } 2963 2964 for (i = 0; i < info->num_outputs; i++) { 2965 outputs[i].semantic_name = info->output_semantic_name[i]; 2966 outputs[i].semantic_index = info->output_semantic_index[i]; 2967 2968 for (j = 0; j < 4; j++) { 2969 outputs[i].values[j] = 2970 LLVMBuildLoad(gallivm->builder, 2971 ctx->outputs[i][j], 2972 ""); 2973 outputs[i].vertex_stream[j] = 2974 (info->output_streams[i] >> (2 * j)) & 3; 2975 } 2976 2977 } 2978 2979 /* Return the primitive ID from the LLVM function. */ 2980 ctx->return_value = 2981 LLVMBuildInsertValue(gallivm->builder, 2982 ctx->return_value, 2983 bitcast(bld_base, TGSI_TYPE_FLOAT, 2984 get_primitive_id(bld_base, 0)), 2985 VS_EPILOG_PRIMID_LOC, ""); 2986 2987 if (ctx->shader->selector->so.num_outputs) 2988 si_llvm_emit_streamout(ctx, outputs, i, 0); 2989 si_llvm_export_vs(bld_base, outputs, i); 2990 FREE(outputs); 2991 } 2992 2993 struct si_ps_exports { 2994 unsigned num; 2995 LLVMValueRef args[10][9]; 2996 }; 2997 2998 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil, 2999 bool writes_samplemask) 3000 { 3001 if (writes_z) { 3002 /* Z needs 32 bits. */ 3003 if (writes_samplemask) 3004 return V_028710_SPI_SHADER_32_ABGR; 3005 else if (writes_stencil) 3006 return V_028710_SPI_SHADER_32_GR; 3007 else 3008 return V_028710_SPI_SHADER_32_R; 3009 } else if (writes_stencil || writes_samplemask) { 3010 /* Both stencil and sample mask need only 16 bits. */ 3011 return V_028710_SPI_SHADER_UINT16_ABGR; 3012 } else { 3013 return V_028710_SPI_SHADER_ZERO; 3014 } 3015 } 3016 3017 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base, 3018 LLVMValueRef depth, LLVMValueRef stencil, 3019 LLVMValueRef samplemask, struct si_ps_exports *exp) 3020 { 3021 struct si_shader_context *ctx = si_shader_context(bld_base); 3022 struct lp_build_context *base = &bld_base->base; 3023 struct lp_build_context *uint = &bld_base->uint_bld; 3024 LLVMValueRef args[9]; 3025 unsigned mask = 0; 3026 unsigned format = si_get_spi_shader_z_format(depth != NULL, 3027 stencil != NULL, 3028 samplemask != NULL); 3029 3030 assert(depth || stencil || samplemask); 3031 3032 args[1] = uint->one; /* whether the EXEC mask is valid */ 3033 args[2] = uint->one; /* DONE bit */ 3034 3035 /* Specify the target we are exporting */ 3036 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ); 3037 3038 args[4] = uint->zero; /* COMP flag */ 3039 args[5] = base->undef; /* R, depth */ 3040 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */ 3041 args[7] = base->undef; /* B, sample mask */ 3042 args[8] = base->undef; /* A, alpha to mask */ 3043 3044 if (format == V_028710_SPI_SHADER_UINT16_ABGR) { 3045 assert(!depth); 3046 args[4] = uint->one; /* COMPR flag */ 3047 3048 if (stencil) { 3049 /* Stencil should be in X[23:16]. */ 3050 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil); 3051 stencil = LLVMBuildShl(base->gallivm->builder, stencil, 3052 LLVMConstInt(ctx->i32, 16, 0), ""); 3053 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil); 3054 mask |= 0x3; 3055 } 3056 if (samplemask) { 3057 /* SampleMask should be in Y[15:0]. */ 3058 args[6] = samplemask; 3059 mask |= 0xc; 3060 } 3061 } else { 3062 if (depth) { 3063 args[5] = depth; 3064 mask |= 0x1; 3065 } 3066 if (stencil) { 3067 args[6] = stencil; 3068 mask |= 0x2; 3069 } 3070 if (samplemask) { 3071 args[7] = samplemask; 3072 mask |= 0x4; 3073 } 3074 } 3075 3076 /* SI (except OLAND and HAINAN) has a bug that it only looks 3077 * at the X writemask component. */ 3078 if (ctx->screen->b.chip_class == SI && 3079 ctx->screen->b.family != CHIP_OLAND && 3080 ctx->screen->b.family != CHIP_HAINAN) 3081 mask |= 0x1; 3082 3083 /* Specify which components to enable */ 3084 args[0] = lp_build_const_int32(base->gallivm, mask); 3085 3086 memcpy(exp->args[exp->num++], args, sizeof(args)); 3087 } 3088 3089 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base, 3090 LLVMValueRef *color, unsigned index, 3091 unsigned samplemask_param, 3092 bool is_last, struct si_ps_exports *exp) 3093 { 3094 struct si_shader_context *ctx = si_shader_context(bld_base); 3095 struct lp_build_context *base = &bld_base->base; 3096 int i; 3097 3098 /* Clamp color */ 3099 if (ctx->shader->key.part.ps.epilog.clamp_color) 3100 for (i = 0; i < 4; i++) 3101 color[i] = si_llvm_saturate(bld_base, color[i]); 3102 3103 /* Alpha to one */ 3104 if (ctx->shader->key.part.ps.epilog.alpha_to_one) 3105 color[3] = base->one; 3106 3107 /* Alpha test */ 3108 if (index == 0 && 3109 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) 3110 si_alpha_test(bld_base, color[3]); 3111 3112 /* Line & polygon smoothing */ 3113 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) 3114 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3], 3115 samplemask_param); 3116 3117 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ 3118 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { 3119 LLVMValueRef args[8][9]; 3120 int c, last = -1; 3121 3122 /* Get the export arguments, also find out what the last one is. */ 3123 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { 3124 si_llvm_init_export_args(bld_base, color, 3125 V_008DFC_SQ_EXP_MRT + c, args[c]); 3126 if (args[c][0] != bld_base->uint_bld.zero) 3127 last = c; 3128 } 3129 3130 /* Emit all exports. */ 3131 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { 3132 if (is_last && last == c) { 3133 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ 3134 args[c][2] = bld_base->uint_bld.one; /* DONE bit */ 3135 } else if (args[c][0] == bld_base->uint_bld.zero) 3136 continue; /* unnecessary NULL export */ 3137 3138 memcpy(exp->args[exp->num++], args[c], sizeof(args[c])); 3139 } 3140 } else { 3141 LLVMValueRef args[9]; 3142 3143 /* Export */ 3144 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index, 3145 args); 3146 if (is_last) { 3147 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */ 3148 args[2] = bld_base->uint_bld.one; /* DONE bit */ 3149 } else if (args[0] == bld_base->uint_bld.zero) 3150 return; /* unnecessary NULL export */ 3151 3152 memcpy(exp->args[exp->num++], args, sizeof(args)); 3153 } 3154 } 3155 3156 static void si_emit_ps_exports(struct si_shader_context *ctx, 3157 struct si_ps_exports *exp) 3158 { 3159 for (unsigned i = 0; i < exp->num; i++) 3160 lp_build_intrinsic(ctx->gallivm.builder, 3161 "llvm.SI.export", ctx->voidt, 3162 exp->args[i], 9, 0); 3163 } 3164 3165 static void si_export_null(struct lp_build_tgsi_context *bld_base) 3166 { 3167 struct si_shader_context *ctx = si_shader_context(bld_base); 3168 struct lp_build_context *base = &bld_base->base; 3169 struct lp_build_context *uint = &bld_base->uint_bld; 3170 LLVMValueRef args[9]; 3171 3172 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */ 3173 args[1] = uint->one; /* whether the EXEC mask is valid */ 3174 args[2] = uint->one; /* DONE bit */ 3175 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL); 3176 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */ 3177 args[5] = base->undef; /* R */ 3178 args[6] = base->undef; /* G */ 3179 args[7] = base->undef; /* B */ 3180 args[8] = base->undef; /* A */ 3181 3182 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", 3183 ctx->voidt, args, 9, 0); 3184 } 3185 3186 /** 3187 * Return PS outputs in this order: 3188 * 3189 * v[0:3] = color0.xyzw 3190 * v[4:7] = color1.xyzw 3191 * ... 3192 * vN+0 = Depth 3193 * vN+1 = Stencil 3194 * vN+2 = SampleMask 3195 * vN+3 = SampleMaskIn (used for OpenGL smoothing) 3196 * 3197 * The alpha-ref SGPR is returned via its original location. 3198 */ 3199 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) 3200 { 3201 struct si_shader_context *ctx = si_shader_context(bld_base); 3202 struct si_shader *shader = ctx->shader; 3203 struct lp_build_context *base = &bld_base->base; 3204 struct tgsi_shader_info *info = &shader->selector->info; 3205 LLVMBuilderRef builder = base->gallivm->builder; 3206 unsigned i, j, first_vgpr, vgpr; 3207 3208 LLVMValueRef color[8][4] = {}; 3209 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 3210 LLVMValueRef ret; 3211 3212 /* Read the output values. */ 3213 for (i = 0; i < info->num_outputs; i++) { 3214 unsigned semantic_name = info->output_semantic_name[i]; 3215 unsigned semantic_index = info->output_semantic_index[i]; 3216 3217 switch (semantic_name) { 3218 case TGSI_SEMANTIC_COLOR: 3219 assert(semantic_index < 8); 3220 for (j = 0; j < 4; j++) { 3221 LLVMValueRef ptr = ctx->outputs[i][j]; 3222 LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); 3223 color[semantic_index][j] = result; 3224 } 3225 break; 3226 case TGSI_SEMANTIC_POSITION: 3227 depth = LLVMBuildLoad(builder, 3228 ctx->outputs[i][2], ""); 3229 break; 3230 case TGSI_SEMANTIC_STENCIL: 3231 stencil = LLVMBuildLoad(builder, 3232 ctx->outputs[i][1], ""); 3233 break; 3234 case TGSI_SEMANTIC_SAMPLEMASK: 3235 samplemask = LLVMBuildLoad(builder, 3236 ctx->outputs[i][0], ""); 3237 break; 3238 default: 3239 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n", 3240 semantic_name); 3241 } 3242 } 3243 3244 /* Fill the return structure. */ 3245 ret = ctx->return_value; 3246 3247 /* Set SGPRs. */ 3248 ret = LLVMBuildInsertValue(builder, ret, 3249 bitcast(bld_base, TGSI_TYPE_SIGNED, 3250 LLVMGetParam(ctx->main_fn, 3251 SI_PARAM_ALPHA_REF)), 3252 SI_SGPR_ALPHA_REF, ""); 3253 3254 /* Set VGPRs */ 3255 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; 3256 for (i = 0; i < ARRAY_SIZE(color); i++) { 3257 if (!color[i][0]) 3258 continue; 3259 3260 for (j = 0; j < 4; j++) 3261 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); 3262 } 3263 if (depth) 3264 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); 3265 if (stencil) 3266 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); 3267 if (samplemask) 3268 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); 3269 3270 /* Add the input sample mask for smoothing at the end. */ 3271 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) 3272 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; 3273 ret = LLVMBuildInsertValue(builder, ret, 3274 LLVMGetParam(ctx->main_fn, 3275 SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); 3276 3277 ctx->return_value = ret; 3278 } 3279 3280 /** 3281 * Given a v8i32 resource descriptor for a buffer, extract the size of the 3282 * buffer in number of elements and return it as an i32. 3283 */ 3284 static LLVMValueRef get_buffer_size( 3285 struct lp_build_tgsi_context *bld_base, 3286 LLVMValueRef descriptor) 3287 { 3288 struct si_shader_context *ctx = si_shader_context(bld_base); 3289 struct gallivm_state *gallivm = bld_base->base.gallivm; 3290 LLVMBuilderRef builder = gallivm->builder; 3291 LLVMValueRef size = 3292 LLVMBuildExtractElement(builder, descriptor, 3293 lp_build_const_int32(gallivm, 2), ""); 3294 3295 if (ctx->screen->b.chip_class >= VI) { 3296 /* On VI, the descriptor contains the size in bytes, 3297 * but TXQ must return the size in elements. 3298 * The stride is always non-zero for resources using TXQ. 3299 */ 3300 LLVMValueRef stride = 3301 LLVMBuildExtractElement(builder, descriptor, 3302 lp_build_const_int32(gallivm, 1), ""); 3303 stride = LLVMBuildLShr(builder, stride, 3304 lp_build_const_int32(gallivm, 16), ""); 3305 stride = LLVMBuildAnd(builder, stride, 3306 lp_build_const_int32(gallivm, 0x3FFF), ""); 3307 3308 size = LLVMBuildUDiv(builder, size, stride, ""); 3309 } 3310 3311 return size; 3312 } 3313 3314 /** 3315 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with 3316 * intrinsic names). 3317 */ 3318 static void build_type_name_for_intr( 3319 LLVMTypeRef type, 3320 char *buf, unsigned bufsize) 3321 { 3322 LLVMTypeRef elem_type = type; 3323 3324 assert(bufsize >= 8); 3325 3326 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { 3327 int ret = snprintf(buf, bufsize, "v%u", 3328 LLVMGetVectorSize(type)); 3329 if (ret < 0) { 3330 char *type_name = LLVMPrintTypeToString(type); 3331 fprintf(stderr, "Error building type name for: %s\n", 3332 type_name); 3333 return; 3334 } 3335 elem_type = LLVMGetElementType(type); 3336 buf += ret; 3337 bufsize -= ret; 3338 } 3339 switch (LLVMGetTypeKind(elem_type)) { 3340 default: break; 3341 case LLVMIntegerTypeKind: 3342 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type)); 3343 break; 3344 case LLVMFloatTypeKind: 3345 snprintf(buf, bufsize, "f32"); 3346 break; 3347 case LLVMDoubleTypeKind: 3348 snprintf(buf, bufsize, "f64"); 3349 break; 3350 } 3351 } 3352 3353 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, 3354 struct lp_build_tgsi_context *bld_base, 3355 struct lp_build_emit_data *emit_data); 3356 3357 /* Prevent optimizations (at least of memory accesses) across the current 3358 * point in the program by emitting empty inline assembly that is marked as 3359 * having side effects. 3360 */ 3361 #if 0 /* unused currently */ 3362 static void emit_optimization_barrier(struct si_shader_context *ctx) 3363 { 3364 LLVMBuilderRef builder = ctx->gallivm.builder; 3365 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); 3366 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false); 3367 LLVMBuildCall(builder, inlineasm, NULL, 0, ""); 3368 } 3369 #endif 3370 3371 /* Combine these with & instead of |. */ 3372 #define NOOP_WAITCNT 0xf7f 3373 #define LGKM_CNT 0x07f 3374 #define VM_CNT 0xf70 3375 3376 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16) 3377 { 3378 struct gallivm_state *gallivm = &ctx->gallivm; 3379 LLVMBuilderRef builder = gallivm->builder; 3380 LLVMValueRef args[1] = { 3381 lp_build_const_int32(gallivm, simm16) 3382 }; 3383 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt", 3384 ctx->voidt, args, 1, 0); 3385 } 3386 3387 static void membar_emit( 3388 const struct lp_build_tgsi_action *action, 3389 struct lp_build_tgsi_context *bld_base, 3390 struct lp_build_emit_data *emit_data) 3391 { 3392 struct si_shader_context *ctx = si_shader_context(bld_base); 3393 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0); 3394 unsigned flags = LLVMConstIntGetZExtValue(src0); 3395 unsigned waitcnt = NOOP_WAITCNT; 3396 3397 if (flags & TGSI_MEMBAR_THREAD_GROUP) 3398 waitcnt &= VM_CNT & LGKM_CNT; 3399 3400 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER | 3401 TGSI_MEMBAR_SHADER_BUFFER | 3402 TGSI_MEMBAR_SHADER_IMAGE)) 3403 waitcnt &= VM_CNT; 3404 3405 if (flags & TGSI_MEMBAR_SHARED) 3406 waitcnt &= LGKM_CNT; 3407 3408 if (waitcnt != NOOP_WAITCNT) 3409 emit_waitcnt(ctx, waitcnt); 3410 } 3411 3412 static LLVMValueRef 3413 shader_buffer_fetch_rsrc(struct si_shader_context *ctx, 3414 const struct tgsi_full_src_register *reg) 3415 { 3416 LLVMValueRef index; 3417 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, 3418 SI_PARAM_SHADER_BUFFERS); 3419 3420 if (!reg->Register.Indirect) 3421 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0); 3422 else 3423 index = get_bounded_indirect_index(ctx, ®->Indirect, 3424 reg->Register.Index, 3425 SI_NUM_SHADER_BUFFERS); 3426 3427 return build_indexed_load_const(ctx, rsrc_ptr, index); 3428 } 3429 3430 static bool tgsi_is_array_sampler(unsigned target) 3431 { 3432 return target == TGSI_TEXTURE_1D_ARRAY || 3433 target == TGSI_TEXTURE_SHADOW1D_ARRAY || 3434 target == TGSI_TEXTURE_2D_ARRAY || 3435 target == TGSI_TEXTURE_SHADOW2D_ARRAY || 3436 target == TGSI_TEXTURE_CUBE_ARRAY || 3437 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY || 3438 target == TGSI_TEXTURE_2D_ARRAY_MSAA; 3439 } 3440 3441 static bool tgsi_is_array_image(unsigned target) 3442 { 3443 return target == TGSI_TEXTURE_3D || 3444 target == TGSI_TEXTURE_CUBE || 3445 target == TGSI_TEXTURE_1D_ARRAY || 3446 target == TGSI_TEXTURE_2D_ARRAY || 3447 target == TGSI_TEXTURE_CUBE_ARRAY || 3448 target == TGSI_TEXTURE_2D_ARRAY_MSAA; 3449 } 3450 3451 /** 3452 * Given a 256-bit resource descriptor, force the DCC enable bit to off. 3453 * 3454 * At least on Tonga, executing image stores on images with DCC enabled and 3455 * non-trivial can eventually lead to lockups. This can occur when an 3456 * application binds an image as read-only but then uses a shader that writes 3457 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including 3458 * program termination) in this case, but it doesn't cost much to be a bit 3459 * nicer: disabling DCC in the shader still leads to undefined results but 3460 * avoids the lockup. 3461 */ 3462 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, 3463 LLVMValueRef rsrc) 3464 { 3465 if (ctx->screen->b.chip_class <= CIK) { 3466 return rsrc; 3467 } else { 3468 LLVMBuilderRef builder = ctx->gallivm.builder; 3469 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0); 3470 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0); 3471 LLVMValueRef tmp; 3472 3473 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, ""); 3474 tmp = LLVMBuildAnd(builder, tmp, i32_C, ""); 3475 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, ""); 3476 } 3477 } 3478 3479 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements) 3480 { 3481 return LLVMPointerType(LLVMArrayType(elem_type, num_elements), 3482 CONST_ADDR_SPACE); 3483 } 3484 3485 /** 3486 * Load the resource descriptor for \p image. 3487 */ 3488 static void 3489 image_fetch_rsrc( 3490 struct lp_build_tgsi_context *bld_base, 3491 const struct tgsi_full_src_register *image, 3492 bool is_store, unsigned target, 3493 LLVMValueRef *rsrc) 3494 { 3495 struct si_shader_context *ctx = si_shader_context(bld_base); 3496 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn, 3497 SI_PARAM_IMAGES); 3498 LLVMValueRef index, tmp; 3499 bool dcc_off = target != TGSI_TEXTURE_BUFFER && is_store; 3500 3501 assert(image->Register.File == TGSI_FILE_IMAGE); 3502 3503 if (!image->Register.Indirect) { 3504 const struct tgsi_shader_info *info = bld_base->info; 3505 3506 index = LLVMConstInt(ctx->i32, image->Register.Index, 0); 3507 3508 if (info->images_writemask & (1 << image->Register.Index) && 3509 target != TGSI_TEXTURE_BUFFER) 3510 dcc_off = true; 3511 } else { 3512 /* From the GL_ARB_shader_image_load_store extension spec: 3513 * 3514 * If a shader performs an image load, store, or atomic 3515 * operation using an image variable declared as an array, 3516 * and if the index used to select an individual element is 3517 * negative or greater than or equal to the size of the 3518 * array, the results of the operation are undefined but may 3519 * not lead to termination. 3520 */ 3521 index = get_bounded_indirect_index(ctx, &image->Indirect, 3522 image->Register.Index, 3523 SI_NUM_IMAGES); 3524 } 3525 3526 if (target == TGSI_TEXTURE_BUFFER) { 3527 LLVMBuilderRef builder = ctx->gallivm.builder; 3528 3529 rsrc_ptr = LLVMBuildPointerCast(builder, rsrc_ptr, 3530 const_array(ctx->v4i32, 0), ""); 3531 index = LLVMBuildMul(builder, index, 3532 LLVMConstInt(ctx->i32, 2, 0), ""); 3533 index = LLVMBuildAdd(builder, index, 3534 LLVMConstInt(ctx->i32, 1, 0), ""); 3535 *rsrc = build_indexed_load_const(ctx, rsrc_ptr, index); 3536 return; 3537 } 3538 3539 tmp = build_indexed_load_const(ctx, rsrc_ptr, index); 3540 if (dcc_off) 3541 tmp = force_dcc_off(ctx, tmp); 3542 *rsrc = tmp; 3543 } 3544 3545 static LLVMValueRef image_fetch_coords( 3546 struct lp_build_tgsi_context *bld_base, 3547 const struct tgsi_full_instruction *inst, 3548 unsigned src) 3549 { 3550 struct gallivm_state *gallivm = bld_base->base.gallivm; 3551 LLVMBuilderRef builder = gallivm->builder; 3552 unsigned target = inst->Memory.Texture; 3553 unsigned num_coords = tgsi_util_get_texture_coord_dim(target); 3554 LLVMValueRef coords[4]; 3555 LLVMValueRef tmp; 3556 int chan; 3557 3558 for (chan = 0; chan < num_coords; ++chan) { 3559 tmp = lp_build_emit_fetch(bld_base, inst, src, chan); 3560 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); 3561 coords[chan] = tmp; 3562 } 3563 3564 if (num_coords == 1) 3565 return coords[0]; 3566 3567 if (num_coords == 3) { 3568 /* LLVM has difficulties lowering 3-element vectors. */ 3569 coords[3] = bld_base->uint_bld.undef; 3570 num_coords = 4; 3571 } 3572 3573 return lp_build_gather_values(gallivm, coords, num_coords); 3574 } 3575 3576 /** 3577 * Append the extra mode bits that are used by image load and store. 3578 */ 3579 static void image_append_args( 3580 struct si_shader_context *ctx, 3581 struct lp_build_emit_data * emit_data, 3582 unsigned target, 3583 bool atomic, 3584 bool force_glc) 3585 { 3586 const struct tgsi_full_instruction *inst = emit_data->inst; 3587 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); 3588 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); 3589 LLVMValueRef r128 = i1false; 3590 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false; 3591 LLVMValueRef glc = 3592 force_glc || 3593 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? 3594 i1true : i1false; 3595 LLVMValueRef slc = i1false; 3596 LLVMValueRef lwe = i1false; 3597 3598 if (atomic || (HAVE_LLVM <= 0x0309)) { 3599 emit_data->args[emit_data->arg_count++] = r128; 3600 emit_data->args[emit_data->arg_count++] = da; 3601 if (!atomic) { 3602 emit_data->args[emit_data->arg_count++] = glc; 3603 } 3604 emit_data->args[emit_data->arg_count++] = slc; 3605 return; 3606 } 3607 3608 /* HAVE_LLVM >= 0x0400 */ 3609 emit_data->args[emit_data->arg_count++] = glc; 3610 emit_data->args[emit_data->arg_count++] = slc; 3611 emit_data->args[emit_data->arg_count++] = lwe; 3612 emit_data->args[emit_data->arg_count++] = da; 3613 } 3614 3615 /** 3616 * Append the resource and indexing arguments for buffer intrinsics. 3617 * 3618 * \param rsrc the v4i32 buffer resource 3619 * \param index index into the buffer (stride-based) 3620 * \param offset byte offset into the buffer 3621 */ 3622 static void buffer_append_args( 3623 struct si_shader_context *ctx, 3624 struct lp_build_emit_data *emit_data, 3625 LLVMValueRef rsrc, 3626 LLVMValueRef index, 3627 LLVMValueRef offset, 3628 bool atomic, 3629 bool force_glc) 3630 { 3631 const struct tgsi_full_instruction *inst = emit_data->inst; 3632 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); 3633 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); 3634 3635 emit_data->args[emit_data->arg_count++] = rsrc; 3636 emit_data->args[emit_data->arg_count++] = index; /* vindex */ 3637 emit_data->args[emit_data->arg_count++] = offset; /* voffset */ 3638 if (!atomic) { 3639 emit_data->args[emit_data->arg_count++] = 3640 force_glc || 3641 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? 3642 i1true : i1false; /* glc */ 3643 } 3644 emit_data->args[emit_data->arg_count++] = i1false; /* slc */ 3645 } 3646 3647 static void load_fetch_args( 3648 struct lp_build_tgsi_context * bld_base, 3649 struct lp_build_emit_data * emit_data) 3650 { 3651 struct si_shader_context *ctx = si_shader_context(bld_base); 3652 struct gallivm_state *gallivm = bld_base->base.gallivm; 3653 const struct tgsi_full_instruction * inst = emit_data->inst; 3654 unsigned target = inst->Memory.Texture; 3655 LLVMValueRef rsrc; 3656 3657 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); 3658 3659 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 3660 LLVMBuilderRef builder = gallivm->builder; 3661 LLVMValueRef offset; 3662 LLVMValueRef tmp; 3663 3664 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]); 3665 3666 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0); 3667 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); 3668 3669 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, 3670 offset, false, false); 3671 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { 3672 LLVMValueRef coords; 3673 3674 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc); 3675 coords = image_fetch_coords(bld_base, inst, 1); 3676 3677 if (target == TGSI_TEXTURE_BUFFER) { 3678 buffer_append_args(ctx, emit_data, rsrc, coords, 3679 bld_base->uint_bld.zero, false, false); 3680 } else { 3681 emit_data->args[0] = coords; 3682 emit_data->args[1] = rsrc; 3683 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ 3684 emit_data->arg_count = 3; 3685 3686 image_append_args(ctx, emit_data, target, false, false); 3687 } 3688 } 3689 } 3690 3691 static void load_emit_buffer(struct si_shader_context *ctx, 3692 struct lp_build_emit_data *emit_data) 3693 { 3694 const struct tgsi_full_instruction *inst = emit_data->inst; 3695 struct gallivm_state *gallivm = &ctx->gallivm; 3696 LLVMBuilderRef builder = gallivm->builder; 3697 uint writemask = inst->Dst[0].Register.WriteMask; 3698 uint count = util_last_bit(writemask); 3699 const char *intrinsic_name; 3700 LLVMTypeRef dst_type; 3701 3702 switch (count) { 3703 case 1: 3704 intrinsic_name = "llvm.amdgcn.buffer.load.f32"; 3705 dst_type = ctx->f32; 3706 break; 3707 case 2: 3708 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32"; 3709 dst_type = LLVMVectorType(ctx->f32, 2); 3710 break; 3711 default: // 3 & 4 3712 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32"; 3713 dst_type = ctx->v4f32; 3714 count = 4; 3715 } 3716 3717 emit_data->output[emit_data->chan] = lp_build_intrinsic( 3718 builder, intrinsic_name, dst_type, 3719 emit_data->args, emit_data->arg_count, 3720 LP_FUNC_ATTR_READONLY); 3721 } 3722 3723 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx, 3724 const struct tgsi_full_instruction *inst, 3725 LLVMTypeRef type, int arg) 3726 { 3727 struct gallivm_state *gallivm = &ctx->gallivm; 3728 LLVMBuilderRef builder = gallivm->builder; 3729 LLVMValueRef offset, ptr; 3730 int addr_space; 3731 3732 offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0); 3733 offset = LLVMBuildBitCast(builder, offset, ctx->i32, ""); 3734 3735 ptr = ctx->shared_memory; 3736 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, ""); 3737 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)); 3738 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), ""); 3739 3740 return ptr; 3741 } 3742 3743 static void load_emit_memory( 3744 struct si_shader_context *ctx, 3745 struct lp_build_emit_data *emit_data) 3746 { 3747 const struct tgsi_full_instruction *inst = emit_data->inst; 3748 struct lp_build_context *base = &ctx->bld_base.base; 3749 struct gallivm_state *gallivm = &ctx->gallivm; 3750 LLVMBuilderRef builder = gallivm->builder; 3751 unsigned writemask = inst->Dst[0].Register.WriteMask; 3752 LLVMValueRef channels[4], ptr, derived_ptr, index; 3753 int chan; 3754 3755 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1); 3756 3757 for (chan = 0; chan < 4; ++chan) { 3758 if (!(writemask & (1 << chan))) { 3759 channels[chan] = LLVMGetUndef(base->elem_type); 3760 continue; 3761 } 3762 3763 index = lp_build_const_int32(gallivm, chan); 3764 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); 3765 channels[chan] = LLVMBuildLoad(builder, derived_ptr, ""); 3766 } 3767 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4); 3768 } 3769 3770 static void get_image_intr_name(const char *base_name, 3771 LLVMTypeRef data_type, 3772 LLVMTypeRef coords_type, 3773 LLVMTypeRef rsrc_type, 3774 char *out_name, unsigned out_len) 3775 { 3776 char coords_type_name[8]; 3777 3778 build_type_name_for_intr(coords_type, coords_type_name, 3779 sizeof(coords_type_name)); 3780 3781 if (HAVE_LLVM <= 0x0309) { 3782 snprintf(out_name, out_len, "%s.%s", base_name, coords_type_name); 3783 } else { 3784 char data_type_name[8]; 3785 char rsrc_type_name[8]; 3786 3787 build_type_name_for_intr(data_type, data_type_name, 3788 sizeof(data_type_name)); 3789 build_type_name_for_intr(rsrc_type, rsrc_type_name, 3790 sizeof(rsrc_type_name)); 3791 snprintf(out_name, out_len, "%s.%s.%s.%s", base_name, 3792 data_type_name, coords_type_name, rsrc_type_name); 3793 } 3794 } 3795 3796 static void load_emit( 3797 const struct lp_build_tgsi_action *action, 3798 struct lp_build_tgsi_context *bld_base, 3799 struct lp_build_emit_data *emit_data) 3800 { 3801 struct si_shader_context *ctx = si_shader_context(bld_base); 3802 struct gallivm_state *gallivm = bld_base->base.gallivm; 3803 LLVMBuilderRef builder = gallivm->builder; 3804 const struct tgsi_full_instruction * inst = emit_data->inst; 3805 char intrinsic_name[64]; 3806 3807 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) { 3808 load_emit_memory(ctx, emit_data); 3809 return; 3810 } 3811 3812 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) 3813 emit_waitcnt(ctx, VM_CNT); 3814 3815 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 3816 load_emit_buffer(ctx, emit_data); 3817 return; 3818 } 3819 3820 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { 3821 emit_data->output[emit_data->chan] = 3822 lp_build_intrinsic( 3823 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type, 3824 emit_data->args, emit_data->arg_count, 3825 LP_FUNC_ATTR_READONLY); 3826 } else { 3827 get_image_intr_name("llvm.amdgcn.image.load", 3828 emit_data->dst_type, /* vdata */ 3829 LLVMTypeOf(emit_data->args[0]), /* coords */ 3830 LLVMTypeOf(emit_data->args[1]), /* rsrc */ 3831 intrinsic_name, sizeof(intrinsic_name)); 3832 3833 emit_data->output[emit_data->chan] = 3834 lp_build_intrinsic( 3835 builder, intrinsic_name, emit_data->dst_type, 3836 emit_data->args, emit_data->arg_count, 3837 LP_FUNC_ATTR_READONLY); 3838 } 3839 } 3840 3841 static void store_fetch_args( 3842 struct lp_build_tgsi_context * bld_base, 3843 struct lp_build_emit_data * emit_data) 3844 { 3845 struct si_shader_context *ctx = si_shader_context(bld_base); 3846 struct gallivm_state *gallivm = bld_base->base.gallivm; 3847 LLVMBuilderRef builder = gallivm->builder; 3848 const struct tgsi_full_instruction * inst = emit_data->inst; 3849 struct tgsi_full_src_register memory; 3850 LLVMValueRef chans[4]; 3851 LLVMValueRef data; 3852 LLVMValueRef rsrc; 3853 unsigned chan; 3854 3855 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context); 3856 3857 for (chan = 0; chan < 4; ++chan) { 3858 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan); 3859 } 3860 data = lp_build_gather_values(gallivm, chans, 4); 3861 3862 emit_data->args[emit_data->arg_count++] = data; 3863 3864 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]); 3865 3866 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { 3867 LLVMValueRef offset; 3868 LLVMValueRef tmp; 3869 3870 rsrc = shader_buffer_fetch_rsrc(ctx, &memory); 3871 3872 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0); 3873 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); 3874 3875 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, 3876 offset, false, false); 3877 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) { 3878 unsigned target = inst->Memory.Texture; 3879 LLVMValueRef coords; 3880 3881 /* 8bit/16bit TC L1 write corruption bug on SI. 3882 * All store opcodes not aligned to a dword are affected. 3883 * 3884 * The only way to get unaligned stores in radeonsi is through 3885 * shader images. 3886 */ 3887 bool force_glc = ctx->screen->b.chip_class == SI; 3888 3889 coords = image_fetch_coords(bld_base, inst, 0); 3890 3891 if (target == TGSI_TEXTURE_BUFFER) { 3892 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc); 3893 buffer_append_args(ctx, emit_data, rsrc, coords, 3894 bld_base->uint_bld.zero, false, force_glc); 3895 } else { 3896 emit_data->args[1] = coords; 3897 image_fetch_rsrc(bld_base, &memory, true, target, 3898 &emit_data->args[2]); 3899 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */ 3900 emit_data->arg_count = 4; 3901 3902 image_append_args(ctx, emit_data, target, false, force_glc); 3903 } 3904 } 3905 } 3906 3907 static void store_emit_buffer( 3908 struct si_shader_context *ctx, 3909 struct lp_build_emit_data *emit_data) 3910 { 3911 const struct tgsi_full_instruction *inst = emit_data->inst; 3912 struct gallivm_state *gallivm = &ctx->gallivm; 3913 LLVMBuilderRef builder = gallivm->builder; 3914 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld; 3915 LLVMValueRef base_data = emit_data->args[0]; 3916 LLVMValueRef base_offset = emit_data->args[3]; 3917 unsigned writemask = inst->Dst[0].Register.WriteMask; 3918 3919 while (writemask) { 3920 int start, count; 3921 const char *intrinsic_name; 3922 LLVMValueRef data; 3923 LLVMValueRef offset; 3924 LLVMValueRef tmp; 3925 3926 u_bit_scan_consecutive_range(&writemask, &start, &count); 3927 3928 /* Due to an LLVM limitation, split 3-element writes 3929 * into a 2-element and a 1-element write. */ 3930 if (count == 3) { 3931 writemask |= 1 << (start + 2); 3932 count = 2; 3933 } 3934 3935 if (count == 4) { 3936 data = base_data; 3937 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32"; 3938 } else if (count == 2) { 3939 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2); 3940 3941 tmp = LLVMBuildExtractElement( 3942 builder, base_data, 3943 lp_build_const_int32(gallivm, start), ""); 3944 data = LLVMBuildInsertElement( 3945 builder, LLVMGetUndef(v2f32), tmp, 3946 uint_bld->zero, ""); 3947 3948 tmp = LLVMBuildExtractElement( 3949 builder, base_data, 3950 lp_build_const_int32(gallivm, start + 1), ""); 3951 data = LLVMBuildInsertElement( 3952 builder, data, tmp, uint_bld->one, ""); 3953 3954 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32"; 3955 } else { 3956 assert(count == 1); 3957 data = LLVMBuildExtractElement( 3958 builder, base_data, 3959 lp_build_const_int32(gallivm, start), ""); 3960 intrinsic_name = "llvm.amdgcn.buffer.store.f32"; 3961 } 3962 3963 offset = base_offset; 3964 if (start != 0) { 3965 offset = LLVMBuildAdd( 3966 builder, offset, 3967 lp_build_const_int32(gallivm, start * 4), ""); 3968 } 3969 3970 emit_data->args[0] = data; 3971 emit_data->args[3] = offset; 3972 3973 lp_build_intrinsic( 3974 builder, intrinsic_name, emit_data->dst_type, 3975 emit_data->args, emit_data->arg_count, 0); 3976 } 3977 } 3978 3979 static void store_emit_memory( 3980 struct si_shader_context *ctx, 3981 struct lp_build_emit_data *emit_data) 3982 { 3983 const struct tgsi_full_instruction *inst = emit_data->inst; 3984 struct gallivm_state *gallivm = &ctx->gallivm; 3985 struct lp_build_context *base = &ctx->bld_base.base; 3986 LLVMBuilderRef builder = gallivm->builder; 3987 unsigned writemask = inst->Dst[0].Register.WriteMask; 3988 LLVMValueRef ptr, derived_ptr, data, index; 3989 int chan; 3990 3991 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0); 3992 3993 for (chan = 0; chan < 4; ++chan) { 3994 if (!(writemask & (1 << chan))) { 3995 continue; 3996 } 3997 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan); 3998 index = lp_build_const_int32(gallivm, chan); 3999 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); 4000 LLVMBuildStore(builder, data, derived_ptr); 4001 } 4002 } 4003 4004 static void store_emit( 4005 const struct lp_build_tgsi_action *action, 4006 struct lp_build_tgsi_context *bld_base, 4007 struct lp_build_emit_data *emit_data) 4008 { 4009 struct si_shader_context *ctx = si_shader_context(bld_base); 4010 struct gallivm_state *gallivm = bld_base->base.gallivm; 4011 LLVMBuilderRef builder = gallivm->builder; 4012 const struct tgsi_full_instruction * inst = emit_data->inst; 4013 unsigned target = inst->Memory.Texture; 4014 char intrinsic_name[64]; 4015 4016 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) { 4017 store_emit_memory(ctx, emit_data); 4018 return; 4019 } 4020 4021 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) 4022 emit_waitcnt(ctx, VM_CNT); 4023 4024 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { 4025 store_emit_buffer(ctx, emit_data); 4026 return; 4027 } 4028 4029 if (target == TGSI_TEXTURE_BUFFER) { 4030 emit_data->output[emit_data->chan] = lp_build_intrinsic( 4031 builder, "llvm.amdgcn.buffer.store.format.v4f32", 4032 emit_data->dst_type, emit_data->args, 4033 emit_data->arg_count, 0); 4034 } else { 4035 get_image_intr_name("llvm.amdgcn.image.store", 4036 LLVMTypeOf(emit_data->args[0]), /* vdata */ 4037 LLVMTypeOf(emit_data->args[1]), /* coords */ 4038 LLVMTypeOf(emit_data->args[2]), /* rsrc */ 4039 intrinsic_name, sizeof(intrinsic_name)); 4040 4041 emit_data->output[emit_data->chan] = 4042 lp_build_intrinsic( 4043 builder, intrinsic_name, emit_data->dst_type, 4044 emit_data->args, emit_data->arg_count, 0); 4045 } 4046 } 4047 4048 static void atomic_fetch_args( 4049 struct lp_build_tgsi_context * bld_base, 4050 struct lp_build_emit_data * emit_data) 4051 { 4052 struct si_shader_context *ctx = si_shader_context(bld_base); 4053 struct gallivm_state *gallivm = bld_base->base.gallivm; 4054 LLVMBuilderRef builder = gallivm->builder; 4055 const struct tgsi_full_instruction * inst = emit_data->inst; 4056 LLVMValueRef data1, data2; 4057 LLVMValueRef rsrc; 4058 LLVMValueRef tmp; 4059 4060 emit_data->dst_type = bld_base->base.elem_type; 4061 4062 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0); 4063 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); 4064 4065 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { 4066 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0); 4067 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); 4068 } 4069 4070 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order 4071 * of arguments, which is reversed relative to TGSI (and GLSL) 4072 */ 4073 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) 4074 emit_data->args[emit_data->arg_count++] = data2; 4075 emit_data->args[emit_data->arg_count++] = data1; 4076 4077 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 4078 LLVMValueRef offset; 4079 4080 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]); 4081 4082 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0); 4083 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); 4084 4085 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, 4086 offset, true, false); 4087 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { 4088 unsigned target = inst->Memory.Texture; 4089 LLVMValueRef coords; 4090 4091 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc); 4092 coords = image_fetch_coords(bld_base, inst, 1); 4093 4094 if (target == TGSI_TEXTURE_BUFFER) { 4095 buffer_append_args(ctx, emit_data, rsrc, coords, 4096 bld_base->uint_bld.zero, true, false); 4097 } else { 4098 emit_data->args[emit_data->arg_count++] = coords; 4099 emit_data->args[emit_data->arg_count++] = rsrc; 4100 4101 image_append_args(ctx, emit_data, target, true, false); 4102 } 4103 } 4104 } 4105 4106 static void atomic_emit_memory(struct si_shader_context *ctx, 4107 struct lp_build_emit_data *emit_data) { 4108 struct gallivm_state *gallivm = &ctx->gallivm; 4109 LLVMBuilderRef builder = gallivm->builder; 4110 const struct tgsi_full_instruction * inst = emit_data->inst; 4111 LLVMValueRef ptr, result, arg; 4112 4113 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1); 4114 4115 arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0); 4116 arg = LLVMBuildBitCast(builder, arg, ctx->i32, ""); 4117 4118 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { 4119 LLVMValueRef new_data; 4120 new_data = lp_build_emit_fetch(&ctx->bld_base, 4121 inst, 3, 0); 4122 4123 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, ""); 4124 4125 #if HAVE_LLVM >= 0x309 4126 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data, 4127 LLVMAtomicOrderingSequentiallyConsistent, 4128 LLVMAtomicOrderingSequentiallyConsistent, 4129 false); 4130 #endif 4131 4132 result = LLVMBuildExtractValue(builder, result, 0, ""); 4133 } else { 4134 LLVMAtomicRMWBinOp op; 4135 4136 switch(inst->Instruction.Opcode) { 4137 case TGSI_OPCODE_ATOMUADD: 4138 op = LLVMAtomicRMWBinOpAdd; 4139 break; 4140 case TGSI_OPCODE_ATOMXCHG: 4141 op = LLVMAtomicRMWBinOpXchg; 4142 break; 4143 case TGSI_OPCODE_ATOMAND: 4144 op = LLVMAtomicRMWBinOpAnd; 4145 break; 4146 case TGSI_OPCODE_ATOMOR: 4147 op = LLVMAtomicRMWBinOpOr; 4148 break; 4149 case TGSI_OPCODE_ATOMXOR: 4150 op = LLVMAtomicRMWBinOpXor; 4151 break; 4152 case TGSI_OPCODE_ATOMUMIN: 4153 op = LLVMAtomicRMWBinOpUMin; 4154 break; 4155 case TGSI_OPCODE_ATOMUMAX: 4156 op = LLVMAtomicRMWBinOpUMax; 4157 break; 4158 case TGSI_OPCODE_ATOMIMIN: 4159 op = LLVMAtomicRMWBinOpMin; 4160 break; 4161 case TGSI_OPCODE_ATOMIMAX: 4162 op = LLVMAtomicRMWBinOpMax; 4163 break; 4164 default: 4165 unreachable("unknown atomic opcode"); 4166 } 4167 4168 result = LLVMBuildAtomicRMW(builder, op, ptr, arg, 4169 LLVMAtomicOrderingSequentiallyConsistent, 4170 false); 4171 } 4172 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, ""); 4173 } 4174 4175 static void atomic_emit( 4176 const struct lp_build_tgsi_action *action, 4177 struct lp_build_tgsi_context *bld_base, 4178 struct lp_build_emit_data *emit_data) 4179 { 4180 struct si_shader_context *ctx = si_shader_context(bld_base); 4181 struct gallivm_state *gallivm = bld_base->base.gallivm; 4182 LLVMBuilderRef builder = gallivm->builder; 4183 const struct tgsi_full_instruction * inst = emit_data->inst; 4184 char intrinsic_name[40]; 4185 LLVMValueRef tmp; 4186 4187 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) { 4188 atomic_emit_memory(ctx, emit_data); 4189 return; 4190 } 4191 4192 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || 4193 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { 4194 snprintf(intrinsic_name, sizeof(intrinsic_name), 4195 "llvm.amdgcn.buffer.atomic.%s", action->intr_name); 4196 } else { 4197 LLVMValueRef coords; 4198 char coords_type[8]; 4199 4200 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) 4201 coords = emit_data->args[2]; 4202 else 4203 coords = emit_data->args[1]; 4204 4205 build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type)); 4206 snprintf(intrinsic_name, sizeof(intrinsic_name), 4207 "llvm.amdgcn.image.atomic.%s.%s", 4208 action->intr_name, coords_type); 4209 } 4210 4211 tmp = lp_build_intrinsic( 4212 builder, intrinsic_name, bld_base->uint_bld.elem_type, 4213 emit_data->args, emit_data->arg_count, 0); 4214 emit_data->output[emit_data->chan] = 4215 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, ""); 4216 } 4217 4218 static void resq_fetch_args( 4219 struct lp_build_tgsi_context * bld_base, 4220 struct lp_build_emit_data * emit_data) 4221 { 4222 struct si_shader_context *ctx = si_shader_context(bld_base); 4223 struct gallivm_state *gallivm = bld_base->base.gallivm; 4224 const struct tgsi_full_instruction *inst = emit_data->inst; 4225 const struct tgsi_full_src_register *reg = &inst->Src[0]; 4226 4227 emit_data->dst_type = ctx->v4i32; 4228 4229 if (reg->Register.File == TGSI_FILE_BUFFER) { 4230 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg); 4231 emit_data->arg_count = 1; 4232 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { 4233 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, 4234 &emit_data->args[0]); 4235 emit_data->arg_count = 1; 4236 } else { 4237 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */ 4238 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, 4239 &emit_data->args[1]); 4240 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ 4241 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */ 4242 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */ 4243 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ? 4244 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */ 4245 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */ 4246 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */ 4247 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */ 4248 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */ 4249 emit_data->arg_count = 10; 4250 } 4251 } 4252 4253 static void resq_emit( 4254 const struct lp_build_tgsi_action *action, 4255 struct lp_build_tgsi_context *bld_base, 4256 struct lp_build_emit_data *emit_data) 4257 { 4258 struct gallivm_state *gallivm = bld_base->base.gallivm; 4259 LLVMBuilderRef builder = gallivm->builder; 4260 const struct tgsi_full_instruction *inst = emit_data->inst; 4261 LLVMValueRef out; 4262 4263 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 4264 out = LLVMBuildExtractElement(builder, emit_data->args[0], 4265 lp_build_const_int32(gallivm, 2), ""); 4266 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { 4267 out = get_buffer_size(bld_base, emit_data->args[0]); 4268 } else { 4269 out = lp_build_intrinsic( 4270 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type, 4271 emit_data->args, emit_data->arg_count, 4272 LP_FUNC_ATTR_READNONE); 4273 4274 /* Divide the number of layers by 6 to get the number of cubes. */ 4275 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) { 4276 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2); 4277 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6); 4278 4279 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, ""); 4280 z = LLVMBuildSDiv(builder, z, imm6, ""); 4281 out = LLVMBuildInsertElement(builder, out, z, imm2, ""); 4282 } 4283 } 4284 4285 emit_data->output[emit_data->chan] = out; 4286 } 4287 4288 static void set_tex_fetch_args(struct si_shader_context *ctx, 4289 struct lp_build_emit_data *emit_data, 4290 unsigned opcode, unsigned target, 4291 LLVMValueRef res_ptr, LLVMValueRef samp_ptr, 4292 LLVMValueRef *param, unsigned count, 4293 unsigned dmask) 4294 { 4295 struct gallivm_state *gallivm = &ctx->gallivm; 4296 unsigned num_args; 4297 unsigned is_rect = target == TGSI_TEXTURE_RECT; 4298 4299 /* Pad to power of two vector */ 4300 while (count < util_next_power_of_two(count)) 4301 param[count++] = LLVMGetUndef(ctx->i32); 4302 4303 /* Texture coordinates. */ 4304 if (count > 1) 4305 emit_data->args[0] = lp_build_gather_values(gallivm, param, count); 4306 else 4307 emit_data->args[0] = param[0]; 4308 4309 /* Resource. */ 4310 emit_data->args[1] = res_ptr; 4311 num_args = 2; 4312 4313 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ) 4314 emit_data->dst_type = ctx->v4i32; 4315 else { 4316 emit_data->dst_type = ctx->v4f32; 4317 4318 emit_data->args[num_args++] = samp_ptr; 4319 } 4320 4321 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask); 4322 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */ 4323 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */ 4324 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 4325 tgsi_is_array_sampler(target)); /* da */ 4326 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */ 4327 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */ 4328 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */ 4329 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */ 4330 4331 emit_data->arg_count = num_args; 4332 } 4333 4334 static const struct lp_build_tgsi_action tex_action; 4335 4336 enum desc_type { 4337 DESC_IMAGE, 4338 DESC_BUFFER, 4339 DESC_FMASK, 4340 DESC_SAMPLER, 4341 }; 4342 4343 /** 4344 * Load an image view, fmask view. or sampler state descriptor. 4345 */ 4346 static LLVMValueRef load_sampler_desc_custom(struct si_shader_context *ctx, 4347 LLVMValueRef list, LLVMValueRef index, 4348 enum desc_type type) 4349 { 4350 struct gallivm_state *gallivm = &ctx->gallivm; 4351 LLVMBuilderRef builder = gallivm->builder; 4352 4353 switch (type) { 4354 case DESC_IMAGE: 4355 /* The image is at [0:7]. */ 4356 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); 4357 break; 4358 case DESC_BUFFER: 4359 /* The buffer is in [4:7]. */ 4360 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); 4361 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), ""); 4362 list = LLVMBuildPointerCast(builder, list, 4363 const_array(ctx->v4i32, 0), ""); 4364 break; 4365 case DESC_FMASK: 4366 /* The FMASK is at [8:15]. */ 4367 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); 4368 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), ""); 4369 break; 4370 case DESC_SAMPLER: 4371 /* The sampler state is at [12:15]. */ 4372 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), ""); 4373 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), ""); 4374 list = LLVMBuildPointerCast(builder, list, 4375 const_array(ctx->v4i32, 0), ""); 4376 break; 4377 } 4378 4379 return build_indexed_load_const(ctx, list, index); 4380 } 4381 4382 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx, 4383 LLVMValueRef index, enum desc_type type) 4384 { 4385 LLVMValueRef list = LLVMGetParam(ctx->main_fn, 4386 SI_PARAM_SAMPLERS); 4387 4388 return load_sampler_desc_custom(ctx, list, index, type); 4389 } 4390 4391 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL. 4392 * 4393 * SI-CI: 4394 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic 4395 * filtering manually. The driver sets img7 to a mask clearing 4396 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do: 4397 * s_and_b32 samp0, samp0, img7 4398 * 4399 * VI: 4400 * The ANISO_OVERRIDE sampler field enables this fix in TA. 4401 */ 4402 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx, 4403 LLVMValueRef res, LLVMValueRef samp) 4404 { 4405 LLVMBuilderRef builder = ctx->gallivm.builder; 4406 LLVMValueRef img7, samp0; 4407 4408 if (ctx->screen->b.chip_class >= VI) 4409 return samp; 4410 4411 img7 = LLVMBuildExtractElement(builder, res, 4412 LLVMConstInt(ctx->i32, 7, 0), ""); 4413 samp0 = LLVMBuildExtractElement(builder, samp, 4414 LLVMConstInt(ctx->i32, 0, 0), ""); 4415 samp0 = LLVMBuildAnd(builder, samp0, img7, ""); 4416 return LLVMBuildInsertElement(builder, samp, samp0, 4417 LLVMConstInt(ctx->i32, 0, 0), ""); 4418 } 4419 4420 static void tex_fetch_ptrs( 4421 struct lp_build_tgsi_context *bld_base, 4422 struct lp_build_emit_data *emit_data, 4423 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr) 4424 { 4425 struct si_shader_context *ctx = si_shader_context(bld_base); 4426 const struct tgsi_full_instruction *inst = emit_data->inst; 4427 unsigned target = inst->Texture.Texture; 4428 unsigned sampler_src; 4429 unsigned sampler_index; 4430 LLVMValueRef index; 4431 4432 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1; 4433 sampler_index = emit_data->inst->Src[sampler_src].Register.Index; 4434 4435 if (emit_data->inst->Src[sampler_src].Register.Indirect) { 4436 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src]; 4437 4438 index = get_bounded_indirect_index(ctx, 4439 ®->Indirect, 4440 reg->Register.Index, 4441 SI_NUM_SAMPLERS); 4442 } else { 4443 index = LLVMConstInt(ctx->i32, sampler_index, 0); 4444 } 4445 4446 if (target == TGSI_TEXTURE_BUFFER) 4447 *res_ptr = load_sampler_desc(ctx, index, DESC_BUFFER); 4448 else 4449 *res_ptr = load_sampler_desc(ctx, index, DESC_IMAGE); 4450 4451 if (samp_ptr) 4452 *samp_ptr = NULL; 4453 if (fmask_ptr) 4454 *fmask_ptr = NULL; 4455 4456 if (target == TGSI_TEXTURE_2D_MSAA || 4457 target == TGSI_TEXTURE_2D_ARRAY_MSAA) { 4458 if (fmask_ptr) 4459 *fmask_ptr = load_sampler_desc(ctx, index, DESC_FMASK); 4460 } else if (target != TGSI_TEXTURE_BUFFER) { 4461 if (samp_ptr) { 4462 *samp_ptr = load_sampler_desc(ctx, index, DESC_SAMPLER); 4463 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); 4464 } 4465 } 4466 } 4467 4468 static void txq_fetch_args( 4469 struct lp_build_tgsi_context *bld_base, 4470 struct lp_build_emit_data *emit_data) 4471 { 4472 struct si_shader_context *ctx = si_shader_context(bld_base); 4473 const struct tgsi_full_instruction *inst = emit_data->inst; 4474 unsigned target = inst->Texture.Texture; 4475 LLVMValueRef res_ptr; 4476 LLVMValueRef address; 4477 4478 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL); 4479 4480 if (target == TGSI_TEXTURE_BUFFER) { 4481 /* Read the size from the buffer descriptor directly. */ 4482 emit_data->args[0] = get_buffer_size(bld_base, res_ptr); 4483 return; 4484 } 4485 4486 /* Textures - set the mip level. */ 4487 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X); 4488 4489 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr, 4490 NULL, &address, 1, 0xf); 4491 } 4492 4493 static void txq_emit(const struct lp_build_tgsi_action *action, 4494 struct lp_build_tgsi_context *bld_base, 4495 struct lp_build_emit_data *emit_data) 4496 { 4497 struct lp_build_context *base = &bld_base->base; 4498 unsigned target = emit_data->inst->Texture.Texture; 4499 4500 if (target == TGSI_TEXTURE_BUFFER) { 4501 /* Just return the buffer size. */ 4502 emit_data->output[emit_data->chan] = emit_data->args[0]; 4503 return; 4504 } 4505 4506 emit_data->output[emit_data->chan] = lp_build_intrinsic( 4507 base->gallivm->builder, "llvm.SI.getresinfo.i32", 4508 emit_data->dst_type, emit_data->args, emit_data->arg_count, 4509 LP_FUNC_ATTR_READNONE); 4510 4511 /* Divide the number of layers by 6 to get the number of cubes. */ 4512 if (target == TGSI_TEXTURE_CUBE_ARRAY || 4513 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4514 LLVMBuilderRef builder = bld_base->base.gallivm->builder; 4515 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2); 4516 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6); 4517 4518 LLVMValueRef v4 = emit_data->output[emit_data->chan]; 4519 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, ""); 4520 z = LLVMBuildSDiv(builder, z, six, ""); 4521 4522 emit_data->output[emit_data->chan] = 4523 LLVMBuildInsertElement(builder, v4, z, two, ""); 4524 } 4525 } 4526 4527 static void tex_fetch_args( 4528 struct lp_build_tgsi_context *bld_base, 4529 struct lp_build_emit_data *emit_data) 4530 { 4531 struct si_shader_context *ctx = si_shader_context(bld_base); 4532 struct gallivm_state *gallivm = bld_base->base.gallivm; 4533 const struct tgsi_full_instruction *inst = emit_data->inst; 4534 unsigned opcode = inst->Instruction.Opcode; 4535 unsigned target = inst->Texture.Texture; 4536 LLVMValueRef coords[5], derivs[6]; 4537 LLVMValueRef address[16]; 4538 unsigned num_coords = tgsi_util_get_texture_coord_dim(target); 4539 int ref_pos = tgsi_util_get_shadow_ref_src_index(target); 4540 unsigned count = 0; 4541 unsigned chan; 4542 unsigned num_deriv_channels = 0; 4543 bool has_offset = inst->Texture.NumOffsets > 0; 4544 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL; 4545 unsigned dmask = 0xf; 4546 4547 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr); 4548 4549 if (target == TGSI_TEXTURE_BUFFER) { 4550 emit_data->dst_type = ctx->v4f32; 4551 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr, 4552 ctx->v16i8, ""); 4553 emit_data->args[1] = bld_base->uint_bld.zero; 4554 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X); 4555 emit_data->arg_count = 3; 4556 return; 4557 } 4558 4559 /* Fetch and project texture coordinates */ 4560 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W); 4561 for (chan = 0; chan < 3; chan++ ) { 4562 coords[chan] = lp_build_emit_fetch(bld_base, 4563 emit_data->inst, 0, 4564 chan); 4565 if (opcode == TGSI_OPCODE_TXP) 4566 coords[chan] = lp_build_emit_llvm_binary(bld_base, 4567 TGSI_OPCODE_DIV, 4568 coords[chan], 4569 coords[3]); 4570 } 4571 4572 if (opcode == TGSI_OPCODE_TXP) 4573 coords[3] = bld_base->base.one; 4574 4575 /* Pack offsets. */ 4576 if (has_offset && opcode != TGSI_OPCODE_TXF) { 4577 /* The offsets are six-bit signed integers packed like this: 4578 * X=[5:0], Y=[13:8], and Z=[21:16]. 4579 */ 4580 LLVMValueRef offset[3], pack; 4581 4582 assert(inst->Texture.NumOffsets == 1); 4583 4584 for (chan = 0; chan < 3; chan++) { 4585 offset[chan] = lp_build_emit_fetch_texoffset(bld_base, 4586 emit_data->inst, 0, chan); 4587 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan], 4588 lp_build_const_int32(gallivm, 0x3f), ""); 4589 if (chan) 4590 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan], 4591 lp_build_const_int32(gallivm, chan*8), ""); 4592 } 4593 4594 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], ""); 4595 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], ""); 4596 address[count++] = pack; 4597 } 4598 4599 /* Pack LOD bias value */ 4600 if (opcode == TGSI_OPCODE_TXB) 4601 address[count++] = coords[3]; 4602 if (opcode == TGSI_OPCODE_TXB2) 4603 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); 4604 4605 /* Pack depth comparison value */ 4606 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) { 4607 LLVMValueRef z; 4608 4609 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4610 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); 4611 } else { 4612 assert(ref_pos >= 0); 4613 z = coords[ref_pos]; 4614 } 4615 4616 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT, 4617 * so the depth comparison value isn't clamped for Z16 and 4618 * Z24 anymore. Do it manually here. 4619 * 4620 * It's unnecessary if the original texture format was 4621 * Z32_FLOAT, but we don't know that here. 4622 */ 4623 if (ctx->screen->b.chip_class == VI) 4624 z = si_llvm_saturate(bld_base, z); 4625 4626 address[count++] = z; 4627 } 4628 4629 /* Pack user derivatives */ 4630 if (opcode == TGSI_OPCODE_TXD) { 4631 int param, num_src_deriv_channels; 4632 4633 switch (target) { 4634 case TGSI_TEXTURE_3D: 4635 num_src_deriv_channels = 3; 4636 num_deriv_channels = 3; 4637 break; 4638 case TGSI_TEXTURE_2D: 4639 case TGSI_TEXTURE_SHADOW2D: 4640 case TGSI_TEXTURE_RECT: 4641 case TGSI_TEXTURE_SHADOWRECT: 4642 case TGSI_TEXTURE_2D_ARRAY: 4643 case TGSI_TEXTURE_SHADOW2D_ARRAY: 4644 num_src_deriv_channels = 2; 4645 num_deriv_channels = 2; 4646 break; 4647 case TGSI_TEXTURE_CUBE: 4648 case TGSI_TEXTURE_SHADOWCUBE: 4649 case TGSI_TEXTURE_CUBE_ARRAY: 4650 case TGSI_TEXTURE_SHADOWCUBE_ARRAY: 4651 /* Cube derivatives will be converted to 2D. */ 4652 num_src_deriv_channels = 3; 4653 num_deriv_channels = 2; 4654 break; 4655 case TGSI_TEXTURE_1D: 4656 case TGSI_TEXTURE_SHADOW1D: 4657 case TGSI_TEXTURE_1D_ARRAY: 4658 case TGSI_TEXTURE_SHADOW1D_ARRAY: 4659 num_src_deriv_channels = 1; 4660 num_deriv_channels = 1; 4661 break; 4662 default: 4663 unreachable("invalid target"); 4664 } 4665 4666 for (param = 0; param < 2; param++) 4667 for (chan = 0; chan < num_src_deriv_channels; chan++) 4668 derivs[param * num_src_deriv_channels + chan] = 4669 lp_build_emit_fetch(bld_base, inst, param+1, chan); 4670 } 4671 4672 if (target == TGSI_TEXTURE_CUBE || 4673 target == TGSI_TEXTURE_CUBE_ARRAY || 4674 target == TGSI_TEXTURE_SHADOWCUBE || 4675 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) 4676 ac_prepare_cube_coords(&ctx->ac, 4677 opcode == TGSI_OPCODE_TXD, 4678 target == TGSI_TEXTURE_CUBE_ARRAY || 4679 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY, 4680 coords, derivs); 4681 4682 if (opcode == TGSI_OPCODE_TXD) 4683 for (int i = 0; i < num_deriv_channels * 2; i++) 4684 address[count++] = derivs[i]; 4685 4686 /* Pack texture coordinates */ 4687 address[count++] = coords[0]; 4688 if (num_coords > 1) 4689 address[count++] = coords[1]; 4690 if (num_coords > 2) 4691 address[count++] = coords[2]; 4692 4693 /* Pack LOD or sample index */ 4694 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF) 4695 address[count++] = coords[3]; 4696 else if (opcode == TGSI_OPCODE_TXL2) 4697 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); 4698 4699 if (count > 16) { 4700 assert(!"Cannot handle more than 16 texture address parameters"); 4701 count = 16; 4702 } 4703 4704 for (chan = 0; chan < count; chan++ ) { 4705 address[chan] = LLVMBuildBitCast(gallivm->builder, 4706 address[chan], ctx->i32, ""); 4707 } 4708 4709 /* Adjust the sample index according to FMASK. 4710 * 4711 * For uncompressed MSAA surfaces, FMASK should return 0x76543210, 4712 * which is the identity mapping. Each nibble says which physical sample 4713 * should be fetched to get that sample. 4714 * 4715 * For example, 0x11111100 means there are only 2 samples stored and 4716 * the second sample covers 3/4 of the pixel. When reading samples 0 4717 * and 1, return physical sample 0 (determined by the first two 0s 4718 * in FMASK), otherwise return physical sample 1. 4719 * 4720 * The sample index should be adjusted as follows: 4721 * sample_index = (fmask >> (sample_index * 4)) & 0xF; 4722 */ 4723 if (target == TGSI_TEXTURE_2D_MSAA || 4724 target == TGSI_TEXTURE_2D_ARRAY_MSAA) { 4725 struct lp_build_context *uint_bld = &bld_base->uint_bld; 4726 struct lp_build_emit_data txf_emit_data = *emit_data; 4727 LLVMValueRef txf_address[4]; 4728 unsigned txf_count = count; 4729 struct tgsi_full_instruction inst = {}; 4730 4731 memcpy(txf_address, address, sizeof(txf_address)); 4732 4733 if (target == TGSI_TEXTURE_2D_MSAA) { 4734 txf_address[2] = bld_base->uint_bld.zero; 4735 } 4736 txf_address[3] = bld_base->uint_bld.zero; 4737 4738 /* Read FMASK using TXF. */ 4739 inst.Instruction.Opcode = TGSI_OPCODE_TXF; 4740 inst.Texture.Texture = target; 4741 txf_emit_data.inst = &inst; 4742 txf_emit_data.chan = 0; 4743 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF, 4744 target, fmask_ptr, NULL, 4745 txf_address, txf_count, 0xf); 4746 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data); 4747 4748 /* Initialize some constants. */ 4749 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0); 4750 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0); 4751 4752 /* Apply the formula. */ 4753 LLVMValueRef fmask = 4754 LLVMBuildExtractElement(gallivm->builder, 4755 txf_emit_data.output[0], 4756 uint_bld->zero, ""); 4757 4758 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3; 4759 4760 LLVMValueRef sample_index4 = 4761 LLVMBuildMul(gallivm->builder, address[sample_chan], four, ""); 4762 4763 LLVMValueRef shifted_fmask = 4764 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, ""); 4765 4766 LLVMValueRef final_sample = 4767 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, ""); 4768 4769 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK 4770 * resource descriptor is 0 (invalid), 4771 */ 4772 LLVMValueRef fmask_desc = 4773 LLVMBuildBitCast(gallivm->builder, fmask_ptr, 4774 ctx->v8i32, ""); 4775 4776 LLVMValueRef fmask_word1 = 4777 LLVMBuildExtractElement(gallivm->builder, fmask_desc, 4778 uint_bld->one, ""); 4779 4780 LLVMValueRef word1_is_nonzero = 4781 LLVMBuildICmp(gallivm->builder, LLVMIntNE, 4782 fmask_word1, uint_bld->zero, ""); 4783 4784 /* Replace the MSAA sample index. */ 4785 address[sample_chan] = 4786 LLVMBuildSelect(gallivm->builder, word1_is_nonzero, 4787 final_sample, address[sample_chan], ""); 4788 } 4789 4790 if (opcode == TGSI_OPCODE_TXF) { 4791 /* add tex offsets */ 4792 if (inst->Texture.NumOffsets) { 4793 struct lp_build_context *uint_bld = &bld_base->uint_bld; 4794 const struct tgsi_texture_offset *off = inst->TexOffsets; 4795 4796 assert(inst->Texture.NumOffsets == 1); 4797 4798 switch (target) { 4799 case TGSI_TEXTURE_3D: 4800 address[2] = lp_build_add(uint_bld, address[2], 4801 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]); 4802 /* fall through */ 4803 case TGSI_TEXTURE_2D: 4804 case TGSI_TEXTURE_SHADOW2D: 4805 case TGSI_TEXTURE_RECT: 4806 case TGSI_TEXTURE_SHADOWRECT: 4807 case TGSI_TEXTURE_2D_ARRAY: 4808 case TGSI_TEXTURE_SHADOW2D_ARRAY: 4809 address[1] = 4810 lp_build_add(uint_bld, address[1], 4811 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]); 4812 /* fall through */ 4813 case TGSI_TEXTURE_1D: 4814 case TGSI_TEXTURE_SHADOW1D: 4815 case TGSI_TEXTURE_1D_ARRAY: 4816 case TGSI_TEXTURE_SHADOW1D_ARRAY: 4817 address[0] = 4818 lp_build_add(uint_bld, address[0], 4819 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]); 4820 break; 4821 /* texture offsets do not apply to other texture targets */ 4822 } 4823 } 4824 } 4825 4826 if (opcode == TGSI_OPCODE_TG4) { 4827 unsigned gather_comp = 0; 4828 4829 /* DMASK was repurposed for GATHER4. 4 components are always 4830 * returned and DMASK works like a swizzle - it selects 4831 * the component to fetch. The only valid DMASK values are 4832 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns 4833 * (red,red,red,red) etc.) The ISA document doesn't mention 4834 * this. 4835 */ 4836 4837 /* Get the component index from src1.x for Gather4. */ 4838 if (!tgsi_is_shadow_target(target)) { 4839 LLVMValueRef comp_imm; 4840 struct tgsi_src_register src1 = inst->Src[1].Register; 4841 4842 assert(src1.File == TGSI_FILE_IMMEDIATE); 4843 4844 comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX]; 4845 gather_comp = LLVMConstIntGetZExtValue(comp_imm); 4846 gather_comp = CLAMP(gather_comp, 0, 3); 4847 } 4848 4849 dmask = 1 << gather_comp; 4850 } 4851 4852 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr, 4853 samp_ptr, address, count, dmask); 4854 } 4855 4856 /* Gather4 should follow the same rules as bilinear filtering, but the hardware 4857 * incorrectly forces nearest filtering if the texture format is integer. 4858 * The only effect it has on Gather4, which always returns 4 texels for 4859 * bilinear filtering, is that the final coordinates are off by 0.5 of 4860 * the texel size. 4861 * 4862 * The workaround is to subtract 0.5 from the unnormalized coordinates, 4863 * or (0.5 / size) from the normalized coordinates. 4864 */ 4865 static void si_lower_gather4_integer(struct si_shader_context *ctx, 4866 struct lp_build_emit_data *emit_data, 4867 const char *intr_name, 4868 unsigned coord_vgpr_index) 4869 { 4870 LLVMBuilderRef builder = ctx->gallivm.builder; 4871 LLVMValueRef coord = emit_data->args[0]; 4872 LLVMValueRef half_texel[2]; 4873 int c; 4874 4875 if (emit_data->inst->Texture.Texture == TGSI_TEXTURE_RECT || 4876 emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) { 4877 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5); 4878 } else { 4879 struct tgsi_full_instruction txq_inst = {}; 4880 struct lp_build_emit_data txq_emit_data = {}; 4881 4882 /* Query the texture size. */ 4883 txq_inst.Texture.Texture = emit_data->inst->Texture.Texture; 4884 txq_emit_data.inst = &txq_inst; 4885 txq_emit_data.dst_type = ctx->v4i32; 4886 set_tex_fetch_args(ctx, &txq_emit_data, TGSI_OPCODE_TXQ, 4887 txq_inst.Texture.Texture, 4888 emit_data->args[1], NULL, 4889 &ctx->bld_base.uint_bld.zero, 4890 1, 0xf); 4891 txq_emit(NULL, &ctx->bld_base, &txq_emit_data); 4892 4893 /* Compute -0.5 / size. */ 4894 for (c = 0; c < 2; c++) { 4895 half_texel[c] = 4896 LLVMBuildExtractElement(builder, txq_emit_data.output[0], 4897 LLVMConstInt(ctx->i32, c, 0), ""); 4898 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, ""); 4899 half_texel[c] = 4900 lp_build_emit_llvm_unary(&ctx->bld_base, 4901 TGSI_OPCODE_RCP, half_texel[c]); 4902 half_texel[c] = LLVMBuildFMul(builder, half_texel[c], 4903 LLVMConstReal(ctx->f32, -0.5), ""); 4904 } 4905 } 4906 4907 for (c = 0; c < 2; c++) { 4908 LLVMValueRef tmp; 4909 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0); 4910 4911 tmp = LLVMBuildExtractElement(builder, coord, index, ""); 4912 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, ""); 4913 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], ""); 4914 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, ""); 4915 coord = LLVMBuildInsertElement(builder, coord, tmp, index, ""); 4916 } 4917 4918 emit_data->args[0] = coord; 4919 emit_data->output[emit_data->chan] = 4920 lp_build_intrinsic(builder, intr_name, emit_data->dst_type, 4921 emit_data->args, emit_data->arg_count, 4922 LP_FUNC_ATTR_READNONE); 4923 } 4924 4925 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, 4926 struct lp_build_tgsi_context *bld_base, 4927 struct lp_build_emit_data *emit_data) 4928 { 4929 struct si_shader_context *ctx = si_shader_context(bld_base); 4930 struct lp_build_context *base = &bld_base->base; 4931 const struct tgsi_full_instruction *inst = emit_data->inst; 4932 unsigned opcode = inst->Instruction.Opcode; 4933 unsigned target = inst->Texture.Texture; 4934 char intr_name[127]; 4935 bool has_offset = inst->Texture.NumOffsets > 0; 4936 bool is_shadow = tgsi_is_shadow_target(target); 4937 char type[64]; 4938 const char *name = "llvm.SI.image.sample"; 4939 const char *infix = ""; 4940 4941 if (target == TGSI_TEXTURE_BUFFER) { 4942 emit_data->output[emit_data->chan] = lp_build_intrinsic( 4943 base->gallivm->builder, 4944 "llvm.SI.vs.load.input", emit_data->dst_type, 4945 emit_data->args, emit_data->arg_count, 4946 LP_FUNC_ATTR_READNONE); 4947 return; 4948 } 4949 4950 switch (opcode) { 4951 case TGSI_OPCODE_TXF: 4952 name = target == TGSI_TEXTURE_2D_MSAA || 4953 target == TGSI_TEXTURE_2D_ARRAY_MSAA ? 4954 "llvm.SI.image.load" : 4955 "llvm.SI.image.load.mip"; 4956 is_shadow = false; 4957 has_offset = false; 4958 break; 4959 case TGSI_OPCODE_LODQ: 4960 name = "llvm.SI.getlod"; 4961 is_shadow = false; 4962 has_offset = false; 4963 break; 4964 case TGSI_OPCODE_TEX: 4965 case TGSI_OPCODE_TEX2: 4966 case TGSI_OPCODE_TXP: 4967 if (ctx->type != PIPE_SHADER_FRAGMENT) 4968 infix = ".lz"; 4969 break; 4970 case TGSI_OPCODE_TXB: 4971 case TGSI_OPCODE_TXB2: 4972 assert(ctx->type == PIPE_SHADER_FRAGMENT); 4973 infix = ".b"; 4974 break; 4975 case TGSI_OPCODE_TXL: 4976 case TGSI_OPCODE_TXL2: 4977 infix = ".l"; 4978 break; 4979 case TGSI_OPCODE_TXD: 4980 infix = ".d"; 4981 break; 4982 case TGSI_OPCODE_TG4: 4983 name = "llvm.SI.gather4"; 4984 infix = ".lz"; 4985 break; 4986 default: 4987 assert(0); 4988 return; 4989 } 4990 4991 /* Add the type and suffixes .c, .o if needed. */ 4992 build_type_name_for_intr(LLVMTypeOf(emit_data->args[0]), type, sizeof(type)); 4993 sprintf(intr_name, "%s%s%s%s.%s", 4994 name, is_shadow ? ".c" : "", infix, 4995 has_offset ? ".o" : "", type); 4996 4997 /* The hardware needs special lowering for Gather4 with integer formats. */ 4998 if (opcode == TGSI_OPCODE_TG4) { 4999 struct tgsi_shader_info *info = &ctx->shader->selector->info; 5000 /* This will also work with non-constant indexing because of how 5001 * glsl_to_tgsi works and we intent to preserve that behavior. 5002 */ 5003 const unsigned src_idx = 2; 5004 unsigned sampler = inst->Src[src_idx].Register.Index; 5005 5006 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER); 5007 5008 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT || 5009 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT) { 5010 /* Texture coordinates start after: 5011 * {offset, bias, z-compare, derivatives} 5012 * Only the offset and z-compare can occur here. 5013 */ 5014 si_lower_gather4_integer(ctx, emit_data, intr_name, 5015 (int)has_offset + (int)is_shadow); 5016 return; 5017 } 5018 } 5019 5020 emit_data->output[emit_data->chan] = lp_build_intrinsic( 5021 base->gallivm->builder, intr_name, emit_data->dst_type, 5022 emit_data->args, emit_data->arg_count, 5023 LP_FUNC_ATTR_READNONE); 5024 } 5025 5026 static void si_llvm_emit_txqs( 5027 const struct lp_build_tgsi_action *action, 5028 struct lp_build_tgsi_context *bld_base, 5029 struct lp_build_emit_data *emit_data) 5030 { 5031 struct si_shader_context *ctx = si_shader_context(bld_base); 5032 struct gallivm_state *gallivm = bld_base->base.gallivm; 5033 LLVMBuilderRef builder = gallivm->builder; 5034 LLVMValueRef res, samples; 5035 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL; 5036 5037 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr); 5038 5039 5040 /* Read the samples from the descriptor directly. */ 5041 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, ""); 5042 samples = LLVMBuildExtractElement( 5043 builder, res, 5044 lp_build_const_int32(gallivm, 3), ""); 5045 samples = LLVMBuildLShr(builder, samples, 5046 lp_build_const_int32(gallivm, 16), ""); 5047 samples = LLVMBuildAnd(builder, samples, 5048 lp_build_const_int32(gallivm, 0xf), ""); 5049 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), 5050 samples, ""); 5051 5052 emit_data->output[emit_data->chan] = samples; 5053 } 5054 5055 /* 5056 * SI implements derivatives using the local data store (LDS) 5057 * All writes to the LDS happen in all executing threads at 5058 * the same time. TID is the Thread ID for the current 5059 * thread and is a value between 0 and 63, representing 5060 * the thread's position in the wavefront. 5061 * 5062 * For the pixel shader threads are grouped into quads of four pixels. 5063 * The TIDs of the pixels of a quad are: 5064 * 5065 * +------+------+ 5066 * |4n + 0|4n + 1| 5067 * +------+------+ 5068 * |4n + 2|4n + 3| 5069 * +------+------+ 5070 * 5071 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel 5072 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of 5073 * the current pixel's column, and masking with 0xfffffffe yields the TID 5074 * of the left pixel of the current pixel's row. 5075 * 5076 * Adding 1 yields the TID of the pixel to the right of the left pixel, and 5077 * adding 2 yields the TID of the pixel below the top pixel. 5078 */ 5079 /* masks for thread ID. */ 5080 #define TID_MASK_TOP_LEFT 0xfffffffc 5081 #define TID_MASK_TOP 0xfffffffd 5082 #define TID_MASK_LEFT 0xfffffffe 5083 5084 static void si_llvm_emit_ddxy( 5085 const struct lp_build_tgsi_action *action, 5086 struct lp_build_tgsi_context *bld_base, 5087 struct lp_build_emit_data *emit_data) 5088 { 5089 struct si_shader_context *ctx = si_shader_context(bld_base); 5090 struct gallivm_state *gallivm = bld_base->base.gallivm; 5091 unsigned opcode = emit_data->info->opcode; 5092 LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, val, args[2]; 5093 int idx; 5094 unsigned mask; 5095 5096 thread_id = get_thread_id(ctx); 5097 5098 if (opcode == TGSI_OPCODE_DDX_FINE) 5099 mask = TID_MASK_LEFT; 5100 else if (opcode == TGSI_OPCODE_DDY_FINE) 5101 mask = TID_MASK_TOP; 5102 else 5103 mask = TID_MASK_TOP_LEFT; 5104 5105 tl_tid = LLVMBuildAnd(gallivm->builder, thread_id, 5106 lp_build_const_int32(gallivm, mask), ""); 5107 5108 /* for DDX we want to next X pixel, DDY next Y pixel. */ 5109 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2; 5110 trbl_tid = LLVMBuildAdd(gallivm->builder, tl_tid, 5111 lp_build_const_int32(gallivm, idx), ""); 5112 5113 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, ""); 5114 5115 if (ctx->screen->has_ds_bpermute) { 5116 args[0] = LLVMBuildMul(gallivm->builder, tl_tid, 5117 lp_build_const_int32(gallivm, 4), ""); 5118 args[1] = val; 5119 tl = lp_build_intrinsic(gallivm->builder, 5120 "llvm.amdgcn.ds.bpermute", ctx->i32, 5121 args, 2, LP_FUNC_ATTR_READNONE); 5122 5123 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid, 5124 lp_build_const_int32(gallivm, 4), ""); 5125 trbl = lp_build_intrinsic(gallivm->builder, 5126 "llvm.amdgcn.ds.bpermute", ctx->i32, 5127 args, 2, LP_FUNC_ATTR_READNONE); 5128 } else { 5129 LLVMValueRef store_ptr, load_ptr0, load_ptr1; 5130 5131 store_ptr = build_gep0(ctx, ctx->lds, thread_id); 5132 load_ptr0 = build_gep0(ctx, ctx->lds, tl_tid); 5133 load_ptr1 = build_gep0(ctx, ctx->lds, trbl_tid); 5134 5135 LLVMBuildStore(gallivm->builder, val, store_ptr); 5136 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, ""); 5137 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, ""); 5138 } 5139 5140 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, ""); 5141 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, ""); 5142 5143 emit_data->output[emit_data->chan] = 5144 LLVMBuildFSub(gallivm->builder, trbl, tl, ""); 5145 } 5146 5147 /* 5148 * this takes an I,J coordinate pair, 5149 * and works out the X and Y derivatives. 5150 * it returns DDX(I), DDX(J), DDY(I), DDY(J). 5151 */ 5152 static LLVMValueRef si_llvm_emit_ddxy_interp( 5153 struct lp_build_tgsi_context *bld_base, 5154 LLVMValueRef interp_ij) 5155 { 5156 struct si_shader_context *ctx = si_shader_context(bld_base); 5157 struct gallivm_state *gallivm = bld_base->base.gallivm; 5158 LLVMValueRef result[4], a; 5159 unsigned i; 5160 5161 for (i = 0; i < 2; i++) { 5162 a = LLVMBuildExtractElement(gallivm->builder, interp_ij, 5163 LLVMConstInt(ctx->i32, i, 0), ""); 5164 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a); 5165 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a); 5166 } 5167 5168 return lp_build_gather_values(gallivm, result, 4); 5169 } 5170 5171 static void interp_fetch_args( 5172 struct lp_build_tgsi_context *bld_base, 5173 struct lp_build_emit_data *emit_data) 5174 { 5175 struct si_shader_context *ctx = si_shader_context(bld_base); 5176 struct gallivm_state *gallivm = bld_base->base.gallivm; 5177 const struct tgsi_full_instruction *inst = emit_data->inst; 5178 5179 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 5180 /* offset is in second src, first two channels */ 5181 emit_data->args[0] = lp_build_emit_fetch(bld_base, 5182 emit_data->inst, 1, 5183 TGSI_CHAN_X); 5184 emit_data->args[1] = lp_build_emit_fetch(bld_base, 5185 emit_data->inst, 1, 5186 TGSI_CHAN_Y); 5187 emit_data->arg_count = 2; 5188 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 5189 LLVMValueRef sample_position; 5190 LLVMValueRef sample_id; 5191 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f); 5192 5193 /* fetch sample ID, then fetch its sample position, 5194 * and place into first two channels. 5195 */ 5196 sample_id = lp_build_emit_fetch(bld_base, 5197 emit_data->inst, 1, TGSI_CHAN_X); 5198 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id, 5199 ctx->i32, ""); 5200 sample_position = load_sample_position(ctx, sample_id); 5201 5202 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder, 5203 sample_position, 5204 lp_build_const_int32(gallivm, 0), ""); 5205 5206 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, ""); 5207 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder, 5208 sample_position, 5209 lp_build_const_int32(gallivm, 1), ""); 5210 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, ""); 5211 emit_data->arg_count = 2; 5212 } 5213 } 5214 5215 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action, 5216 struct lp_build_tgsi_context *bld_base, 5217 struct lp_build_emit_data *emit_data) 5218 { 5219 struct si_shader_context *ctx = si_shader_context(bld_base); 5220 struct si_shader *shader = ctx->shader; 5221 struct gallivm_state *gallivm = bld_base->base.gallivm; 5222 struct lp_build_context *uint = &bld_base->uint_bld; 5223 LLVMValueRef interp_param; 5224 const struct tgsi_full_instruction *inst = emit_data->inst; 5225 int input_index = inst->Src[0].Register.Index; 5226 int chan; 5227 int i; 5228 LLVMValueRef attr_number; 5229 LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK); 5230 int interp_param_idx; 5231 unsigned interp = shader->selector->info.input_interpolate[input_index]; 5232 unsigned location; 5233 5234 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); 5235 5236 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 5237 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) 5238 location = TGSI_INTERPOLATE_LOC_CENTER; 5239 else 5240 location = TGSI_INTERPOLATE_LOC_CENTROID; 5241 5242 interp_param_idx = lookup_interp_param_index(interp, location); 5243 if (interp_param_idx == -1) 5244 return; 5245 else if (interp_param_idx) 5246 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx); 5247 else 5248 interp_param = NULL; 5249 5250 attr_number = lp_build_const_int32(gallivm, input_index); 5251 5252 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 5253 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 5254 LLVMValueRef ij_out[2]; 5255 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param); 5256 5257 /* 5258 * take the I then J parameters, and the DDX/Y for it, and 5259 * calculate the IJ inputs for the interpolator. 5260 * temp1 = ddx * offset/sample.x + I; 5261 * interp_param.I = ddy * offset/sample.y + temp1; 5262 * temp1 = ddx * offset/sample.x + J; 5263 * interp_param.J = ddy * offset/sample.y + temp1; 5264 */ 5265 for (i = 0; i < 2; i++) { 5266 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i); 5267 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2); 5268 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder, 5269 ddxy_out, ix_ll, ""); 5270 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder, 5271 ddxy_out, iy_ll, ""); 5272 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder, 5273 interp_param, ix_ll, ""); 5274 LLVMValueRef temp1, temp2; 5275 5276 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el, 5277 ctx->f32, ""); 5278 5279 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], ""); 5280 5281 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, ""); 5282 5283 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], ""); 5284 5285 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, ""); 5286 } 5287 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2); 5288 } 5289 5290 for (chan = 0; chan < 4; chan++) { 5291 LLVMValueRef llvm_chan; 5292 unsigned schan; 5293 5294 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan); 5295 llvm_chan = lp_build_const_int32(gallivm, schan); 5296 5297 if (interp_param) { 5298 interp_param = LLVMBuildBitCast(gallivm->builder, 5299 interp_param, LLVMVectorType(ctx->f32, 2), ""); 5300 LLVMValueRef i = LLVMBuildExtractElement( 5301 gallivm->builder, interp_param, uint->zero, ""); 5302 LLVMValueRef j = LLVMBuildExtractElement( 5303 gallivm->builder, interp_param, uint->one, ""); 5304 emit_data->output[chan] = build_fs_interp(bld_base, 5305 llvm_chan, attr_number, params, 5306 i, j); 5307 } else { 5308 emit_data->output[chan] = build_fs_interp_mov(bld_base, 5309 lp_build_const_int32(gallivm, 2), /* P0 */ 5310 llvm_chan, attr_number, params); 5311 } 5312 } 5313 } 5314 5315 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base, 5316 struct lp_build_emit_data *emit_data) 5317 { 5318 struct si_shader_context *ctx = si_shader_context(bld_base); 5319 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register; 5320 LLVMValueRef imm; 5321 unsigned stream; 5322 5323 assert(src0.File == TGSI_FILE_IMMEDIATE); 5324 5325 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX]; 5326 stream = LLVMConstIntGetZExtValue(imm) & 0x3; 5327 return stream; 5328 } 5329 5330 /* Emit one vertex from the geometry shader */ 5331 static void si_llvm_emit_vertex( 5332 const struct lp_build_tgsi_action *action, 5333 struct lp_build_tgsi_context *bld_base, 5334 struct lp_build_emit_data *emit_data) 5335 { 5336 struct si_shader_context *ctx = si_shader_context(bld_base); 5337 struct lp_build_context *uint = &bld_base->uint_bld; 5338 struct si_shader *shader = ctx->shader; 5339 struct tgsi_shader_info *info = &shader->selector->info; 5340 struct gallivm_state *gallivm = bld_base->base.gallivm; 5341 struct lp_build_if_state if_state; 5342 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, 5343 SI_PARAM_GS2VS_OFFSET); 5344 LLVMValueRef gs_next_vertex; 5345 LLVMValueRef can_emit, kill; 5346 LLVMValueRef args[2]; 5347 unsigned chan, offset; 5348 int i; 5349 unsigned stream; 5350 5351 stream = si_llvm_get_stream(bld_base, emit_data); 5352 5353 /* Write vertex attribute values to GSVS ring */ 5354 gs_next_vertex = LLVMBuildLoad(gallivm->builder, 5355 ctx->gs_next_vertex[stream], 5356 ""); 5357 5358 /* If this thread has already emitted the declared maximum number of 5359 * vertices, skip the write: excessive vertex emissions are not 5360 * supposed to have any effect. 5361 * 5362 * If the shader has no writes to memory, kill it instead. This skips 5363 * further memory loads and may allow LLVM to skip to the end 5364 * altogether. 5365 */ 5366 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex, 5367 lp_build_const_int32(gallivm, 5368 shader->selector->gs_max_out_vertices), ""); 5369 5370 bool use_kill = !info->writes_memory; 5371 if (use_kill) { 5372 kill = lp_build_select(&bld_base->base, can_emit, 5373 lp_build_const_float(gallivm, 1.0f), 5374 lp_build_const_float(gallivm, -1.0f)); 5375 5376 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", 5377 ctx->voidt, &kill, 1, 0); 5378 } else { 5379 lp_build_if(&if_state, gallivm, can_emit); 5380 } 5381 5382 offset = 0; 5383 for (i = 0; i < info->num_outputs; i++) { 5384 LLVMValueRef *out_ptr = ctx->outputs[i]; 5385 5386 for (chan = 0; chan < 4; chan++) { 5387 if (!(info->output_usagemask[i] & (1 << chan)) || 5388 ((info->output_streams[i] >> (2 * chan)) & 3) != stream) 5389 continue; 5390 5391 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); 5392 LLVMValueRef voffset = 5393 lp_build_const_int32(gallivm, offset * 5394 shader->selector->gs_max_out_vertices); 5395 offset++; 5396 5397 voffset = lp_build_add(uint, voffset, gs_next_vertex); 5398 voffset = lp_build_mul_imm(uint, voffset, 4); 5399 5400 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, ""); 5401 5402 build_tbuffer_store(ctx, 5403 ctx->gsvs_ring[stream], 5404 out_val, 1, 5405 voffset, soffset, 0, 5406 V_008F0C_BUF_DATA_FORMAT_32, 5407 V_008F0C_BUF_NUM_FORMAT_UINT, 5408 1, 0, 1, 1, 0); 5409 } 5410 } 5411 5412 gs_next_vertex = lp_build_add(uint, gs_next_vertex, 5413 lp_build_const_int32(gallivm, 1)); 5414 5415 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]); 5416 5417 /* Signal vertex emission */ 5418 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8)); 5419 args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID); 5420 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", 5421 ctx->voidt, args, 2, 0); 5422 5423 if (!use_kill) 5424 lp_build_endif(&if_state); 5425 } 5426 5427 /* Cut one primitive from the geometry shader */ 5428 static void si_llvm_emit_primitive( 5429 const struct lp_build_tgsi_action *action, 5430 struct lp_build_tgsi_context *bld_base, 5431 struct lp_build_emit_data *emit_data) 5432 { 5433 struct si_shader_context *ctx = si_shader_context(bld_base); 5434 struct gallivm_state *gallivm = bld_base->base.gallivm; 5435 LLVMValueRef args[2]; 5436 unsigned stream; 5437 5438 /* Signal primitive cut */ 5439 stream = si_llvm_get_stream(bld_base, emit_data); 5440 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8)); 5441 args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID); 5442 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", 5443 ctx->voidt, args, 2, 0); 5444 } 5445 5446 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, 5447 struct lp_build_tgsi_context *bld_base, 5448 struct lp_build_emit_data *emit_data) 5449 { 5450 struct si_shader_context *ctx = si_shader_context(bld_base); 5451 struct gallivm_state *gallivm = bld_base->base.gallivm; 5452 5453 /* SI only (thanks to a hw bug workaround): 5454 * The real barrier instruction isnt needed, because an entire patch 5455 * always fits into a single wave. 5456 */ 5457 if (HAVE_LLVM >= 0x0309 && 5458 ctx->screen->b.chip_class == SI && 5459 ctx->type == PIPE_SHADER_TESS_CTRL) { 5460 emit_waitcnt(ctx, LGKM_CNT & VM_CNT); 5461 return; 5462 } 5463 5464 lp_build_intrinsic(gallivm->builder, 5465 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier" 5466 : "llvm.AMDGPU.barrier.local", 5467 ctx->voidt, NULL, 0, 0); 5468 } 5469 5470 static const struct lp_build_tgsi_action tex_action = { 5471 .fetch_args = tex_fetch_args, 5472 .emit = build_tex_intrinsic, 5473 }; 5474 5475 static const struct lp_build_tgsi_action interp_action = { 5476 .fetch_args = interp_fetch_args, 5477 .emit = build_interp_intrinsic, 5478 }; 5479 5480 static void si_create_function(struct si_shader_context *ctx, 5481 const char *name, 5482 LLVMTypeRef *returns, unsigned num_returns, 5483 LLVMTypeRef *params, unsigned num_params, 5484 int last_sgpr) 5485 { 5486 int i; 5487 5488 si_llvm_create_func(ctx, name, returns, num_returns, 5489 params, num_params); 5490 si_llvm_shader_type(ctx->main_fn, ctx->type); 5491 ctx->return_value = LLVMGetUndef(ctx->return_type); 5492 5493 for (i = 0; i <= last_sgpr; ++i) { 5494 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i); 5495 5496 /* The combination of: 5497 * - ByVal 5498 * - dereferenceable 5499 * - invariant.load 5500 * allows the optimization passes to move loads and reduces 5501 * SGPR spilling significantly. 5502 */ 5503 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { 5504 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL); 5505 lp_add_attr_dereferenceable(P, UINT64_MAX); 5506 } else 5507 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG); 5508 } 5509 5510 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) { 5511 /* These were copied from some LLVM test. */ 5512 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 5513 "less-precise-fpmad", 5514 "true"); 5515 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 5516 "no-infs-fp-math", 5517 "true"); 5518 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 5519 "no-nans-fp-math", 5520 "true"); 5521 LLVMAddTargetDependentFunctionAttr(ctx->main_fn, 5522 "unsafe-fp-math", 5523 "true"); 5524 } 5525 } 5526 5527 static void create_meta_data(struct si_shader_context *ctx) 5528 { 5529 struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; 5530 5531 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context, 5532 "invariant.load", 14); 5533 ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context, 5534 "range", 5); 5535 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context, 5536 "amdgpu.uniform", 14); 5537 5538 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0); 5539 } 5540 5541 static void declare_streamout_params(struct si_shader_context *ctx, 5542 struct pipe_stream_output_info *so, 5543 LLVMTypeRef *params, LLVMTypeRef i32, 5544 unsigned *num_params) 5545 { 5546 int i; 5547 5548 /* Streamout SGPRs. */ 5549 if (so->num_outputs) { 5550 if (ctx->type != PIPE_SHADER_TESS_EVAL) 5551 params[ctx->param_streamout_config = (*num_params)++] = i32; 5552 else 5553 ctx->param_streamout_config = ctx->param_tess_offchip; 5554 5555 params[ctx->param_streamout_write_index = (*num_params)++] = i32; 5556 } 5557 /* A streamout buffer offset is loaded if the stride is non-zero. */ 5558 for (i = 0; i < 4; i++) { 5559 if (!so->stride[i]) 5560 continue; 5561 5562 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32; 5563 } 5564 } 5565 5566 static unsigned llvm_get_type_size(LLVMTypeRef type) 5567 { 5568 LLVMTypeKind kind = LLVMGetTypeKind(type); 5569 5570 switch (kind) { 5571 case LLVMIntegerTypeKind: 5572 return LLVMGetIntTypeWidth(type) / 8; 5573 case LLVMFloatTypeKind: 5574 return 4; 5575 case LLVMPointerTypeKind: 5576 return 8; 5577 case LLVMVectorTypeKind: 5578 return LLVMGetVectorSize(type) * 5579 llvm_get_type_size(LLVMGetElementType(type)); 5580 case LLVMArrayTypeKind: 5581 return LLVMGetArrayLength(type) * 5582 llvm_get_type_size(LLVMGetElementType(type)); 5583 default: 5584 assert(0); 5585 return 0; 5586 } 5587 } 5588 5589 static void declare_tess_lds(struct si_shader_context *ctx) 5590 { 5591 struct gallivm_state *gallivm = &ctx->gallivm; 5592 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 5593 struct lp_build_context *uint = &bld_base->uint_bld; 5594 5595 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768; 5596 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, uint->zero, 5597 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE), 5598 "tess_lds"); 5599 } 5600 5601 static unsigned si_get_max_workgroup_size(struct si_shader *shader) 5602 { 5603 const unsigned *properties = shader->selector->info.properties; 5604 unsigned max_work_group_size = 5605 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * 5606 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * 5607 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; 5608 5609 if (!max_work_group_size) { 5610 /* This is a variable group size compute shader, 5611 * compile it for the maximum possible group size. 5612 */ 5613 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; 5614 } 5615 return max_work_group_size; 5616 } 5617 5618 static void create_function(struct si_shader_context *ctx) 5619 { 5620 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 5621 struct gallivm_state *gallivm = bld_base->base.gallivm; 5622 struct si_shader *shader = ctx->shader; 5623 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32; 5624 LLVMTypeRef returns[16+32*4]; 5625 unsigned i, last_sgpr, num_params, num_return_sgprs; 5626 unsigned num_returns = 0; 5627 unsigned num_prolog_vgprs = 0; 5628 5629 v3i32 = LLVMVectorType(ctx->i32, 3); 5630 5631 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); 5632 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS); 5633 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS); 5634 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES); 5635 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS); 5636 5637 switch (ctx->type) { 5638 case PIPE_SHADER_VERTEX: 5639 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS); 5640 params[SI_PARAM_BASE_VERTEX] = ctx->i32; 5641 params[SI_PARAM_START_INSTANCE] = ctx->i32; 5642 params[SI_PARAM_DRAWID] = ctx->i32; 5643 num_params = SI_PARAM_DRAWID+1; 5644 5645 if (shader->key.as_es) { 5646 params[ctx->param_es2gs_offset = num_params++] = ctx->i32; 5647 } else if (shader->key.as_ls) { 5648 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32; 5649 num_params = SI_PARAM_LS_OUT_LAYOUT+1; 5650 } else { 5651 if (shader->is_gs_copy_shader) { 5652 num_params = SI_PARAM_RW_BUFFERS+1; 5653 } else { 5654 params[SI_PARAM_VS_STATE_BITS] = ctx->i32; 5655 num_params = SI_PARAM_VS_STATE_BITS+1; 5656 } 5657 5658 /* The locations of the other parameters are assigned dynamically. */ 5659 declare_streamout_params(ctx, &shader->selector->so, 5660 params, ctx->i32, &num_params); 5661 } 5662 5663 last_sgpr = num_params-1; 5664 5665 /* VGPRs */ 5666 params[ctx->param_vertex_id = num_params++] = ctx->i32; 5667 params[ctx->param_rel_auto_id = num_params++] = ctx->i32; 5668 params[ctx->param_vs_prim_id = num_params++] = ctx->i32; 5669 params[ctx->param_instance_id = num_params++] = ctx->i32; 5670 5671 if (!shader->is_gs_copy_shader) { 5672 /* Vertex load indices. */ 5673 ctx->param_vertex_index0 = num_params; 5674 5675 for (i = 0; i < shader->selector->info.num_inputs; i++) 5676 params[num_params++] = ctx->i32; 5677 5678 num_prolog_vgprs += shader->selector->info.num_inputs; 5679 5680 /* PrimitiveID output. */ 5681 if (!shader->key.as_es && !shader->key.as_ls) 5682 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) 5683 returns[num_returns++] = ctx->f32; 5684 } 5685 break; 5686 5687 case PIPE_SHADER_TESS_CTRL: 5688 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32; 5689 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32; 5690 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32; 5691 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32; 5692 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32; 5693 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32; 5694 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; 5695 5696 /* VGPRs */ 5697 params[SI_PARAM_PATCH_ID] = ctx->i32; 5698 params[SI_PARAM_REL_IDS] = ctx->i32; 5699 num_params = SI_PARAM_REL_IDS+1; 5700 5701 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are 5702 * placed after the user SGPRs. 5703 */ 5704 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++) 5705 returns[num_returns++] = ctx->i32; /* SGPRs */ 5706 5707 for (i = 0; i < 3; i++) 5708 returns[num_returns++] = ctx->f32; /* VGPRs */ 5709 break; 5710 5711 case PIPE_SHADER_TESS_EVAL: 5712 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32; 5713 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1; 5714 5715 if (shader->key.as_es) { 5716 params[ctx->param_oc_lds = num_params++] = ctx->i32; 5717 params[ctx->param_tess_offchip = num_params++] = ctx->i32; 5718 params[ctx->param_es2gs_offset = num_params++] = ctx->i32; 5719 } else { 5720 params[ctx->param_tess_offchip = num_params++] = ctx->i32; 5721 declare_streamout_params(ctx, &shader->selector->so, 5722 params, ctx->i32, &num_params); 5723 params[ctx->param_oc_lds = num_params++] = ctx->i32; 5724 } 5725 last_sgpr = num_params - 1; 5726 5727 /* VGPRs */ 5728 params[ctx->param_tes_u = num_params++] = ctx->f32; 5729 params[ctx->param_tes_v = num_params++] = ctx->f32; 5730 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32; 5731 params[ctx->param_tes_patch_id = num_params++] = ctx->i32; 5732 5733 /* PrimitiveID output. */ 5734 if (!shader->key.as_es) 5735 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++) 5736 returns[num_returns++] = ctx->f32; 5737 break; 5738 5739 case PIPE_SHADER_GEOMETRY: 5740 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32; 5741 params[SI_PARAM_GS_WAVE_ID] = ctx->i32; 5742 last_sgpr = SI_PARAM_GS_WAVE_ID; 5743 5744 /* VGPRs */ 5745 params[SI_PARAM_VTX0_OFFSET] = ctx->i32; 5746 params[SI_PARAM_VTX1_OFFSET] = ctx->i32; 5747 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32; 5748 params[SI_PARAM_VTX2_OFFSET] = ctx->i32; 5749 params[SI_PARAM_VTX3_OFFSET] = ctx->i32; 5750 params[SI_PARAM_VTX4_OFFSET] = ctx->i32; 5751 params[SI_PARAM_VTX5_OFFSET] = ctx->i32; 5752 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32; 5753 num_params = SI_PARAM_GS_INSTANCE_ID+1; 5754 break; 5755 5756 case PIPE_SHADER_FRAGMENT: 5757 params[SI_PARAM_ALPHA_REF] = ctx->f32; 5758 params[SI_PARAM_PRIM_MASK] = ctx->i32; 5759 last_sgpr = SI_PARAM_PRIM_MASK; 5760 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32; 5761 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32; 5762 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32; 5763 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32; 5764 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32; 5765 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32; 5766 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32; 5767 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32; 5768 params[SI_PARAM_POS_X_FLOAT] = ctx->f32; 5769 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32; 5770 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32; 5771 params[SI_PARAM_POS_W_FLOAT] = ctx->f32; 5772 params[SI_PARAM_FRONT_FACE] = ctx->i32; 5773 shader->info.face_vgpr_index = 20; 5774 params[SI_PARAM_ANCILLARY] = ctx->i32; 5775 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32; 5776 params[SI_PARAM_POS_FIXED_PT] = ctx->i32; 5777 num_params = SI_PARAM_POS_FIXED_PT+1; 5778 5779 /* Color inputs from the prolog. */ 5780 if (shader->selector->info.colors_read) { 5781 unsigned num_color_elements = 5782 util_bitcount(shader->selector->info.colors_read); 5783 5784 assert(num_params + num_color_elements <= ARRAY_SIZE(params)); 5785 for (i = 0; i < num_color_elements; i++) 5786 params[num_params++] = ctx->f32; 5787 5788 num_prolog_vgprs += num_color_elements; 5789 } 5790 5791 /* Outputs for the epilog. */ 5792 num_return_sgprs = SI_SGPR_ALPHA_REF + 1; 5793 num_returns = 5794 num_return_sgprs + 5795 util_bitcount(shader->selector->info.colors_written) * 4 + 5796 shader->selector->info.writes_z + 5797 shader->selector->info.writes_stencil + 5798 shader->selector->info.writes_samplemask + 5799 1 /* SampleMaskIn */; 5800 5801 num_returns = MAX2(num_returns, 5802 num_return_sgprs + 5803 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 5804 5805 for (i = 0; i < num_return_sgprs; i++) 5806 returns[i] = ctx->i32; 5807 for (; i < num_returns; i++) 5808 returns[i] = ctx->f32; 5809 break; 5810 5811 case PIPE_SHADER_COMPUTE: 5812 params[SI_PARAM_GRID_SIZE] = v3i32; 5813 params[SI_PARAM_BLOCK_SIZE] = v3i32; 5814 params[SI_PARAM_BLOCK_ID] = v3i32; 5815 last_sgpr = SI_PARAM_BLOCK_ID; 5816 5817 params[SI_PARAM_THREAD_ID] = v3i32; 5818 num_params = SI_PARAM_THREAD_ID + 1; 5819 break; 5820 default: 5821 assert(0 && "unimplemented shader"); 5822 return; 5823 } 5824 5825 assert(num_params <= ARRAY_SIZE(params)); 5826 5827 si_create_function(ctx, "main", returns, num_returns, params, 5828 num_params, last_sgpr); 5829 5830 /* Reserve register locations for VGPR inputs the PS prolog may need. */ 5831 if (ctx->type == PIPE_SHADER_FRAGMENT && 5832 ctx->separate_prolog) { 5833 si_llvm_add_attribute(ctx->main_fn, 5834 "InitialPSInputAddr", 5835 S_0286D0_PERSP_SAMPLE_ENA(1) | 5836 S_0286D0_PERSP_CENTER_ENA(1) | 5837 S_0286D0_PERSP_CENTROID_ENA(1) | 5838 S_0286D0_LINEAR_SAMPLE_ENA(1) | 5839 S_0286D0_LINEAR_CENTER_ENA(1) | 5840 S_0286D0_LINEAR_CENTROID_ENA(1) | 5841 S_0286D0_FRONT_FACE_ENA(1) | 5842 S_0286D0_POS_FIXED_PT_ENA(1)); 5843 } else if (ctx->type == PIPE_SHADER_COMPUTE) { 5844 si_llvm_add_attribute(ctx->main_fn, 5845 "amdgpu-max-work-group-size", 5846 si_get_max_workgroup_size(shader)); 5847 } 5848 5849 shader->info.num_input_sgprs = 0; 5850 shader->info.num_input_vgprs = 0; 5851 5852 for (i = 0; i <= last_sgpr; ++i) 5853 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4; 5854 5855 for (; i < num_params; ++i) 5856 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4; 5857 5858 assert(shader->info.num_input_vgprs >= num_prolog_vgprs); 5859 shader->info.num_input_vgprs -= num_prolog_vgprs; 5860 5861 if (!ctx->screen->has_ds_bpermute && 5862 bld_base->info && 5863 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 || 5864 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 || 5865 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 || 5866 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 || 5867 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 || 5868 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0)) 5869 ctx->lds = 5870 LLVMAddGlobalInAddressSpace(gallivm->module, 5871 LLVMArrayType(ctx->i32, 64), 5872 "ddxy_lds", 5873 LOCAL_ADDR_SPACE); 5874 5875 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.as_ls) || 5876 ctx->type == PIPE_SHADER_TESS_CTRL || 5877 ctx->type == PIPE_SHADER_TESS_EVAL) 5878 declare_tess_lds(ctx); 5879 } 5880 5881 /** 5882 * Load ESGS and GSVS ring buffer resource descriptors and save the variables 5883 * for later use. 5884 */ 5885 static void preload_ring_buffers(struct si_shader_context *ctx) 5886 { 5887 struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; 5888 LLVMBuilderRef builder = gallivm->builder; 5889 5890 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, 5891 SI_PARAM_RW_BUFFERS); 5892 5893 if ((ctx->type == PIPE_SHADER_VERTEX && 5894 ctx->shader->key.as_es) || 5895 (ctx->type == PIPE_SHADER_TESS_EVAL && 5896 ctx->shader->key.as_es) || 5897 ctx->type == PIPE_SHADER_GEOMETRY) { 5898 unsigned ring = 5899 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS 5900 : SI_ES_RING_ESGS; 5901 LLVMValueRef offset = lp_build_const_int32(gallivm, ring); 5902 5903 ctx->esgs_ring = 5904 build_indexed_load_const(ctx, buf_ptr, offset); 5905 } 5906 5907 if (ctx->shader->is_gs_copy_shader) { 5908 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS); 5909 5910 ctx->gsvs_ring[0] = 5911 build_indexed_load_const(ctx, buf_ptr, offset); 5912 } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 5913 const struct si_shader_selector *sel = ctx->shader->selector; 5914 struct lp_build_context *uint = &ctx->bld_base.uint_bld; 5915 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS); 5916 LLVMValueRef base_ring; 5917 5918 base_ring = build_indexed_load_const(ctx, buf_ptr, offset); 5919 5920 /* The conceptual layout of the GSVS ring is 5921 * v0c0 .. vLv0 v0c1 .. vLc1 .. 5922 * but the real memory layout is swizzled across 5923 * threads: 5924 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL 5925 * t16v0c0 .. 5926 * Override the buffer descriptor accordingly. 5927 */ 5928 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); 5929 uint64_t stream_offset = 0; 5930 5931 for (unsigned stream = 0; stream < 4; ++stream) { 5932 unsigned num_components; 5933 unsigned stride; 5934 unsigned num_records; 5935 LLVMValueRef ring, tmp; 5936 5937 num_components = sel->info.num_stream_output_components[stream]; 5938 if (!num_components) 5939 continue; 5940 5941 stride = 4 * num_components * sel->gs_max_out_vertices; 5942 5943 /* Limit on the stride field for <= CIK. */ 5944 assert(stride < (1 << 14)); 5945 5946 num_records = 64; 5947 5948 ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); 5949 tmp = LLVMBuildExtractElement(builder, ring, uint->zero, ""); 5950 tmp = LLVMBuildAdd(builder, tmp, 5951 LLVMConstInt(ctx->i64, 5952 stream_offset, 0), ""); 5953 stream_offset += stride * 64; 5954 5955 ring = LLVMBuildInsertElement(builder, ring, tmp, uint->zero, ""); 5956 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); 5957 tmp = LLVMBuildExtractElement(builder, ring, uint->one, ""); 5958 tmp = LLVMBuildOr(builder, tmp, 5959 LLVMConstInt(ctx->i32, 5960 S_008F04_STRIDE(stride) | 5961 S_008F04_SWIZZLE_ENABLE(1), 0), ""); 5962 ring = LLVMBuildInsertElement(builder, ring, tmp, uint->one, ""); 5963 ring = LLVMBuildInsertElement(builder, ring, 5964 LLVMConstInt(ctx->i32, num_records, 0), 5965 LLVMConstInt(ctx->i32, 2, 0), ""); 5966 ring = LLVMBuildInsertElement(builder, ring, 5967 LLVMConstInt(ctx->i32, 5968 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | 5969 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 5970 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | 5971 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | 5972 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 5973 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | 5974 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */ 5975 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ 5976 S_008F0C_ADD_TID_ENABLE(1), 5977 0), 5978 LLVMConstInt(ctx->i32, 3, 0), ""); 5979 ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, ""); 5980 5981 ctx->gsvs_ring[stream] = ring; 5982 } 5983 } 5984 } 5985 5986 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, 5987 LLVMValueRef param_rw_buffers, 5988 unsigned param_pos_fixed_pt) 5989 { 5990 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 5991 struct gallivm_state *gallivm = bld_base->base.gallivm; 5992 LLVMBuilderRef builder = gallivm->builder; 5993 LLVMValueRef slot, desc, offset, row, bit, address[2]; 5994 5995 /* Use the fixed-point gl_FragCoord input. 5996 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits 5997 * per coordinate to get the repeating effect. 5998 */ 5999 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5); 6000 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5); 6001 6002 /* Load the buffer descriptor. */ 6003 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE); 6004 desc = build_indexed_load_const(ctx, param_rw_buffers, slot); 6005 6006 /* The stipple pattern is 32x32, each row has 32 bits. */ 6007 offset = LLVMBuildMul(builder, address[1], 6008 LLVMConstInt(ctx->i32, 4, 0), ""); 6009 row = buffer_load_const(ctx, desc, offset); 6010 row = LLVMBuildBitCast(builder, row, ctx->i32, ""); 6011 bit = LLVMBuildLShr(builder, row, address[0], ""); 6012 bit = LLVMBuildTrunc(builder, bit, ctx->i1, ""); 6013 6014 /* The intrinsic kills the thread if arg < 0. */ 6015 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0), 6016 LLVMConstReal(ctx->f32, -1), ""); 6017 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0); 6018 } 6019 6020 void si_shader_binary_read_config(struct radeon_shader_binary *binary, 6021 struct si_shader_config *conf, 6022 unsigned symbol_offset) 6023 { 6024 unsigned i; 6025 const unsigned char *config = 6026 radeon_shader_binary_config_start(binary, symbol_offset); 6027 bool really_needs_scratch = false; 6028 6029 /* LLVM adds SGPR spills to the scratch size. 6030 * Find out if we really need the scratch buffer. 6031 */ 6032 for (i = 0; i < binary->reloc_count; i++) { 6033 const struct radeon_shader_reloc *reloc = &binary->relocs[i]; 6034 6035 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || 6036 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { 6037 really_needs_scratch = true; 6038 break; 6039 } 6040 } 6041 6042 /* XXX: We may be able to emit some of these values directly rather than 6043 * extracting fields to be emitted later. 6044 */ 6045 6046 for (i = 0; i < binary->config_size_per_symbol; i+= 8) { 6047 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); 6048 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4)); 6049 switch (reg) { 6050 case R_00B028_SPI_SHADER_PGM_RSRC1_PS: 6051 case R_00B128_SPI_SHADER_PGM_RSRC1_VS: 6052 case R_00B228_SPI_SHADER_PGM_RSRC1_GS: 6053 case R_00B848_COMPUTE_PGM_RSRC1: 6054 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); 6055 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); 6056 conf->float_mode = G_00B028_FLOAT_MODE(value); 6057 conf->rsrc1 = value; 6058 break; 6059 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: 6060 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); 6061 break; 6062 case R_00B84C_COMPUTE_PGM_RSRC2: 6063 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); 6064 conf->rsrc2 = value; 6065 break; 6066 case R_0286CC_SPI_PS_INPUT_ENA: 6067 conf->spi_ps_input_ena = value; 6068 break; 6069 case R_0286D0_SPI_PS_INPUT_ADDR: 6070 conf->spi_ps_input_addr = value; 6071 break; 6072 case R_0286E8_SPI_TMPRING_SIZE: 6073 case R_00B860_COMPUTE_TMPRING_SIZE: 6074 /* WAVESIZE is in units of 256 dwords. */ 6075 if (really_needs_scratch) 6076 conf->scratch_bytes_per_wave = 6077 G_00B860_WAVESIZE(value) * 256 * 4; 6078 break; 6079 case 0x4: /* SPILLED_SGPRS */ 6080 conf->spilled_sgprs = value; 6081 break; 6082 case 0x8: /* SPILLED_VGPRS */ 6083 conf->spilled_vgprs = value; 6084 break; 6085 default: 6086 { 6087 static bool printed; 6088 6089 if (!printed) { 6090 fprintf(stderr, "Warning: LLVM emitted unknown " 6091 "config register: 0x%x\n", reg); 6092 printed = true; 6093 } 6094 } 6095 break; 6096 } 6097 } 6098 6099 if (!conf->spi_ps_input_addr) 6100 conf->spi_ps_input_addr = conf->spi_ps_input_ena; 6101 } 6102 6103 void si_shader_apply_scratch_relocs(struct si_context *sctx, 6104 struct si_shader *shader, 6105 struct si_shader_config *config, 6106 uint64_t scratch_va) 6107 { 6108 unsigned i; 6109 uint32_t scratch_rsrc_dword0 = scratch_va; 6110 uint32_t scratch_rsrc_dword1 = 6111 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); 6112 6113 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE 6114 * correctly. 6115 */ 6116 if (HAVE_LLVM >= 0x0309) 6117 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); 6118 else 6119 scratch_rsrc_dword1 |= 6120 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64); 6121 6122 for (i = 0 ; i < shader->binary.reloc_count; i++) { 6123 const struct radeon_shader_reloc *reloc = 6124 &shader->binary.relocs[i]; 6125 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { 6126 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, 6127 &scratch_rsrc_dword0, 4); 6128 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { 6129 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset, 6130 &scratch_rsrc_dword1, 4); 6131 } 6132 } 6133 } 6134 6135 static unsigned si_get_shader_binary_size(struct si_shader *shader) 6136 { 6137 unsigned size = shader->binary.code_size; 6138 6139 if (shader->prolog) 6140 size += shader->prolog->binary.code_size; 6141 if (shader->epilog) 6142 size += shader->epilog->binary.code_size; 6143 return size; 6144 } 6145 6146 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) 6147 { 6148 const struct radeon_shader_binary *prolog = 6149 shader->prolog ? &shader->prolog->binary : NULL; 6150 const struct radeon_shader_binary *epilog = 6151 shader->epilog ? &shader->epilog->binary : NULL; 6152 const struct radeon_shader_binary *mainb = &shader->binary; 6153 unsigned bo_size = si_get_shader_binary_size(shader) + 6154 (!epilog ? mainb->rodata_size : 0); 6155 unsigned char *ptr; 6156 6157 assert(!prolog || !prolog->rodata_size); 6158 assert((!prolog && !epilog) || !mainb->rodata_size); 6159 assert(!epilog || !epilog->rodata_size); 6160 6161 r600_resource_reference(&shader->bo, NULL); 6162 shader->bo = (struct r600_resource*) 6163 pipe_buffer_create(&sscreen->b.b, 0, 6164 PIPE_USAGE_IMMUTABLE, bo_size); 6165 if (!shader->bo) 6166 return -ENOMEM; 6167 6168 /* Upload. */ 6169 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL, 6170 PIPE_TRANSFER_READ_WRITE); 6171 6172 if (prolog) { 6173 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size); 6174 ptr += prolog->code_size; 6175 } 6176 6177 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size); 6178 ptr += mainb->code_size; 6179 6180 if (epilog) 6181 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size); 6182 else if (mainb->rodata_size > 0) 6183 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size); 6184 6185 sscreen->b.ws->buffer_unmap(shader->bo->buf); 6186 return 0; 6187 } 6188 6189 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary, 6190 struct pipe_debug_callback *debug, 6191 const char *name, FILE *file) 6192 { 6193 char *line, *p; 6194 unsigned i, count; 6195 6196 if (binary->disasm_string) { 6197 fprintf(file, "Shader %s disassembly:\n", name); 6198 fprintf(file, "%s", binary->disasm_string); 6199 6200 if (debug && debug->debug_message) { 6201 /* Very long debug messages are cut off, so send the 6202 * disassembly one line at a time. This causes more 6203 * overhead, but on the plus side it simplifies 6204 * parsing of resulting logs. 6205 */ 6206 pipe_debug_message(debug, SHADER_INFO, 6207 "Shader Disassembly Begin"); 6208 6209 line = binary->disasm_string; 6210 while (*line) { 6211 p = util_strchrnul(line, '\n'); 6212 count = p - line; 6213 6214 if (count) { 6215 pipe_debug_message(debug, SHADER_INFO, 6216 "%.*s", count, line); 6217 } 6218 6219 if (!*p) 6220 break; 6221 line = p + 1; 6222 } 6223 6224 pipe_debug_message(debug, SHADER_INFO, 6225 "Shader Disassembly End"); 6226 } 6227 } else { 6228 fprintf(file, "Shader %s binary:\n", name); 6229 for (i = 0; i < binary->code_size; i += 4) { 6230 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i, 6231 binary->code[i + 3], binary->code[i + 2], 6232 binary->code[i + 1], binary->code[i]); 6233 } 6234 } 6235 } 6236 6237 static void si_shader_dump_stats(struct si_screen *sscreen, 6238 struct si_shader *shader, 6239 struct pipe_debug_callback *debug, 6240 unsigned processor, 6241 FILE *file, 6242 bool check_debug_option) 6243 { 6244 struct si_shader_config *conf = &shader->config; 6245 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0; 6246 unsigned code_size = si_get_shader_binary_size(shader); 6247 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256; 6248 unsigned lds_per_wave = 0; 6249 unsigned max_simd_waves = 10; 6250 6251 /* Compute LDS usage for PS. */ 6252 switch (processor) { 6253 case PIPE_SHADER_FRAGMENT: 6254 /* The minimum usage per wave is (num_inputs * 48). The maximum 6255 * usage is (num_inputs * 48 * 16). 6256 * We can get anything in between and it varies between waves. 6257 * 6258 * The 48 bytes per input for a single primitive is equal to 6259 * 4 bytes/component * 4 components/input * 3 points. 6260 * 6261 * Other stages don't know the size at compile time or don't 6262 * allocate LDS per wave, but instead they do it per thread group. 6263 */ 6264 lds_per_wave = conf->lds_size * lds_increment + 6265 align(num_inputs * 48, lds_increment); 6266 break; 6267 case PIPE_SHADER_COMPUTE: 6268 if (shader->selector) { 6269 unsigned max_workgroup_size = 6270 si_get_max_workgroup_size(shader); 6271 lds_per_wave = (conf->lds_size * lds_increment) / 6272 DIV_ROUND_UP(max_workgroup_size, 64); 6273 } 6274 break; 6275 } 6276 6277 /* Compute the per-SIMD wave counts. */ 6278 if (conf->num_sgprs) { 6279 if (sscreen->b.chip_class >= VI) 6280 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs); 6281 else 6282 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs); 6283 } 6284 6285 if (conf->num_vgprs) 6286 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs); 6287 6288 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above 6289 * 16KB makes some SIMDs unoccupied). */ 6290 if (lds_per_wave) 6291 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave); 6292 6293 if (!check_debug_option || 6294 r600_can_dump_shader(&sscreen->b, processor)) { 6295 if (processor == PIPE_SHADER_FRAGMENT) { 6296 fprintf(file, "*** SHADER CONFIG ***\n" 6297 "SPI_PS_INPUT_ADDR = 0x%04x\n" 6298 "SPI_PS_INPUT_ENA = 0x%04x\n", 6299 conf->spi_ps_input_addr, conf->spi_ps_input_ena); 6300 } 6301 6302 fprintf(file, "*** SHADER STATS ***\n" 6303 "SGPRS: %d\n" 6304 "VGPRS: %d\n" 6305 "Spilled SGPRs: %d\n" 6306 "Spilled VGPRs: %d\n" 6307 "Private memory VGPRs: %d\n" 6308 "Code Size: %d bytes\n" 6309 "LDS: %d blocks\n" 6310 "Scratch: %d bytes per wave\n" 6311 "Max Waves: %d\n" 6312 "********************\n\n\n", 6313 conf->num_sgprs, conf->num_vgprs, 6314 conf->spilled_sgprs, conf->spilled_vgprs, 6315 conf->private_mem_vgprs, code_size, 6316 conf->lds_size, conf->scratch_bytes_per_wave, 6317 max_simd_waves); 6318 } 6319 6320 pipe_debug_message(debug, SHADER_INFO, 6321 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " 6322 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " 6323 "Spilled VGPRs: %d PrivMem VGPRs: %d", 6324 conf->num_sgprs, conf->num_vgprs, code_size, 6325 conf->lds_size, conf->scratch_bytes_per_wave, 6326 max_simd_waves, conf->spilled_sgprs, 6327 conf->spilled_vgprs, conf->private_mem_vgprs); 6328 } 6329 6330 static const char *si_get_shader_name(struct si_shader *shader, 6331 unsigned processor) 6332 { 6333 switch (processor) { 6334 case PIPE_SHADER_VERTEX: 6335 if (shader->key.as_es) 6336 return "Vertex Shader as ES"; 6337 else if (shader->key.as_ls) 6338 return "Vertex Shader as LS"; 6339 else 6340 return "Vertex Shader as VS"; 6341 case PIPE_SHADER_TESS_CTRL: 6342 return "Tessellation Control Shader"; 6343 case PIPE_SHADER_TESS_EVAL: 6344 if (shader->key.as_es) 6345 return "Tessellation Evaluation Shader as ES"; 6346 else 6347 return "Tessellation Evaluation Shader as VS"; 6348 case PIPE_SHADER_GEOMETRY: 6349 if (shader->is_gs_copy_shader) 6350 return "GS Copy Shader as VS"; 6351 else 6352 return "Geometry Shader"; 6353 case PIPE_SHADER_FRAGMENT: 6354 return "Pixel Shader"; 6355 case PIPE_SHADER_COMPUTE: 6356 return "Compute Shader"; 6357 default: 6358 return "Unknown Shader"; 6359 } 6360 } 6361 6362 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, 6363 struct pipe_debug_callback *debug, unsigned processor, 6364 FILE *file, bool check_debug_option) 6365 { 6366 if (!check_debug_option || 6367 r600_can_dump_shader(&sscreen->b, processor)) 6368 si_dump_shader_key(processor, &shader->key, file); 6369 6370 if (!check_debug_option && shader->binary.llvm_ir_string) { 6371 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", 6372 si_get_shader_name(shader, processor)); 6373 fprintf(file, "%s\n", shader->binary.llvm_ir_string); 6374 } 6375 6376 if (!check_debug_option || 6377 (r600_can_dump_shader(&sscreen->b, processor) && 6378 !(sscreen->b.debug_flags & DBG_NO_ASM))) { 6379 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor)); 6380 6381 if (shader->prolog) 6382 si_shader_dump_disassembly(&shader->prolog->binary, 6383 debug, "prolog", file); 6384 6385 si_shader_dump_disassembly(&shader->binary, debug, "main", file); 6386 6387 if (shader->epilog) 6388 si_shader_dump_disassembly(&shader->epilog->binary, 6389 debug, "epilog", file); 6390 fprintf(file, "\n"); 6391 } 6392 6393 si_shader_dump_stats(sscreen, shader, debug, processor, file, 6394 check_debug_option); 6395 } 6396 6397 int si_compile_llvm(struct si_screen *sscreen, 6398 struct radeon_shader_binary *binary, 6399 struct si_shader_config *conf, 6400 LLVMTargetMachineRef tm, 6401 LLVMModuleRef mod, 6402 struct pipe_debug_callback *debug, 6403 unsigned processor, 6404 const char *name) 6405 { 6406 int r = 0; 6407 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations); 6408 6409 if (r600_can_dump_shader(&sscreen->b, processor)) { 6410 fprintf(stderr, "radeonsi: Compiling shader %d\n", count); 6411 6412 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) { 6413 fprintf(stderr, "%s LLVM IR:\n\n", name); 6414 LLVMDumpModule(mod); 6415 fprintf(stderr, "\n"); 6416 } 6417 } 6418 6419 if (sscreen->record_llvm_ir) { 6420 char *ir = LLVMPrintModuleToString(mod); 6421 binary->llvm_ir_string = strdup(ir); 6422 LLVMDisposeMessage(ir); 6423 } 6424 6425 if (!si_replace_shader(count, binary)) { 6426 r = si_llvm_compile(mod, binary, tm, debug); 6427 if (r) 6428 return r; 6429 } 6430 6431 si_shader_binary_read_config(binary, conf, 0); 6432 6433 /* Enable 64-bit and 16-bit denormals, because there is no performance 6434 * cost. 6435 * 6436 * If denormals are enabled, all floating-point output modifiers are 6437 * ignored. 6438 * 6439 * Don't enable denormals for 32-bit floats, because: 6440 * - Floating-point output modifiers would be ignored by the hw. 6441 * - Some opcodes don't support denormals, such as v_mad_f32. We would 6442 * have to stop using those. 6443 * - SI & CI would be very slow. 6444 */ 6445 conf->float_mode |= V_00B028_FP_64_DENORMS; 6446 6447 FREE(binary->config); 6448 FREE(binary->global_symbol_offsets); 6449 binary->config = NULL; 6450 binary->global_symbol_offsets = NULL; 6451 6452 /* Some shaders can't have rodata because their binaries can be 6453 * concatenated. 6454 */ 6455 if (binary->rodata_size && 6456 (processor == PIPE_SHADER_VERTEX || 6457 processor == PIPE_SHADER_TESS_CTRL || 6458 processor == PIPE_SHADER_TESS_EVAL || 6459 processor == PIPE_SHADER_FRAGMENT)) { 6460 fprintf(stderr, "radeonsi: The shader can't have rodata."); 6461 return -EINVAL; 6462 } 6463 6464 return r; 6465 } 6466 6467 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) 6468 { 6469 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) 6470 LLVMBuildRetVoid(ctx->gallivm.builder); 6471 else 6472 LLVMBuildRet(ctx->gallivm.builder, ret); 6473 } 6474 6475 /* Generate code for the hardware VS shader stage to go with a geometry shader */ 6476 struct si_shader * 6477 si_generate_gs_copy_shader(struct si_screen *sscreen, 6478 LLVMTargetMachineRef tm, 6479 struct si_shader_selector *gs_selector, 6480 struct pipe_debug_callback *debug) 6481 { 6482 struct si_shader_context ctx; 6483 struct si_shader *shader; 6484 struct gallivm_state *gallivm = &ctx.gallivm; 6485 LLVMBuilderRef builder; 6486 struct lp_build_tgsi_context *bld_base = &ctx.bld_base; 6487 struct lp_build_context *uint = &bld_base->uint_bld; 6488 struct si_shader_output_values *outputs; 6489 struct tgsi_shader_info *gsinfo = &gs_selector->info; 6490 LLVMValueRef args[9]; 6491 int i, r; 6492 6493 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0])); 6494 6495 if (!outputs) 6496 return NULL; 6497 6498 shader = CALLOC_STRUCT(si_shader); 6499 if (!shader) { 6500 FREE(outputs); 6501 return NULL; 6502 } 6503 6504 6505 shader->selector = gs_selector; 6506 shader->is_gs_copy_shader = true; 6507 6508 si_init_shader_ctx(&ctx, sscreen, shader, tm); 6509 ctx.type = PIPE_SHADER_VERTEX; 6510 6511 builder = gallivm->builder; 6512 6513 create_meta_data(&ctx); 6514 create_function(&ctx); 6515 preload_ring_buffers(&ctx); 6516 6517 args[0] = ctx.gsvs_ring[0]; 6518 args[1] = lp_build_mul_imm(uint, 6519 LLVMGetParam(ctx.main_fn, 6520 ctx.param_vertex_id), 6521 4); 6522 args[3] = uint->zero; 6523 args[4] = uint->one; /* OFFEN */ 6524 args[5] = uint->zero; /* IDXEN */ 6525 args[6] = uint->one; /* GLC */ 6526 args[7] = uint->one; /* SLC */ 6527 args[8] = uint->zero; /* TFE */ 6528 6529 /* Fetch the vertex stream ID.*/ 6530 LLVMValueRef stream_id; 6531 6532 if (gs_selector->so.num_outputs) 6533 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2); 6534 else 6535 stream_id = uint->zero; 6536 6537 /* Fill in output information. */ 6538 for (i = 0; i < gsinfo->num_outputs; ++i) { 6539 outputs[i].semantic_name = gsinfo->output_semantic_name[i]; 6540 outputs[i].semantic_index = gsinfo->output_semantic_index[i]; 6541 6542 for (int chan = 0; chan < 4; chan++) { 6543 outputs[i].vertex_stream[chan] = 6544 (gsinfo->output_streams[i] >> (2 * chan)) & 3; 6545 } 6546 } 6547 6548 LLVMBasicBlockRef end_bb; 6549 LLVMValueRef switch_inst; 6550 6551 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end"); 6552 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); 6553 6554 for (int stream = 0; stream < 4; stream++) { 6555 LLVMBasicBlockRef bb; 6556 unsigned offset; 6557 6558 if (!gsinfo->num_stream_output_components[stream]) 6559 continue; 6560 6561 if (stream > 0 && !gs_selector->so.num_outputs) 6562 continue; 6563 6564 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out"); 6565 LLVMAddCase(switch_inst, lp_build_const_int32(gallivm, stream), bb); 6566 LLVMPositionBuilderAtEnd(builder, bb); 6567 6568 /* Fetch vertex data from GSVS ring */ 6569 offset = 0; 6570 for (i = 0; i < gsinfo->num_outputs; ++i) { 6571 for (unsigned chan = 0; chan < 4; chan++) { 6572 if (!(gsinfo->output_usagemask[i] & (1 << chan)) || 6573 outputs[i].vertex_stream[chan] != stream) { 6574 outputs[i].values[chan] = ctx.bld_base.base.undef; 6575 continue; 6576 } 6577 6578 args[2] = lp_build_const_int32( 6579 gallivm, 6580 offset * gs_selector->gs_max_out_vertices * 16 * 4); 6581 offset++; 6582 6583 outputs[i].values[chan] = 6584 LLVMBuildBitCast(gallivm->builder, 6585 lp_build_intrinsic(gallivm->builder, 6586 "llvm.SI.buffer.load.dword.i32.i32", 6587 ctx.i32, args, 9, 6588 LP_FUNC_ATTR_READONLY), 6589 ctx.f32, ""); 6590 } 6591 } 6592 6593 /* Streamout and exports. */ 6594 if (gs_selector->so.num_outputs) { 6595 si_llvm_emit_streamout(&ctx, outputs, 6596 gsinfo->num_outputs, 6597 stream); 6598 } 6599 6600 if (stream == 0) 6601 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs); 6602 6603 LLVMBuildBr(builder, end_bb); 6604 } 6605 6606 LLVMPositionBuilderAtEnd(builder, end_bb); 6607 6608 LLVMBuildRetVoid(gallivm->builder); 6609 6610 /* Dump LLVM IR before any optimization passes */ 6611 if (sscreen->b.debug_flags & DBG_PREOPT_IR && 6612 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY)) 6613 LLVMDumpModule(bld_base->base.gallivm->module); 6614 6615 si_llvm_finalize_module(&ctx, 6616 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY)); 6617 6618 r = si_compile_llvm(sscreen, &ctx.shader->binary, 6619 &ctx.shader->config, ctx.tm, 6620 bld_base->base.gallivm->module, 6621 debug, PIPE_SHADER_GEOMETRY, 6622 "GS Copy Shader"); 6623 if (!r) { 6624 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY)) 6625 fprintf(stderr, "GS Copy Shader:\n"); 6626 si_shader_dump(sscreen, ctx.shader, debug, 6627 PIPE_SHADER_GEOMETRY, stderr, true); 6628 r = si_shader_binary_upload(sscreen, ctx.shader); 6629 } 6630 6631 si_llvm_dispose(&ctx); 6632 6633 FREE(outputs); 6634 6635 if (r != 0) { 6636 FREE(shader); 6637 shader = NULL; 6638 } 6639 return shader; 6640 } 6641 6642 static void si_dump_shader_key(unsigned shader, struct si_shader_key *key, 6643 FILE *f) 6644 { 6645 int i; 6646 6647 fprintf(f, "SHADER KEY\n"); 6648 6649 switch (shader) { 6650 case PIPE_SHADER_VERTEX: 6651 fprintf(f, " part.vs.prolog.instance_divisors = {"); 6652 for (i = 0; i < ARRAY_SIZE(key->part.vs.prolog.instance_divisors); i++) 6653 fprintf(f, !i ? "%u" : ", %u", 6654 key->part.vs.prolog.instance_divisors[i]); 6655 fprintf(f, "}\n"); 6656 fprintf(f, " part.vs.epilog.export_prim_id = %u\n", key->part.vs.epilog.export_prim_id); 6657 fprintf(f, " as_es = %u\n", key->as_es); 6658 fprintf(f, " as_ls = %u\n", key->as_ls); 6659 fprintf(f, " mono.vs.fix_fetch = 0x%"PRIx64"\n", key->mono.vs.fix_fetch); 6660 break; 6661 6662 case PIPE_SHADER_TESS_CTRL: 6663 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); 6664 fprintf(f, " mono.tcs.inputs_to_copy = 0x%"PRIx64"\n", key->mono.tcs.inputs_to_copy); 6665 break; 6666 6667 case PIPE_SHADER_TESS_EVAL: 6668 fprintf(f, " part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id); 6669 fprintf(f, " as_es = %u\n", key->as_es); 6670 break; 6671 6672 case PIPE_SHADER_GEOMETRY: 6673 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); 6674 break; 6675 6676 case PIPE_SHADER_COMPUTE: 6677 break; 6678 6679 case PIPE_SHADER_FRAGMENT: 6680 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); 6681 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); 6682 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); 6683 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp); 6684 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp); 6685 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp); 6686 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp); 6687 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp); 6688 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear); 6689 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format); 6690 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); 6691 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); 6692 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); 6693 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); 6694 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); 6695 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing); 6696 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); 6697 break; 6698 6699 default: 6700 assert(0); 6701 } 6702 6703 if ((shader == PIPE_SHADER_GEOMETRY || 6704 shader == PIPE_SHADER_TESS_EVAL || 6705 shader == PIPE_SHADER_VERTEX) && 6706 !key->as_es && !key->as_ls) { 6707 fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs); 6708 fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2); 6709 fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable); 6710 } 6711 } 6712 6713 static void si_init_shader_ctx(struct si_shader_context *ctx, 6714 struct si_screen *sscreen, 6715 struct si_shader *shader, 6716 LLVMTargetMachineRef tm) 6717 { 6718 struct lp_build_tgsi_context *bld_base; 6719 struct lp_build_tgsi_action tmpl = {}; 6720 6721 si_llvm_context_init(ctx, sscreen, shader, tm, 6722 (shader && shader->selector) ? &shader->selector->info : NULL, 6723 (shader && shader->selector) ? shader->selector->tokens : NULL); 6724 6725 bld_base = &ctx->bld_base; 6726 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; 6727 6728 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action; 6729 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action; 6730 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action; 6731 6732 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action; 6733 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action; 6734 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action; 6735 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action; 6736 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action; 6737 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action; 6738 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action; 6739 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action; 6740 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action; 6741 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args; 6742 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit; 6743 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action; 6744 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action; 6745 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs; 6746 6747 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args; 6748 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit; 6749 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args; 6750 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit; 6751 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args; 6752 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit; 6753 6754 tmpl.fetch_args = atomic_fetch_args; 6755 tmpl.emit = atomic_emit; 6756 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl; 6757 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add"; 6758 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl; 6759 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap"; 6760 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl; 6761 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap"; 6762 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl; 6763 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and"; 6764 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl; 6765 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or"; 6766 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl; 6767 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor"; 6768 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl; 6769 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin"; 6770 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl; 6771 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax"; 6772 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl; 6773 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin"; 6774 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl; 6775 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax"; 6776 6777 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; 6778 6779 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; 6780 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; 6781 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; 6782 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy; 6783 6784 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex; 6785 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive; 6786 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier; 6787 } 6788 6789 /* Return true if the PARAM export has been eliminated. */ 6790 static bool si_eliminate_const_output(struct si_shader_context *ctx, 6791 LLVMValueRef inst, unsigned offset) 6792 { 6793 struct si_shader *shader = ctx->shader; 6794 unsigned num_outputs = shader->selector->info.num_outputs; 6795 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */ 6796 bool is_zero[4] = {}, is_one[4] = {}; 6797 6798 for (i = 0; i < 4; i++) { 6799 LLVMBool loses_info; 6800 LLVMValueRef p = LLVMGetOperand(inst, 5 + i); 6801 6802 /* It's a constant expression. Undef outputs are eliminated too. */ 6803 if (LLVMIsUndef(p)) { 6804 is_zero[i] = true; 6805 is_one[i] = true; 6806 } else if (LLVMIsAConstantFP(p)) { 6807 double a = LLVMConstRealGetDouble(p, &loses_info); 6808 6809 if (a == 0) 6810 is_zero[i] = true; 6811 else if (a == 1) 6812 is_one[i] = true; 6813 else 6814 return false; /* other constant */ 6815 } else 6816 return false; 6817 } 6818 6819 /* Only certain combinations of 0 and 1 can be eliminated. */ 6820 if (is_zero[0] && is_zero[1] && is_zero[2]) 6821 default_val = is_zero[3] ? 0 : 1; 6822 else if (is_one[0] && is_one[1] && is_one[2]) 6823 default_val = is_zero[3] ? 2 : 3; 6824 else 6825 return false; 6826 6827 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */ 6828 LLVMInstructionEraseFromParent(inst); 6829 6830 /* Change OFFSET to DEFAULT_VAL. */ 6831 for (i = 0; i < num_outputs; i++) { 6832 if (shader->info.vs_output_param_offset[i] == offset) { 6833 shader->info.vs_output_param_offset[i] = 6834 EXP_PARAM_DEFAULT_VAL_0000 + default_val; 6835 break; 6836 } 6837 } 6838 return true; 6839 } 6840 6841 struct si_vs_exports { 6842 unsigned num; 6843 unsigned offset[SI_MAX_VS_OUTPUTS]; 6844 LLVMValueRef inst[SI_MAX_VS_OUTPUTS]; 6845 }; 6846 6847 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx) 6848 { 6849 struct si_shader *shader = ctx->shader; 6850 struct tgsi_shader_info *info = &shader->selector->info; 6851 LLVMBasicBlockRef bb; 6852 struct si_vs_exports exports; 6853 bool removed_any = false; 6854 6855 exports.num = 0; 6856 6857 if (ctx->type == PIPE_SHADER_FRAGMENT || 6858 ctx->type == PIPE_SHADER_COMPUTE || 6859 shader->key.as_es || 6860 shader->key.as_ls) 6861 return; 6862 6863 /* Process all LLVM instructions. */ 6864 bb = LLVMGetFirstBasicBlock(ctx->main_fn); 6865 while (bb) { 6866 LLVMValueRef inst = LLVMGetFirstInstruction(bb); 6867 6868 while (inst) { 6869 LLVMValueRef cur = inst; 6870 inst = LLVMGetNextInstruction(inst); 6871 6872 if (LLVMGetInstructionOpcode(cur) != LLVMCall) 6873 continue; 6874 6875 LLVMValueRef callee = lp_get_called_value(cur); 6876 6877 if (!lp_is_function(callee)) 6878 continue; 6879 6880 const char *name = LLVMGetValueName(callee); 6881 unsigned num_args = LLVMCountParams(callee); 6882 6883 /* Check if this is an export instruction. */ 6884 if (num_args != 9 || strcmp(name, "llvm.SI.export")) 6885 continue; 6886 6887 LLVMValueRef arg = LLVMGetOperand(cur, 3); 6888 unsigned target = LLVMConstIntGetZExtValue(arg); 6889 6890 if (target < V_008DFC_SQ_EXP_PARAM) 6891 continue; 6892 6893 target -= V_008DFC_SQ_EXP_PARAM; 6894 6895 /* Eliminate constant value PARAM exports. */ 6896 if (si_eliminate_const_output(ctx, cur, target)) { 6897 removed_any = true; 6898 } else { 6899 exports.offset[exports.num] = target; 6900 exports.inst[exports.num] = cur; 6901 exports.num++; 6902 } 6903 } 6904 bb = LLVMGetNextBasicBlock(bb); 6905 } 6906 6907 /* Remove holes in export memory due to removed PARAM exports. 6908 * This is done by renumbering all PARAM exports. 6909 */ 6910 if (removed_any) { 6911 ubyte current_offset[SI_MAX_VS_OUTPUTS]; 6912 unsigned new_count = 0; 6913 unsigned out, i; 6914 6915 /* Make a copy of the offsets. We need the old version while 6916 * we are modifying some of them. */ 6917 assert(sizeof(current_offset) == 6918 sizeof(shader->info.vs_output_param_offset)); 6919 memcpy(current_offset, shader->info.vs_output_param_offset, 6920 sizeof(current_offset)); 6921 6922 for (i = 0; i < exports.num; i++) { 6923 unsigned offset = exports.offset[i]; 6924 6925 for (out = 0; out < info->num_outputs; out++) { 6926 if (current_offset[out] != offset) 6927 continue; 6928 6929 LLVMSetOperand(exports.inst[i], 3, 6930 LLVMConstInt(ctx->i32, 6931 V_008DFC_SQ_EXP_PARAM + new_count, 0)); 6932 shader->info.vs_output_param_offset[out] = new_count; 6933 new_count++; 6934 break; 6935 } 6936 } 6937 shader->info.nr_param_exports = new_count; 6938 } 6939 } 6940 6941 static void si_count_scratch_private_memory(struct si_shader_context *ctx) 6942 { 6943 ctx->shader->config.private_mem_vgprs = 0; 6944 6945 /* Process all LLVM instructions. */ 6946 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn); 6947 while (bb) { 6948 LLVMValueRef next = LLVMGetFirstInstruction(bb); 6949 6950 while (next) { 6951 LLVMValueRef inst = next; 6952 next = LLVMGetNextInstruction(next); 6953 6954 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca) 6955 continue; 6956 6957 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); 6958 /* No idea why LLVM aligns allocas to 4 elements. */ 6959 unsigned alignment = LLVMGetAlignment(inst); 6960 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment); 6961 ctx->shader->config.private_mem_vgprs += dw_size; 6962 } 6963 bb = LLVMGetNextBasicBlock(bb); 6964 } 6965 } 6966 6967 static bool si_compile_tgsi_main(struct si_shader_context *ctx, 6968 struct si_shader *shader) 6969 { 6970 struct si_shader_selector *sel = shader->selector; 6971 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 6972 6973 switch (ctx->type) { 6974 case PIPE_SHADER_VERTEX: 6975 ctx->load_input = declare_input_vs; 6976 if (shader->key.as_ls) 6977 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue; 6978 else if (shader->key.as_es) 6979 bld_base->emit_epilogue = si_llvm_emit_es_epilogue; 6980 else 6981 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue; 6982 break; 6983 case PIPE_SHADER_TESS_CTRL: 6984 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs; 6985 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs; 6986 bld_base->emit_store = store_output_tcs; 6987 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue; 6988 break; 6989 case PIPE_SHADER_TESS_EVAL: 6990 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes; 6991 if (shader->key.as_es) 6992 bld_base->emit_epilogue = si_llvm_emit_es_epilogue; 6993 else 6994 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue; 6995 break; 6996 case PIPE_SHADER_GEOMETRY: 6997 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs; 6998 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue; 6999 break; 7000 case PIPE_SHADER_FRAGMENT: 7001 ctx->load_input = declare_input_fs; 7002 bld_base->emit_epilogue = si_llvm_return_fs_outputs; 7003 break; 7004 case PIPE_SHADER_COMPUTE: 7005 ctx->declare_memory_region = declare_compute_memory; 7006 break; 7007 default: 7008 assert(!"Unsupported shader type"); 7009 return false; 7010 } 7011 7012 create_meta_data(ctx); 7013 create_function(ctx); 7014 preload_ring_buffers(ctx); 7015 7016 if (ctx->type == PIPE_SHADER_GEOMETRY) { 7017 int i; 7018 for (i = 0; i < 4; i++) { 7019 ctx->gs_next_vertex[i] = 7020 lp_build_alloca(bld_base->base.gallivm, 7021 ctx->i32, ""); 7022 } 7023 } 7024 7025 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) { 7026 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n"); 7027 return false; 7028 } 7029 7030 si_llvm_build_ret(ctx, ctx->return_value); 7031 return true; 7032 } 7033 7034 /** 7035 * Compute the VS prolog key, which contains all the information needed to 7036 * build the VS prolog function, and set shader->info bits where needed. 7037 */ 7038 static void si_get_vs_prolog_key(struct si_shader *shader, 7039 union si_shader_part_key *key) 7040 { 7041 struct tgsi_shader_info *info = &shader->selector->info; 7042 7043 memset(key, 0, sizeof(*key)); 7044 key->vs_prolog.states = shader->key.part.vs.prolog; 7045 key->vs_prolog.num_input_sgprs = shader->info.num_input_sgprs; 7046 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; 7047 7048 /* Set the instanceID flag. */ 7049 for (unsigned i = 0; i < info->num_inputs; i++) 7050 if (key->vs_prolog.states.instance_divisors[i]) 7051 shader->info.uses_instanceid = true; 7052 } 7053 7054 /** 7055 * Compute the VS epilog key, which contains all the information needed to 7056 * build the VS epilog function, and set the PrimitiveID output offset. 7057 */ 7058 static void si_get_vs_epilog_key(struct si_shader *shader, 7059 struct si_vs_epilog_bits *states, 7060 union si_shader_part_key *key) 7061 { 7062 memset(key, 0, sizeof(*key)); 7063 key->vs_epilog.states = *states; 7064 7065 /* Set up the PrimitiveID output. */ 7066 if (shader->key.part.vs.epilog.export_prim_id) { 7067 unsigned index = shader->selector->info.num_outputs; 7068 unsigned offset = shader->info.nr_param_exports++; 7069 7070 key->vs_epilog.prim_id_param_offset = offset; 7071 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset)); 7072 shader->info.vs_output_param_offset[index] = offset; 7073 } 7074 } 7075 7076 /** 7077 * Compute the PS prolog key, which contains all the information needed to 7078 * build the PS prolog function, and set related bits in shader->config. 7079 */ 7080 static void si_get_ps_prolog_key(struct si_shader *shader, 7081 union si_shader_part_key *key, 7082 bool separate_prolog) 7083 { 7084 struct tgsi_shader_info *info = &shader->selector->info; 7085 7086 memset(key, 0, sizeof(*key)); 7087 key->ps_prolog.states = shader->key.part.ps.prolog; 7088 key->ps_prolog.colors_read = info->colors_read; 7089 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; 7090 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; 7091 key->ps_prolog.wqm = info->uses_derivatives && 7092 (key->ps_prolog.colors_read || 7093 key->ps_prolog.states.force_persp_sample_interp || 7094 key->ps_prolog.states.force_linear_sample_interp || 7095 key->ps_prolog.states.force_persp_center_interp || 7096 key->ps_prolog.states.force_linear_center_interp || 7097 key->ps_prolog.states.bc_optimize_for_persp || 7098 key->ps_prolog.states.bc_optimize_for_linear); 7099 7100 if (info->colors_read) { 7101 unsigned *color = shader->selector->color_attr_index; 7102 7103 if (shader->key.part.ps.prolog.color_two_side) { 7104 /* BCOLORs are stored after the last input. */ 7105 key->ps_prolog.num_interp_inputs = info->num_inputs; 7106 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; 7107 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); 7108 } 7109 7110 for (unsigned i = 0; i < 2; i++) { 7111 unsigned interp = info->input_interpolate[color[i]]; 7112 unsigned location = info->input_interpolate_loc[color[i]]; 7113 7114 if (!(info->colors_read & (0xf << i*4))) 7115 continue; 7116 7117 key->ps_prolog.color_attr_index[i] = color[i]; 7118 7119 if (shader->key.part.ps.prolog.flatshade_colors && 7120 interp == TGSI_INTERPOLATE_COLOR) 7121 interp = TGSI_INTERPOLATE_CONSTANT; 7122 7123 switch (interp) { 7124 case TGSI_INTERPOLATE_CONSTANT: 7125 key->ps_prolog.color_interp_vgpr_index[i] = -1; 7126 break; 7127 case TGSI_INTERPOLATE_PERSPECTIVE: 7128 case TGSI_INTERPOLATE_COLOR: 7129 /* Force the interpolation location for colors here. */ 7130 if (shader->key.part.ps.prolog.force_persp_sample_interp) 7131 location = TGSI_INTERPOLATE_LOC_SAMPLE; 7132 if (shader->key.part.ps.prolog.force_persp_center_interp) 7133 location = TGSI_INTERPOLATE_LOC_CENTER; 7134 7135 switch (location) { 7136 case TGSI_INTERPOLATE_LOC_SAMPLE: 7137 key->ps_prolog.color_interp_vgpr_index[i] = 0; 7138 shader->config.spi_ps_input_ena |= 7139 S_0286CC_PERSP_SAMPLE_ENA(1); 7140 break; 7141 case TGSI_INTERPOLATE_LOC_CENTER: 7142 key->ps_prolog.color_interp_vgpr_index[i] = 2; 7143 shader->config.spi_ps_input_ena |= 7144 S_0286CC_PERSP_CENTER_ENA(1); 7145 break; 7146 case TGSI_INTERPOLATE_LOC_CENTROID: 7147 key->ps_prolog.color_interp_vgpr_index[i] = 4; 7148 shader->config.spi_ps_input_ena |= 7149 S_0286CC_PERSP_CENTROID_ENA(1); 7150 break; 7151 default: 7152 assert(0); 7153 } 7154 break; 7155 case TGSI_INTERPOLATE_LINEAR: 7156 /* Force the interpolation location for colors here. */ 7157 if (shader->key.part.ps.prolog.force_linear_sample_interp) 7158 location = TGSI_INTERPOLATE_LOC_SAMPLE; 7159 if (shader->key.part.ps.prolog.force_linear_center_interp) 7160 location = TGSI_INTERPOLATE_LOC_CENTER; 7161 7162 /* The VGPR assignment for non-monolithic shaders 7163 * works because InitialPSInputAddr is set on the 7164 * main shader and PERSP_PULL_MODEL is never used. 7165 */ 7166 switch (location) { 7167 case TGSI_INTERPOLATE_LOC_SAMPLE: 7168 key->ps_prolog.color_interp_vgpr_index[i] = 7169 separate_prolog ? 6 : 9; 7170 shader->config.spi_ps_input_ena |= 7171 S_0286CC_LINEAR_SAMPLE_ENA(1); 7172 break; 7173 case TGSI_INTERPOLATE_LOC_CENTER: 7174 key->ps_prolog.color_interp_vgpr_index[i] = 7175 separate_prolog ? 8 : 11; 7176 shader->config.spi_ps_input_ena |= 7177 S_0286CC_LINEAR_CENTER_ENA(1); 7178 break; 7179 case TGSI_INTERPOLATE_LOC_CENTROID: 7180 key->ps_prolog.color_interp_vgpr_index[i] = 7181 separate_prolog ? 10 : 13; 7182 shader->config.spi_ps_input_ena |= 7183 S_0286CC_LINEAR_CENTROID_ENA(1); 7184 break; 7185 default: 7186 assert(0); 7187 } 7188 break; 7189 default: 7190 assert(0); 7191 } 7192 } 7193 } 7194 } 7195 7196 /** 7197 * Check whether a PS prolog is required based on the key. 7198 */ 7199 static bool si_need_ps_prolog(const union si_shader_part_key *key) 7200 { 7201 return key->ps_prolog.colors_read || 7202 key->ps_prolog.states.force_persp_sample_interp || 7203 key->ps_prolog.states.force_linear_sample_interp || 7204 key->ps_prolog.states.force_persp_center_interp || 7205 key->ps_prolog.states.force_linear_center_interp || 7206 key->ps_prolog.states.bc_optimize_for_persp || 7207 key->ps_prolog.states.bc_optimize_for_linear || 7208 key->ps_prolog.states.poly_stipple; 7209 } 7210 7211 /** 7212 * Compute the PS epilog key, which contains all the information needed to 7213 * build the PS epilog function. 7214 */ 7215 static void si_get_ps_epilog_key(struct si_shader *shader, 7216 union si_shader_part_key *key) 7217 { 7218 struct tgsi_shader_info *info = &shader->selector->info; 7219 memset(key, 0, sizeof(*key)); 7220 key->ps_epilog.colors_written = info->colors_written; 7221 key->ps_epilog.writes_z = info->writes_z; 7222 key->ps_epilog.writes_stencil = info->writes_stencil; 7223 key->ps_epilog.writes_samplemask = info->writes_samplemask; 7224 key->ps_epilog.states = shader->key.part.ps.epilog; 7225 } 7226 7227 /** 7228 * Build the GS prolog function. Rotate the input vertices for triangle strips 7229 * with adjacency. 7230 */ 7231 static void si_build_gs_prolog_function(struct si_shader_context *ctx, 7232 union si_shader_part_key *key) 7233 { 7234 const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2; 7235 const unsigned num_vgprs = 8; 7236 struct gallivm_state *gallivm = &ctx->gallivm; 7237 LLVMBuilderRef builder = gallivm->builder; 7238 LLVMTypeRef params[32]; 7239 LLVMTypeRef returns[32]; 7240 LLVMValueRef func, ret; 7241 7242 for (unsigned i = 0; i < num_sgprs; ++i) { 7243 params[i] = ctx->i32; 7244 returns[i] = ctx->i32; 7245 } 7246 7247 for (unsigned i = 0; i < num_vgprs; ++i) { 7248 params[num_sgprs + i] = ctx->i32; 7249 returns[num_sgprs + i] = ctx->f32; 7250 } 7251 7252 /* Create the function. */ 7253 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 7254 params, num_sgprs + num_vgprs, num_sgprs - 1); 7255 func = ctx->main_fn; 7256 7257 /* Copy inputs to outputs. This should be no-op, as the registers match, 7258 * but it will prevent the compiler from overwriting them unintentionally. 7259 */ 7260 ret = ctx->return_value; 7261 for (unsigned i = 0; i < num_sgprs; i++) { 7262 LLVMValueRef p = LLVMGetParam(func, i); 7263 ret = LLVMBuildInsertValue(builder, ret, p, i, ""); 7264 } 7265 for (unsigned i = 0; i < num_vgprs; i++) { 7266 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); 7267 p = LLVMBuildBitCast(builder, p, ctx->f32, ""); 7268 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); 7269 } 7270 7271 if (key->gs_prolog.states.tri_strip_adj_fix) { 7272 /* Remap the input vertices for every other primitive. */ 7273 const unsigned vtx_params[6] = { 7274 num_sgprs, 7275 num_sgprs + 1, 7276 num_sgprs + 3, 7277 num_sgprs + 4, 7278 num_sgprs + 5, 7279 num_sgprs + 6 7280 }; 7281 LLVMValueRef prim_id, rotate; 7282 7283 prim_id = LLVMGetParam(func, num_sgprs + 2); 7284 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); 7285 7286 for (unsigned i = 0; i < 6; ++i) { 7287 LLVMValueRef base, rotated, actual; 7288 base = LLVMGetParam(func, vtx_params[i]); 7289 rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]); 7290 actual = LLVMBuildSelect(builder, rotate, rotated, base, ""); 7291 actual = LLVMBuildBitCast(builder, actual, ctx->f32, ""); 7292 ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], ""); 7293 } 7294 } 7295 7296 LLVMBuildRet(builder, ret); 7297 } 7298 7299 /** 7300 * Given a list of shader part functions, build a wrapper function that 7301 * runs them in sequence to form a monolithic shader. 7302 */ 7303 static void si_build_wrapper_function(struct si_shader_context *ctx, 7304 LLVMValueRef *parts, 7305 unsigned num_parts, 7306 unsigned main_part) 7307 { 7308 struct gallivm_state *gallivm = &ctx->gallivm; 7309 LLVMBuilderRef builder = ctx->gallivm.builder; 7310 /* PS epilog has one arg per color component */ 7311 LLVMTypeRef param_types[48]; 7312 LLVMValueRef out[48]; 7313 LLVMTypeRef function_type; 7314 unsigned num_params; 7315 unsigned num_out; 7316 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */ 7317 unsigned num_sgprs, num_vgprs; 7318 unsigned last_sgpr_param; 7319 unsigned gprs; 7320 7321 for (unsigned i = 0; i < num_parts; ++i) { 7322 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE); 7323 LLVMSetLinkage(parts[i], LLVMPrivateLinkage); 7324 } 7325 7326 /* The parameters of the wrapper function correspond to those of the 7327 * first part in terms of SGPRs and VGPRs, but we use the types of the 7328 * main part to get the right types. This is relevant for the 7329 * dereferenceable attribute on descriptor table pointers. 7330 */ 7331 num_sgprs = 0; 7332 num_vgprs = 0; 7333 7334 function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); 7335 num_params = LLVMCountParamTypes(function_type); 7336 7337 for (unsigned i = 0; i < num_params; ++i) { 7338 LLVMValueRef param = LLVMGetParam(parts[0], i); 7339 7340 if (ac_is_sgpr_param(param)) { 7341 assert(num_vgprs == 0); 7342 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4; 7343 } else { 7344 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4; 7345 } 7346 } 7347 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types)); 7348 7349 num_params = 0; 7350 last_sgpr_param = 0; 7351 gprs = 0; 7352 while (gprs < num_sgprs + num_vgprs) { 7353 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params); 7354 unsigned size; 7355 7356 param_types[num_params] = LLVMTypeOf(param); 7357 if (gprs < num_sgprs) 7358 last_sgpr_param = num_params; 7359 size = llvm_get_type_size(param_types[num_params]) / 4; 7360 num_params++; 7361 7362 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); 7363 assert(gprs + size <= num_sgprs + num_vgprs && 7364 (gprs >= num_sgprs || gprs + size <= num_sgprs)); 7365 7366 gprs += size; 7367 } 7368 7369 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param); 7370 7371 /* Record the arguments of the function as if they were an output of 7372 * a previous part. 7373 */ 7374 num_out = 0; 7375 num_out_sgpr = 0; 7376 7377 for (unsigned i = 0; i < num_params; ++i) { 7378 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); 7379 LLVMTypeRef param_type = LLVMTypeOf(param); 7380 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32; 7381 unsigned size = llvm_get_type_size(param_type) / 4; 7382 7383 if (size == 1) { 7384 if (param_type != out_type) 7385 param = LLVMBuildBitCast(builder, param, out_type, ""); 7386 out[num_out++] = param; 7387 } else { 7388 LLVMTypeRef vector_type = LLVMVectorType(out_type, size); 7389 7390 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 7391 param = LLVMBuildPtrToInt(builder, param, ctx->i64, ""); 7392 param_type = ctx->i64; 7393 } 7394 7395 if (param_type != vector_type) 7396 param = LLVMBuildBitCast(builder, param, vector_type, ""); 7397 7398 for (unsigned j = 0; j < size; ++j) 7399 out[num_out++] = LLVMBuildExtractElement( 7400 builder, param, LLVMConstInt(ctx->i32, j, 0), ""); 7401 } 7402 7403 if (i <= last_sgpr_param) 7404 num_out_sgpr = num_out; 7405 } 7406 7407 /* Now chain the parts. */ 7408 for (unsigned part = 0; part < num_parts; ++part) { 7409 LLVMValueRef in[48]; 7410 LLVMValueRef ret; 7411 LLVMTypeRef ret_type; 7412 unsigned out_idx = 0; 7413 7414 num_params = LLVMCountParams(parts[part]); 7415 assert(num_params <= ARRAY_SIZE(param_types)); 7416 7417 /* Derive arguments for the next part from outputs of the 7418 * previous one. 7419 */ 7420 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { 7421 LLVMValueRef param; 7422 LLVMTypeRef param_type; 7423 bool is_sgpr; 7424 unsigned param_size; 7425 LLVMValueRef arg = NULL; 7426 7427 param = LLVMGetParam(parts[part], param_idx); 7428 param_type = LLVMTypeOf(param); 7429 param_size = llvm_get_type_size(param_type) / 4; 7430 is_sgpr = ac_is_sgpr_param(param); 7431 7432 if (is_sgpr) { 7433 #if HAVE_LLVM < 0x0400 7434 LLVMRemoveAttribute(param, LLVMByValAttribute); 7435 #else 7436 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5); 7437 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id); 7438 #endif 7439 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG); 7440 } 7441 7442 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); 7443 assert(is_sgpr || out_idx >= num_out_sgpr); 7444 7445 if (param_size == 1) 7446 arg = out[out_idx]; 7447 else 7448 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size); 7449 7450 if (LLVMTypeOf(arg) != param_type) { 7451 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { 7452 arg = LLVMBuildBitCast(builder, arg, ctx->i64, ""); 7453 arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); 7454 } else { 7455 arg = LLVMBuildBitCast(builder, arg, param_type, ""); 7456 } 7457 } 7458 7459 in[param_idx] = arg; 7460 out_idx += param_size; 7461 } 7462 7463 ret = LLVMBuildCall(builder, parts[part], in, num_params, ""); 7464 ret_type = LLVMTypeOf(ret); 7465 7466 /* Extract the returned GPRs. */ 7467 num_out = 0; 7468 num_out_sgpr = 0; 7469 7470 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { 7471 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); 7472 7473 unsigned ret_size = LLVMCountStructElementTypes(ret_type); 7474 7475 for (unsigned i = 0; i < ret_size; ++i) { 7476 LLVMValueRef val = 7477 LLVMBuildExtractValue(builder, ret, i, ""); 7478 7479 out[num_out++] = val; 7480 7481 if (LLVMTypeOf(val) == ctx->i32) { 7482 assert(num_out_sgpr + 1 == num_out); 7483 num_out_sgpr = num_out; 7484 } 7485 } 7486 } 7487 } 7488 7489 LLVMBuildRetVoid(builder); 7490 } 7491 7492 int si_compile_tgsi_shader(struct si_screen *sscreen, 7493 LLVMTargetMachineRef tm, 7494 struct si_shader *shader, 7495 bool is_monolithic, 7496 struct pipe_debug_callback *debug) 7497 { 7498 struct si_shader_selector *sel = shader->selector; 7499 struct si_shader_context ctx; 7500 struct lp_build_tgsi_context *bld_base; 7501 LLVMModuleRef mod; 7502 int r = -1; 7503 7504 /* Dump TGSI code before doing TGSI->LLVM conversion in case the 7505 * conversion fails. */ 7506 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) && 7507 !(sscreen->b.debug_flags & DBG_NO_TGSI)) { 7508 tgsi_dump(sel->tokens, 0); 7509 si_dump_streamout(&sel->so); 7510 } 7511 7512 si_init_shader_ctx(&ctx, sscreen, shader, tm); 7513 ctx.separate_prolog = !is_monolithic; 7514 7515 memset(shader->info.vs_output_param_offset, EXP_PARAM_UNDEFINED, 7516 sizeof(shader->info.vs_output_param_offset)); 7517 7518 shader->info.uses_instanceid = sel->info.uses_instanceid; 7519 7520 bld_base = &ctx.bld_base; 7521 ctx.load_system_value = declare_system_value; 7522 7523 if (!si_compile_tgsi_main(&ctx, shader)) { 7524 si_llvm_dispose(&ctx); 7525 return -1; 7526 } 7527 7528 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { 7529 LLVMValueRef parts[3]; 7530 bool need_prolog; 7531 bool need_epilog; 7532 7533 need_prolog = sel->info.num_inputs; 7534 need_epilog = !shader->key.as_es && !shader->key.as_ls; 7535 7536 parts[need_prolog ? 1 : 0] = ctx.main_fn; 7537 7538 if (need_prolog) { 7539 union si_shader_part_key prolog_key; 7540 si_get_vs_prolog_key(shader, &prolog_key); 7541 si_build_vs_prolog_function(&ctx, &prolog_key); 7542 parts[0] = ctx.main_fn; 7543 } 7544 7545 if (need_epilog) { 7546 union si_shader_part_key epilog_key; 7547 si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key); 7548 si_build_vs_epilog_function(&ctx, &epilog_key); 7549 parts[need_prolog ? 2 : 1] = ctx.main_fn; 7550 } 7551 7552 si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog, 7553 need_prolog ? 1 : 0); 7554 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { 7555 LLVMValueRef parts[2]; 7556 union si_shader_part_key epilog_key; 7557 7558 parts[0] = ctx.main_fn; 7559 7560 memset(&epilog_key, 0, sizeof(epilog_key)); 7561 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 7562 si_build_tcs_epilog_function(&ctx, &epilog_key); 7563 parts[1] = ctx.main_fn; 7564 7565 si_build_wrapper_function(&ctx, parts, 2, 0); 7566 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && 7567 !shader->key.as_es) { 7568 LLVMValueRef parts[2]; 7569 union si_shader_part_key epilog_key; 7570 7571 parts[0] = ctx.main_fn; 7572 7573 si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key); 7574 si_build_vs_epilog_function(&ctx, &epilog_key); 7575 parts[1] = ctx.main_fn; 7576 7577 si_build_wrapper_function(&ctx, parts, 2, 0); 7578 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { 7579 LLVMValueRef parts[2]; 7580 union si_shader_part_key prolog_key; 7581 7582 parts[1] = ctx.main_fn; 7583 7584 memset(&prolog_key, 0, sizeof(prolog_key)); 7585 prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 7586 si_build_gs_prolog_function(&ctx, &prolog_key); 7587 parts[0] = ctx.main_fn; 7588 7589 si_build_wrapper_function(&ctx, parts, 2, 1); 7590 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { 7591 LLVMValueRef parts[3]; 7592 union si_shader_part_key prolog_key; 7593 union si_shader_part_key epilog_key; 7594 bool need_prolog; 7595 7596 si_get_ps_prolog_key(shader, &prolog_key, false); 7597 need_prolog = si_need_ps_prolog(&prolog_key); 7598 7599 parts[need_prolog ? 1 : 0] = ctx.main_fn; 7600 7601 if (need_prolog) { 7602 si_build_ps_prolog_function(&ctx, &prolog_key); 7603 parts[0] = ctx.main_fn; 7604 } 7605 7606 si_get_ps_epilog_key(shader, &epilog_key); 7607 si_build_ps_epilog_function(&ctx, &epilog_key); 7608 parts[need_prolog ? 2 : 1] = ctx.main_fn; 7609 7610 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0); 7611 } 7612 7613 mod = bld_base->base.gallivm->module; 7614 7615 /* Dump LLVM IR before any optimization passes */ 7616 if (sscreen->b.debug_flags & DBG_PREOPT_IR && 7617 r600_can_dump_shader(&sscreen->b, ctx.type)) 7618 LLVMDumpModule(mod); 7619 7620 si_llvm_finalize_module(&ctx, 7621 r600_extra_shader_checks(&sscreen->b, ctx.type)); 7622 7623 /* Post-optimization transformations and analysis. */ 7624 si_eliminate_const_vs_outputs(&ctx); 7625 7626 if ((debug && debug->debug_message) || 7627 r600_can_dump_shader(&sscreen->b, ctx.type)) 7628 si_count_scratch_private_memory(&ctx); 7629 7630 /* Compile to bytecode. */ 7631 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm, 7632 mod, debug, ctx.type, "TGSI shader"); 7633 si_llvm_dispose(&ctx); 7634 if (r) { 7635 fprintf(stderr, "LLVM failed to compile shader\n"); 7636 return r; 7637 } 7638 7639 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. 7640 * LLVM 3.9svn has this bug. 7641 */ 7642 if (sel->type == PIPE_SHADER_COMPUTE) { 7643 unsigned wave_size = 64; 7644 unsigned max_vgprs = 256; 7645 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512; 7646 unsigned max_sgprs_per_wave = 128; 7647 unsigned max_block_threads = si_get_max_workgroup_size(shader); 7648 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); 7649 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); 7650 7651 max_vgprs = max_vgprs / min_waves_per_simd; 7652 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave); 7653 7654 if (shader->config.num_sgprs > max_sgprs || 7655 shader->config.num_vgprs > max_vgprs) { 7656 fprintf(stderr, "LLVM failed to compile a shader correctly: " 7657 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n", 7658 shader->config.num_sgprs, shader->config.num_vgprs, 7659 max_sgprs, max_vgprs); 7660 7661 /* Just terminate the process, because dependent 7662 * shaders can hang due to bad input data, but use 7663 * the env var to allow shader-db to work. 7664 */ 7665 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false)) 7666 abort(); 7667 } 7668 } 7669 7670 /* Add the scratch offset to input SGPRs. */ 7671 if (shader->config.scratch_bytes_per_wave) 7672 shader->info.num_input_sgprs += 1; /* scratch byte offset */ 7673 7674 /* Calculate the number of fragment input VGPRs. */ 7675 if (ctx.type == PIPE_SHADER_FRAGMENT) { 7676 shader->info.num_input_vgprs = 0; 7677 shader->info.face_vgpr_index = -1; 7678 7679 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr)) 7680 shader->info.num_input_vgprs += 2; 7681 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)) 7682 shader->info.num_input_vgprs += 2; 7683 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr)) 7684 shader->info.num_input_vgprs += 2; 7685 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr)) 7686 shader->info.num_input_vgprs += 3; 7687 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr)) 7688 shader->info.num_input_vgprs += 2; 7689 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)) 7690 shader->info.num_input_vgprs += 2; 7691 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr)) 7692 shader->info.num_input_vgprs += 2; 7693 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr)) 7694 shader->info.num_input_vgprs += 1; 7695 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr)) 7696 shader->info.num_input_vgprs += 1; 7697 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr)) 7698 shader->info.num_input_vgprs += 1; 7699 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr)) 7700 shader->info.num_input_vgprs += 1; 7701 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr)) 7702 shader->info.num_input_vgprs += 1; 7703 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) { 7704 shader->info.face_vgpr_index = shader->info.num_input_vgprs; 7705 shader->info.num_input_vgprs += 1; 7706 } 7707 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) 7708 shader->info.num_input_vgprs += 1; 7709 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr)) 7710 shader->info.num_input_vgprs += 1; 7711 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)) 7712 shader->info.num_input_vgprs += 1; 7713 } 7714 7715 return 0; 7716 } 7717 7718 /** 7719 * Create, compile and return a shader part (prolog or epilog). 7720 * 7721 * \param sscreen screen 7722 * \param list list of shader parts of the same category 7723 * \param type shader type 7724 * \param key shader part key 7725 * \param prolog whether the part being requested is a prolog 7726 * \param tm LLVM target machine 7727 * \param debug debug callback 7728 * \param build the callback responsible for building the main function 7729 * \return non-NULL on success 7730 */ 7731 static struct si_shader_part * 7732 si_get_shader_part(struct si_screen *sscreen, 7733 struct si_shader_part **list, 7734 enum pipe_shader_type type, 7735 bool prolog, 7736 union si_shader_part_key *key, 7737 LLVMTargetMachineRef tm, 7738 struct pipe_debug_callback *debug, 7739 void (*build)(struct si_shader_context *, 7740 union si_shader_part_key *), 7741 const char *name) 7742 { 7743 struct si_shader_part *result; 7744 7745 pipe_mutex_lock(sscreen->shader_parts_mutex); 7746 7747 /* Find existing. */ 7748 for (result = *list; result; result = result->next) { 7749 if (memcmp(&result->key, key, sizeof(*key)) == 0) { 7750 pipe_mutex_unlock(sscreen->shader_parts_mutex); 7751 return result; 7752 } 7753 } 7754 7755 /* Compile a new one. */ 7756 result = CALLOC_STRUCT(si_shader_part); 7757 result->key = *key; 7758 7759 struct si_shader shader = {}; 7760 struct si_shader_context ctx; 7761 struct gallivm_state *gallivm = &ctx.gallivm; 7762 7763 si_init_shader_ctx(&ctx, sscreen, &shader, tm); 7764 ctx.type = type; 7765 7766 switch (type) { 7767 case PIPE_SHADER_VERTEX: 7768 break; 7769 case PIPE_SHADER_TESS_CTRL: 7770 assert(!prolog); 7771 shader.key.part.tcs.epilog = key->tcs_epilog.states; 7772 break; 7773 case PIPE_SHADER_GEOMETRY: 7774 assert(prolog); 7775 break; 7776 case PIPE_SHADER_FRAGMENT: 7777 if (prolog) 7778 shader.key.part.ps.prolog = key->ps_prolog.states; 7779 else 7780 shader.key.part.ps.epilog = key->ps_epilog.states; 7781 break; 7782 default: 7783 unreachable("bad shader part"); 7784 } 7785 7786 build(&ctx, key); 7787 7788 /* Compile. */ 7789 si_llvm_finalize_module(&ctx, 7790 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT)); 7791 7792 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm, 7793 gallivm->module, debug, ctx.type, name)) { 7794 FREE(result); 7795 result = NULL; 7796 goto out; 7797 } 7798 7799 result->next = *list; 7800 *list = result; 7801 7802 out: 7803 si_llvm_dispose(&ctx); 7804 pipe_mutex_unlock(sscreen->shader_parts_mutex); 7805 return result; 7806 } 7807 7808 /** 7809 * Build the vertex shader prolog function. 7810 * 7811 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). 7812 * All inputs are returned unmodified. The vertex load indices are 7813 * stored after them, which will be used by the API VS for fetching inputs. 7814 * 7815 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: 7816 * input_v0, 7817 * input_v1, 7818 * input_v2, 7819 * input_v3, 7820 * (VertexID + BaseVertex), 7821 * (InstanceID + StartInstance), 7822 * (InstanceID / 2 + StartInstance) 7823 */ 7824 static void si_build_vs_prolog_function(struct si_shader_context *ctx, 7825 union si_shader_part_key *key) 7826 { 7827 struct gallivm_state *gallivm = &ctx->gallivm; 7828 LLVMTypeRef *params, *returns; 7829 LLVMValueRef ret, func; 7830 int last_sgpr, num_params, num_returns, i; 7831 7832 ctx->param_vertex_id = key->vs_prolog.num_input_sgprs; 7833 ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3; 7834 7835 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ 7836 params = alloca((key->vs_prolog.num_input_sgprs + 4) * 7837 sizeof(LLVMTypeRef)); 7838 returns = alloca((key->vs_prolog.num_input_sgprs + 4 + 7839 key->vs_prolog.last_input + 1) * 7840 sizeof(LLVMTypeRef)); 7841 num_params = 0; 7842 num_returns = 0; 7843 7844 /* Declare input and output SGPRs. */ 7845 num_params = 0; 7846 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { 7847 params[num_params++] = ctx->i32; 7848 returns[num_returns++] = ctx->i32; 7849 } 7850 last_sgpr = num_params - 1; 7851 7852 /* 4 preloaded VGPRs (outputs must be floats) */ 7853 for (i = 0; i < 4; i++) { 7854 params[num_params++] = ctx->i32; 7855 returns[num_returns++] = ctx->f32; 7856 } 7857 7858 /* Vertex load indices. */ 7859 for (i = 0; i <= key->vs_prolog.last_input; i++) 7860 returns[num_returns++] = ctx->f32; 7861 7862 /* Create the function. */ 7863 si_create_function(ctx, "vs_prolog", returns, num_returns, params, 7864 num_params, last_sgpr); 7865 func = ctx->main_fn; 7866 7867 /* Copy inputs to outputs. This should be no-op, as the registers match, 7868 * but it will prevent the compiler from overwriting them unintentionally. 7869 */ 7870 ret = ctx->return_value; 7871 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { 7872 LLVMValueRef p = LLVMGetParam(func, i); 7873 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); 7874 } 7875 for (i = num_params - 4; i < num_params; i++) { 7876 LLVMValueRef p = LLVMGetParam(func, i); 7877 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, ""); 7878 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); 7879 } 7880 7881 /* Compute vertex load indices from instance divisors. */ 7882 for (i = 0; i <= key->vs_prolog.last_input; i++) { 7883 unsigned divisor = key->vs_prolog.states.instance_divisors[i]; 7884 LLVMValueRef index; 7885 7886 if (divisor) { 7887 /* InstanceID / Divisor + StartInstance */ 7888 index = get_instance_index_for_fetch(ctx, 7889 SI_SGPR_START_INSTANCE, 7890 divisor); 7891 } else { 7892 /* VertexID + BaseVertex */ 7893 index = LLVMBuildAdd(gallivm->builder, 7894 LLVMGetParam(func, ctx->param_vertex_id), 7895 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), ""); 7896 } 7897 7898 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, ""); 7899 ret = LLVMBuildInsertValue(gallivm->builder, ret, index, 7900 num_params++, ""); 7901 } 7902 7903 si_llvm_build_ret(ctx, ret); 7904 } 7905 7906 /** 7907 * Build the vertex shader epilog function. This is also used by the tessellation 7908 * evaluation shader compiled as VS. 7909 * 7910 * The input is PrimitiveID. 7911 * 7912 * If PrimitiveID is required by the pixel shader, export it. 7913 * Otherwise, do nothing. 7914 */ 7915 static void si_build_vs_epilog_function(struct si_shader_context *ctx, 7916 union si_shader_part_key *key) 7917 { 7918 struct gallivm_state *gallivm = &ctx->gallivm; 7919 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 7920 LLVMTypeRef params[5]; 7921 int num_params, i; 7922 7923 /* Declare input VGPRs. */ 7924 num_params = key->vs_epilog.states.export_prim_id ? 7925 (VS_EPILOG_PRIMID_LOC + 1) : 0; 7926 assert(num_params <= ARRAY_SIZE(params)); 7927 7928 for (i = 0; i < num_params; i++) 7929 params[i] = ctx->f32; 7930 7931 /* Create the function. */ 7932 si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1); 7933 7934 /* Emit exports. */ 7935 if (key->vs_epilog.states.export_prim_id) { 7936 struct lp_build_context *base = &bld_base->base; 7937 struct lp_build_context *uint = &bld_base->uint_bld; 7938 LLVMValueRef args[9]; 7939 7940 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */ 7941 args[1] = uint->zero; /* whether the EXEC mask is valid */ 7942 args[2] = uint->zero; /* DONE bit */ 7943 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM + 7944 key->vs_epilog.prim_id_param_offset); 7945 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */ 7946 args[5] = LLVMGetParam(ctx->main_fn, 7947 VS_EPILOG_PRIMID_LOC); /* X */ 7948 args[6] = base->undef; /* Y */ 7949 args[7] = base->undef; /* Z */ 7950 args[8] = base->undef; /* W */ 7951 7952 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export", 7953 LLVMVoidTypeInContext(base->gallivm->context), 7954 args, 9, 0); 7955 } 7956 7957 LLVMBuildRetVoid(gallivm->builder); 7958 } 7959 7960 /** 7961 * Create & compile a vertex shader epilog. This a helper used by VS and TES. 7962 */ 7963 static bool si_get_vs_epilog(struct si_screen *sscreen, 7964 LLVMTargetMachineRef tm, 7965 struct si_shader *shader, 7966 struct pipe_debug_callback *debug, 7967 struct si_vs_epilog_bits *states) 7968 { 7969 union si_shader_part_key epilog_key; 7970 7971 si_get_vs_epilog_key(shader, states, &epilog_key); 7972 7973 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs, 7974 PIPE_SHADER_VERTEX, true, 7975 &epilog_key, tm, debug, 7976 si_build_vs_epilog_function, 7977 "Vertex Shader Epilog"); 7978 return shader->epilog != NULL; 7979 } 7980 7981 /** 7982 * Select and compile (or reuse) vertex shader parts (prolog & epilog). 7983 */ 7984 static bool si_shader_select_vs_parts(struct si_screen *sscreen, 7985 LLVMTargetMachineRef tm, 7986 struct si_shader *shader, 7987 struct pipe_debug_callback *debug) 7988 { 7989 struct tgsi_shader_info *info = &shader->selector->info; 7990 union si_shader_part_key prolog_key; 7991 7992 /* Get the prolog. */ 7993 si_get_vs_prolog_key(shader, &prolog_key); 7994 7995 /* The prolog is a no-op if there are no inputs. */ 7996 if (info->num_inputs) { 7997 shader->prolog = 7998 si_get_shader_part(sscreen, &sscreen->vs_prologs, 7999 PIPE_SHADER_VERTEX, true, 8000 &prolog_key, tm, debug, 8001 si_build_vs_prolog_function, 8002 "Vertex Shader Prolog"); 8003 if (!shader->prolog) 8004 return false; 8005 } 8006 8007 /* Get the epilog. */ 8008 if (!shader->key.as_es && !shader->key.as_ls && 8009 !si_get_vs_epilog(sscreen, tm, shader, debug, 8010 &shader->key.part.vs.epilog)) 8011 return false; 8012 8013 return true; 8014 } 8015 8016 /** 8017 * Select and compile (or reuse) TES parts (epilog). 8018 */ 8019 static bool si_shader_select_tes_parts(struct si_screen *sscreen, 8020 LLVMTargetMachineRef tm, 8021 struct si_shader *shader, 8022 struct pipe_debug_callback *debug) 8023 { 8024 if (shader->key.as_es) 8025 return true; 8026 8027 /* TES compiled as VS. */ 8028 return si_get_vs_epilog(sscreen, tm, shader, debug, 8029 &shader->key.part.tes.epilog); 8030 } 8031 8032 /** 8033 * Compile the TCS epilog function. This writes tesselation factors to memory 8034 * based on the output primitive type of the tesselator (determined by TES). 8035 */ 8036 static void si_build_tcs_epilog_function(struct si_shader_context *ctx, 8037 union si_shader_part_key *key) 8038 { 8039 struct gallivm_state *gallivm = &ctx->gallivm; 8040 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 8041 LLVMTypeRef params[16]; 8042 LLVMValueRef func; 8043 int last_sgpr, num_params; 8044 8045 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */ 8046 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); 8047 params[SI_PARAM_CONST_BUFFERS] = ctx->i64; 8048 params[SI_PARAM_SAMPLERS] = ctx->i64; 8049 params[SI_PARAM_IMAGES] = ctx->i64; 8050 params[SI_PARAM_SHADER_BUFFERS] = ctx->i64; 8051 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32; 8052 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32; 8053 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32; 8054 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32; 8055 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32; 8056 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32; 8057 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET; 8058 num_params = last_sgpr + 1; 8059 8060 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */ 8061 params[num_params++] = ctx->i32; /* invocation ID within the patch */ 8062 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */ 8063 8064 /* Create the function. */ 8065 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr); 8066 declare_tess_lds(ctx); 8067 func = ctx->main_fn; 8068 8069 si_write_tess_factors(bld_base, 8070 LLVMGetParam(func, last_sgpr + 1), 8071 LLVMGetParam(func, last_sgpr + 2), 8072 LLVMGetParam(func, last_sgpr + 3)); 8073 8074 LLVMBuildRetVoid(gallivm->builder); 8075 } 8076 8077 /** 8078 * Select and compile (or reuse) TCS parts (epilog). 8079 */ 8080 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, 8081 LLVMTargetMachineRef tm, 8082 struct si_shader *shader, 8083 struct pipe_debug_callback *debug) 8084 { 8085 union si_shader_part_key epilog_key; 8086 8087 /* Get the epilog. */ 8088 memset(&epilog_key, 0, sizeof(epilog_key)); 8089 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 8090 8091 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, 8092 PIPE_SHADER_TESS_CTRL, false, 8093 &epilog_key, tm, debug, 8094 si_build_tcs_epilog_function, 8095 "Tessellation Control Shader Epilog"); 8096 return shader->epilog != NULL; 8097 } 8098 8099 /** 8100 * Select and compile (or reuse) GS parts (prolog). 8101 */ 8102 static bool si_shader_select_gs_parts(struct si_screen *sscreen, 8103 LLVMTargetMachineRef tm, 8104 struct si_shader *shader, 8105 struct pipe_debug_callback *debug) 8106 { 8107 union si_shader_part_key prolog_key; 8108 8109 if (!shader->key.part.gs.prolog.tri_strip_adj_fix) 8110 return true; 8111 8112 memset(&prolog_key, 0, sizeof(prolog_key)); 8113 prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 8114 8115 shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs, 8116 PIPE_SHADER_GEOMETRY, true, 8117 &prolog_key, tm, debug, 8118 si_build_gs_prolog_function, 8119 "Geometry Shader Prolog"); 8120 return shader->prolog != NULL; 8121 } 8122 8123 /** 8124 * Build the pixel shader prolog function. This handles: 8125 * - two-side color selection and interpolation 8126 * - overriding interpolation parameters for the API PS 8127 * - polygon stippling 8128 * 8129 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are 8130 * overriden by other states. (e.g. per-sample interpolation) 8131 * Interpolated colors are stored after the preloaded VGPRs. 8132 */ 8133 static void si_build_ps_prolog_function(struct si_shader_context *ctx, 8134 union si_shader_part_key *key) 8135 { 8136 struct gallivm_state *gallivm = &ctx->gallivm; 8137 LLVMTypeRef *params; 8138 LLVMValueRef ret, func; 8139 int last_sgpr, num_params, num_returns, i, num_color_channels; 8140 8141 assert(si_need_ps_prolog(key)); 8142 8143 /* Number of inputs + 8 color elements. */ 8144 params = alloca((key->ps_prolog.num_input_sgprs + 8145 key->ps_prolog.num_input_vgprs + 8) * 8146 sizeof(LLVMTypeRef)); 8147 8148 /* Declare inputs. */ 8149 num_params = 0; 8150 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) 8151 params[num_params++] = ctx->i32; 8152 last_sgpr = num_params - 1; 8153 8154 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) 8155 params[num_params++] = ctx->f32; 8156 8157 /* Declare outputs (same as inputs + add colors if needed) */ 8158 num_returns = num_params; 8159 num_color_channels = util_bitcount(key->ps_prolog.colors_read); 8160 for (i = 0; i < num_color_channels; i++) 8161 params[num_returns++] = ctx->f32; 8162 8163 /* Create the function. */ 8164 si_create_function(ctx, "ps_prolog", params, num_returns, params, 8165 num_params, last_sgpr); 8166 func = ctx->main_fn; 8167 8168 /* Copy inputs to outputs. This should be no-op, as the registers match, 8169 * but it will prevent the compiler from overwriting them unintentionally. 8170 */ 8171 ret = ctx->return_value; 8172 for (i = 0; i < num_params; i++) { 8173 LLVMValueRef p = LLVMGetParam(func, i); 8174 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); 8175 } 8176 8177 /* Polygon stippling. */ 8178 if (key->ps_prolog.states.poly_stipple) { 8179 /* POS_FIXED_PT is always last. */ 8180 unsigned pos = key->ps_prolog.num_input_sgprs + 8181 key->ps_prolog.num_input_vgprs - 1; 8182 LLVMValueRef ptr[2], list; 8183 8184 /* Get the pointer to rw buffers. */ 8185 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS); 8186 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI); 8187 list = lp_build_gather_values(gallivm, ptr, 2); 8188 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, ""); 8189 list = LLVMBuildIntToPtr(gallivm->builder, list, 8190 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), ""); 8191 8192 si_llvm_emit_polygon_stipple(ctx, list, pos); 8193 } 8194 8195 if (key->ps_prolog.states.bc_optimize_for_persp || 8196 key->ps_prolog.states.bc_optimize_for_linear) { 8197 unsigned i, base = key->ps_prolog.num_input_sgprs; 8198 LLVMValueRef center[2], centroid[2], tmp, bc_optimize; 8199 8200 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; 8201 * The hw doesn't compute CENTROID if the whole wave only 8202 * contains fully-covered quads. 8203 * 8204 * PRIM_MASK is after user SGPRs. 8205 */ 8206 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); 8207 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize, 8208 LLVMConstInt(ctx->i32, 31, 0), ""); 8209 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize, 8210 ctx->i1, ""); 8211 8212 if (key->ps_prolog.states.bc_optimize_for_persp) { 8213 /* Read PERSP_CENTER. */ 8214 for (i = 0; i < 2; i++) 8215 center[i] = LLVMGetParam(func, base + 2 + i); 8216 /* Read PERSP_CENTROID. */ 8217 for (i = 0; i < 2; i++) 8218 centroid[i] = LLVMGetParam(func, base + 4 + i); 8219 /* Select PERSP_CENTROID. */ 8220 for (i = 0; i < 2; i++) { 8221 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize, 8222 center[i], centroid[i], ""); 8223 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8224 tmp, base + 4 + i, ""); 8225 } 8226 } 8227 if (key->ps_prolog.states.bc_optimize_for_linear) { 8228 /* Read LINEAR_CENTER. */ 8229 for (i = 0; i < 2; i++) 8230 center[i] = LLVMGetParam(func, base + 8 + i); 8231 /* Read LINEAR_CENTROID. */ 8232 for (i = 0; i < 2; i++) 8233 centroid[i] = LLVMGetParam(func, base + 10 + i); 8234 /* Select LINEAR_CENTROID. */ 8235 for (i = 0; i < 2; i++) { 8236 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize, 8237 center[i], centroid[i], ""); 8238 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8239 tmp, base + 10 + i, ""); 8240 } 8241 } 8242 } 8243 8244 /* Force per-sample interpolation. */ 8245 if (key->ps_prolog.states.force_persp_sample_interp) { 8246 unsigned i, base = key->ps_prolog.num_input_sgprs; 8247 LLVMValueRef persp_sample[2]; 8248 8249 /* Read PERSP_SAMPLE. */ 8250 for (i = 0; i < 2; i++) 8251 persp_sample[i] = LLVMGetParam(func, base + i); 8252 /* Overwrite PERSP_CENTER. */ 8253 for (i = 0; i < 2; i++) 8254 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8255 persp_sample[i], base + 2 + i, ""); 8256 /* Overwrite PERSP_CENTROID. */ 8257 for (i = 0; i < 2; i++) 8258 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8259 persp_sample[i], base + 4 + i, ""); 8260 } 8261 if (key->ps_prolog.states.force_linear_sample_interp) { 8262 unsigned i, base = key->ps_prolog.num_input_sgprs; 8263 LLVMValueRef linear_sample[2]; 8264 8265 /* Read LINEAR_SAMPLE. */ 8266 for (i = 0; i < 2; i++) 8267 linear_sample[i] = LLVMGetParam(func, base + 6 + i); 8268 /* Overwrite LINEAR_CENTER. */ 8269 for (i = 0; i < 2; i++) 8270 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8271 linear_sample[i], base + 8 + i, ""); 8272 /* Overwrite LINEAR_CENTROID. */ 8273 for (i = 0; i < 2; i++) 8274 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8275 linear_sample[i], base + 10 + i, ""); 8276 } 8277 8278 /* Force center interpolation. */ 8279 if (key->ps_prolog.states.force_persp_center_interp) { 8280 unsigned i, base = key->ps_prolog.num_input_sgprs; 8281 LLVMValueRef persp_center[2]; 8282 8283 /* Read PERSP_CENTER. */ 8284 for (i = 0; i < 2; i++) 8285 persp_center[i] = LLVMGetParam(func, base + 2 + i); 8286 /* Overwrite PERSP_SAMPLE. */ 8287 for (i = 0; i < 2; i++) 8288 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8289 persp_center[i], base + i, ""); 8290 /* Overwrite PERSP_CENTROID. */ 8291 for (i = 0; i < 2; i++) 8292 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8293 persp_center[i], base + 4 + i, ""); 8294 } 8295 if (key->ps_prolog.states.force_linear_center_interp) { 8296 unsigned i, base = key->ps_prolog.num_input_sgprs; 8297 LLVMValueRef linear_center[2]; 8298 8299 /* Read LINEAR_CENTER. */ 8300 for (i = 0; i < 2; i++) 8301 linear_center[i] = LLVMGetParam(func, base + 8 + i); 8302 /* Overwrite LINEAR_SAMPLE. */ 8303 for (i = 0; i < 2; i++) 8304 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8305 linear_center[i], base + 6 + i, ""); 8306 /* Overwrite LINEAR_CENTROID. */ 8307 for (i = 0; i < 2; i++) 8308 ret = LLVMBuildInsertValue(gallivm->builder, ret, 8309 linear_center[i], base + 10 + i, ""); 8310 } 8311 8312 /* Interpolate colors. */ 8313 for (i = 0; i < 2; i++) { 8314 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; 8315 unsigned face_vgpr = key->ps_prolog.num_input_sgprs + 8316 key->ps_prolog.face_vgpr_index; 8317 LLVMValueRef interp[2], color[4]; 8318 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; 8319 8320 if (!writemask) 8321 continue; 8322 8323 /* If the interpolation qualifier is not CONSTANT (-1). */ 8324 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { 8325 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + 8326 key->ps_prolog.color_interp_vgpr_index[i]; 8327 8328 /* Get the (i,j) updated by bc_optimize handling. */ 8329 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret, 8330 interp_vgpr, ""); 8331 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret, 8332 interp_vgpr + 1, ""); 8333 interp_ij = lp_build_gather_values(gallivm, interp, 2); 8334 } 8335 8336 /* Use the absolute location of the input. */ 8337 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); 8338 8339 if (key->ps_prolog.states.color_two_side) { 8340 face = LLVMGetParam(func, face_vgpr); 8341 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, ""); 8342 } 8343 8344 interp_fs_input(ctx, 8345 key->ps_prolog.color_attr_index[i], 8346 TGSI_SEMANTIC_COLOR, i, 8347 key->ps_prolog.num_interp_inputs, 8348 key->ps_prolog.colors_read, interp_ij, 8349 prim_mask, face, color); 8350 8351 while (writemask) { 8352 unsigned chan = u_bit_scan(&writemask); 8353 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan], 8354 num_params++, ""); 8355 } 8356 } 8357 8358 /* Tell LLVM to insert WQM instruction sequence when needed. */ 8359 if (key->ps_prolog.wqm) { 8360 LLVMAddTargetDependentFunctionAttr(func, 8361 "amdgpu-ps-wqm-outputs", ""); 8362 } 8363 8364 si_llvm_build_ret(ctx, ret); 8365 } 8366 8367 /** 8368 * Build the pixel shader epilog function. This handles everything that must be 8369 * emulated for pixel shader exports. (alpha-test, format conversions, etc) 8370 */ 8371 static void si_build_ps_epilog_function(struct si_shader_context *ctx, 8372 union si_shader_part_key *key) 8373 { 8374 struct gallivm_state *gallivm = &ctx->gallivm; 8375 struct lp_build_tgsi_context *bld_base = &ctx->bld_base; 8376 LLVMTypeRef params[16+8*4+3]; 8377 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; 8378 int last_sgpr, num_params, i; 8379 struct si_ps_exports exp = {}; 8380 8381 /* Declare input SGPRs. */ 8382 params[SI_PARAM_RW_BUFFERS] = ctx->i64; 8383 params[SI_PARAM_CONST_BUFFERS] = ctx->i64; 8384 params[SI_PARAM_SAMPLERS] = ctx->i64; 8385 params[SI_PARAM_IMAGES] = ctx->i64; 8386 params[SI_PARAM_SHADER_BUFFERS] = ctx->i64; 8387 params[SI_PARAM_ALPHA_REF] = ctx->f32; 8388 last_sgpr = SI_PARAM_ALPHA_REF; 8389 8390 /* Declare input VGPRs. */ 8391 num_params = (last_sgpr + 1) + 8392 util_bitcount(key->ps_epilog.colors_written) * 4 + 8393 key->ps_epilog.writes_z + 8394 key->ps_epilog.writes_stencil + 8395 key->ps_epilog.writes_samplemask; 8396 8397 num_params = MAX2(num_params, 8398 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 8399 8400 assert(num_params <= ARRAY_SIZE(params)); 8401 8402 for (i = last_sgpr + 1; i < num_params; i++) 8403 params[i] = ctx->f32; 8404 8405 /* Create the function. */ 8406 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr); 8407 /* Disable elimination of unused inputs. */ 8408 si_llvm_add_attribute(ctx->main_fn, 8409 "InitialPSInputAddr", 0xffffff); 8410 8411 /* Process colors. */ 8412 unsigned vgpr = last_sgpr + 1; 8413 unsigned colors_written = key->ps_epilog.colors_written; 8414 int last_color_export = -1; 8415 8416 /* Find the last color export. */ 8417 if (!key->ps_epilog.writes_z && 8418 !key->ps_epilog.writes_stencil && 8419 !key->ps_epilog.writes_samplemask) { 8420 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; 8421 8422 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ 8423 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { 8424 /* Just set this if any of the colorbuffers are enabled. */ 8425 if (spi_format & 8426 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) 8427 last_color_export = 0; 8428 } else { 8429 for (i = 0; i < 8; i++) 8430 if (colors_written & (1 << i) && 8431 (spi_format >> (i * 4)) & 0xf) 8432 last_color_export = i; 8433 } 8434 } 8435 8436 while (colors_written) { 8437 LLVMValueRef color[4]; 8438 int mrt = u_bit_scan(&colors_written); 8439 8440 for (i = 0; i < 4; i++) 8441 color[i] = LLVMGetParam(ctx->main_fn, vgpr++); 8442 8443 si_export_mrt_color(bld_base, color, mrt, 8444 num_params - 1, 8445 mrt == last_color_export, &exp); 8446 } 8447 8448 /* Process depth, stencil, samplemask. */ 8449 if (key->ps_epilog.writes_z) 8450 depth = LLVMGetParam(ctx->main_fn, vgpr++); 8451 if (key->ps_epilog.writes_stencil) 8452 stencil = LLVMGetParam(ctx->main_fn, vgpr++); 8453 if (key->ps_epilog.writes_samplemask) 8454 samplemask = LLVMGetParam(ctx->main_fn, vgpr++); 8455 8456 if (depth || stencil || samplemask) 8457 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp); 8458 else if (last_color_export == -1) 8459 si_export_null(bld_base); 8460 8461 if (exp.num) 8462 si_emit_ps_exports(ctx, &exp); 8463 8464 /* Compile. */ 8465 LLVMBuildRetVoid(gallivm->builder); 8466 } 8467 8468 /** 8469 * Select and compile (or reuse) pixel shader parts (prolog & epilog). 8470 */ 8471 static bool si_shader_select_ps_parts(struct si_screen *sscreen, 8472 LLVMTargetMachineRef tm, 8473 struct si_shader *shader, 8474 struct pipe_debug_callback *debug) 8475 { 8476 union si_shader_part_key prolog_key; 8477 union si_shader_part_key epilog_key; 8478 8479 /* Get the prolog. */ 8480 si_get_ps_prolog_key(shader, &prolog_key, true); 8481 8482 /* The prolog is a no-op if these aren't set. */ 8483 if (si_need_ps_prolog(&prolog_key)) { 8484 shader->prolog = 8485 si_get_shader_part(sscreen, &sscreen->ps_prologs, 8486 PIPE_SHADER_FRAGMENT, true, 8487 &prolog_key, tm, debug, 8488 si_build_ps_prolog_function, 8489 "Fragment Shader Prolog"); 8490 if (!shader->prolog) 8491 return false; 8492 } 8493 8494 /* Get the epilog. */ 8495 si_get_ps_epilog_key(shader, &epilog_key); 8496 8497 shader->epilog = 8498 si_get_shader_part(sscreen, &sscreen->ps_epilogs, 8499 PIPE_SHADER_FRAGMENT, false, 8500 &epilog_key, tm, debug, 8501 si_build_ps_epilog_function, 8502 "Fragment Shader Epilog"); 8503 if (!shader->epilog) 8504 return false; 8505 8506 /* Enable POS_FIXED_PT if polygon stippling is enabled. */ 8507 if (shader->key.part.ps.prolog.poly_stipple) { 8508 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); 8509 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); 8510 } 8511 8512 /* Set up the enable bits for per-sample shading if needed. */ 8513 if (shader->key.part.ps.prolog.force_persp_sample_interp && 8514 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || 8515 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 8516 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; 8517 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 8518 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); 8519 } 8520 if (shader->key.part.ps.prolog.force_linear_sample_interp && 8521 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || 8522 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 8523 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; 8524 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 8525 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); 8526 } 8527 if (shader->key.part.ps.prolog.force_persp_center_interp && 8528 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 8529 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 8530 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; 8531 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 8532 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 8533 } 8534 if (shader->key.part.ps.prolog.force_linear_center_interp && 8535 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 8536 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 8537 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; 8538 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 8539 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 8540 } 8541 8542 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ 8543 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && 8544 !(shader->config.spi_ps_input_ena & 0xf)) { 8545 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 8546 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); 8547 } 8548 8549 /* At least one pair of interpolation weights must be enabled. */ 8550 if (!(shader->config.spi_ps_input_ena & 0x7f)) { 8551 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 8552 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); 8553 } 8554 8555 /* The sample mask input is always enabled, because the API shader always 8556 * passes it through to the epilog. Disable it here if it's unused. 8557 */ 8558 if (!shader->key.part.ps.epilog.poly_line_smoothing && 8559 !shader->selector->info.reads_samplemask) 8560 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; 8561 8562 return true; 8563 } 8564 8565 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, 8566 unsigned *lds_size) 8567 { 8568 /* SPI barrier management bug: 8569 * Make sure we have at least 4k of LDS in use to avoid the bug. 8570 * It applies to workgroup sizes of more than one wavefront. 8571 */ 8572 if (sscreen->b.family == CHIP_BONAIRE || 8573 sscreen->b.family == CHIP_KABINI || 8574 sscreen->b.family == CHIP_MULLINS) 8575 *lds_size = MAX2(*lds_size, 8); 8576 } 8577 8578 static void si_fix_resource_usage(struct si_screen *sscreen, 8579 struct si_shader *shader) 8580 { 8581 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ 8582 8583 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); 8584 8585 if (shader->selector->type == PIPE_SHADER_COMPUTE && 8586 si_get_max_workgroup_size(shader) > 64) { 8587 si_multiwave_lds_size_workaround(sscreen, 8588 &shader->config.lds_size); 8589 } 8590 } 8591 8592 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, 8593 struct si_shader *shader, 8594 struct pipe_debug_callback *debug) 8595 { 8596 struct si_shader_selector *sel = shader->selector; 8597 struct si_shader *mainp = sel->main_shader_part; 8598 int r; 8599 8600 /* LS, ES, VS are compiled on demand if the main part hasn't been 8601 * compiled for that stage. 8602 * 8603 * Vertex shaders are compiled on demand when a vertex fetch 8604 * workaround must be applied. 8605 */ 8606 if (shader->is_monolithic) { 8607 /* Monolithic shader (compiled as a whole, has many variants, 8608 * may take a long time to compile). 8609 */ 8610 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug); 8611 if (r) 8612 return r; 8613 } else { 8614 /* The shader consists of 2-3 parts: 8615 * 8616 * - the middle part is the user shader, it has 1 variant only 8617 * and it was compiled during the creation of the shader 8618 * selector 8619 * - the prolog part is inserted at the beginning 8620 * - the epilog part is inserted at the end 8621 * 8622 * The prolog and epilog have many (but simple) variants. 8623 */ 8624 8625 /* Copy the compiled TGSI shader data over. */ 8626 shader->is_binary_shared = true; 8627 shader->binary = mainp->binary; 8628 shader->config = mainp->config; 8629 shader->info.num_input_sgprs = mainp->info.num_input_sgprs; 8630 shader->info.num_input_vgprs = mainp->info.num_input_vgprs; 8631 shader->info.face_vgpr_index = mainp->info.face_vgpr_index; 8632 memcpy(shader->info.vs_output_param_offset, 8633 mainp->info.vs_output_param_offset, 8634 sizeof(mainp->info.vs_output_param_offset)); 8635 shader->info.uses_instanceid = mainp->info.uses_instanceid; 8636 shader->info.nr_pos_exports = mainp->info.nr_pos_exports; 8637 shader->info.nr_param_exports = mainp->info.nr_param_exports; 8638 8639 /* Select prologs and/or epilogs. */ 8640 switch (sel->type) { 8641 case PIPE_SHADER_VERTEX: 8642 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug)) 8643 return -1; 8644 break; 8645 case PIPE_SHADER_TESS_CTRL: 8646 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug)) 8647 return -1; 8648 break; 8649 case PIPE_SHADER_TESS_EVAL: 8650 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug)) 8651 return -1; 8652 break; 8653 case PIPE_SHADER_GEOMETRY: 8654 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug)) 8655 return -1; 8656 break; 8657 case PIPE_SHADER_FRAGMENT: 8658 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug)) 8659 return -1; 8660 8661 /* Make sure we have at least as many VGPRs as there 8662 * are allocated inputs. 8663 */ 8664 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8665 shader->info.num_input_vgprs); 8666 break; 8667 } 8668 8669 /* Update SGPR and VGPR counts. */ 8670 if (shader->prolog) { 8671 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8672 shader->prolog->config.num_sgprs); 8673 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8674 shader->prolog->config.num_vgprs); 8675 } 8676 if (shader->epilog) { 8677 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, 8678 shader->epilog->config.num_sgprs); 8679 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, 8680 shader->epilog->config.num_vgprs); 8681 } 8682 } 8683 8684 si_fix_resource_usage(sscreen, shader); 8685 si_shader_dump(sscreen, shader, debug, sel->info.processor, 8686 stderr, true); 8687 8688 /* Upload. */ 8689 r = si_shader_binary_upload(sscreen, shader); 8690 if (r) { 8691 fprintf(stderr, "LLVM failed to upload shader\n"); 8692 return r; 8693 } 8694 8695 return 0; 8696 } 8697 8698 void si_shader_destroy(struct si_shader *shader) 8699 { 8700 if (shader->scratch_bo) 8701 r600_resource_reference(&shader->scratch_bo, NULL); 8702 8703 r600_resource_reference(&shader->bo, NULL); 8704 8705 if (!shader->is_binary_shared) 8706 radeon_shader_binary_clean(&shader->binary); 8707 8708 free(shader->shader_log); 8709 } 8710