1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * Copyright 2007 VMware, Inc. 5 * All Rights Reserved. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the 9 * "Software"), to deal in the Software without restriction, including 10 * without limitation the rights to use, copy, modify, merge, publish, 11 * distribute, sub license, and/or sell copies of the Software, and to 12 * permit persons to whom the Software is furnished to do so, subject to 13 * the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the 16 * next paragraph) shall be included in all copies or substantial portions 17 * of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 * 27 **************************************************************************/ 28 29 /** 30 * @file 31 * Code generate the whole fragment pipeline. 32 * 33 * The fragment pipeline consists of the following stages: 34 * - early depth test 35 * - fragment shader 36 * - alpha test 37 * - depth/stencil test 38 * - blending 39 * 40 * This file has only the glue to assemble the fragment pipeline. The actual 41 * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the 42 * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we 43 * muster the LLVM JIT execution engine to create a function that follows an 44 * established binary interface and that can be called from C directly. 45 * 46 * A big source of complexity here is that we often want to run different 47 * stages with different precisions and data types and precisions. For example, 48 * the fragment shader needs typically to be done in floats, but the 49 * depth/stencil test and blending is better done in the type that most closely 50 * matches the depth/stencil and color buffer respectively. 51 * 52 * Since the width of a SIMD vector register stays the same regardless of the 53 * element type, different types imply different number of elements, so we must 54 * code generate more instances of the stages with larger types to be able to 55 * feed/consume the stages with smaller types. 56 * 57 * @author Jose Fonseca <jfonseca (at) vmware.com> 58 */ 59 60 #include <limits.h> 61 #include "pipe/p_defines.h" 62 #include "util/u_inlines.h" 63 #include "util/u_memory.h" 64 #include "util/u_pointer.h" 65 #include "util/u_format.h" 66 #include "util/u_dump.h" 67 #include "util/u_string.h" 68 #include "util/simple_list.h" 69 #include "util/u_dual_blend.h" 70 #include "util/os_time.h" 71 #include "pipe/p_shader_tokens.h" 72 #include "draw/draw_context.h" 73 #include "tgsi/tgsi_dump.h" 74 #include "tgsi/tgsi_scan.h" 75 #include "tgsi/tgsi_parse.h" 76 #include "gallivm/lp_bld_type.h" 77 #include "gallivm/lp_bld_const.h" 78 #include "gallivm/lp_bld_conv.h" 79 #include "gallivm/lp_bld_init.h" 80 #include "gallivm/lp_bld_intr.h" 81 #include "gallivm/lp_bld_logic.h" 82 #include "gallivm/lp_bld_tgsi.h" 83 #include "gallivm/lp_bld_swizzle.h" 84 #include "gallivm/lp_bld_flow.h" 85 #include "gallivm/lp_bld_debug.h" 86 #include "gallivm/lp_bld_arit.h" 87 #include "gallivm/lp_bld_bitarit.h" 88 #include "gallivm/lp_bld_pack.h" 89 #include "gallivm/lp_bld_format.h" 90 #include "gallivm/lp_bld_quad.h" 91 92 #include "lp_bld_alpha.h" 93 #include "lp_bld_blend.h" 94 #include "lp_bld_depth.h" 95 #include "lp_bld_interp.h" 96 #include "lp_context.h" 97 #include "lp_debug.h" 98 #include "lp_perf.h" 99 #include "lp_setup.h" 100 #include "lp_state.h" 101 #include "lp_tex_sample.h" 102 #include "lp_flush.h" 103 #include "lp_state_fs.h" 104 #include "lp_rast.h" 105 106 107 /** Fragment shader number (for debugging) */ 108 static unsigned fs_no = 0; 109 110 111 /** 112 * Expand the relevant bits of mask_input to a n*4-dword mask for the 113 * n*four pixels in n 2x2 quads. This will set the n*four elements of the 114 * quad mask vector to 0 or ~0. 115 * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid 116 * quad arguments with fs length 8. 117 * 118 * \param first_quad which quad(s) of the quad group to test, in [0,3] 119 * \param mask_input bitwise mask for the whole 4x4 stamp 120 */ 121 static LLVMValueRef 122 generate_quad_mask(struct gallivm_state *gallivm, 123 struct lp_type fs_type, 124 unsigned first_quad, 125 LLVMValueRef mask_input) /* int32 */ 126 { 127 LLVMBuilderRef builder = gallivm->builder; 128 struct lp_type mask_type; 129 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 130 LLVMValueRef bits[16]; 131 LLVMValueRef mask, bits_vec; 132 int shift, i; 133 134 /* 135 * XXX: We'll need a different path for 16 x u8 136 */ 137 assert(fs_type.width == 32); 138 assert(fs_type.length <= ARRAY_SIZE(bits)); 139 mask_type = lp_int_type(fs_type); 140 141 /* 142 * mask_input >>= (quad * 4) 143 */ 144 switch (first_quad) { 145 case 0: 146 shift = 0; 147 break; 148 case 1: 149 assert(fs_type.length == 4); 150 shift = 2; 151 break; 152 case 2: 153 shift = 8; 154 break; 155 case 3: 156 assert(fs_type.length == 4); 157 shift = 10; 158 break; 159 default: 160 assert(0); 161 shift = 0; 162 } 163 164 mask_input = LLVMBuildLShr(builder, 165 mask_input, 166 LLVMConstInt(i32t, shift, 0), 167 ""); 168 169 /* 170 * mask = { mask_input & (1 << i), for i in [0,3] } 171 */ 172 mask = lp_build_broadcast(gallivm, 173 lp_build_vec_type(gallivm, mask_type), 174 mask_input); 175 176 for (i = 0; i < fs_type.length / 4; i++) { 177 unsigned j = 2 * (i % 2) + (i / 2) * 8; 178 bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0); 179 bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0); 180 bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0); 181 bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0); 182 } 183 bits_vec = LLVMConstVector(bits, fs_type.length); 184 mask = LLVMBuildAnd(builder, mask, bits_vec, ""); 185 186 /* 187 * mask = mask == bits ? ~0 : 0 188 */ 189 mask = lp_build_compare(gallivm, 190 mask_type, PIPE_FUNC_EQUAL, 191 mask, bits_vec); 192 193 return mask; 194 } 195 196 197 #define EARLY_DEPTH_TEST 0x1 198 #define LATE_DEPTH_TEST 0x2 199 #define EARLY_DEPTH_WRITE 0x4 200 #define LATE_DEPTH_WRITE 0x8 201 202 static int 203 find_output_by_semantic( const struct tgsi_shader_info *info, 204 unsigned semantic, 205 unsigned index ) 206 { 207 int i; 208 209 for (i = 0; i < info->num_outputs; i++) 210 if (info->output_semantic_name[i] == semantic && 211 info->output_semantic_index[i] == index) 212 return i; 213 214 return -1; 215 } 216 217 218 /** 219 * Fetch the specified lp_jit_viewport structure for a given viewport_index. 220 */ 221 static LLVMValueRef 222 lp_llvm_viewport(LLVMValueRef context_ptr, 223 struct gallivm_state *gallivm, 224 LLVMValueRef viewport_index) 225 { 226 LLVMBuilderRef builder = gallivm->builder; 227 LLVMValueRef ptr; 228 LLVMValueRef res; 229 struct lp_type viewport_type = 230 lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS); 231 232 ptr = lp_jit_context_viewports(gallivm, context_ptr); 233 ptr = LLVMBuildPointerCast(builder, ptr, 234 LLVMPointerType(lp_build_vec_type(gallivm, viewport_type), 0), ""); 235 236 res = lp_build_pointer_get(builder, ptr, viewport_index); 237 238 return res; 239 } 240 241 242 static LLVMValueRef 243 lp_build_depth_clamp(struct gallivm_state *gallivm, 244 LLVMBuilderRef builder, 245 struct lp_type type, 246 LLVMValueRef context_ptr, 247 LLVMValueRef thread_data_ptr, 248 LLVMValueRef z) 249 { 250 LLVMValueRef viewport, min_depth, max_depth; 251 LLVMValueRef viewport_index; 252 struct lp_build_context f32_bld; 253 254 assert(type.floating); 255 lp_build_context_init(&f32_bld, gallivm, type); 256 257 /* 258 * Assumes clamping of the viewport index will occur in setup/gs. Value 259 * is passed through the rasterization stage via lp_rast_shader_inputs. 260 * 261 * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping 262 * semantics. 263 */ 264 viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm, 265 thread_data_ptr); 266 267 /* 268 * Load the min and max depth from the lp_jit_context.viewports 269 * array of lp_jit_viewport structures. 270 */ 271 viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index); 272 273 /* viewports[viewport_index].min_depth */ 274 min_depth = LLVMBuildExtractElement(builder, viewport, 275 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), ""); 276 min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth); 277 278 /* viewports[viewport_index].max_depth */ 279 max_depth = LLVMBuildExtractElement(builder, viewport, 280 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), ""); 281 max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth); 282 283 /* 284 * Clamp to the min and max depth values for the given viewport. 285 */ 286 return lp_build_clamp(&f32_bld, z, min_depth, max_depth); 287 } 288 289 290 /** 291 * Generate the fragment shader, depth/stencil test, and alpha tests. 292 */ 293 static void 294 generate_fs_loop(struct gallivm_state *gallivm, 295 struct lp_fragment_shader *shader, 296 const struct lp_fragment_shader_variant_key *key, 297 LLVMBuilderRef builder, 298 struct lp_type type, 299 LLVMValueRef context_ptr, 300 LLVMValueRef num_loop, 301 struct lp_build_interp_soa_context *interp, 302 struct lp_build_sampler_soa *sampler, 303 LLVMValueRef mask_store, 304 LLVMValueRef (*out_color)[4], 305 LLVMValueRef depth_ptr, 306 LLVMValueRef depth_stride, 307 LLVMValueRef facing, 308 LLVMValueRef thread_data_ptr) 309 { 310 const struct util_format_description *zs_format_desc = NULL; 311 const struct tgsi_token *tokens = shader->base.tokens; 312 struct lp_type int_type = lp_int_type(type); 313 LLVMTypeRef vec_type, int_vec_type; 314 LLVMValueRef mask_ptr, mask_val; 315 LLVMValueRef consts_ptr, num_consts_ptr; 316 LLVMValueRef z; 317 LLVMValueRef z_value, s_value; 318 LLVMValueRef z_fb, s_fb; 319 LLVMValueRef stencil_refs[2]; 320 LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; 321 struct lp_build_for_loop_state loop_state; 322 struct lp_build_mask_context mask; 323 /* 324 * TODO: figure out if simple_shader optimization is really worthwile to 325 * keep. Disabled because it may hide some real bugs in the (depth/stencil) 326 * code since tests tend to take another codepath than real shaders. 327 */ 328 boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 && 329 shader->info.base.num_inputs < 3 && 330 shader->info.base.num_instructions < 8) && 0; 331 const boolean dual_source_blend = key->blend.rt[0].blend_enable && 332 util_blend_state_is_dual(&key->blend, 0); 333 unsigned attrib; 334 unsigned chan; 335 unsigned cbuf; 336 unsigned depth_mode; 337 338 struct lp_bld_tgsi_system_values system_values; 339 340 memset(&system_values, 0, sizeof(system_values)); 341 342 if (key->depth.enabled || 343 key->stencil[0].enabled) { 344 345 zs_format_desc = util_format_description(key->zsbuf_format); 346 assert(zs_format_desc); 347 348 if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) { 349 if (key->alpha.enabled || 350 key->blend.alpha_to_coverage || 351 shader->info.base.uses_kill || 352 shader->info.base.writes_samplemask) { 353 /* With alpha test and kill, can do the depth test early 354 * and hopefully eliminate some quads. But need to do a 355 * special deferred depth write once the final mask value 356 * is known. This only works though if there's either no 357 * stencil test or the stencil value isn't written. 358 */ 359 if (key->stencil[0].enabled && (key->stencil[0].writemask || 360 (key->stencil[1].enabled && 361 key->stencil[1].writemask))) 362 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE; 363 else 364 depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE; 365 } 366 else 367 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE; 368 } 369 else { 370 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE; 371 } 372 373 if (!(key->depth.enabled && key->depth.writemask) && 374 !(key->stencil[0].enabled && (key->stencil[0].writemask || 375 (key->stencil[1].enabled && 376 key->stencil[1].writemask)))) 377 depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE); 378 } 379 else { 380 depth_mode = 0; 381 } 382 383 vec_type = lp_build_vec_type(gallivm, type); 384 int_vec_type = lp_build_vec_type(gallivm, int_type); 385 386 stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr); 387 stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr); 388 /* convert scalar stencil refs into vectors */ 389 stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]); 390 stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]); 391 392 consts_ptr = lp_jit_context_constants(gallivm, context_ptr); 393 num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr); 394 395 lp_build_for_loop_begin(&loop_state, gallivm, 396 lp_build_const_int32(gallivm, 0), 397 LLVMIntULT, 398 num_loop, 399 lp_build_const_int32(gallivm, 1)); 400 401 mask_ptr = LLVMBuildGEP(builder, mask_store, 402 &loop_state.counter, 1, "mask_ptr"); 403 mask_val = LLVMBuildLoad(builder, mask_ptr, ""); 404 405 memset(outputs, 0, sizeof outputs); 406 407 for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { 408 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 409 out_color[cbuf][chan] = lp_build_array_alloca(gallivm, 410 lp_build_vec_type(gallivm, 411 type), 412 num_loop, "color"); 413 } 414 } 415 if (dual_source_blend) { 416 assert(key->nr_cbufs <= 1); 417 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 418 out_color[1][chan] = lp_build_array_alloca(gallivm, 419 lp_build_vec_type(gallivm, 420 type), 421 num_loop, "color1"); 422 } 423 } 424 425 426 /* 'mask' will control execution based on quad's pixel alive/killed state */ 427 lp_build_mask_begin(&mask, gallivm, type, mask_val); 428 429 if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader) 430 lp_build_mask_check(&mask); 431 432 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter); 433 z = interp->pos[2]; 434 435 if (depth_mode & EARLY_DEPTH_TEST) { 436 /* 437 * Clamp according to ARB_depth_clamp semantics. 438 */ 439 if (key->depth_clamp) { 440 z = lp_build_depth_clamp(gallivm, builder, type, context_ptr, 441 thread_data_ptr, z); 442 } 443 lp_build_depth_stencil_load_swizzled(gallivm, type, 444 zs_format_desc, key->resource_1d, 445 depth_ptr, depth_stride, 446 &z_fb, &s_fb, loop_state.counter); 447 lp_build_depth_stencil_test(gallivm, 448 &key->depth, 449 key->stencil, 450 type, 451 zs_format_desc, 452 &mask, 453 stencil_refs, 454 z, z_fb, s_fb, 455 facing, 456 &z_value, &s_value, 457 !simple_shader); 458 459 if (depth_mode & EARLY_DEPTH_WRITE) { 460 lp_build_depth_stencil_write_swizzled(gallivm, type, 461 zs_format_desc, key->resource_1d, 462 NULL, NULL, NULL, loop_state.counter, 463 depth_ptr, depth_stride, 464 z_value, s_value); 465 } 466 /* 467 * Note mask check if stencil is enabled must be after ds write not after 468 * stencil test otherwise new stencil values may not get written if all 469 * fragments got killed by depth/stencil test. 470 */ 471 if (!simple_shader && key->stencil[0].enabled) 472 lp_build_mask_check(&mask); 473 } 474 475 lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter); 476 477 /* Build the actual shader */ 478 lp_build_tgsi_soa(gallivm, tokens, type, &mask, 479 consts_ptr, num_consts_ptr, &system_values, 480 interp->inputs, 481 outputs, context_ptr, thread_data_ptr, 482 sampler, &shader->info.base, NULL); 483 484 /* Alpha test */ 485 if (key->alpha.enabled) { 486 int color0 = find_output_by_semantic(&shader->info.base, 487 TGSI_SEMANTIC_COLOR, 488 0); 489 490 if (color0 != -1 && outputs[color0][3]) { 491 const struct util_format_description *cbuf_format_desc; 492 LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha"); 493 LLVMValueRef alpha_ref_value; 494 495 alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_ptr); 496 alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value); 497 498 cbuf_format_desc = util_format_description(key->cbuf_format[0]); 499 500 lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc, 501 &mask, alpha, alpha_ref_value, 502 (depth_mode & LATE_DEPTH_TEST) != 0); 503 } 504 } 505 506 /* Emulate Alpha to Coverage with Alpha test */ 507 if (key->blend.alpha_to_coverage) { 508 int color0 = find_output_by_semantic(&shader->info.base, 509 TGSI_SEMANTIC_COLOR, 510 0); 511 512 if (color0 != -1 && outputs[color0][3]) { 513 LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha"); 514 515 lp_build_alpha_to_coverage(gallivm, type, 516 &mask, alpha, 517 (depth_mode & LATE_DEPTH_TEST) != 0); 518 } 519 } 520 521 if (shader->info.base.writes_samplemask) { 522 int smaski = find_output_by_semantic(&shader->info.base, 523 TGSI_SEMANTIC_SAMPLEMASK, 524 0); 525 LLVMValueRef smask; 526 struct lp_build_context smask_bld; 527 lp_build_context_init(&smask_bld, gallivm, int_type); 528 529 assert(smaski >= 0); 530 smask = LLVMBuildLoad(builder, outputs[smaski][0], "smask"); 531 /* 532 * Pixel is alive according to the first sample in the mask. 533 */ 534 smask = LLVMBuildBitCast(builder, smask, smask_bld.vec_type, ""); 535 smask = lp_build_and(&smask_bld, smask, smask_bld.one); 536 smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, smask, smask_bld.zero); 537 lp_build_mask_update(&mask, smask); 538 } 539 540 /* Late Z test */ 541 if (depth_mode & LATE_DEPTH_TEST) { 542 int pos0 = find_output_by_semantic(&shader->info.base, 543 TGSI_SEMANTIC_POSITION, 544 0); 545 int s_out = find_output_by_semantic(&shader->info.base, 546 TGSI_SEMANTIC_STENCIL, 547 0); 548 if (pos0 != -1 && outputs[pos0][2]) { 549 z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z"); 550 } 551 /* 552 * Clamp according to ARB_depth_clamp semantics. 553 */ 554 if (key->depth_clamp) { 555 z = lp_build_depth_clamp(gallivm, builder, type, context_ptr, 556 thread_data_ptr, z); 557 } 558 559 if (s_out != -1 && outputs[s_out][1]) { 560 /* there's only one value, and spec says to discard additional bits */ 561 LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255); 562 stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s"); 563 stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, ""); 564 stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, ""); 565 stencil_refs[1] = stencil_refs[0]; 566 } 567 568 lp_build_depth_stencil_load_swizzled(gallivm, type, 569 zs_format_desc, key->resource_1d, 570 depth_ptr, depth_stride, 571 &z_fb, &s_fb, loop_state.counter); 572 573 lp_build_depth_stencil_test(gallivm, 574 &key->depth, 575 key->stencil, 576 type, 577 zs_format_desc, 578 &mask, 579 stencil_refs, 580 z, z_fb, s_fb, 581 facing, 582 &z_value, &s_value, 583 !simple_shader); 584 /* Late Z write */ 585 if (depth_mode & LATE_DEPTH_WRITE) { 586 lp_build_depth_stencil_write_swizzled(gallivm, type, 587 zs_format_desc, key->resource_1d, 588 NULL, NULL, NULL, loop_state.counter, 589 depth_ptr, depth_stride, 590 z_value, s_value); 591 } 592 } 593 else if ((depth_mode & EARLY_DEPTH_TEST) && 594 (depth_mode & LATE_DEPTH_WRITE)) 595 { 596 /* Need to apply a reduced mask to the depth write. Reload the 597 * depth value, update from zs_value with the new mask value and 598 * write that out. 599 */ 600 lp_build_depth_stencil_write_swizzled(gallivm, type, 601 zs_format_desc, key->resource_1d, 602 &mask, z_fb, s_fb, loop_state.counter, 603 depth_ptr, depth_stride, 604 z_value, s_value); 605 } 606 607 608 /* Color write */ 609 for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib) 610 { 611 unsigned cbuf = shader->info.base.output_semantic_index[attrib]; 612 if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) && 613 ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend))) 614 { 615 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 616 if(outputs[attrib][chan]) { 617 /* XXX: just initialize outputs to point at colors[] and 618 * skip this. 619 */ 620 LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], ""); 621 LLVMValueRef color_ptr; 622 color_ptr = LLVMBuildGEP(builder, out_color[cbuf][chan], 623 &loop_state.counter, 1, ""); 624 lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]); 625 LLVMBuildStore(builder, out, color_ptr); 626 } 627 } 628 } 629 } 630 631 if (key->occlusion_count) { 632 LLVMValueRef counter = lp_jit_thread_data_counter(gallivm, thread_data_ptr); 633 lp_build_name(counter, "counter"); 634 lp_build_occlusion_count(gallivm, type, 635 lp_build_mask_value(&mask), counter); 636 } 637 638 mask_val = lp_build_mask_end(&mask); 639 LLVMBuildStore(builder, mask_val, mask_ptr); 640 lp_build_for_loop_end(&loop_state); 641 } 642 643 644 /** 645 * This function will reorder pixels from the fragment shader SoA to memory layout AoS 646 * 647 * Fragment Shader outputs pixels in small 2x2 blocks 648 * e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ... 649 * 650 * However in memory pixels are stored in rows 651 * e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ... 652 * 653 * @param type fragment shader type (4x or 8x float) 654 * @param num_fs number of fs_src 655 * @param is_1d whether we're outputting to a 1d resource 656 * @param dst_channels number of output channels 657 * @param fs_src output from fragment shader 658 * @param dst pointer to store result 659 * @param pad_inline is channel padding inline or at end of row 660 * @return the number of dsts 661 */ 662 static int 663 generate_fs_twiddle(struct gallivm_state *gallivm, 664 struct lp_type type, 665 unsigned num_fs, 666 unsigned dst_channels, 667 LLVMValueRef fs_src[][4], 668 LLVMValueRef* dst, 669 bool pad_inline) 670 { 671 LLVMValueRef src[16]; 672 673 bool swizzle_pad; 674 bool twiddle; 675 bool split; 676 677 unsigned pixels = type.length / 4; 678 unsigned reorder_group; 679 unsigned src_channels; 680 unsigned src_count; 681 unsigned i; 682 683 src_channels = dst_channels < 3 ? dst_channels : 4; 684 src_count = num_fs * src_channels; 685 686 assert(pixels == 2 || pixels == 1); 687 assert(num_fs * src_channels <= ARRAY_SIZE(src)); 688 689 /* 690 * Transpose from SoA -> AoS 691 */ 692 for (i = 0; i < num_fs; ++i) { 693 lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]); 694 } 695 696 /* 697 * Pick transformation options 698 */ 699 swizzle_pad = false; 700 twiddle = false; 701 split = false; 702 reorder_group = 0; 703 704 if (dst_channels == 1) { 705 twiddle = true; 706 707 if (pixels == 2) { 708 split = true; 709 } 710 } else if (dst_channels == 2) { 711 if (pixels == 1) { 712 reorder_group = 1; 713 } 714 } else if (dst_channels > 2) { 715 if (pixels == 1) { 716 reorder_group = 2; 717 } else { 718 twiddle = true; 719 } 720 721 if (!pad_inline && dst_channels == 3 && pixels > 1) { 722 swizzle_pad = true; 723 } 724 } 725 726 /* 727 * Split the src in half 728 */ 729 if (split) { 730 for (i = num_fs; i > 0; --i) { 731 src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4); 732 src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4); 733 } 734 735 src_count *= 2; 736 type.length = 4; 737 } 738 739 /* 740 * Ensure pixels are in memory order 741 */ 742 if (reorder_group) { 743 /* Twiddle pixels by reordering the array, e.g.: 744 * 745 * src_count = 8 -> 0 2 1 3 4 6 5 7 746 * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15 747 */ 748 const unsigned reorder_sw[] = { 0, 2, 1, 3 }; 749 750 for (i = 0; i < src_count; ++i) { 751 unsigned group = i / reorder_group; 752 unsigned block = (group / 4) * 4 * reorder_group; 753 unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group); 754 dst[i] = src[j]; 755 } 756 } else if (twiddle) { 757 /* Twiddle pixels across elements of array */ 758 /* 759 * XXX: we should avoid this in some cases, but would need to tell 760 * lp_build_conv to reorder (or deal with it ourselves). 761 */ 762 lp_bld_quad_twiddle(gallivm, type, src, src_count, dst); 763 } else { 764 /* Do nothing */ 765 memcpy(dst, src, sizeof(LLVMValueRef) * src_count); 766 } 767 768 /* 769 * Moves any padding between pixels to the end 770 * e.g. RGBXRGBX -> RGBRGBXX 771 */ 772 if (swizzle_pad) { 773 unsigned char swizzles[16]; 774 unsigned elems = pixels * dst_channels; 775 776 for (i = 0; i < type.length; ++i) { 777 if (i < elems) 778 swizzles[i] = i % dst_channels + (i / dst_channels) * 4; 779 else 780 swizzles[i] = LP_BLD_SWIZZLE_DONTCARE; 781 } 782 783 for (i = 0; i < src_count; ++i) { 784 dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length); 785 } 786 } 787 788 return src_count; 789 } 790 791 792 /* 793 * Untwiddle and transpose, much like the above. 794 * However, this is after conversion, so we get packed vectors. 795 * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data, 796 * the vectors will look like: 797 * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may 798 * be swizzled here). Extending to 16bit should be trivial. 799 * Should also be extended to handle twice wide vectors with AVX2... 800 */ 801 static void 802 fs_twiddle_transpose(struct gallivm_state *gallivm, 803 struct lp_type type, 804 LLVMValueRef *src, 805 unsigned src_count, 806 LLVMValueRef *dst) 807 { 808 unsigned i, j; 809 struct lp_type type64, type16, type32; 810 LLVMTypeRef type64_t, type8_t, type16_t, type32_t; 811 LLVMBuilderRef builder = gallivm->builder; 812 LLVMValueRef tmp[4], shuf[8]; 813 for (j = 0; j < 2; j++) { 814 shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0); 815 shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2); 816 shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1); 817 shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3); 818 } 819 820 assert(src_count == 4 || src_count == 2 || src_count == 1); 821 assert(type.width == 8); 822 assert(type.length == 16); 823 824 type8_t = lp_build_vec_type(gallivm, type); 825 826 type64 = type; 827 type64.length /= 8; 828 type64.width *= 8; 829 type64_t = lp_build_vec_type(gallivm, type64); 830 831 type16 = type; 832 type16.length /= 2; 833 type16.width *= 2; 834 type16_t = lp_build_vec_type(gallivm, type16); 835 836 type32 = type; 837 type32.length /= 4; 838 type32.width *= 4; 839 type32_t = lp_build_vec_type(gallivm, type32); 840 841 lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp); 842 843 if (src_count == 1) { 844 /* transpose was no-op, just untwiddle */ 845 LLVMValueRef shuf_vec; 846 shuf_vec = LLVMConstVector(shuf, 8); 847 tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, ""); 848 tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, ""); 849 dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, ""); 850 } else if (src_count == 2) { 851 LLVMValueRef shuf_vec; 852 shuf_vec = LLVMConstVector(shuf, 4); 853 854 for (i = 0; i < 2; i++) { 855 tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, ""); 856 tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, ""); 857 dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, ""); 858 } 859 } else { 860 for (j = 0; j < 2; j++) { 861 LLVMValueRef lo, hi, lo2, hi2; 862 /* 863 * Note that if we only really have 3 valid channels (rgb) 864 * and we don't need alpha we could substitute a undef here 865 * for the respective channel (causing llvm to drop conversion 866 * for alpha). 867 */ 868 /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */ 869 lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, ""); 870 hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, ""); 871 lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0); 872 hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1); 873 dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, ""); 874 dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, ""); 875 } 876 } 877 } 878 879 880 /** 881 * Load an unswizzled block of pixels from memory 882 */ 883 static void 884 load_unswizzled_block(struct gallivm_state *gallivm, 885 LLVMValueRef base_ptr, 886 LLVMValueRef stride, 887 unsigned block_width, 888 unsigned block_height, 889 LLVMValueRef* dst, 890 struct lp_type dst_type, 891 unsigned dst_count, 892 unsigned dst_alignment) 893 { 894 LLVMBuilderRef builder = gallivm->builder; 895 unsigned row_size = dst_count / block_height; 896 unsigned i; 897 898 /* Ensure block exactly fits into dst */ 899 assert((block_width * block_height) % dst_count == 0); 900 901 for (i = 0; i < dst_count; ++i) { 902 unsigned x = i % row_size; 903 unsigned y = i / row_size; 904 905 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length); 906 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, ""); 907 908 LLVMValueRef gep[2]; 909 LLVMValueRef dst_ptr; 910 911 gep[0] = lp_build_const_int32(gallivm, 0); 912 gep[1] = LLVMBuildAdd(builder, bx, by, ""); 913 914 dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); 915 dst_ptr = LLVMBuildBitCast(builder, dst_ptr, 916 LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), ""); 917 918 dst[i] = LLVMBuildLoad(builder, dst_ptr, ""); 919 920 LLVMSetAlignment(dst[i], dst_alignment); 921 } 922 } 923 924 925 /** 926 * Store an unswizzled block of pixels to memory 927 */ 928 static void 929 store_unswizzled_block(struct gallivm_state *gallivm, 930 LLVMValueRef base_ptr, 931 LLVMValueRef stride, 932 unsigned block_width, 933 unsigned block_height, 934 LLVMValueRef* src, 935 struct lp_type src_type, 936 unsigned src_count, 937 unsigned src_alignment) 938 { 939 LLVMBuilderRef builder = gallivm->builder; 940 unsigned row_size = src_count / block_height; 941 unsigned i; 942 943 /* Ensure src exactly fits into block */ 944 assert((block_width * block_height) % src_count == 0); 945 946 for (i = 0; i < src_count; ++i) { 947 unsigned x = i % row_size; 948 unsigned y = i / row_size; 949 950 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length); 951 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, ""); 952 953 LLVMValueRef gep[2]; 954 LLVMValueRef src_ptr; 955 956 gep[0] = lp_build_const_int32(gallivm, 0); 957 gep[1] = LLVMBuildAdd(builder, bx, by, ""); 958 959 src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); 960 src_ptr = LLVMBuildBitCast(builder, src_ptr, 961 LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), ""); 962 963 src_ptr = LLVMBuildStore(builder, src[i], src_ptr); 964 965 LLVMSetAlignment(src_ptr, src_alignment); 966 } 967 } 968 969 970 /** 971 * Checks if a format description is an arithmetic format 972 * 973 * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5. 974 */ 975 static inline boolean 976 is_arithmetic_format(const struct util_format_description *format_desc) 977 { 978 boolean arith = false; 979 unsigned i; 980 981 for (i = 0; i < format_desc->nr_channels; ++i) { 982 arith |= format_desc->channel[i].size != format_desc->channel[0].size; 983 arith |= (format_desc->channel[i].size % 8) != 0; 984 } 985 986 return arith; 987 } 988 989 990 /** 991 * Checks if this format requires special handling due to required expansion 992 * to floats for blending, and furthermore has "natural" packed AoS -> unpacked 993 * SoA conversion. 994 */ 995 static inline boolean 996 format_expands_to_float_soa(const struct util_format_description *format_desc) 997 { 998 if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || 999 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { 1000 return true; 1001 } 1002 return false; 1003 } 1004 1005 1006 /** 1007 * Retrieves the type representing the memory layout for a format 1008 * 1009 * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte 1010 */ 1011 static inline void 1012 lp_mem_type_from_format_desc(const struct util_format_description *format_desc, 1013 struct lp_type* type) 1014 { 1015 unsigned i; 1016 unsigned chan; 1017 1018 if (format_expands_to_float_soa(format_desc)) { 1019 /* just make this a uint with width of block */ 1020 type->floating = false; 1021 type->fixed = false; 1022 type->sign = false; 1023 type->norm = false; 1024 type->width = format_desc->block.bits; 1025 type->length = 1; 1026 return; 1027 } 1028 1029 for (i = 0; i < 4; i++) 1030 if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) 1031 break; 1032 chan = i; 1033 1034 memset(type, 0, sizeof(struct lp_type)); 1035 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT; 1036 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED; 1037 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED; 1038 type->norm = format_desc->channel[chan].normalized; 1039 1040 if (is_arithmetic_format(format_desc)) { 1041 type->width = 0; 1042 type->length = 1; 1043 1044 for (i = 0; i < format_desc->nr_channels; ++i) { 1045 type->width += format_desc->channel[i].size; 1046 } 1047 } else { 1048 type->width = format_desc->channel[chan].size; 1049 type->length = format_desc->nr_channels; 1050 } 1051 } 1052 1053 1054 /** 1055 * Retrieves the type for a format which is usable in the blending code. 1056 * 1057 * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte 1058 */ 1059 static inline void 1060 lp_blend_type_from_format_desc(const struct util_format_description *format_desc, 1061 struct lp_type* type) 1062 { 1063 unsigned i; 1064 unsigned chan; 1065 1066 if (format_expands_to_float_soa(format_desc)) { 1067 /* always use ordinary floats for blending */ 1068 type->floating = true; 1069 type->fixed = false; 1070 type->sign = true; 1071 type->norm = false; 1072 type->width = 32; 1073 type->length = 4; 1074 return; 1075 } 1076 1077 for (i = 0; i < 4; i++) 1078 if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) 1079 break; 1080 chan = i; 1081 1082 memset(type, 0, sizeof(struct lp_type)); 1083 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT; 1084 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED; 1085 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED; 1086 type->norm = format_desc->channel[chan].normalized; 1087 type->width = format_desc->channel[chan].size; 1088 type->length = format_desc->nr_channels; 1089 1090 for (i = 1; i < format_desc->nr_channels; ++i) { 1091 if (format_desc->channel[i].size > type->width) 1092 type->width = format_desc->channel[i].size; 1093 } 1094 1095 if (type->floating) { 1096 type->width = 32; 1097 } else { 1098 if (type->width <= 8) { 1099 type->width = 8; 1100 } else if (type->width <= 16) { 1101 type->width = 16; 1102 } else { 1103 type->width = 32; 1104 } 1105 } 1106 1107 if (is_arithmetic_format(format_desc) && type->length == 3) { 1108 type->length = 4; 1109 } 1110 } 1111 1112 1113 /** 1114 * Scale a normalized value from src_bits to dst_bits. 1115 * 1116 * The exact calculation is 1117 * 1118 * dst = iround(src * dst_mask / src_mask) 1119 * 1120 * or with integer rounding 1121 * 1122 * dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask) 1123 * 1124 * where 1125 * 1126 * src_mask = (1 << src_bits) - 1 1127 * dst_mask = (1 << dst_bits) - 1 1128 * 1129 * but we try to avoid division and multiplication through shifts. 1130 */ 1131 static inline LLVMValueRef 1132 scale_bits(struct gallivm_state *gallivm, 1133 int src_bits, 1134 int dst_bits, 1135 LLVMValueRef src, 1136 struct lp_type src_type) 1137 { 1138 LLVMBuilderRef builder = gallivm->builder; 1139 LLVMValueRef result = src; 1140 1141 if (dst_bits < src_bits) { 1142 int delta_bits = src_bits - dst_bits; 1143 1144 if (delta_bits <= dst_bits) { 1145 /* 1146 * Approximate the rescaling with a single shift. 1147 * 1148 * This gives the wrong rounding. 1149 */ 1150 1151 result = LLVMBuildLShr(builder, 1152 src, 1153 lp_build_const_int_vec(gallivm, src_type, delta_bits), 1154 ""); 1155 1156 } else { 1157 /* 1158 * Try more accurate rescaling. 1159 */ 1160 1161 /* 1162 * Drop the least significant bits to make space for the multiplication. 1163 * 1164 * XXX: A better approach would be to use a wider integer type as intermediate. But 1165 * this is enough to convert alpha from 16bits -> 2 when rendering to 1166 * PIPE_FORMAT_R10G10B10A2_UNORM. 1167 */ 1168 result = LLVMBuildLShr(builder, 1169 src, 1170 lp_build_const_int_vec(gallivm, src_type, dst_bits), 1171 ""); 1172 1173 1174 result = LLVMBuildMul(builder, 1175 result, 1176 lp_build_const_int_vec(gallivm, src_type, (1LL << dst_bits) - 1), 1177 ""); 1178 1179 /* 1180 * Add a rounding term before the division. 1181 * 1182 * TODO: Handle signed integers too. 1183 */ 1184 if (!src_type.sign) { 1185 result = LLVMBuildAdd(builder, 1186 result, 1187 lp_build_const_int_vec(gallivm, src_type, (1LL << (delta_bits - 1))), 1188 ""); 1189 } 1190 1191 /* 1192 * Approximate the division by src_mask with a src_bits shift. 1193 * 1194 * Given the src has already been shifted by dst_bits, all we need 1195 * to do is to shift by the difference. 1196 */ 1197 1198 result = LLVMBuildLShr(builder, 1199 result, 1200 lp_build_const_int_vec(gallivm, src_type, delta_bits), 1201 ""); 1202 } 1203 1204 } else if (dst_bits > src_bits) { 1205 /* Scale up bits */ 1206 int db = dst_bits - src_bits; 1207 1208 /* Shift left by difference in bits */ 1209 result = LLVMBuildShl(builder, 1210 src, 1211 lp_build_const_int_vec(gallivm, src_type, db), 1212 ""); 1213 1214 if (db <= src_bits) { 1215 /* Enough bits in src to fill the remainder */ 1216 LLVMValueRef lower = LLVMBuildLShr(builder, 1217 src, 1218 lp_build_const_int_vec(gallivm, src_type, src_bits - db), 1219 ""); 1220 1221 result = LLVMBuildOr(builder, result, lower, ""); 1222 } else if (db > src_bits) { 1223 /* Need to repeatedly copy src bits to fill remainder in dst */ 1224 unsigned n; 1225 1226 for (n = src_bits; n < dst_bits; n *= 2) { 1227 LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n); 1228 1229 result = LLVMBuildOr(builder, 1230 result, 1231 LLVMBuildLShr(builder, result, shuv, ""), 1232 ""); 1233 } 1234 } 1235 } 1236 1237 return result; 1238 } 1239 1240 /** 1241 * If RT is a smallfloat (needing denorms) format 1242 */ 1243 static inline int 1244 have_smallfloat_format(struct lp_type dst_type, 1245 enum pipe_format format) 1246 { 1247 return ((dst_type.floating && dst_type.width != 32) || 1248 /* due to format handling hacks this format doesn't have floating set 1249 * here (and actually has width set to 32 too) so special case this. */ 1250 (format == PIPE_FORMAT_R11G11B10_FLOAT)); 1251 } 1252 1253 1254 /** 1255 * Convert from memory format to blending format 1256 * 1257 * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending 1258 */ 1259 static void 1260 convert_to_blend_type(struct gallivm_state *gallivm, 1261 unsigned block_size, 1262 const struct util_format_description *src_fmt, 1263 struct lp_type src_type, 1264 struct lp_type dst_type, 1265 LLVMValueRef* src, // and dst 1266 unsigned num_srcs) 1267 { 1268 LLVMValueRef *dst = src; 1269 LLVMBuilderRef builder = gallivm->builder; 1270 struct lp_type blend_type; 1271 struct lp_type mem_type; 1272 unsigned i, j; 1273 unsigned pixels = block_size / num_srcs; 1274 bool is_arith; 1275 1276 /* 1277 * full custom path for packed floats and srgb formats - none of the later 1278 * functions would do anything useful, and given the lp_type representation they 1279 * can't be fixed. Should really have some SoA blend path for these kind of 1280 * formats rather than hacking them in here. 1281 */ 1282 if (format_expands_to_float_soa(src_fmt)) { 1283 LLVMValueRef tmpsrc[4]; 1284 /* 1285 * This is pretty suboptimal for this case blending in SoA would be much 1286 * better, since conversion gets us SoA values so need to convert back. 1287 */ 1288 assert(src_type.width == 32 || src_type.width == 16); 1289 assert(dst_type.floating); 1290 assert(dst_type.width == 32); 1291 assert(dst_type.length % 4 == 0); 1292 assert(num_srcs % 4 == 0); 1293 1294 if (src_type.width == 16) { 1295 /* expand 4x16bit values to 4x32bit */ 1296 struct lp_type type32x4 = src_type; 1297 LLVMTypeRef ltype32x4; 1298 unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4; 1299 type32x4.width = 32; 1300 ltype32x4 = lp_build_vec_type(gallivm, type32x4); 1301 for (i = 0; i < num_fetch; i++) { 1302 src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, ""); 1303 } 1304 src_type.width = 32; 1305 } 1306 for (i = 0; i < 4; i++) { 1307 tmpsrc[i] = src[i]; 1308 } 1309 for (i = 0; i < num_srcs / 4; i++) { 1310 LLVMValueRef tmpsoa[4]; 1311 LLVMValueRef tmps = tmpsrc[i]; 1312 if (dst_type.length == 8) { 1313 LLVMValueRef shuffles[8]; 1314 unsigned j; 1315 /* fetch was 4 values but need 8-wide output values */ 1316 tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2); 1317 /* 1318 * for 8-wide aos transpose would give us wrong order not matching 1319 * incoming converted fs values and mask. ARGH. 1320 */ 1321 for (j = 0; j < 4; j++) { 1322 shuffles[j] = lp_build_const_int32(gallivm, j * 2); 1323 shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1); 1324 } 1325 tmps = LLVMBuildShuffleVector(builder, tmps, tmps, 1326 LLVMConstVector(shuffles, 8), ""); 1327 } 1328 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) { 1329 lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa); 1330 } 1331 else { 1332 lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa); 1333 } 1334 lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]); 1335 } 1336 return; 1337 } 1338 1339 lp_mem_type_from_format_desc(src_fmt, &mem_type); 1340 lp_blend_type_from_format_desc(src_fmt, &blend_type); 1341 1342 /* Is the format arithmetic */ 1343 is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length; 1344 is_arith &= !(mem_type.width == 16 && mem_type.floating); 1345 1346 /* Pad if necessary */ 1347 if (!is_arith && src_type.length < dst_type.length) { 1348 for (i = 0; i < num_srcs; ++i) { 1349 dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length); 1350 } 1351 1352 src_type.length = dst_type.length; 1353 } 1354 1355 /* Special case for half-floats */ 1356 if (mem_type.width == 16 && mem_type.floating) { 1357 assert(blend_type.width == 32 && blend_type.floating); 1358 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst); 1359 is_arith = false; 1360 } 1361 1362 if (!is_arith) { 1363 return; 1364 } 1365 1366 src_type.width = blend_type.width * blend_type.length; 1367 blend_type.length *= pixels; 1368 src_type.length *= pixels / (src_type.length / mem_type.length); 1369 1370 for (i = 0; i < num_srcs; ++i) { 1371 LLVMValueRef chans[4]; 1372 LLVMValueRef res = NULL; 1373 1374 dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), ""); 1375 1376 for (j = 0; j < src_fmt->nr_channels; ++j) { 1377 unsigned mask = 0; 1378 unsigned sa = src_fmt->channel[j].shift; 1379 #ifdef PIPE_ARCH_LITTLE_ENDIAN 1380 unsigned from_lsb = j; 1381 #else 1382 unsigned from_lsb = src_fmt->nr_channels - j - 1; 1383 #endif 1384 1385 mask = (1 << src_fmt->channel[j].size) - 1; 1386 1387 /* Extract bits from source */ 1388 chans[j] = LLVMBuildLShr(builder, 1389 dst[i], 1390 lp_build_const_int_vec(gallivm, src_type, sa), 1391 ""); 1392 1393 chans[j] = LLVMBuildAnd(builder, 1394 chans[j], 1395 lp_build_const_int_vec(gallivm, src_type, mask), 1396 ""); 1397 1398 /* Scale bits */ 1399 if (src_type.norm) { 1400 chans[j] = scale_bits(gallivm, src_fmt->channel[j].size, 1401 blend_type.width, chans[j], src_type); 1402 } 1403 1404 /* Insert bits into correct position */ 1405 chans[j] = LLVMBuildShl(builder, 1406 chans[j], 1407 lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width), 1408 ""); 1409 1410 if (j == 0) { 1411 res = chans[j]; 1412 } else { 1413 res = LLVMBuildOr(builder, res, chans[j], ""); 1414 } 1415 } 1416 1417 dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), ""); 1418 } 1419 } 1420 1421 1422 /** 1423 * Convert from blending format to memory format 1424 * 1425 * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory 1426 */ 1427 static void 1428 convert_from_blend_type(struct gallivm_state *gallivm, 1429 unsigned block_size, 1430 const struct util_format_description *src_fmt, 1431 struct lp_type src_type, 1432 struct lp_type dst_type, 1433 LLVMValueRef* src, // and dst 1434 unsigned num_srcs) 1435 { 1436 LLVMValueRef* dst = src; 1437 unsigned i, j, k; 1438 struct lp_type mem_type; 1439 struct lp_type blend_type; 1440 LLVMBuilderRef builder = gallivm->builder; 1441 unsigned pixels = block_size / num_srcs; 1442 bool is_arith; 1443 1444 /* 1445 * full custom path for packed floats and srgb formats - none of the later 1446 * functions would do anything useful, and given the lp_type representation they 1447 * can't be fixed. Should really have some SoA blend path for these kind of 1448 * formats rather than hacking them in here. 1449 */ 1450 if (format_expands_to_float_soa(src_fmt)) { 1451 /* 1452 * This is pretty suboptimal for this case blending in SoA would be much 1453 * better - we need to transpose the AoS values back to SoA values for 1454 * conversion/packing. 1455 */ 1456 assert(src_type.floating); 1457 assert(src_type.width == 32); 1458 assert(src_type.length % 4 == 0); 1459 assert(dst_type.width == 32 || dst_type.width == 16); 1460 1461 for (i = 0; i < num_srcs / 4; i++) { 1462 LLVMValueRef tmpsoa[4], tmpdst; 1463 lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa); 1464 /* really really need SoA here */ 1465 1466 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) { 1467 tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa); 1468 } 1469 else { 1470 tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt, 1471 src_type, tmpsoa); 1472 } 1473 1474 if (src_type.length == 8) { 1475 LLVMValueRef tmpaos, shuffles[8]; 1476 unsigned j; 1477 /* 1478 * for 8-wide aos transpose has given us wrong order not matching 1479 * output order. HMPF. Also need to split the output values manually. 1480 */ 1481 for (j = 0; j < 4; j++) { 1482 shuffles[j * 2] = lp_build_const_int32(gallivm, j); 1483 shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4); 1484 } 1485 tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst, 1486 LLVMConstVector(shuffles, 8), ""); 1487 src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4); 1488 src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4); 1489 } 1490 else { 1491 src[i] = tmpdst; 1492 } 1493 } 1494 if (dst_type.width == 16) { 1495 struct lp_type type16x8 = dst_type; 1496 struct lp_type type32x4 = dst_type; 1497 LLVMTypeRef ltype16x4, ltypei64, ltypei128; 1498 unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4; 1499 type16x8.length = 8; 1500 type32x4.width = 32; 1501 ltypei128 = LLVMIntTypeInContext(gallivm->context, 128); 1502 ltypei64 = LLVMIntTypeInContext(gallivm->context, 64); 1503 ltype16x4 = lp_build_vec_type(gallivm, dst_type); 1504 /* We could do vector truncation but it doesn't generate very good code */ 1505 for (i = 0; i < num_fetch; i++) { 1506 src[i] = lp_build_pack2(gallivm, type32x4, type16x8, 1507 src[i], lp_build_zero(gallivm, type32x4)); 1508 src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, ""); 1509 src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, ""); 1510 src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, ""); 1511 } 1512 } 1513 return; 1514 } 1515 1516 lp_mem_type_from_format_desc(src_fmt, &mem_type); 1517 lp_blend_type_from_format_desc(src_fmt, &blend_type); 1518 1519 is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length); 1520 1521 /* Special case for half-floats */ 1522 if (mem_type.width == 16 && mem_type.floating) { 1523 int length = dst_type.length; 1524 assert(blend_type.width == 32 && blend_type.floating); 1525 1526 dst_type.length = src_type.length; 1527 1528 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst); 1529 1530 dst_type.length = length; 1531 is_arith = false; 1532 } 1533 1534 /* Remove any padding */ 1535 if (!is_arith && (src_type.length % mem_type.length)) { 1536 src_type.length -= (src_type.length % mem_type.length); 1537 1538 for (i = 0; i < num_srcs; ++i) { 1539 dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length); 1540 } 1541 } 1542 1543 /* No bit arithmetic to do */ 1544 if (!is_arith) { 1545 return; 1546 } 1547 1548 src_type.length = pixels; 1549 src_type.width = blend_type.length * blend_type.width; 1550 dst_type.length = pixels; 1551 1552 for (i = 0; i < num_srcs; ++i) { 1553 LLVMValueRef chans[4]; 1554 LLVMValueRef res = NULL; 1555 1556 dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), ""); 1557 1558 for (j = 0; j < src_fmt->nr_channels; ++j) { 1559 unsigned mask = 0; 1560 unsigned sa = src_fmt->channel[j].shift; 1561 #ifdef PIPE_ARCH_LITTLE_ENDIAN 1562 unsigned from_lsb = j; 1563 #else 1564 unsigned from_lsb = src_fmt->nr_channels - j - 1; 1565 #endif 1566 1567 assert(blend_type.width > src_fmt->channel[j].size); 1568 1569 for (k = 0; k < blend_type.width; ++k) { 1570 mask |= 1 << k; 1571 } 1572 1573 /* Extract bits */ 1574 chans[j] = LLVMBuildLShr(builder, 1575 dst[i], 1576 lp_build_const_int_vec(gallivm, src_type, 1577 from_lsb * blend_type.width), 1578 ""); 1579 1580 chans[j] = LLVMBuildAnd(builder, 1581 chans[j], 1582 lp_build_const_int_vec(gallivm, src_type, mask), 1583 ""); 1584 1585 /* Scale down bits */ 1586 if (src_type.norm) { 1587 chans[j] = scale_bits(gallivm, blend_type.width, 1588 src_fmt->channel[j].size, chans[j], src_type); 1589 } 1590 1591 /* Insert bits */ 1592 chans[j] = LLVMBuildShl(builder, 1593 chans[j], 1594 lp_build_const_int_vec(gallivm, src_type, sa), 1595 ""); 1596 1597 sa += src_fmt->channel[j].size; 1598 1599 if (j == 0) { 1600 res = chans[j]; 1601 } else { 1602 res = LLVMBuildOr(builder, res, chans[j], ""); 1603 } 1604 } 1605 1606 assert (dst_type.width != 24); 1607 1608 dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), ""); 1609 } 1610 } 1611 1612 1613 /** 1614 * Convert alpha to same blend type as src 1615 */ 1616 static void 1617 convert_alpha(struct gallivm_state *gallivm, 1618 struct lp_type row_type, 1619 struct lp_type alpha_type, 1620 const unsigned block_size, 1621 const unsigned block_height, 1622 const unsigned src_count, 1623 const unsigned dst_channels, 1624 const bool pad_inline, 1625 LLVMValueRef* src_alpha) 1626 { 1627 LLVMBuilderRef builder = gallivm->builder; 1628 unsigned i, j; 1629 unsigned length = row_type.length; 1630 row_type.length = alpha_type.length; 1631 1632 /* Twiddle the alpha to match pixels */ 1633 lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha); 1634 1635 /* 1636 * TODO this should use single lp_build_conv call for 1637 * src_count == 1 && dst_channels == 1 case (dropping the concat below) 1638 */ 1639 for (i = 0; i < block_height; ++i) { 1640 lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1); 1641 } 1642 1643 alpha_type = row_type; 1644 row_type.length = length; 1645 1646 /* If only one channel we can only need the single alpha value per pixel */ 1647 if (src_count == 1 && dst_channels == 1) { 1648 1649 lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height, src_alpha, src_count); 1650 } else { 1651 /* If there are more srcs than rows then we need to split alpha up */ 1652 if (src_count > block_height) { 1653 for (i = src_count; i > 0; --i) { 1654 unsigned pixels = block_size / src_count; 1655 unsigned idx = i - 1; 1656 1657 src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4], 1658 (idx * pixels) % 4, pixels); 1659 } 1660 } 1661 1662 /* If there is a src for each pixel broadcast the alpha across whole row */ 1663 if (src_count == block_size) { 1664 for (i = 0; i < src_count; ++i) { 1665 src_alpha[i] = lp_build_broadcast(gallivm, 1666 lp_build_vec_type(gallivm, row_type), src_alpha[i]); 1667 } 1668 } else { 1669 unsigned pixels = block_size / src_count; 1670 unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels; 1671 unsigned alpha_span = 1; 1672 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; 1673 1674 /* Check if we need 2 src_alphas for our shuffles */ 1675 if (pixels > alpha_type.length) { 1676 alpha_span = 2; 1677 } 1678 1679 /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */ 1680 for (j = 0; j < row_type.length; ++j) { 1681 if (j < pixels * channels) { 1682 shuffles[j] = lp_build_const_int32(gallivm, j / channels); 1683 } else { 1684 shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 1685 } 1686 } 1687 1688 for (i = 0; i < src_count; ++i) { 1689 unsigned idx1 = i, idx2 = i; 1690 1691 if (alpha_span > 1){ 1692 idx1 *= alpha_span; 1693 idx2 = idx1 + 1; 1694 } 1695 1696 src_alpha[i] = LLVMBuildShuffleVector(builder, 1697 src_alpha[idx1], 1698 src_alpha[idx2], 1699 LLVMConstVector(shuffles, row_type.length), 1700 ""); 1701 } 1702 } 1703 } 1704 } 1705 1706 1707 /** 1708 * Generates the blend function for unswizzled colour buffers 1709 * Also generates the read & write from colour buffer 1710 */ 1711 static void 1712 generate_unswizzled_blend(struct gallivm_state *gallivm, 1713 unsigned rt, 1714 struct lp_fragment_shader_variant *variant, 1715 enum pipe_format out_format, 1716 unsigned int num_fs, 1717 struct lp_type fs_type, 1718 LLVMValueRef* fs_mask, 1719 LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4], 1720 LLVMValueRef context_ptr, 1721 LLVMValueRef color_ptr, 1722 LLVMValueRef stride, 1723 unsigned partial_mask, 1724 boolean do_branch) 1725 { 1726 const unsigned alpha_channel = 3; 1727 const unsigned block_width = LP_RASTER_BLOCK_SIZE; 1728 const unsigned block_height = LP_RASTER_BLOCK_SIZE; 1729 const unsigned block_size = block_width * block_height; 1730 const unsigned lp_integer_vector_width = 128; 1731 1732 LLVMBuilderRef builder = gallivm->builder; 1733 LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS]; 1734 LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS]; 1735 LLVMValueRef src_alpha[4 * 4]; 1736 LLVMValueRef src1_alpha[4 * 4] = { NULL }; 1737 LLVMValueRef src_mask[4 * 4]; 1738 LLVMValueRef src[4 * 4]; 1739 LLVMValueRef src1[4 * 4]; 1740 LLVMValueRef dst[4 * 4]; 1741 LLVMValueRef blend_color; 1742 LLVMValueRef blend_alpha; 1743 LLVMValueRef i32_zero; 1744 LLVMValueRef check_mask; 1745 LLVMValueRef undef_src_val; 1746 1747 struct lp_build_mask_context mask_ctx; 1748 struct lp_type mask_type; 1749 struct lp_type blend_type; 1750 struct lp_type row_type; 1751 struct lp_type dst_type; 1752 struct lp_type ls_type; 1753 1754 unsigned char swizzle[TGSI_NUM_CHANNELS]; 1755 unsigned vector_width; 1756 unsigned src_channels = TGSI_NUM_CHANNELS; 1757 unsigned dst_channels; 1758 unsigned dst_count; 1759 unsigned src_count; 1760 unsigned i, j; 1761 1762 const struct util_format_description* out_format_desc = util_format_description(out_format); 1763 1764 unsigned dst_alignment; 1765 1766 bool pad_inline = is_arithmetic_format(out_format_desc); 1767 bool has_alpha = false; 1768 const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable && 1769 util_blend_state_is_dual(&variant->key.blend, 0); 1770 1771 const boolean is_1d = variant->key.resource_1d; 1772 boolean twiddle_after_convert = FALSE; 1773 unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs; 1774 LLVMValueRef fpstate = 0; 1775 1776 /* Get type from output format */ 1777 lp_blend_type_from_format_desc(out_format_desc, &row_type); 1778 lp_mem_type_from_format_desc(out_format_desc, &dst_type); 1779 1780 /* 1781 * Technically this code should go into lp_build_smallfloat_to_float 1782 * and lp_build_float_to_smallfloat but due to the 1783 * http://llvm.org/bugs/show_bug.cgi?id=6393 1784 * llvm reorders the mxcsr intrinsics in a way that breaks the code. 1785 * So the ordering is important here and there shouldn't be any 1786 * llvm ir instrunctions in this function before 1787 * this, otherwise half-float format conversions won't work 1788 * (again due to llvm bug #6393). 1789 */ 1790 if (have_smallfloat_format(dst_type, out_format)) { 1791 /* We need to make sure that denorms are ok for half float 1792 conversions */ 1793 fpstate = lp_build_fpstate_get(gallivm); 1794 lp_build_fpstate_set_denorms_zero(gallivm, FALSE); 1795 } 1796 1797 mask_type = lp_int32_vec4_type(); 1798 mask_type.length = fs_type.length; 1799 1800 for (i = num_fs; i < num_fullblock_fs; i++) { 1801 fs_mask[i] = lp_build_zero(gallivm, mask_type); 1802 } 1803 1804 /* Do not bother executing code when mask is empty.. */ 1805 if (do_branch) { 1806 check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type)); 1807 1808 for (i = 0; i < num_fullblock_fs; ++i) { 1809 check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], ""); 1810 } 1811 1812 lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask); 1813 lp_build_mask_check(&mask_ctx); 1814 } 1815 1816 partial_mask |= !variant->opaque; 1817 i32_zero = lp_build_const_int32(gallivm, 0); 1818 1819 undef_src_val = lp_build_undef(gallivm, fs_type); 1820 1821 row_type.length = fs_type.length; 1822 vector_width = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width; 1823 1824 /* Compute correct swizzle and count channels */ 1825 memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS); 1826 dst_channels = 0; 1827 1828 for (i = 0; i < TGSI_NUM_CHANNELS; ++i) { 1829 /* Ensure channel is used */ 1830 if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) { 1831 continue; 1832 } 1833 1834 /* Ensure not already written to (happens in case with GL_ALPHA) */ 1835 if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) { 1836 continue; 1837 } 1838 1839 /* Ensure we havn't already found all channels */ 1840 if (dst_channels >= out_format_desc->nr_channels) { 1841 continue; 1842 } 1843 1844 swizzle[out_format_desc->swizzle[i]] = i; 1845 ++dst_channels; 1846 1847 if (i == alpha_channel) { 1848 has_alpha = true; 1849 } 1850 } 1851 1852 if (format_expands_to_float_soa(out_format_desc)) { 1853 /* 1854 * the code above can't work for layout_other 1855 * for srgb it would sort of work but we short-circuit swizzles, etc. 1856 * as that is done as part of unpack / pack. 1857 */ 1858 dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */ 1859 has_alpha = true; 1860 swizzle[0] = 0; 1861 swizzle[1] = 1; 1862 swizzle[2] = 2; 1863 swizzle[3] = 3; 1864 pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */ 1865 } 1866 1867 /* If 3 channels then pad to include alpha for 4 element transpose */ 1868 if (dst_channels == 3) { 1869 assert (!has_alpha); 1870 for (i = 0; i < TGSI_NUM_CHANNELS; i++) { 1871 if (swizzle[i] > TGSI_NUM_CHANNELS) 1872 swizzle[i] = 3; 1873 } 1874 if (out_format_desc->nr_channels == 4) { 1875 dst_channels = 4; 1876 /* 1877 * We use alpha from the color conversion, not separate one. 1878 * We had to include it for transpose, hence it will get converted 1879 * too (albeit when doing transpose after conversion, that would 1880 * no longer be the case necessarily). 1881 * (It works only with 4 channel dsts, e.g. rgbx formats, because 1882 * otherwise we really have padding, not alpha, included.) 1883 */ 1884 has_alpha = true; 1885 } 1886 } 1887 1888 /* 1889 * Load shader output 1890 */ 1891 for (i = 0; i < num_fullblock_fs; ++i) { 1892 /* Always load alpha for use in blending */ 1893 LLVMValueRef alpha; 1894 if (i < num_fs) { 1895 alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], ""); 1896 } 1897 else { 1898 alpha = undef_src_val; 1899 } 1900 1901 /* Load each channel */ 1902 for (j = 0; j < dst_channels; ++j) { 1903 assert(swizzle[j] < 4); 1904 if (i < num_fs) { 1905 fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], ""); 1906 } 1907 else { 1908 fs_src[i][j] = undef_src_val; 1909 } 1910 } 1911 1912 /* If 3 channels then pad to include alpha for 4 element transpose */ 1913 /* 1914 * XXX If we include that here maybe could actually use it instead of 1915 * separate alpha for blending? 1916 * (Difficult though we actually convert pad channels, not alpha.) 1917 */ 1918 if (dst_channels == 3 && !has_alpha) { 1919 fs_src[i][3] = alpha; 1920 } 1921 1922 /* We split the row_mask and row_alpha as we want 128bit interleave */ 1923 if (fs_type.length == 8) { 1924 src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i], 1925 0, src_channels); 1926 src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i], 1927 src_channels, src_channels); 1928 1929 src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels); 1930 src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, 1931 src_channels, src_channels); 1932 } else { 1933 src_mask[i] = fs_mask[i]; 1934 src_alpha[i] = alpha; 1935 } 1936 } 1937 if (dual_source_blend) { 1938 /* same as above except different src/dst, skip masks and comments... */ 1939 for (i = 0; i < num_fullblock_fs; ++i) { 1940 LLVMValueRef alpha; 1941 if (i < num_fs) { 1942 alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], ""); 1943 } 1944 else { 1945 alpha = undef_src_val; 1946 } 1947 1948 for (j = 0; j < dst_channels; ++j) { 1949 assert(swizzle[j] < 4); 1950 if (i < num_fs) { 1951 fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], ""); 1952 } 1953 else { 1954 fs_src1[i][j] = undef_src_val; 1955 } 1956 } 1957 if (dst_channels == 3 && !has_alpha) { 1958 fs_src1[i][3] = alpha; 1959 } 1960 if (fs_type.length == 8) { 1961 src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels); 1962 src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, 1963 src_channels, src_channels); 1964 } else { 1965 src1_alpha[i] = alpha; 1966 } 1967 } 1968 } 1969 1970 if (util_format_is_pure_integer(out_format)) { 1971 /* 1972 * In this case fs_type was really ints or uints disguised as floats, 1973 * fix that up now. 1974 */ 1975 fs_type.floating = 0; 1976 fs_type.sign = dst_type.sign; 1977 for (i = 0; i < num_fullblock_fs; ++i) { 1978 for (j = 0; j < dst_channels; ++j) { 1979 fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j], 1980 lp_build_vec_type(gallivm, fs_type), ""); 1981 } 1982 if (dst_channels == 3 && !has_alpha) { 1983 fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3], 1984 lp_build_vec_type(gallivm, fs_type), ""); 1985 } 1986 } 1987 } 1988 1989 /* 1990 * We actually should generally do conversion first (for non-1d cases) 1991 * when the blend format is 8 or 16 bits. The reason is obvious, 1992 * there's 2 or 4 times less vectors to deal with for the interleave... 1993 * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit 1994 * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit 1995 * unpack only with 128bit vectors). 1996 * Note: for 16bit sizes really need matching pack conversion code 1997 */ 1998 if (!is_1d && dst_channels != 3 && dst_type.width == 8) { 1999 twiddle_after_convert = TRUE; 2000 } 2001 2002 /* 2003 * Pixel twiddle from fragment shader order to memory order 2004 */ 2005 if (!twiddle_after_convert) { 2006 src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, 2007 dst_channels, fs_src, src, pad_inline); 2008 if (dual_source_blend) { 2009 generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels, 2010 fs_src1, src1, pad_inline); 2011 } 2012 } else { 2013 src_count = num_fullblock_fs * dst_channels; 2014 /* 2015 * We reorder things a bit here, so the cases for 4-wide and 8-wide 2016 * (AVX) turn out the same later when untwiddling/transpose (albeit 2017 * for true AVX2 path untwiddle needs to be different). 2018 * For now just order by colors first (so we can use unpack later). 2019 */ 2020 for (j = 0; j < num_fullblock_fs; j++) { 2021 for (i = 0; i < dst_channels; i++) { 2022 src[i*num_fullblock_fs + j] = fs_src[j][i]; 2023 if (dual_source_blend) { 2024 src1[i*num_fullblock_fs + j] = fs_src1[j][i]; 2025 } 2026 } 2027 } 2028 } 2029 2030 src_channels = dst_channels < 3 ? dst_channels : 4; 2031 if (src_count != num_fullblock_fs * src_channels) { 2032 unsigned ds = src_count / (num_fullblock_fs * src_channels); 2033 row_type.length /= ds; 2034 fs_type.length = row_type.length; 2035 } 2036 2037 blend_type = row_type; 2038 mask_type.length = 4; 2039 2040 /* Convert src to row_type */ 2041 if (dual_source_blend) { 2042 struct lp_type old_row_type = row_type; 2043 lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src); 2044 src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type, src1, src_count, src1); 2045 } 2046 else { 2047 src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src); 2048 } 2049 2050 /* If the rows are not an SSE vector, combine them to become SSE size! */ 2051 if ((row_type.width * row_type.length) % 128) { 2052 unsigned bits = row_type.width * row_type.length; 2053 unsigned combined; 2054 2055 assert(src_count >= (vector_width / bits)); 2056 2057 dst_count = src_count / (vector_width / bits); 2058 2059 combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count); 2060 if (dual_source_blend) { 2061 lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count); 2062 } 2063 2064 row_type.length *= combined; 2065 src_count /= combined; 2066 2067 bits = row_type.width * row_type.length; 2068 assert(bits == 128 || bits == 256); 2069 } 2070 2071 if (twiddle_after_convert) { 2072 fs_twiddle_transpose(gallivm, row_type, src, src_count, src); 2073 if (dual_source_blend) { 2074 fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1); 2075 } 2076 } 2077 2078 /* 2079 * Blend Colour conversion 2080 */ 2081 blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr); 2082 blend_color = LLVMBuildPointerCast(builder, blend_color, 2083 LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), ""); 2084 blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, 2085 &i32_zero, 1, ""), ""); 2086 2087 /* Convert */ 2088 lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1); 2089 2090 if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { 2091 /* 2092 * since blending is done with floats, there was no conversion. 2093 * However, the rules according to fixed point renderbuffers still 2094 * apply, that is we must clamp inputs to 0.0/1.0. 2095 * (This would apply to separate alpha conversion too but we currently 2096 * force has_alpha to be true.) 2097 * TODO: should skip this with "fake" blend, since post-blend conversion 2098 * will clamp anyway. 2099 * TODO: could also skip this if fragment color clamping is enabled. We 2100 * don't support it natively so it gets baked into the shader however, so 2101 * can't really tell here. 2102 */ 2103 struct lp_build_context f32_bld; 2104 assert(row_type.floating); 2105 lp_build_context_init(&f32_bld, gallivm, row_type); 2106 for (i = 0; i < src_count; i++) { 2107 src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]); 2108 } 2109 if (dual_source_blend) { 2110 for (i = 0; i < src_count; i++) { 2111 src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]); 2112 } 2113 } 2114 /* probably can't be different than row_type but better safe than sorry... */ 2115 lp_build_context_init(&f32_bld, gallivm, blend_type); 2116 blend_color = lp_build_clamp(&f32_bld, blend_color, f32_bld.zero, f32_bld.one); 2117 } 2118 2119 /* Extract alpha */ 2120 blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3)); 2121 2122 /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */ 2123 pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width; 2124 if (pad_inline) { 2125 /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */ 2126 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length); 2127 } else { 2128 /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */ 2129 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length); 2130 } 2131 2132 /* 2133 * Mask conversion 2134 */ 2135 lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], block_height, &src_mask[0]); 2136 2137 if (src_count < block_height) { 2138 lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count); 2139 } else if (src_count > block_height) { 2140 for (i = src_count; i > 0; --i) { 2141 unsigned pixels = block_size / src_count; 2142 unsigned idx = i - 1; 2143 2144 src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4], 2145 (idx * pixels) % 4, pixels); 2146 } 2147 } 2148 2149 assert(mask_type.width == 32); 2150 2151 for (i = 0; i < src_count; ++i) { 2152 unsigned pixels = block_size / src_count; 2153 unsigned pixel_width = row_type.width * dst_channels; 2154 2155 if (pixel_width == 24) { 2156 mask_type.width = 8; 2157 mask_type.length = vector_width / mask_type.width; 2158 } else { 2159 mask_type.length = pixels; 2160 mask_type.width = row_type.width * dst_channels; 2161 2162 /* 2163 * If mask_type width is smaller than 32bit, this doesn't quite 2164 * generate the most efficient code (could use some pack). 2165 */ 2166 src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], 2167 lp_build_int_vec_type(gallivm, mask_type), ""); 2168 2169 mask_type.length *= dst_channels; 2170 mask_type.width /= dst_channels; 2171 } 2172 2173 src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], 2174 lp_build_int_vec_type(gallivm, mask_type), ""); 2175 src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length); 2176 } 2177 2178 /* 2179 * Alpha conversion 2180 */ 2181 if (!has_alpha) { 2182 struct lp_type alpha_type = fs_type; 2183 alpha_type.length = 4; 2184 convert_alpha(gallivm, row_type, alpha_type, 2185 block_size, block_height, 2186 src_count, dst_channels, 2187 pad_inline, src_alpha); 2188 if (dual_source_blend) { 2189 convert_alpha(gallivm, row_type, alpha_type, 2190 block_size, block_height, 2191 src_count, dst_channels, 2192 pad_inline, src1_alpha); 2193 } 2194 } 2195 2196 2197 /* 2198 * Load dst from memory 2199 */ 2200 if (src_count < block_height) { 2201 dst_count = block_height; 2202 } else { 2203 dst_count = src_count; 2204 } 2205 2206 dst_type.length *= block_size / dst_count; 2207 2208 if (format_expands_to_float_soa(out_format_desc)) { 2209 /* 2210 * we need multiple values at once for the conversion, so can as well 2211 * load them vectorized here too instead of concatenating later. 2212 * (Still need concatenation later for 8-wide vectors). 2213 */ 2214 dst_count = block_height; 2215 dst_type.length = block_width; 2216 } 2217 2218 /* 2219 * Compute the alignment of the destination pointer in bytes 2220 * We fetch 1-4 pixels, if the format has pot alignment then those fetches 2221 * are always aligned by MIN2(16, fetch_width) except for buffers (not 2222 * 1d tex but can't distinguish here) so need to stick with per-pixel 2223 * alignment in this case. 2224 */ 2225 if (is_1d) { 2226 dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8); 2227 } 2228 else { 2229 dst_alignment = dst_type.length * dst_type.width / 8; 2230 } 2231 /* Force power-of-two alignment by extracting only the least-significant-bit */ 2232 dst_alignment = 1 << (ffs(dst_alignment) - 1); 2233 /* 2234 * Resource base and stride pointers are aligned to 16 bytes, so that's 2235 * the maximum alignment we can guarantee 2236 */ 2237 dst_alignment = MIN2(16, dst_alignment); 2238 2239 ls_type = dst_type; 2240 2241 if (dst_count > src_count) { 2242 if ((dst_type.width == 8 || dst_type.width == 16) && 2243 util_is_power_of_two(dst_type.length) && 2244 dst_type.length * dst_type.width < 128) { 2245 /* 2246 * Never try to load values as 4xi8 which we will then 2247 * concatenate to larger vectors. This gives llvm a real 2248 * headache (the problem is the type legalizer (?) will 2249 * try to load that as 4xi8 zext to 4xi32 to fill the vector, 2250 * then the shuffles to concatenate are more or less impossible 2251 * - llvm is easily capable of generating a sequence of 32 2252 * pextrb/pinsrb instructions for that. Albeit it appears to 2253 * be fixed in llvm 4.0. So, load and concatenate with 32bit 2254 * width to avoid the trouble (16bit seems not as bad, llvm 2255 * probably recognizes the load+shuffle as only one shuffle 2256 * is necessary, but we can do just the same anyway). 2257 */ 2258 ls_type.length = dst_type.length * dst_type.width / 32; 2259 ls_type.width = 32; 2260 } 2261 } 2262 2263 if (is_1d) { 2264 load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1, 2265 dst, ls_type, dst_count / 4, dst_alignment); 2266 for (i = dst_count / 4; i < dst_count; i++) { 2267 dst[i] = lp_build_undef(gallivm, ls_type); 2268 } 2269 2270 } 2271 else { 2272 load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, 2273 dst, ls_type, dst_count, dst_alignment); 2274 } 2275 2276 2277 /* 2278 * Convert from dst/output format to src/blending format. 2279 * 2280 * This is necessary as we can only read 1 row from memory at a time, 2281 * so the minimum dst_count will ever be at this point is 4. 2282 * 2283 * With, for example, R8 format you can have all 16 pixels in a 128 bit vector, 2284 * this will take the 4 dsts and combine them into 1 src so we can perform blending 2285 * on all 16 pixels in that single vector at once. 2286 */ 2287 if (dst_count > src_count) { 2288 if (ls_type.length != dst_type.length && ls_type.length == 1) { 2289 LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type); 2290 LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1); 2291 for (i = 0; i < dst_count; i++) { 2292 dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, ""); 2293 } 2294 } 2295 2296 lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count); 2297 2298 if (ls_type.length != dst_type.length) { 2299 struct lp_type tmp_type = dst_type; 2300 tmp_type.length = dst_type.length * 4 / src_count; 2301 for (i = 0; i < src_count; i++) { 2302 dst[i] = LLVMBuildBitCast(builder, dst[i], 2303 lp_build_vec_type(gallivm, tmp_type), ""); 2304 } 2305 } 2306 } 2307 2308 /* 2309 * Blending 2310 */ 2311 /* XXX this is broken for RGB8 formats - 2312 * they get expanded from 12 to 16 elements (to include alpha) 2313 * by convert_to_blend_type then reduced to 15 instead of 12 2314 * by convert_from_blend_type (a simple fix though breaks A8...). 2315 * R16G16B16 also crashes differently however something going wrong 2316 * inside llvm handling npot vector sizes seemingly. 2317 * It seems some cleanup could be done here (like skipping conversion/blend 2318 * when not needed). 2319 */ 2320 convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, 2321 row_type, dst, src_count); 2322 2323 /* 2324 * FIXME: Really should get logic ops / masks out of generic blend / row 2325 * format. Logic ops will definitely not work on the blend float format 2326 * used for SRGB here and I think OpenGL expects this to work as expected 2327 * (that is incoming values converted to srgb then logic op applied). 2328 */ 2329 for (i = 0; i < src_count; ++i) { 2330 dst[i] = lp_build_blend_aos(gallivm, 2331 &variant->key.blend, 2332 out_format, 2333 row_type, 2334 rt, 2335 src[i], 2336 has_alpha ? NULL : src_alpha[i], 2337 src1[i], 2338 has_alpha ? NULL : src1_alpha[i], 2339 dst[i], 2340 partial_mask ? src_mask[i] : NULL, 2341 blend_color, 2342 has_alpha ? NULL : blend_alpha, 2343 swizzle, 2344 pad_inline ? 4 : dst_channels); 2345 } 2346 2347 convert_from_blend_type(gallivm, block_size, out_format_desc, 2348 row_type, dst_type, dst, src_count); 2349 2350 /* Split the blend rows back to memory rows */ 2351 if (dst_count > src_count) { 2352 row_type.length = dst_type.length * (dst_count / src_count); 2353 2354 if (src_count == 1) { 2355 dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2); 2356 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2); 2357 2358 row_type.length /= 2; 2359 src_count *= 2; 2360 } 2361 2362 dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2); 2363 dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2); 2364 dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2); 2365 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2); 2366 2367 row_type.length /= 2; 2368 src_count *= 2; 2369 } 2370 2371 /* 2372 * Store blend result to memory 2373 */ 2374 if (is_1d) { 2375 store_unswizzled_block(gallivm, color_ptr, stride, block_width, 1, 2376 dst, dst_type, dst_count / 4, dst_alignment); 2377 } 2378 else { 2379 store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, 2380 dst, dst_type, dst_count, dst_alignment); 2381 } 2382 2383 if (have_smallfloat_format(dst_type, out_format)) { 2384 lp_build_fpstate_set(gallivm, fpstate); 2385 } 2386 2387 if (do_branch) { 2388 lp_build_mask_end(&mask_ctx); 2389 } 2390 } 2391 2392 2393 /** 2394 * Generate the runtime callable function for the whole fragment pipeline. 2395 * Note that the function which we generate operates on a block of 16 2396 * pixels at at time. The block contains 2x2 quads. Each quad contains 2397 * 2x2 pixels. 2398 */ 2399 static void 2400 generate_fragment(struct llvmpipe_context *lp, 2401 struct lp_fragment_shader *shader, 2402 struct lp_fragment_shader_variant *variant, 2403 unsigned partial_mask) 2404 { 2405 struct gallivm_state *gallivm = variant->gallivm; 2406 const struct lp_fragment_shader_variant_key *key = &variant->key; 2407 struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; 2408 char func_name[64]; 2409 struct lp_type fs_type; 2410 struct lp_type blend_type; 2411 LLVMTypeRef fs_elem_type; 2412 LLVMTypeRef blend_vec_type; 2413 LLVMTypeRef arg_types[13]; 2414 LLVMTypeRef func_type; 2415 LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context); 2416 LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context); 2417 LLVMValueRef context_ptr; 2418 LLVMValueRef x; 2419 LLVMValueRef y; 2420 LLVMValueRef a0_ptr; 2421 LLVMValueRef dadx_ptr; 2422 LLVMValueRef dady_ptr; 2423 LLVMValueRef color_ptr_ptr; 2424 LLVMValueRef stride_ptr; 2425 LLVMValueRef depth_ptr; 2426 LLVMValueRef depth_stride; 2427 LLVMValueRef mask_input; 2428 LLVMValueRef thread_data_ptr; 2429 LLVMBasicBlockRef block; 2430 LLVMBuilderRef builder; 2431 struct lp_build_sampler_soa *sampler; 2432 struct lp_build_interp_soa_context interp; 2433 LLVMValueRef fs_mask[16 / 4]; 2434 LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4]; 2435 LLVMValueRef function; 2436 LLVMValueRef facing; 2437 unsigned num_fs; 2438 unsigned i; 2439 unsigned chan; 2440 unsigned cbuf; 2441 boolean cbuf0_write_all; 2442 const boolean dual_source_blend = key->blend.rt[0].blend_enable && 2443 util_blend_state_is_dual(&key->blend, 0); 2444 2445 assert(lp_native_vector_width / 32 >= 4); 2446 2447 /* Adjust color input interpolation according to flatshade state: 2448 */ 2449 memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]); 2450 for (i = 0; i < shader->info.base.num_inputs; i++) { 2451 if (inputs[i].interp == LP_INTERP_COLOR) { 2452 if (key->flatshade) 2453 inputs[i].interp = LP_INTERP_CONSTANT; 2454 else 2455 inputs[i].interp = LP_INTERP_PERSPECTIVE; 2456 } 2457 } 2458 2459 /* check if writes to cbuf[0] are to be copied to all cbufs */ 2460 cbuf0_write_all = 2461 shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]; 2462 2463 /* TODO: actually pick these based on the fs and color buffer 2464 * characteristics. */ 2465 2466 memset(&fs_type, 0, sizeof fs_type); 2467 fs_type.floating = TRUE; /* floating point values */ 2468 fs_type.sign = TRUE; /* values are signed */ 2469 fs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ 2470 fs_type.width = 32; /* 32-bit float */ 2471 fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */ 2472 2473 memset(&blend_type, 0, sizeof blend_type); 2474 blend_type.floating = FALSE; /* values are integers */ 2475 blend_type.sign = FALSE; /* values are unsigned */ 2476 blend_type.norm = TRUE; /* values are in [0,1] or [-1,1] */ 2477 blend_type.width = 8; /* 8-bit ubyte values */ 2478 blend_type.length = 16; /* 16 elements per vector */ 2479 2480 /* 2481 * Generate the function prototype. Any change here must be reflected in 2482 * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa. 2483 */ 2484 2485 fs_elem_type = lp_build_elem_type(gallivm, fs_type); 2486 2487 blend_vec_type = lp_build_vec_type(gallivm, blend_type); 2488 2489 util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", 2490 shader->no, variant->no, partial_mask ? "partial" : "whole"); 2491 2492 arg_types[0] = variant->jit_context_ptr_type; /* context */ 2493 arg_types[1] = int32_type; /* x */ 2494 arg_types[2] = int32_type; /* y */ 2495 arg_types[3] = int32_type; /* facing */ 2496 arg_types[4] = LLVMPointerType(fs_elem_type, 0); /* a0 */ 2497 arg_types[5] = LLVMPointerType(fs_elem_type, 0); /* dadx */ 2498 arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dady */ 2499 arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0); /* color */ 2500 arg_types[8] = LLVMPointerType(int8_type, 0); /* depth */ 2501 arg_types[9] = int32_type; /* mask_input */ 2502 arg_types[10] = variant->jit_thread_data_ptr_type; /* per thread data */ 2503 arg_types[11] = LLVMPointerType(int32_type, 0); /* stride */ 2504 arg_types[12] = int32_type; /* depth_stride */ 2505 2506 func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context), 2507 arg_types, ARRAY_SIZE(arg_types), 0); 2508 2509 function = LLVMAddFunction(gallivm->module, func_name, func_type); 2510 LLVMSetFunctionCallConv(function, LLVMCCallConv); 2511 2512 variant->function[partial_mask] = function; 2513 2514 /* XXX: need to propagate noalias down into color param now we are 2515 * passing a pointer-to-pointer? 2516 */ 2517 for(i = 0; i < ARRAY_SIZE(arg_types); ++i) 2518 if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) 2519 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS); 2520 2521 context_ptr = LLVMGetParam(function, 0); 2522 x = LLVMGetParam(function, 1); 2523 y = LLVMGetParam(function, 2); 2524 facing = LLVMGetParam(function, 3); 2525 a0_ptr = LLVMGetParam(function, 4); 2526 dadx_ptr = LLVMGetParam(function, 5); 2527 dady_ptr = LLVMGetParam(function, 6); 2528 color_ptr_ptr = LLVMGetParam(function, 7); 2529 depth_ptr = LLVMGetParam(function, 8); 2530 mask_input = LLVMGetParam(function, 9); 2531 thread_data_ptr = LLVMGetParam(function, 10); 2532 stride_ptr = LLVMGetParam(function, 11); 2533 depth_stride = LLVMGetParam(function, 12); 2534 2535 lp_build_name(context_ptr, "context"); 2536 lp_build_name(x, "x"); 2537 lp_build_name(y, "y"); 2538 lp_build_name(a0_ptr, "a0"); 2539 lp_build_name(dadx_ptr, "dadx"); 2540 lp_build_name(dady_ptr, "dady"); 2541 lp_build_name(color_ptr_ptr, "color_ptr_ptr"); 2542 lp_build_name(depth_ptr, "depth"); 2543 lp_build_name(mask_input, "mask_input"); 2544 lp_build_name(thread_data_ptr, "thread_data"); 2545 lp_build_name(stride_ptr, "stride_ptr"); 2546 lp_build_name(depth_stride, "depth_stride"); 2547 2548 /* 2549 * Function body 2550 */ 2551 2552 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry"); 2553 builder = gallivm->builder; 2554 assert(builder); 2555 LLVMPositionBuilderAtEnd(builder, block); 2556 2557 /* code generated texture sampling */ 2558 sampler = lp_llvm_sampler_soa_create(key->state); 2559 2560 num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */ 2561 /* for 1d resources only run "upper half" of stamp */ 2562 if (key->resource_1d) 2563 num_fs /= 2; 2564 2565 { 2566 LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs); 2567 LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type); 2568 LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type, 2569 num_loop, "mask_store"); 2570 LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS]; 2571 boolean pixel_center_integer = 2572 shader->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER]; 2573 2574 /* 2575 * The shader input interpolation info is not explicitely baked in the 2576 * shader key, but everything it derives from (TGSI, and flatshade) is 2577 * already included in the shader key. 2578 */ 2579 lp_build_interp_soa_init(&interp, 2580 gallivm, 2581 shader->info.base.num_inputs, 2582 inputs, 2583 pixel_center_integer, 2584 key->depth_clamp, 2585 builder, fs_type, 2586 a0_ptr, dadx_ptr, dady_ptr, 2587 x, y); 2588 2589 for (i = 0; i < num_fs; i++) { 2590 LLVMValueRef mask; 2591 LLVMValueRef indexi = lp_build_const_int32(gallivm, i); 2592 LLVMValueRef mask_ptr = LLVMBuildGEP(builder, mask_store, 2593 &indexi, 1, "mask_ptr"); 2594 2595 if (partial_mask) { 2596 mask = generate_quad_mask(gallivm, fs_type, 2597 i*fs_type.length/4, mask_input); 2598 } 2599 else { 2600 mask = lp_build_const_int_vec(gallivm, fs_type, ~0); 2601 } 2602 LLVMBuildStore(builder, mask, mask_ptr); 2603 } 2604 2605 generate_fs_loop(gallivm, 2606 shader, key, 2607 builder, 2608 fs_type, 2609 context_ptr, 2610 num_loop, 2611 &interp, 2612 sampler, 2613 mask_store, /* output */ 2614 color_store, 2615 depth_ptr, 2616 depth_stride, 2617 facing, 2618 thread_data_ptr); 2619 2620 for (i = 0; i < num_fs; i++) { 2621 LLVMValueRef indexi = lp_build_const_int32(gallivm, i); 2622 LLVMValueRef ptr = LLVMBuildGEP(builder, mask_store, 2623 &indexi, 1, ""); 2624 fs_mask[i] = LLVMBuildLoad(builder, ptr, "mask"); 2625 /* This is fucked up need to reorganize things */ 2626 for (cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { 2627 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 2628 ptr = LLVMBuildGEP(builder, 2629 color_store[cbuf * !cbuf0_write_all][chan], 2630 &indexi, 1, ""); 2631 fs_out_color[cbuf][chan][i] = ptr; 2632 } 2633 } 2634 if (dual_source_blend) { 2635 /* only support one dual source blend target hence always use output 1 */ 2636 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 2637 ptr = LLVMBuildGEP(builder, 2638 color_store[1][chan], 2639 &indexi, 1, ""); 2640 fs_out_color[1][chan][i] = ptr; 2641 } 2642 } 2643 } 2644 } 2645 2646 sampler->destroy(sampler); 2647 2648 /* Loop over color outputs / color buffers to do blending. 2649 */ 2650 for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { 2651 if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE) { 2652 LLVMValueRef color_ptr; 2653 LLVMValueRef stride; 2654 LLVMValueRef index = lp_build_const_int32(gallivm, cbuf); 2655 2656 boolean do_branch = ((key->depth.enabled 2657 || key->stencil[0].enabled 2658 || key->alpha.enabled) 2659 && !shader->info.base.uses_kill); 2660 2661 color_ptr = LLVMBuildLoad(builder, 2662 LLVMBuildGEP(builder, color_ptr_ptr, 2663 &index, 1, ""), 2664 ""); 2665 2666 lp_build_name(color_ptr, "color_ptr%d", cbuf); 2667 2668 stride = LLVMBuildLoad(builder, 2669 LLVMBuildGEP(builder, stride_ptr, &index, 1, ""), 2670 ""); 2671 2672 generate_unswizzled_blend(gallivm, cbuf, variant, 2673 key->cbuf_format[cbuf], 2674 num_fs, fs_type, fs_mask, fs_out_color, 2675 context_ptr, color_ptr, stride, 2676 partial_mask, do_branch); 2677 } 2678 } 2679 2680 LLVMBuildRetVoid(builder); 2681 2682 gallivm_verify_function(gallivm, function); 2683 } 2684 2685 2686 static void 2687 dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) 2688 { 2689 unsigned i; 2690 2691 debug_printf("fs variant %p:\n", (void *) key); 2692 2693 if (key->flatshade) { 2694 debug_printf("flatshade = 1\n"); 2695 } 2696 for (i = 0; i < key->nr_cbufs; ++i) { 2697 debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i])); 2698 } 2699 if (key->depth.enabled || key->stencil[0].enabled) { 2700 debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format)); 2701 } 2702 if (key->depth.enabled) { 2703 debug_printf("depth.func = %s\n", util_str_func(key->depth.func, TRUE)); 2704 debug_printf("depth.writemask = %u\n", key->depth.writemask); 2705 } 2706 2707 for (i = 0; i < 2; ++i) { 2708 if (key->stencil[i].enabled) { 2709 debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, TRUE)); 2710 debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, TRUE)); 2711 debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, TRUE)); 2712 debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, TRUE)); 2713 debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask); 2714 debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask); 2715 } 2716 } 2717 2718 if (key->alpha.enabled) { 2719 debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, TRUE)); 2720 } 2721 2722 if (key->occlusion_count) { 2723 debug_printf("occlusion_count = 1\n"); 2724 } 2725 2726 if (key->blend.logicop_enable) { 2727 debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, TRUE)); 2728 } 2729 else if (key->blend.rt[0].blend_enable) { 2730 debug_printf("blend.rgb_func = %s\n", util_str_blend_func (key->blend.rt[0].rgb_func, TRUE)); 2731 debug_printf("blend.rgb_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE)); 2732 debug_printf("blend.rgb_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE)); 2733 debug_printf("blend.alpha_func = %s\n", util_str_blend_func (key->blend.rt[0].alpha_func, TRUE)); 2734 debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE)); 2735 debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE)); 2736 } 2737 debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask); 2738 if (key->blend.alpha_to_coverage) { 2739 debug_printf("blend.alpha_to_coverage is enabled\n"); 2740 } 2741 for (i = 0; i < key->nr_samplers; ++i) { 2742 const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state; 2743 debug_printf("sampler[%u] = \n", i); 2744 debug_printf(" .wrap = %s %s %s\n", 2745 util_str_tex_wrap(sampler->wrap_s, TRUE), 2746 util_str_tex_wrap(sampler->wrap_t, TRUE), 2747 util_str_tex_wrap(sampler->wrap_r, TRUE)); 2748 debug_printf(" .min_img_filter = %s\n", 2749 util_str_tex_filter(sampler->min_img_filter, TRUE)); 2750 debug_printf(" .min_mip_filter = %s\n", 2751 util_str_tex_mipfilter(sampler->min_mip_filter, TRUE)); 2752 debug_printf(" .mag_img_filter = %s\n", 2753 util_str_tex_filter(sampler->mag_img_filter, TRUE)); 2754 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) 2755 debug_printf(" .compare_func = %s\n", util_str_func(sampler->compare_func, TRUE)); 2756 debug_printf(" .normalized_coords = %u\n", sampler->normalized_coords); 2757 debug_printf(" .min_max_lod_equal = %u\n", sampler->min_max_lod_equal); 2758 debug_printf(" .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero); 2759 debug_printf(" .apply_min_lod = %u\n", sampler->apply_min_lod); 2760 debug_printf(" .apply_max_lod = %u\n", sampler->apply_max_lod); 2761 } 2762 for (i = 0; i < key->nr_sampler_views; ++i) { 2763 const struct lp_static_texture_state *texture = &key->state[i].texture_state; 2764 debug_printf("texture[%u] = \n", i); 2765 debug_printf(" .format = %s\n", 2766 util_format_name(texture->format)); 2767 debug_printf(" .target = %s\n", 2768 util_str_tex_target(texture->target, TRUE)); 2769 debug_printf(" .level_zero_only = %u\n", 2770 texture->level_zero_only); 2771 debug_printf(" .pot = %u %u %u\n", 2772 texture->pot_width, 2773 texture->pot_height, 2774 texture->pot_depth); 2775 } 2776 } 2777 2778 2779 void 2780 lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant) 2781 { 2782 debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n", 2783 variant->shader->no, variant->no); 2784 tgsi_dump(variant->shader->base.tokens, 0); 2785 dump_fs_variant_key(&variant->key); 2786 debug_printf("variant->opaque = %u\n", variant->opaque); 2787 debug_printf("\n"); 2788 } 2789 2790 2791 /** 2792 * Generate a new fragment shader variant from the shader code and 2793 * other state indicated by the key. 2794 */ 2795 static struct lp_fragment_shader_variant * 2796 generate_variant(struct llvmpipe_context *lp, 2797 struct lp_fragment_shader *shader, 2798 const struct lp_fragment_shader_variant_key *key) 2799 { 2800 struct lp_fragment_shader_variant *variant; 2801 const struct util_format_description *cbuf0_format_desc = NULL; 2802 boolean fullcolormask; 2803 char module_name[64]; 2804 2805 variant = CALLOC_STRUCT(lp_fragment_shader_variant); 2806 if (!variant) 2807 return NULL; 2808 2809 util_snprintf(module_name, sizeof(module_name), "fs%u_variant%u", 2810 shader->no, shader->variants_created); 2811 2812 variant->gallivm = gallivm_create(module_name, lp->context); 2813 if (!variant->gallivm) { 2814 FREE(variant); 2815 return NULL; 2816 } 2817 2818 variant->shader = shader; 2819 variant->list_item_global.base = variant; 2820 variant->list_item_local.base = variant; 2821 variant->no = shader->variants_created++; 2822 2823 memcpy(&variant->key, key, shader->variant_key_size); 2824 2825 /* 2826 * Determine whether we are touching all channels in the color buffer. 2827 */ 2828 fullcolormask = FALSE; 2829 if (key->nr_cbufs == 1) { 2830 cbuf0_format_desc = util_format_description(key->cbuf_format[0]); 2831 fullcolormask = util_format_colormask_full(cbuf0_format_desc, key->blend.rt[0].colormask); 2832 } 2833 2834 variant->opaque = 2835 !key->blend.logicop_enable && 2836 !key->blend.rt[0].blend_enable && 2837 fullcolormask && 2838 !key->stencil[0].enabled && 2839 !key->alpha.enabled && 2840 !key->blend.alpha_to_coverage && 2841 !key->depth.enabled && 2842 !shader->info.base.uses_kill && 2843 !shader->info.base.writes_samplemask 2844 ? TRUE : FALSE; 2845 2846 if ((shader->info.base.num_tokens <= 1) && 2847 !key->depth.enabled && !key->stencil[0].enabled) { 2848 variant->ps_inv_multiplier = 0; 2849 } else { 2850 variant->ps_inv_multiplier = 1; 2851 } 2852 2853 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) { 2854 lp_debug_fs_variant(variant); 2855 } 2856 2857 lp_jit_init_types(variant); 2858 2859 if (variant->jit_function[RAST_EDGE_TEST] == NULL) 2860 generate_fragment(lp, shader, variant, RAST_EDGE_TEST); 2861 2862 if (variant->jit_function[RAST_WHOLE] == NULL) { 2863 if (variant->opaque) { 2864 /* Specialized shader, which doesn't need to read the color buffer. */ 2865 generate_fragment(lp, shader, variant, RAST_WHOLE); 2866 } 2867 } 2868 2869 /* 2870 * Compile everything 2871 */ 2872 2873 gallivm_compile_module(variant->gallivm); 2874 2875 variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module); 2876 2877 if (variant->function[RAST_EDGE_TEST]) { 2878 variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func) 2879 gallivm_jit_function(variant->gallivm, 2880 variant->function[RAST_EDGE_TEST]); 2881 } 2882 2883 if (variant->function[RAST_WHOLE]) { 2884 variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func) 2885 gallivm_jit_function(variant->gallivm, 2886 variant->function[RAST_WHOLE]); 2887 } else if (!variant->jit_function[RAST_WHOLE]) { 2888 variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST]; 2889 } 2890 2891 gallivm_free_ir(variant->gallivm); 2892 2893 return variant; 2894 } 2895 2896 2897 static void * 2898 llvmpipe_create_fs_state(struct pipe_context *pipe, 2899 const struct pipe_shader_state *templ) 2900 { 2901 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); 2902 struct lp_fragment_shader *shader; 2903 int nr_samplers; 2904 int nr_sampler_views; 2905 int i; 2906 2907 shader = CALLOC_STRUCT(lp_fragment_shader); 2908 if (!shader) 2909 return NULL; 2910 2911 shader->no = fs_no++; 2912 make_empty_list(&shader->variants); 2913 2914 /* get/save the summary info for this shader */ 2915 lp_build_tgsi_info(templ->tokens, &shader->info); 2916 2917 /* we need to keep a local copy of the tokens */ 2918 shader->base.tokens = tgsi_dup_tokens(templ->tokens); 2919 2920 shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ); 2921 if (shader->draw_data == NULL) { 2922 FREE((void *) shader->base.tokens); 2923 FREE(shader); 2924 return NULL; 2925 } 2926 2927 nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; 2928 nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; 2929 2930 shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key, 2931 state[MAX2(nr_samplers, nr_sampler_views)]); 2932 2933 for (i = 0; i < shader->info.base.num_inputs; i++) { 2934 shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i]; 2935 shader->inputs[i].cyl_wrap = shader->info.base.input_cylindrical_wrap[i]; 2936 2937 switch (shader->info.base.input_interpolate[i]) { 2938 case TGSI_INTERPOLATE_CONSTANT: 2939 shader->inputs[i].interp = LP_INTERP_CONSTANT; 2940 break; 2941 case TGSI_INTERPOLATE_LINEAR: 2942 shader->inputs[i].interp = LP_INTERP_LINEAR; 2943 break; 2944 case TGSI_INTERPOLATE_PERSPECTIVE: 2945 shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; 2946 break; 2947 case TGSI_INTERPOLATE_COLOR: 2948 shader->inputs[i].interp = LP_INTERP_COLOR; 2949 break; 2950 default: 2951 assert(0); 2952 break; 2953 } 2954 2955 switch (shader->info.base.input_semantic_name[i]) { 2956 case TGSI_SEMANTIC_FACE: 2957 shader->inputs[i].interp = LP_INTERP_FACING; 2958 break; 2959 case TGSI_SEMANTIC_POSITION: 2960 /* Position was already emitted above 2961 */ 2962 shader->inputs[i].interp = LP_INTERP_POSITION; 2963 shader->inputs[i].src_index = 0; 2964 continue; 2965 } 2966 2967 /* XXX this is a completely pointless index map... */ 2968 shader->inputs[i].src_index = i+1; 2969 } 2970 2971 if (LP_DEBUG & DEBUG_TGSI) { 2972 unsigned attrib; 2973 debug_printf("llvmpipe: Create fragment shader #%u %p:\n", 2974 shader->no, (void *) shader); 2975 tgsi_dump(templ->tokens, 0); 2976 debug_printf("usage masks:\n"); 2977 for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) { 2978 unsigned usage_mask = shader->info.base.input_usage_mask[attrib]; 2979 debug_printf(" IN[%u].%s%s%s%s\n", 2980 attrib, 2981 usage_mask & TGSI_WRITEMASK_X ? "x" : "", 2982 usage_mask & TGSI_WRITEMASK_Y ? "y" : "", 2983 usage_mask & TGSI_WRITEMASK_Z ? "z" : "", 2984 usage_mask & TGSI_WRITEMASK_W ? "w" : ""); 2985 } 2986 debug_printf("\n"); 2987 } 2988 2989 return shader; 2990 } 2991 2992 2993 static void 2994 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs) 2995 { 2996 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); 2997 2998 if (llvmpipe->fs == fs) 2999 return; 3000 3001 llvmpipe->fs = (struct lp_fragment_shader *) fs; 3002 3003 draw_bind_fragment_shader(llvmpipe->draw, 3004 (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL)); 3005 3006 llvmpipe->dirty |= LP_NEW_FS; 3007 } 3008 3009 3010 /** 3011 * Remove shader variant from two lists: the shader's variant list 3012 * and the context's variant list. 3013 */ 3014 void 3015 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp, 3016 struct lp_fragment_shader_variant *variant) 3017 { 3018 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) { 3019 debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u " 3020 "v total cached %u inst %u total inst %u\n", 3021 variant->shader->no, variant->no, 3022 variant->shader->variants_created, 3023 variant->shader->variants_cached, 3024 lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs); 3025 } 3026 3027 gallivm_destroy(variant->gallivm); 3028 3029 /* remove from shader's list */ 3030 remove_from_list(&variant->list_item_local); 3031 variant->shader->variants_cached--; 3032 3033 /* remove from context's list */ 3034 remove_from_list(&variant->list_item_global); 3035 lp->nr_fs_variants--; 3036 lp->nr_fs_instrs -= variant->nr_instrs; 3037 3038 FREE(variant); 3039 } 3040 3041 3042 static void 3043 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs) 3044 { 3045 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); 3046 struct lp_fragment_shader *shader = fs; 3047 struct lp_fs_variant_list_item *li; 3048 3049 assert(fs != llvmpipe->fs); 3050 3051 /* 3052 * XXX: we need to flush the context until we have some sort of reference 3053 * counting in fragment shaders as they may still be binned 3054 * Flushing alone might not sufficient we need to wait on it too. 3055 */ 3056 llvmpipe_finish(pipe, __FUNCTION__); 3057 3058 /* Delete all the variants */ 3059 li = first_elem(&shader->variants); 3060 while(!at_end(&shader->variants, li)) { 3061 struct lp_fs_variant_list_item *next = next_elem(li); 3062 llvmpipe_remove_shader_variant(llvmpipe, li->base); 3063 li = next; 3064 } 3065 3066 /* Delete draw module's data */ 3067 draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data); 3068 3069 assert(shader->variants_cached == 0); 3070 FREE((void *) shader->base.tokens); 3071 FREE(shader); 3072 } 3073 3074 3075 3076 static void 3077 llvmpipe_set_constant_buffer(struct pipe_context *pipe, 3078 enum pipe_shader_type shader, uint index, 3079 const struct pipe_constant_buffer *cb) 3080 { 3081 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); 3082 struct pipe_resource *constants = cb ? cb->buffer : NULL; 3083 3084 assert(shader < PIPE_SHADER_TYPES); 3085 assert(index < ARRAY_SIZE(llvmpipe->constants[shader])); 3086 3087 /* note: reference counting */ 3088 util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb); 3089 3090 if (constants) { 3091 if (!(constants->bind & PIPE_BIND_CONSTANT_BUFFER)) { 3092 debug_printf("Illegal set constant without bind flag\n"); 3093 constants->bind |= PIPE_BIND_CONSTANT_BUFFER; 3094 } 3095 } 3096 3097 if (shader == PIPE_SHADER_VERTEX || 3098 shader == PIPE_SHADER_GEOMETRY) { 3099 /* Pass the constants to the 'draw' module */ 3100 const unsigned size = cb ? cb->buffer_size : 0; 3101 const ubyte *data; 3102 3103 if (constants) { 3104 data = (ubyte *) llvmpipe_resource_data(constants); 3105 } 3106 else if (cb && cb->user_buffer) { 3107 data = (ubyte *) cb->user_buffer; 3108 } 3109 else { 3110 data = NULL; 3111 } 3112 3113 if (data) 3114 data += cb->buffer_offset; 3115 3116 draw_set_mapped_constant_buffer(llvmpipe->draw, shader, 3117 index, data, size); 3118 } 3119 else { 3120 llvmpipe->dirty |= LP_NEW_FS_CONSTANTS; 3121 } 3122 3123 if (cb && cb->user_buffer) { 3124 pipe_resource_reference(&constants, NULL); 3125 } 3126 } 3127 3128 3129 /** 3130 * Return the blend factor equivalent to a destination alpha of one. 3131 */ 3132 static inline unsigned 3133 force_dst_alpha_one(unsigned factor, boolean clamped_zero) 3134 { 3135 switch(factor) { 3136 case PIPE_BLENDFACTOR_DST_ALPHA: 3137 return PIPE_BLENDFACTOR_ONE; 3138 case PIPE_BLENDFACTOR_INV_DST_ALPHA: 3139 return PIPE_BLENDFACTOR_ZERO; 3140 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: 3141 if (clamped_zero) 3142 return PIPE_BLENDFACTOR_ZERO; 3143 else 3144 return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE; 3145 } 3146 3147 return factor; 3148 } 3149 3150 3151 /** 3152 * We need to generate several variants of the fragment pipeline to match 3153 * all the combinations of the contributing state atoms. 3154 * 3155 * TODO: there is actually no reason to tie this to context state -- the 3156 * generated code could be cached globally in the screen. 3157 */ 3158 static void 3159 make_variant_key(struct llvmpipe_context *lp, 3160 struct lp_fragment_shader *shader, 3161 struct lp_fragment_shader_variant_key *key) 3162 { 3163 unsigned i; 3164 3165 memset(key, 0, shader->variant_key_size); 3166 3167 if (lp->framebuffer.zsbuf) { 3168 enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format; 3169 const struct util_format_description *zsbuf_desc = 3170 util_format_description(zsbuf_format); 3171 3172 if (lp->depth_stencil->depth.enabled && 3173 util_format_has_depth(zsbuf_desc)) { 3174 key->zsbuf_format = zsbuf_format; 3175 memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth); 3176 } 3177 if (lp->depth_stencil->stencil[0].enabled && 3178 util_format_has_stencil(zsbuf_desc)) { 3179 key->zsbuf_format = zsbuf_format; 3180 memcpy(&key->stencil, &lp->depth_stencil->stencil, sizeof key->stencil); 3181 } 3182 if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) { 3183 key->resource_1d = TRUE; 3184 } 3185 } 3186 3187 /* 3188 * Propagate the depth clamp setting from the rasterizer state. 3189 * depth_clip == 0 implies depth clamping is enabled. 3190 * 3191 * When clip_halfz is enabled, then always clamp the depth values. 3192 * 3193 * XXX: This is incorrect for GL, but correct for d3d10 (depth 3194 * clamp is always active in d3d10, regardless if depth clip is 3195 * enabled or not). 3196 * (GL has an always-on [0,1] clamp on fs depth output instead 3197 * to ensure the depth values stay in range. Doesn't look like 3198 * we do that, though...) 3199 */ 3200 if (lp->rasterizer->clip_halfz) { 3201 key->depth_clamp = 1; 3202 } else { 3203 key->depth_clamp = (lp->rasterizer->depth_clip == 0) ? 1 : 0; 3204 } 3205 3206 /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */ 3207 if (!lp->framebuffer.nr_cbufs || 3208 !lp->framebuffer.cbufs[0] || 3209 !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) { 3210 key->alpha.enabled = lp->depth_stencil->alpha.enabled; 3211 } 3212 if(key->alpha.enabled) 3213 key->alpha.func = lp->depth_stencil->alpha.func; 3214 /* alpha.ref_value is passed in jit_context */ 3215 3216 key->flatshade = lp->rasterizer->flatshade; 3217 if (lp->active_occlusion_queries) { 3218 key->occlusion_count = TRUE; 3219 } 3220 3221 if (lp->framebuffer.nr_cbufs) { 3222 memcpy(&key->blend, lp->blend, sizeof key->blend); 3223 } 3224 3225 key->nr_cbufs = lp->framebuffer.nr_cbufs; 3226 3227 if (!key->blend.independent_blend_enable) { 3228 /* we always need independent blend otherwise the fixups below won't work */ 3229 for (i = 1; i < key->nr_cbufs; i++) { 3230 memcpy(&key->blend.rt[i], &key->blend.rt[0], sizeof(key->blend.rt[0])); 3231 } 3232 key->blend.independent_blend_enable = 1; 3233 } 3234 3235 for (i = 0; i < lp->framebuffer.nr_cbufs; i++) { 3236 struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i]; 3237 3238 if (lp->framebuffer.cbufs[i]) { 3239 enum pipe_format format = lp->framebuffer.cbufs[i]->format; 3240 const struct util_format_description *format_desc; 3241 3242 key->cbuf_format[i] = format; 3243 3244 /* 3245 * Figure out if this is a 1d resource. Note that OpenGL allows crazy 3246 * mixing of 2d textures with height 1 and 1d textures, so make sure 3247 * we pick 1d if any cbuf or zsbuf is 1d. 3248 */ 3249 if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) { 3250 key->resource_1d = TRUE; 3251 } 3252 3253 format_desc = util_format_description(format); 3254 assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || 3255 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); 3256 3257 /* 3258 * Mask out color channels not present in the color buffer. 3259 */ 3260 blend_rt->colormask &= util_format_colormask(format_desc); 3261 3262 /* 3263 * Disable blend for integer formats. 3264 */ 3265 if (util_format_is_pure_integer(format)) { 3266 blend_rt->blend_enable = 0; 3267 } 3268 3269 /* 3270 * Our swizzled render tiles always have an alpha channel, but the 3271 * linear render target format often does not, so force here the dst 3272 * alpha to be one. 3273 * 3274 * This is not a mere optimization. Wrong results will be produced if 3275 * the dst alpha is used, the dst format does not have alpha, and the 3276 * previous rendering was not flushed from the swizzled to linear 3277 * buffer. For example, NonPowTwo DCT. 3278 * 3279 * TODO: This should be generalized to all channels for better 3280 * performance, but only alpha causes correctness issues. 3281 * 3282 * Also, force rgb/alpha func/factors match, to make AoS blending 3283 * easier. 3284 */ 3285 if (format_desc->swizzle[3] > PIPE_SWIZZLE_W || 3286 format_desc->swizzle[3] == format_desc->swizzle[0]) { 3287 /* Doesn't cover mixed snorm/unorm but can't render to them anyway */ 3288 boolean clamped_zero = !util_format_is_float(format) && 3289 !util_format_is_snorm(format); 3290 blend_rt->rgb_src_factor = 3291 force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero); 3292 blend_rt->rgb_dst_factor = 3293 force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero); 3294 blend_rt->alpha_func = blend_rt->rgb_func; 3295 blend_rt->alpha_src_factor = blend_rt->rgb_src_factor; 3296 blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor; 3297 } 3298 } 3299 else { 3300 /* no color buffer for this fragment output */ 3301 key->cbuf_format[i] = PIPE_FORMAT_NONE; 3302 blend_rt->colormask = 0x0; 3303 blend_rt->blend_enable = 0; 3304 } 3305 } 3306 3307 /* This value will be the same for all the variants of a given shader: 3308 */ 3309 key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; 3310 3311 for(i = 0; i < key->nr_samplers; ++i) { 3312 if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { 3313 lp_sampler_static_sampler_state(&key->state[i].sampler_state, 3314 lp->samplers[PIPE_SHADER_FRAGMENT][i]); 3315 } 3316 } 3317 3318 /* 3319 * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes 3320 * are dx10-style? Can't really have mixed opcodes, at least not 3321 * if we want to skip the holes here (without rescanning tgsi). 3322 */ 3323 if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { 3324 key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; 3325 for(i = 0; i < key->nr_sampler_views; ++i) { 3326 if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { 3327 lp_sampler_static_texture_state(&key->state[i].texture_state, 3328 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); 3329 } 3330 } 3331 } 3332 else { 3333 key->nr_sampler_views = key->nr_samplers; 3334 for(i = 0; i < key->nr_sampler_views; ++i) { 3335 if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { 3336 lp_sampler_static_texture_state(&key->state[i].texture_state, 3337 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); 3338 } 3339 } 3340 } 3341 } 3342 3343 3344 3345 /** 3346 * Update fragment shader state. This is called just prior to drawing 3347 * something when some fragment-related state has changed. 3348 */ 3349 void 3350 llvmpipe_update_fs(struct llvmpipe_context *lp) 3351 { 3352 struct lp_fragment_shader *shader = lp->fs; 3353 struct lp_fragment_shader_variant_key key; 3354 struct lp_fragment_shader_variant *variant = NULL; 3355 struct lp_fs_variant_list_item *li; 3356 3357 make_variant_key(lp, shader, &key); 3358 3359 /* Search the variants for one which matches the key */ 3360 li = first_elem(&shader->variants); 3361 while(!at_end(&shader->variants, li)) { 3362 if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) { 3363 variant = li->base; 3364 break; 3365 } 3366 li = next_elem(li); 3367 } 3368 3369 if (variant) { 3370 /* Move this variant to the head of the list to implement LRU 3371 * deletion of shader's when we have too many. 3372 */ 3373 move_to_head(&lp->fs_variants_list, &variant->list_item_global); 3374 } 3375 else { 3376 /* variant not found, create it now */ 3377 int64_t t0, t1, dt; 3378 unsigned i; 3379 unsigned variants_to_cull; 3380 3381 if (LP_DEBUG & DEBUG_FS) { 3382 debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n", 3383 lp->nr_fs_variants, 3384 lp->nr_fs_instrs, 3385 lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0); 3386 } 3387 3388 /* First, check if we've exceeded the max number of shader variants. 3389 * If so, free 6.25% of them (the least recently used ones). 3390 */ 3391 variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 16 : 0; 3392 3393 if (variants_to_cull || 3394 lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) { 3395 struct pipe_context *pipe = &lp->pipe; 3396 3397 if (gallivm_debug & GALLIVM_DEBUG_PERF) { 3398 debug_printf("Evicting FS: %u fs variants,\t%u total variants," 3399 "\t%u instrs,\t%u instrs/variant\n", 3400 shader->variants_cached, 3401 lp->nr_fs_variants, lp->nr_fs_instrs, 3402 lp->nr_fs_instrs / lp->nr_fs_variants); 3403 } 3404 3405 /* 3406 * XXX: we need to flush the context until we have some sort of 3407 * reference counting in fragment shaders as they may still be binned 3408 * Flushing alone might not be sufficient we need to wait on it too. 3409 */ 3410 llvmpipe_finish(pipe, __FUNCTION__); 3411 3412 /* 3413 * We need to re-check lp->nr_fs_variants because an arbitrarliy large 3414 * number of shader variants (potentially all of them) could be 3415 * pending for destruction on flush. 3416 */ 3417 3418 for (i = 0; i < variants_to_cull || lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS; i++) { 3419 struct lp_fs_variant_list_item *item; 3420 if (is_empty_list(&lp->fs_variants_list)) { 3421 break; 3422 } 3423 item = last_elem(&lp->fs_variants_list); 3424 assert(item); 3425 assert(item->base); 3426 llvmpipe_remove_shader_variant(lp, item->base); 3427 } 3428 } 3429 3430 /* 3431 * Generate the new variant. 3432 */ 3433 t0 = os_time_get(); 3434 variant = generate_variant(lp, shader, &key); 3435 t1 = os_time_get(); 3436 dt = t1 - t0; 3437 LP_COUNT_ADD(llvm_compile_time, dt); 3438 LP_COUNT_ADD(nr_llvm_compiles, 2); /* emit vs. omit in/out test */ 3439 3440 /* Put the new variant into the list */ 3441 if (variant) { 3442 insert_at_head(&shader->variants, &variant->list_item_local); 3443 insert_at_head(&lp->fs_variants_list, &variant->list_item_global); 3444 lp->nr_fs_variants++; 3445 lp->nr_fs_instrs += variant->nr_instrs; 3446 shader->variants_cached++; 3447 } 3448 } 3449 3450 /* Bind this variant */ 3451 lp_setup_set_fs_variant(lp->setup, variant); 3452 } 3453 3454 3455 3456 3457 3458 void 3459 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe) 3460 { 3461 llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state; 3462 llvmpipe->pipe.bind_fs_state = llvmpipe_bind_fs_state; 3463 llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state; 3464 3465 llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer; 3466 } 3467 3468 /* 3469 * Rasterization is disabled if there is no pixel shader and 3470 * both depth and stencil testing are disabled: 3471 * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125 3472 */ 3473 boolean 3474 llvmpipe_rasterization_disabled(struct llvmpipe_context *lp) 3475 { 3476 boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1; 3477 3478 return (null_fs && 3479 !lp->depth_stencil->depth.enabled && 3480 !lp->depth_stencil->stencil[0].enabled); 3481 } 3482