1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * Copyright 2007 VMware, Inc. 5 * All Rights Reserved. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the 9 * "Software"), to deal in the Software without restriction, including 10 * without limitation the rights to use, copy, modify, merge, publish, 11 * distribute, sub license, and/or sell copies of the Software, and to 12 * permit persons to whom the Software is furnished to do so, subject to 13 * the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the 16 * next paragraph) shall be included in all copies or substantial portions 17 * of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 * 27 **************************************************************************/ 28 29 /** 30 * @file 31 * Code generate the whole fragment pipeline. 32 * 33 * The fragment pipeline consists of the following stages: 34 * - early depth test 35 * - fragment shader 36 * - alpha test 37 * - depth/stencil test 38 * - blending 39 * 40 * This file has only the glue to assemble the fragment pipeline. The actual 41 * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the 42 * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we 43 * muster the LLVM JIT execution engine to create a function that follows an 44 * established binary interface and that can be called from C directly. 45 * 46 * A big source of complexity here is that we often want to run different 47 * stages with different precisions and data types and precisions. For example, 48 * the fragment shader needs typically to be done in floats, but the 49 * depth/stencil test and blending is better done in the type that most closely 50 * matches the depth/stencil and color buffer respectively. 51 * 52 * Since the width of a SIMD vector register stays the same regardless of the 53 * element type, different types imply different number of elements, so we must 54 * code generate more instances of the stages with larger types to be able to 55 * feed/consume the stages with smaller types. 56 * 57 * @author Jose Fonseca <jfonseca (at) vmware.com> 58 */ 59 60 #include <limits.h> 61 #include "pipe/p_defines.h" 62 #include "util/u_inlines.h" 63 #include "util/u_memory.h" 64 #include "util/u_pointer.h" 65 #include "util/u_format.h" 66 #include "util/u_dump.h" 67 #include "util/u_string.h" 68 #include "util/simple_list.h" 69 #include "util/u_dual_blend.h" 70 #include "os/os_time.h" 71 #include "pipe/p_shader_tokens.h" 72 #include "draw/draw_context.h" 73 #include "tgsi/tgsi_dump.h" 74 #include "tgsi/tgsi_scan.h" 75 #include "tgsi/tgsi_parse.h" 76 #include "gallivm/lp_bld_type.h" 77 #include "gallivm/lp_bld_const.h" 78 #include "gallivm/lp_bld_conv.h" 79 #include "gallivm/lp_bld_init.h" 80 #include "gallivm/lp_bld_intr.h" 81 #include "gallivm/lp_bld_logic.h" 82 #include "gallivm/lp_bld_tgsi.h" 83 #include "gallivm/lp_bld_swizzle.h" 84 #include "gallivm/lp_bld_flow.h" 85 #include "gallivm/lp_bld_debug.h" 86 #include "gallivm/lp_bld_arit.h" 87 #include "gallivm/lp_bld_pack.h" 88 #include "gallivm/lp_bld_format.h" 89 #include "gallivm/lp_bld_quad.h" 90 91 #include "lp_bld_alpha.h" 92 #include "lp_bld_blend.h" 93 #include "lp_bld_depth.h" 94 #include "lp_bld_interp.h" 95 #include "lp_context.h" 96 #include "lp_debug.h" 97 #include "lp_perf.h" 98 #include "lp_setup.h" 99 #include "lp_state.h" 100 #include "lp_tex_sample.h" 101 #include "lp_flush.h" 102 #include "lp_state_fs.h" 103 #include "lp_rast.h" 104 105 106 /** Fragment shader number (for debugging) */ 107 static unsigned fs_no = 0; 108 109 110 /** 111 * Expand the relevant bits of mask_input to a n*4-dword mask for the 112 * n*four pixels in n 2x2 quads. This will set the n*four elements of the 113 * quad mask vector to 0 or ~0. 114 * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid 115 * quad arguments with fs length 8. 116 * 117 * \param first_quad which quad(s) of the quad group to test, in [0,3] 118 * \param mask_input bitwise mask for the whole 4x4 stamp 119 */ 120 static LLVMValueRef 121 generate_quad_mask(struct gallivm_state *gallivm, 122 struct lp_type fs_type, 123 unsigned first_quad, 124 LLVMValueRef mask_input) /* int32 */ 125 { 126 LLVMBuilderRef builder = gallivm->builder; 127 struct lp_type mask_type; 128 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 129 LLVMValueRef bits[16]; 130 LLVMValueRef mask, bits_vec; 131 int shift, i; 132 133 /* 134 * XXX: We'll need a different path for 16 x u8 135 */ 136 assert(fs_type.width == 32); 137 assert(fs_type.length <= ARRAY_SIZE(bits)); 138 mask_type = lp_int_type(fs_type); 139 140 /* 141 * mask_input >>= (quad * 4) 142 */ 143 switch (first_quad) { 144 case 0: 145 shift = 0; 146 break; 147 case 1: 148 assert(fs_type.length == 4); 149 shift = 2; 150 break; 151 case 2: 152 shift = 8; 153 break; 154 case 3: 155 assert(fs_type.length == 4); 156 shift = 10; 157 break; 158 default: 159 assert(0); 160 shift = 0; 161 } 162 163 mask_input = LLVMBuildLShr(builder, 164 mask_input, 165 LLVMConstInt(i32t, shift, 0), 166 ""); 167 168 /* 169 * mask = { mask_input & (1 << i), for i in [0,3] } 170 */ 171 mask = lp_build_broadcast(gallivm, 172 lp_build_vec_type(gallivm, mask_type), 173 mask_input); 174 175 for (i = 0; i < fs_type.length / 4; i++) { 176 unsigned j = 2 * (i % 2) + (i / 2) * 8; 177 bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0); 178 bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0); 179 bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0); 180 bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0); 181 } 182 bits_vec = LLVMConstVector(bits, fs_type.length); 183 mask = LLVMBuildAnd(builder, mask, bits_vec, ""); 184 185 /* 186 * mask = mask == bits ? ~0 : 0 187 */ 188 mask = lp_build_compare(gallivm, 189 mask_type, PIPE_FUNC_EQUAL, 190 mask, bits_vec); 191 192 return mask; 193 } 194 195 196 #define EARLY_DEPTH_TEST 0x1 197 #define LATE_DEPTH_TEST 0x2 198 #define EARLY_DEPTH_WRITE 0x4 199 #define LATE_DEPTH_WRITE 0x8 200 201 static int 202 find_output_by_semantic( const struct tgsi_shader_info *info, 203 unsigned semantic, 204 unsigned index ) 205 { 206 int i; 207 208 for (i = 0; i < info->num_outputs; i++) 209 if (info->output_semantic_name[i] == semantic && 210 info->output_semantic_index[i] == index) 211 return i; 212 213 return -1; 214 } 215 216 217 /** 218 * Fetch the specified lp_jit_viewport structure for a given viewport_index. 219 */ 220 static LLVMValueRef 221 lp_llvm_viewport(LLVMValueRef context_ptr, 222 struct gallivm_state *gallivm, 223 LLVMValueRef viewport_index) 224 { 225 LLVMBuilderRef builder = gallivm->builder; 226 LLVMValueRef ptr; 227 LLVMValueRef res; 228 struct lp_type viewport_type = 229 lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS); 230 231 ptr = lp_jit_context_viewports(gallivm, context_ptr); 232 ptr = LLVMBuildPointerCast(builder, ptr, 233 LLVMPointerType(lp_build_vec_type(gallivm, viewport_type), 0), ""); 234 235 res = lp_build_pointer_get(builder, ptr, viewport_index); 236 237 return res; 238 } 239 240 241 static LLVMValueRef 242 lp_build_depth_clamp(struct gallivm_state *gallivm, 243 LLVMBuilderRef builder, 244 struct lp_type type, 245 LLVMValueRef context_ptr, 246 LLVMValueRef thread_data_ptr, 247 LLVMValueRef z) 248 { 249 LLVMValueRef viewport, min_depth, max_depth; 250 LLVMValueRef viewport_index; 251 struct lp_build_context f32_bld; 252 253 assert(type.floating); 254 lp_build_context_init(&f32_bld, gallivm, type); 255 256 /* 257 * Assumes clamping of the viewport index will occur in setup/gs. Value 258 * is passed through the rasterization stage via lp_rast_shader_inputs. 259 * 260 * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping 261 * semantics. 262 */ 263 viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm, 264 thread_data_ptr); 265 266 /* 267 * Load the min and max depth from the lp_jit_context.viewports 268 * array of lp_jit_viewport structures. 269 */ 270 viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index); 271 272 /* viewports[viewport_index].min_depth */ 273 min_depth = LLVMBuildExtractElement(builder, viewport, 274 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), ""); 275 min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth); 276 277 /* viewports[viewport_index].max_depth */ 278 max_depth = LLVMBuildExtractElement(builder, viewport, 279 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), ""); 280 max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth); 281 282 /* 283 * Clamp to the min and max depth values for the given viewport. 284 */ 285 return lp_build_clamp(&f32_bld, z, min_depth, max_depth); 286 } 287 288 289 /** 290 * Generate the fragment shader, depth/stencil test, and alpha tests. 291 */ 292 static void 293 generate_fs_loop(struct gallivm_state *gallivm, 294 struct lp_fragment_shader *shader, 295 const struct lp_fragment_shader_variant_key *key, 296 LLVMBuilderRef builder, 297 struct lp_type type, 298 LLVMValueRef context_ptr, 299 LLVMValueRef num_loop, 300 struct lp_build_interp_soa_context *interp, 301 struct lp_build_sampler_soa *sampler, 302 LLVMValueRef mask_store, 303 LLVMValueRef (*out_color)[4], 304 LLVMValueRef depth_ptr, 305 LLVMValueRef depth_stride, 306 LLVMValueRef facing, 307 LLVMValueRef thread_data_ptr) 308 { 309 const struct util_format_description *zs_format_desc = NULL; 310 const struct tgsi_token *tokens = shader->base.tokens; 311 struct lp_type int_type = lp_int_type(type); 312 LLVMTypeRef vec_type, int_vec_type; 313 LLVMValueRef mask_ptr, mask_val; 314 LLVMValueRef consts_ptr, num_consts_ptr; 315 LLVMValueRef z; 316 LLVMValueRef z_value, s_value; 317 LLVMValueRef z_fb, s_fb; 318 LLVMValueRef stencil_refs[2]; 319 LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; 320 struct lp_build_for_loop_state loop_state; 321 struct lp_build_mask_context mask; 322 /* 323 * TODO: figure out if simple_shader optimization is really worthwile to 324 * keep. Disabled because it may hide some real bugs in the (depth/stencil) 325 * code since tests tend to take another codepath than real shaders. 326 */ 327 boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 && 328 shader->info.base.num_inputs < 3 && 329 shader->info.base.num_instructions < 8) && 0; 330 const boolean dual_source_blend = key->blend.rt[0].blend_enable && 331 util_blend_state_is_dual(&key->blend, 0); 332 unsigned attrib; 333 unsigned chan; 334 unsigned cbuf; 335 unsigned depth_mode; 336 337 struct lp_bld_tgsi_system_values system_values; 338 339 memset(&system_values, 0, sizeof(system_values)); 340 341 if (key->depth.enabled || 342 key->stencil[0].enabled) { 343 344 zs_format_desc = util_format_description(key->zsbuf_format); 345 assert(zs_format_desc); 346 347 if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) { 348 if (key->alpha.enabled || 349 key->blend.alpha_to_coverage || 350 shader->info.base.uses_kill) { 351 /* With alpha test and kill, can do the depth test early 352 * and hopefully eliminate some quads. But need to do a 353 * special deferred depth write once the final mask value 354 * is known. This only works though if there's either no 355 * stencil test or the stencil value isn't written. 356 */ 357 if (key->stencil[0].enabled && (key->stencil[0].writemask || 358 (key->stencil[1].enabled && 359 key->stencil[1].writemask))) 360 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE; 361 else 362 depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE; 363 } 364 else 365 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE; 366 } 367 else { 368 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE; 369 } 370 371 if (!(key->depth.enabled && key->depth.writemask) && 372 !(key->stencil[0].enabled && (key->stencil[0].writemask || 373 (key->stencil[1].enabled && 374 key->stencil[1].writemask)))) 375 depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE); 376 } 377 else { 378 depth_mode = 0; 379 } 380 381 vec_type = lp_build_vec_type(gallivm, type); 382 int_vec_type = lp_build_vec_type(gallivm, int_type); 383 384 stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr); 385 stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr); 386 /* convert scalar stencil refs into vectors */ 387 stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]); 388 stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]); 389 390 consts_ptr = lp_jit_context_constants(gallivm, context_ptr); 391 num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr); 392 393 lp_build_for_loop_begin(&loop_state, gallivm, 394 lp_build_const_int32(gallivm, 0), 395 LLVMIntULT, 396 num_loop, 397 lp_build_const_int32(gallivm, 1)); 398 399 mask_ptr = LLVMBuildGEP(builder, mask_store, 400 &loop_state.counter, 1, "mask_ptr"); 401 mask_val = LLVMBuildLoad(builder, mask_ptr, ""); 402 403 memset(outputs, 0, sizeof outputs); 404 405 for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { 406 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 407 out_color[cbuf][chan] = lp_build_array_alloca(gallivm, 408 lp_build_vec_type(gallivm, 409 type), 410 num_loop, "color"); 411 } 412 } 413 if (dual_source_blend) { 414 assert(key->nr_cbufs <= 1); 415 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 416 out_color[1][chan] = lp_build_array_alloca(gallivm, 417 lp_build_vec_type(gallivm, 418 type), 419 num_loop, "color1"); 420 } 421 } 422 423 424 /* 'mask' will control execution based on quad's pixel alive/killed state */ 425 lp_build_mask_begin(&mask, gallivm, type, mask_val); 426 427 if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader) 428 lp_build_mask_check(&mask); 429 430 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter); 431 z = interp->pos[2]; 432 433 if (depth_mode & EARLY_DEPTH_TEST) { 434 /* 435 * Clamp according to ARB_depth_clamp semantics. 436 */ 437 if (key->depth_clamp) { 438 z = lp_build_depth_clamp(gallivm, builder, type, context_ptr, 439 thread_data_ptr, z); 440 } 441 lp_build_depth_stencil_load_swizzled(gallivm, type, 442 zs_format_desc, key->resource_1d, 443 depth_ptr, depth_stride, 444 &z_fb, &s_fb, loop_state.counter); 445 lp_build_depth_stencil_test(gallivm, 446 &key->depth, 447 key->stencil, 448 type, 449 zs_format_desc, 450 &mask, 451 stencil_refs, 452 z, z_fb, s_fb, 453 facing, 454 &z_value, &s_value, 455 !simple_shader); 456 457 if (depth_mode & EARLY_DEPTH_WRITE) { 458 lp_build_depth_stencil_write_swizzled(gallivm, type, 459 zs_format_desc, key->resource_1d, 460 NULL, NULL, NULL, loop_state.counter, 461 depth_ptr, depth_stride, 462 z_value, s_value); 463 } 464 /* 465 * Note mask check if stencil is enabled must be after ds write not after 466 * stencil test otherwise new stencil values may not get written if all 467 * fragments got killed by depth/stencil test. 468 */ 469 if (!simple_shader && key->stencil[0].enabled) 470 lp_build_mask_check(&mask); 471 } 472 473 lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter); 474 475 /* Build the actual shader */ 476 lp_build_tgsi_soa(gallivm, tokens, type, &mask, 477 consts_ptr, num_consts_ptr, &system_values, 478 interp->inputs, 479 outputs, context_ptr, thread_data_ptr, 480 sampler, &shader->info.base, NULL); 481 482 /* Alpha test */ 483 if (key->alpha.enabled) { 484 int color0 = find_output_by_semantic(&shader->info.base, 485 TGSI_SEMANTIC_COLOR, 486 0); 487 488 if (color0 != -1 && outputs[color0][3]) { 489 const struct util_format_description *cbuf_format_desc; 490 LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha"); 491 LLVMValueRef alpha_ref_value; 492 493 alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_ptr); 494 alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value); 495 496 cbuf_format_desc = util_format_description(key->cbuf_format[0]); 497 498 lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc, 499 &mask, alpha, alpha_ref_value, 500 (depth_mode & LATE_DEPTH_TEST) != 0); 501 } 502 } 503 504 /* Emulate Alpha to Coverage with Alpha test */ 505 if (key->blend.alpha_to_coverage) { 506 int color0 = find_output_by_semantic(&shader->info.base, 507 TGSI_SEMANTIC_COLOR, 508 0); 509 510 if (color0 != -1 && outputs[color0][3]) { 511 LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha"); 512 513 lp_build_alpha_to_coverage(gallivm, type, 514 &mask, alpha, 515 (depth_mode & LATE_DEPTH_TEST) != 0); 516 } 517 } 518 519 /* Late Z test */ 520 if (depth_mode & LATE_DEPTH_TEST) { 521 int pos0 = find_output_by_semantic(&shader->info.base, 522 TGSI_SEMANTIC_POSITION, 523 0); 524 int s_out = find_output_by_semantic(&shader->info.base, 525 TGSI_SEMANTIC_STENCIL, 526 0); 527 if (pos0 != -1 && outputs[pos0][2]) { 528 z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z"); 529 } 530 /* 531 * Clamp according to ARB_depth_clamp semantics. 532 */ 533 if (key->depth_clamp) { 534 z = lp_build_depth_clamp(gallivm, builder, type, context_ptr, 535 thread_data_ptr, z); 536 } 537 538 if (s_out != -1 && outputs[s_out][1]) { 539 /* there's only one value, and spec says to discard additional bits */ 540 LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255); 541 stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s"); 542 stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, ""); 543 stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, ""); 544 stencil_refs[1] = stencil_refs[0]; 545 } 546 547 lp_build_depth_stencil_load_swizzled(gallivm, type, 548 zs_format_desc, key->resource_1d, 549 depth_ptr, depth_stride, 550 &z_fb, &s_fb, loop_state.counter); 551 552 lp_build_depth_stencil_test(gallivm, 553 &key->depth, 554 key->stencil, 555 type, 556 zs_format_desc, 557 &mask, 558 stencil_refs, 559 z, z_fb, s_fb, 560 facing, 561 &z_value, &s_value, 562 !simple_shader); 563 /* Late Z write */ 564 if (depth_mode & LATE_DEPTH_WRITE) { 565 lp_build_depth_stencil_write_swizzled(gallivm, type, 566 zs_format_desc, key->resource_1d, 567 NULL, NULL, NULL, loop_state.counter, 568 depth_ptr, depth_stride, 569 z_value, s_value); 570 } 571 } 572 else if ((depth_mode & EARLY_DEPTH_TEST) && 573 (depth_mode & LATE_DEPTH_WRITE)) 574 { 575 /* Need to apply a reduced mask to the depth write. Reload the 576 * depth value, update from zs_value with the new mask value and 577 * write that out. 578 */ 579 lp_build_depth_stencil_write_swizzled(gallivm, type, 580 zs_format_desc, key->resource_1d, 581 &mask, z_fb, s_fb, loop_state.counter, 582 depth_ptr, depth_stride, 583 z_value, s_value); 584 } 585 586 587 /* Color write */ 588 for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib) 589 { 590 unsigned cbuf = shader->info.base.output_semantic_index[attrib]; 591 if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) && 592 ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend))) 593 { 594 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 595 if(outputs[attrib][chan]) { 596 /* XXX: just initialize outputs to point at colors[] and 597 * skip this. 598 */ 599 LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], ""); 600 LLVMValueRef color_ptr; 601 color_ptr = LLVMBuildGEP(builder, out_color[cbuf][chan], 602 &loop_state.counter, 1, ""); 603 lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]); 604 LLVMBuildStore(builder, out, color_ptr); 605 } 606 } 607 } 608 } 609 610 if (key->occlusion_count) { 611 LLVMValueRef counter = lp_jit_thread_data_counter(gallivm, thread_data_ptr); 612 lp_build_name(counter, "counter"); 613 lp_build_occlusion_count(gallivm, type, 614 lp_build_mask_value(&mask), counter); 615 } 616 617 mask_val = lp_build_mask_end(&mask); 618 LLVMBuildStore(builder, mask_val, mask_ptr); 619 lp_build_for_loop_end(&loop_state); 620 } 621 622 623 /** 624 * This function will reorder pixels from the fragment shader SoA to memory layout AoS 625 * 626 * Fragment Shader outputs pixels in small 2x2 blocks 627 * e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ... 628 * 629 * However in memory pixels are stored in rows 630 * e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ... 631 * 632 * @param type fragment shader type (4x or 8x float) 633 * @param num_fs number of fs_src 634 * @param is_1d whether we're outputting to a 1d resource 635 * @param dst_channels number of output channels 636 * @param fs_src output from fragment shader 637 * @param dst pointer to store result 638 * @param pad_inline is channel padding inline or at end of row 639 * @return the number of dsts 640 */ 641 static int 642 generate_fs_twiddle(struct gallivm_state *gallivm, 643 struct lp_type type, 644 unsigned num_fs, 645 unsigned dst_channels, 646 LLVMValueRef fs_src[][4], 647 LLVMValueRef* dst, 648 bool pad_inline) 649 { 650 LLVMValueRef src[16]; 651 652 bool swizzle_pad; 653 bool twiddle; 654 bool split; 655 656 unsigned pixels = type.length / 4; 657 unsigned reorder_group; 658 unsigned src_channels; 659 unsigned src_count; 660 unsigned i; 661 662 src_channels = dst_channels < 3 ? dst_channels : 4; 663 src_count = num_fs * src_channels; 664 665 assert(pixels == 2 || pixels == 1); 666 assert(num_fs * src_channels <= ARRAY_SIZE(src)); 667 668 /* 669 * Transpose from SoA -> AoS 670 */ 671 for (i = 0; i < num_fs; ++i) { 672 lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]); 673 } 674 675 /* 676 * Pick transformation options 677 */ 678 swizzle_pad = false; 679 twiddle = false; 680 split = false; 681 reorder_group = 0; 682 683 if (dst_channels == 1) { 684 twiddle = true; 685 686 if (pixels == 2) { 687 split = true; 688 } 689 } else if (dst_channels == 2) { 690 if (pixels == 1) { 691 reorder_group = 1; 692 } 693 } else if (dst_channels > 2) { 694 if (pixels == 1) { 695 reorder_group = 2; 696 } else { 697 twiddle = true; 698 } 699 700 if (!pad_inline && dst_channels == 3 && pixels > 1) { 701 swizzle_pad = true; 702 } 703 } 704 705 /* 706 * Split the src in half 707 */ 708 if (split) { 709 for (i = num_fs; i > 0; --i) { 710 src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4); 711 src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4); 712 } 713 714 src_count *= 2; 715 type.length = 4; 716 } 717 718 /* 719 * Ensure pixels are in memory order 720 */ 721 if (reorder_group) { 722 /* Twiddle pixels by reordering the array, e.g.: 723 * 724 * src_count = 8 -> 0 2 1 3 4 6 5 7 725 * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15 726 */ 727 const unsigned reorder_sw[] = { 0, 2, 1, 3 }; 728 729 for (i = 0; i < src_count; ++i) { 730 unsigned group = i / reorder_group; 731 unsigned block = (group / 4) * 4 * reorder_group; 732 unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group); 733 dst[i] = src[j]; 734 } 735 } else if (twiddle) { 736 /* Twiddle pixels across elements of array */ 737 /* 738 * XXX: we should avoid this in some cases, but would need to tell 739 * lp_build_conv to reorder (or deal with it ourselves). 740 */ 741 lp_bld_quad_twiddle(gallivm, type, src, src_count, dst); 742 } else { 743 /* Do nothing */ 744 memcpy(dst, src, sizeof(LLVMValueRef) * src_count); 745 } 746 747 /* 748 * Moves any padding between pixels to the end 749 * e.g. RGBXRGBX -> RGBRGBXX 750 */ 751 if (swizzle_pad) { 752 unsigned char swizzles[16]; 753 unsigned elems = pixels * dst_channels; 754 755 for (i = 0; i < type.length; ++i) { 756 if (i < elems) 757 swizzles[i] = i % dst_channels + (i / dst_channels) * 4; 758 else 759 swizzles[i] = LP_BLD_SWIZZLE_DONTCARE; 760 } 761 762 for (i = 0; i < src_count; ++i) { 763 dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length); 764 } 765 } 766 767 return src_count; 768 } 769 770 771 /* 772 * Untwiddle and transpose, much like the above. 773 * However, this is after conversion, so we get packed vectors. 774 * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data, 775 * the vectors will look like: 776 * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may 777 * be swizzled here). Extending to 16bit should be trivial. 778 * Should also be extended to handle twice wide vectors with AVX2... 779 */ 780 static void 781 fs_twiddle_transpose(struct gallivm_state *gallivm, 782 struct lp_type type, 783 LLVMValueRef *src, 784 unsigned src_count, 785 LLVMValueRef *dst) 786 { 787 unsigned i, j; 788 struct lp_type type64, type16, type32; 789 LLVMTypeRef type64_t, type8_t, type16_t, type32_t; 790 LLVMBuilderRef builder = gallivm->builder; 791 LLVMValueRef tmp[4], shuf[8]; 792 for (j = 0; j < 2; j++) { 793 shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0); 794 shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2); 795 shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1); 796 shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3); 797 } 798 799 assert(src_count == 4 || src_count == 2 || src_count == 1); 800 assert(type.width == 8); 801 assert(type.length == 16); 802 803 type8_t = lp_build_vec_type(gallivm, type); 804 805 type64 = type; 806 type64.length /= 8; 807 type64.width *= 8; 808 type64_t = lp_build_vec_type(gallivm, type64); 809 810 type16 = type; 811 type16.length /= 2; 812 type16.width *= 2; 813 type16_t = lp_build_vec_type(gallivm, type16); 814 815 type32 = type; 816 type32.length /= 4; 817 type32.width *= 4; 818 type32_t = lp_build_vec_type(gallivm, type32); 819 820 lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp); 821 822 if (src_count == 1) { 823 /* transpose was no-op, just untwiddle */ 824 LLVMValueRef shuf_vec; 825 shuf_vec = LLVMConstVector(shuf, 8); 826 tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, ""); 827 tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, ""); 828 dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, ""); 829 } else if (src_count == 2) { 830 LLVMValueRef shuf_vec; 831 shuf_vec = LLVMConstVector(shuf, 4); 832 833 for (i = 0; i < 2; i++) { 834 tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, ""); 835 tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, ""); 836 dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, ""); 837 } 838 } else { 839 for (j = 0; j < 2; j++) { 840 LLVMValueRef lo, hi, lo2, hi2; 841 /* 842 * Note that if we only really have 3 valid channels (rgb) 843 * and we don't need alpha we could substitute a undef here 844 * for the respective channel (causing llvm to drop conversion 845 * for alpha). 846 */ 847 /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */ 848 lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, ""); 849 hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, ""); 850 lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0); 851 hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1); 852 dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, ""); 853 dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, ""); 854 } 855 } 856 } 857 858 859 /** 860 * Load an unswizzled block of pixels from memory 861 */ 862 static void 863 load_unswizzled_block(struct gallivm_state *gallivm, 864 LLVMValueRef base_ptr, 865 LLVMValueRef stride, 866 unsigned block_width, 867 unsigned block_height, 868 LLVMValueRef* dst, 869 struct lp_type dst_type, 870 unsigned dst_count, 871 unsigned dst_alignment) 872 { 873 LLVMBuilderRef builder = gallivm->builder; 874 unsigned row_size = dst_count / block_height; 875 unsigned i; 876 877 /* Ensure block exactly fits into dst */ 878 assert((block_width * block_height) % dst_count == 0); 879 880 for (i = 0; i < dst_count; ++i) { 881 unsigned x = i % row_size; 882 unsigned y = i / row_size; 883 884 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length); 885 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, ""); 886 887 LLVMValueRef gep[2]; 888 LLVMValueRef dst_ptr; 889 890 gep[0] = lp_build_const_int32(gallivm, 0); 891 gep[1] = LLVMBuildAdd(builder, bx, by, ""); 892 893 dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); 894 dst_ptr = LLVMBuildBitCast(builder, dst_ptr, 895 LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), ""); 896 897 dst[i] = LLVMBuildLoad(builder, dst_ptr, ""); 898 899 LLVMSetAlignment(dst[i], dst_alignment); 900 } 901 } 902 903 904 /** 905 * Store an unswizzled block of pixels to memory 906 */ 907 static void 908 store_unswizzled_block(struct gallivm_state *gallivm, 909 LLVMValueRef base_ptr, 910 LLVMValueRef stride, 911 unsigned block_width, 912 unsigned block_height, 913 LLVMValueRef* src, 914 struct lp_type src_type, 915 unsigned src_count, 916 unsigned src_alignment) 917 { 918 LLVMBuilderRef builder = gallivm->builder; 919 unsigned row_size = src_count / block_height; 920 unsigned i; 921 922 /* Ensure src exactly fits into block */ 923 assert((block_width * block_height) % src_count == 0); 924 925 for (i = 0; i < src_count; ++i) { 926 unsigned x = i % row_size; 927 unsigned y = i / row_size; 928 929 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length); 930 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, ""); 931 932 LLVMValueRef gep[2]; 933 LLVMValueRef src_ptr; 934 935 gep[0] = lp_build_const_int32(gallivm, 0); 936 gep[1] = LLVMBuildAdd(builder, bx, by, ""); 937 938 src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); 939 src_ptr = LLVMBuildBitCast(builder, src_ptr, 940 LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), ""); 941 942 src_ptr = LLVMBuildStore(builder, src[i], src_ptr); 943 944 LLVMSetAlignment(src_ptr, src_alignment); 945 } 946 } 947 948 949 /** 950 * Checks if a format description is an arithmetic format 951 * 952 * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5. 953 */ 954 static inline boolean 955 is_arithmetic_format(const struct util_format_description *format_desc) 956 { 957 boolean arith = false; 958 unsigned i; 959 960 for (i = 0; i < format_desc->nr_channels; ++i) { 961 arith |= format_desc->channel[i].size != format_desc->channel[0].size; 962 arith |= (format_desc->channel[i].size % 8) != 0; 963 } 964 965 return arith; 966 } 967 968 969 /** 970 * Checks if this format requires special handling due to required expansion 971 * to floats for blending, and furthermore has "natural" packed AoS -> unpacked 972 * SoA conversion. 973 */ 974 static inline boolean 975 format_expands_to_float_soa(const struct util_format_description *format_desc) 976 { 977 if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || 978 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { 979 return true; 980 } 981 return false; 982 } 983 984 985 /** 986 * Retrieves the type representing the memory layout for a format 987 * 988 * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte 989 */ 990 static inline void 991 lp_mem_type_from_format_desc(const struct util_format_description *format_desc, 992 struct lp_type* type) 993 { 994 unsigned i; 995 unsigned chan; 996 997 if (format_expands_to_float_soa(format_desc)) { 998 /* just make this a uint with width of block */ 999 type->floating = false; 1000 type->fixed = false; 1001 type->sign = false; 1002 type->norm = false; 1003 type->width = format_desc->block.bits; 1004 type->length = 1; 1005 return; 1006 } 1007 1008 for (i = 0; i < 4; i++) 1009 if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) 1010 break; 1011 chan = i; 1012 1013 memset(type, 0, sizeof(struct lp_type)); 1014 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT; 1015 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED; 1016 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED; 1017 type->norm = format_desc->channel[chan].normalized; 1018 1019 if (is_arithmetic_format(format_desc)) { 1020 type->width = 0; 1021 type->length = 1; 1022 1023 for (i = 0; i < format_desc->nr_channels; ++i) { 1024 type->width += format_desc->channel[i].size; 1025 } 1026 } else { 1027 type->width = format_desc->channel[chan].size; 1028 type->length = format_desc->nr_channels; 1029 } 1030 } 1031 1032 1033 /** 1034 * Retrieves the type for a format which is usable in the blending code. 1035 * 1036 * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte 1037 */ 1038 static inline void 1039 lp_blend_type_from_format_desc(const struct util_format_description *format_desc, 1040 struct lp_type* type) 1041 { 1042 unsigned i; 1043 unsigned chan; 1044 1045 if (format_expands_to_float_soa(format_desc)) { 1046 /* always use ordinary floats for blending */ 1047 type->floating = true; 1048 type->fixed = false; 1049 type->sign = true; 1050 type->norm = false; 1051 type->width = 32; 1052 type->length = 4; 1053 return; 1054 } 1055 1056 for (i = 0; i < 4; i++) 1057 if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) 1058 break; 1059 chan = i; 1060 1061 memset(type, 0, sizeof(struct lp_type)); 1062 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT; 1063 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED; 1064 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED; 1065 type->norm = format_desc->channel[chan].normalized; 1066 type->width = format_desc->channel[chan].size; 1067 type->length = format_desc->nr_channels; 1068 1069 for (i = 1; i < format_desc->nr_channels; ++i) { 1070 if (format_desc->channel[i].size > type->width) 1071 type->width = format_desc->channel[i].size; 1072 } 1073 1074 if (type->floating) { 1075 type->width = 32; 1076 } else { 1077 if (type->width <= 8) { 1078 type->width = 8; 1079 } else if (type->width <= 16) { 1080 type->width = 16; 1081 } else { 1082 type->width = 32; 1083 } 1084 } 1085 1086 if (is_arithmetic_format(format_desc) && type->length == 3) { 1087 type->length = 4; 1088 } 1089 } 1090 1091 1092 /** 1093 * Scale a normalized value from src_bits to dst_bits. 1094 * 1095 * The exact calculation is 1096 * 1097 * dst = iround(src * dst_mask / src_mask) 1098 * 1099 * or with integer rounding 1100 * 1101 * dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask) 1102 * 1103 * where 1104 * 1105 * src_mask = (1 << src_bits) - 1 1106 * dst_mask = (1 << dst_bits) - 1 1107 * 1108 * but we try to avoid division and multiplication through shifts. 1109 */ 1110 static inline LLVMValueRef 1111 scale_bits(struct gallivm_state *gallivm, 1112 int src_bits, 1113 int dst_bits, 1114 LLVMValueRef src, 1115 struct lp_type src_type) 1116 { 1117 LLVMBuilderRef builder = gallivm->builder; 1118 LLVMValueRef result = src; 1119 1120 if (dst_bits < src_bits) { 1121 int delta_bits = src_bits - dst_bits; 1122 1123 if (delta_bits <= dst_bits) { 1124 /* 1125 * Approximate the rescaling with a single shift. 1126 * 1127 * This gives the wrong rounding. 1128 */ 1129 1130 result = LLVMBuildLShr(builder, 1131 src, 1132 lp_build_const_int_vec(gallivm, src_type, delta_bits), 1133 ""); 1134 1135 } else { 1136 /* 1137 * Try more accurate rescaling. 1138 */ 1139 1140 /* 1141 * Drop the least significant bits to make space for the multiplication. 1142 * 1143 * XXX: A better approach would be to use a wider integer type as intermediate. But 1144 * this is enough to convert alpha from 16bits -> 2 when rendering to 1145 * PIPE_FORMAT_R10G10B10A2_UNORM. 1146 */ 1147 result = LLVMBuildLShr(builder, 1148 src, 1149 lp_build_const_int_vec(gallivm, src_type, dst_bits), 1150 ""); 1151 1152 1153 result = LLVMBuildMul(builder, 1154 result, 1155 lp_build_const_int_vec(gallivm, src_type, (1LL << dst_bits) - 1), 1156 ""); 1157 1158 /* 1159 * Add a rounding term before the division. 1160 * 1161 * TODO: Handle signed integers too. 1162 */ 1163 if (!src_type.sign) { 1164 result = LLVMBuildAdd(builder, 1165 result, 1166 lp_build_const_int_vec(gallivm, src_type, (1LL << (delta_bits - 1))), 1167 ""); 1168 } 1169 1170 /* 1171 * Approximate the division by src_mask with a src_bits shift. 1172 * 1173 * Given the src has already been shifted by dst_bits, all we need 1174 * to do is to shift by the difference. 1175 */ 1176 1177 result = LLVMBuildLShr(builder, 1178 result, 1179 lp_build_const_int_vec(gallivm, src_type, delta_bits), 1180 ""); 1181 } 1182 1183 } else if (dst_bits > src_bits) { 1184 /* Scale up bits */ 1185 int db = dst_bits - src_bits; 1186 1187 /* Shift left by difference in bits */ 1188 result = LLVMBuildShl(builder, 1189 src, 1190 lp_build_const_int_vec(gallivm, src_type, db), 1191 ""); 1192 1193 if (db <= src_bits) { 1194 /* Enough bits in src to fill the remainder */ 1195 LLVMValueRef lower = LLVMBuildLShr(builder, 1196 src, 1197 lp_build_const_int_vec(gallivm, src_type, src_bits - db), 1198 ""); 1199 1200 result = LLVMBuildOr(builder, result, lower, ""); 1201 } else if (db > src_bits) { 1202 /* Need to repeatedly copy src bits to fill remainder in dst */ 1203 unsigned n; 1204 1205 for (n = src_bits; n < dst_bits; n *= 2) { 1206 LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n); 1207 1208 result = LLVMBuildOr(builder, 1209 result, 1210 LLVMBuildLShr(builder, result, shuv, ""), 1211 ""); 1212 } 1213 } 1214 } 1215 1216 return result; 1217 } 1218 1219 /** 1220 * If RT is a smallfloat (needing denorms) format 1221 */ 1222 static inline int 1223 have_smallfloat_format(struct lp_type dst_type, 1224 enum pipe_format format) 1225 { 1226 return ((dst_type.floating && dst_type.width != 32) || 1227 /* due to format handling hacks this format doesn't have floating set 1228 * here (and actually has width set to 32 too) so special case this. */ 1229 (format == PIPE_FORMAT_R11G11B10_FLOAT)); 1230 } 1231 1232 1233 /** 1234 * Convert from memory format to blending format 1235 * 1236 * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending 1237 */ 1238 static void 1239 convert_to_blend_type(struct gallivm_state *gallivm, 1240 unsigned block_size, 1241 const struct util_format_description *src_fmt, 1242 struct lp_type src_type, 1243 struct lp_type dst_type, 1244 LLVMValueRef* src, // and dst 1245 unsigned num_srcs) 1246 { 1247 LLVMValueRef *dst = src; 1248 LLVMBuilderRef builder = gallivm->builder; 1249 struct lp_type blend_type; 1250 struct lp_type mem_type; 1251 unsigned i, j; 1252 unsigned pixels = block_size / num_srcs; 1253 bool is_arith; 1254 1255 /* 1256 * full custom path for packed floats and srgb formats - none of the later 1257 * functions would do anything useful, and given the lp_type representation they 1258 * can't be fixed. Should really have some SoA blend path for these kind of 1259 * formats rather than hacking them in here. 1260 */ 1261 if (format_expands_to_float_soa(src_fmt)) { 1262 LLVMValueRef tmpsrc[4]; 1263 /* 1264 * This is pretty suboptimal for this case blending in SoA would be much 1265 * better, since conversion gets us SoA values so need to convert back. 1266 */ 1267 assert(src_type.width == 32 || src_type.width == 16); 1268 assert(dst_type.floating); 1269 assert(dst_type.width == 32); 1270 assert(dst_type.length % 4 == 0); 1271 assert(num_srcs % 4 == 0); 1272 1273 if (src_type.width == 16) { 1274 /* expand 4x16bit values to 4x32bit */ 1275 struct lp_type type32x4 = src_type; 1276 LLVMTypeRef ltype32x4; 1277 unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4; 1278 type32x4.width = 32; 1279 ltype32x4 = lp_build_vec_type(gallivm, type32x4); 1280 for (i = 0; i < num_fetch; i++) { 1281 src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, ""); 1282 } 1283 src_type.width = 32; 1284 } 1285 for (i = 0; i < 4; i++) { 1286 tmpsrc[i] = src[i]; 1287 } 1288 for (i = 0; i < num_srcs / 4; i++) { 1289 LLVMValueRef tmpsoa[4]; 1290 LLVMValueRef tmps = tmpsrc[i]; 1291 if (dst_type.length == 8) { 1292 LLVMValueRef shuffles[8]; 1293 unsigned j; 1294 /* fetch was 4 values but need 8-wide output values */ 1295 tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2); 1296 /* 1297 * for 8-wide aos transpose would give us wrong order not matching 1298 * incoming converted fs values and mask. ARGH. 1299 */ 1300 for (j = 0; j < 4; j++) { 1301 shuffles[j] = lp_build_const_int32(gallivm, j * 2); 1302 shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1); 1303 } 1304 tmps = LLVMBuildShuffleVector(builder, tmps, tmps, 1305 LLVMConstVector(shuffles, 8), ""); 1306 } 1307 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) { 1308 lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa); 1309 } 1310 else { 1311 lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa); 1312 } 1313 lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]); 1314 } 1315 return; 1316 } 1317 1318 lp_mem_type_from_format_desc(src_fmt, &mem_type); 1319 lp_blend_type_from_format_desc(src_fmt, &blend_type); 1320 1321 /* Is the format arithmetic */ 1322 is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length; 1323 is_arith &= !(mem_type.width == 16 && mem_type.floating); 1324 1325 /* Pad if necessary */ 1326 if (!is_arith && src_type.length < dst_type.length) { 1327 for (i = 0; i < num_srcs; ++i) { 1328 dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length); 1329 } 1330 1331 src_type.length = dst_type.length; 1332 } 1333 1334 /* Special case for half-floats */ 1335 if (mem_type.width == 16 && mem_type.floating) { 1336 assert(blend_type.width == 32 && blend_type.floating); 1337 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst); 1338 is_arith = false; 1339 } 1340 1341 if (!is_arith) { 1342 return; 1343 } 1344 1345 src_type.width = blend_type.width * blend_type.length; 1346 blend_type.length *= pixels; 1347 src_type.length *= pixels / (src_type.length / mem_type.length); 1348 1349 for (i = 0; i < num_srcs; ++i) { 1350 LLVMValueRef chans[4]; 1351 LLVMValueRef res = NULL; 1352 1353 dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), ""); 1354 1355 for (j = 0; j < src_fmt->nr_channels; ++j) { 1356 unsigned mask = 0; 1357 unsigned sa = src_fmt->channel[j].shift; 1358 #ifdef PIPE_ARCH_LITTLE_ENDIAN 1359 unsigned from_lsb = j; 1360 #else 1361 unsigned from_lsb = src_fmt->nr_channels - j - 1; 1362 #endif 1363 1364 mask = (1 << src_fmt->channel[j].size) - 1; 1365 1366 /* Extract bits from source */ 1367 chans[j] = LLVMBuildLShr(builder, 1368 dst[i], 1369 lp_build_const_int_vec(gallivm, src_type, sa), 1370 ""); 1371 1372 chans[j] = LLVMBuildAnd(builder, 1373 chans[j], 1374 lp_build_const_int_vec(gallivm, src_type, mask), 1375 ""); 1376 1377 /* Scale bits */ 1378 if (src_type.norm) { 1379 chans[j] = scale_bits(gallivm, src_fmt->channel[j].size, 1380 blend_type.width, chans[j], src_type); 1381 } 1382 1383 /* Insert bits into correct position */ 1384 chans[j] = LLVMBuildShl(builder, 1385 chans[j], 1386 lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width), 1387 ""); 1388 1389 if (j == 0) { 1390 res = chans[j]; 1391 } else { 1392 res = LLVMBuildOr(builder, res, chans[j], ""); 1393 } 1394 } 1395 1396 dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), ""); 1397 } 1398 } 1399 1400 1401 /** 1402 * Convert from blending format to memory format 1403 * 1404 * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory 1405 */ 1406 static void 1407 convert_from_blend_type(struct gallivm_state *gallivm, 1408 unsigned block_size, 1409 const struct util_format_description *src_fmt, 1410 struct lp_type src_type, 1411 struct lp_type dst_type, 1412 LLVMValueRef* src, // and dst 1413 unsigned num_srcs) 1414 { 1415 LLVMValueRef* dst = src; 1416 unsigned i, j, k; 1417 struct lp_type mem_type; 1418 struct lp_type blend_type; 1419 LLVMBuilderRef builder = gallivm->builder; 1420 unsigned pixels = block_size / num_srcs; 1421 bool is_arith; 1422 1423 /* 1424 * full custom path for packed floats and srgb formats - none of the later 1425 * functions would do anything useful, and given the lp_type representation they 1426 * can't be fixed. Should really have some SoA blend path for these kind of 1427 * formats rather than hacking them in here. 1428 */ 1429 if (format_expands_to_float_soa(src_fmt)) { 1430 /* 1431 * This is pretty suboptimal for this case blending in SoA would be much 1432 * better - we need to transpose the AoS values back to SoA values for 1433 * conversion/packing. 1434 */ 1435 assert(src_type.floating); 1436 assert(src_type.width == 32); 1437 assert(src_type.length % 4 == 0); 1438 assert(dst_type.width == 32 || dst_type.width == 16); 1439 1440 for (i = 0; i < num_srcs / 4; i++) { 1441 LLVMValueRef tmpsoa[4], tmpdst; 1442 lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa); 1443 /* really really need SoA here */ 1444 1445 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) { 1446 tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa); 1447 } 1448 else { 1449 tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt, 1450 src_type, tmpsoa); 1451 } 1452 1453 if (src_type.length == 8) { 1454 LLVMValueRef tmpaos, shuffles[8]; 1455 unsigned j; 1456 /* 1457 * for 8-wide aos transpose has given us wrong order not matching 1458 * output order. HMPF. Also need to split the output values manually. 1459 */ 1460 for (j = 0; j < 4; j++) { 1461 shuffles[j * 2] = lp_build_const_int32(gallivm, j); 1462 shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4); 1463 } 1464 tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst, 1465 LLVMConstVector(shuffles, 8), ""); 1466 src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4); 1467 src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4); 1468 } 1469 else { 1470 src[i] = tmpdst; 1471 } 1472 } 1473 if (dst_type.width == 16) { 1474 struct lp_type type16x8 = dst_type; 1475 struct lp_type type32x4 = dst_type; 1476 LLVMTypeRef ltype16x4, ltypei64, ltypei128; 1477 unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4; 1478 type16x8.length = 8; 1479 type32x4.width = 32; 1480 ltypei128 = LLVMIntTypeInContext(gallivm->context, 128); 1481 ltypei64 = LLVMIntTypeInContext(gallivm->context, 64); 1482 ltype16x4 = lp_build_vec_type(gallivm, dst_type); 1483 /* We could do vector truncation but it doesn't generate very good code */ 1484 for (i = 0; i < num_fetch; i++) { 1485 src[i] = lp_build_pack2(gallivm, type32x4, type16x8, 1486 src[i], lp_build_zero(gallivm, type32x4)); 1487 src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, ""); 1488 src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, ""); 1489 src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, ""); 1490 } 1491 } 1492 return; 1493 } 1494 1495 lp_mem_type_from_format_desc(src_fmt, &mem_type); 1496 lp_blend_type_from_format_desc(src_fmt, &blend_type); 1497 1498 is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length); 1499 1500 /* Special case for half-floats */ 1501 if (mem_type.width == 16 && mem_type.floating) { 1502 int length = dst_type.length; 1503 assert(blend_type.width == 32 && blend_type.floating); 1504 1505 dst_type.length = src_type.length; 1506 1507 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst); 1508 1509 dst_type.length = length; 1510 is_arith = false; 1511 } 1512 1513 /* Remove any padding */ 1514 if (!is_arith && (src_type.length % mem_type.length)) { 1515 src_type.length -= (src_type.length % mem_type.length); 1516 1517 for (i = 0; i < num_srcs; ++i) { 1518 dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length); 1519 } 1520 } 1521 1522 /* No bit arithmetic to do */ 1523 if (!is_arith) { 1524 return; 1525 } 1526 1527 src_type.length = pixels; 1528 src_type.width = blend_type.length * blend_type.width; 1529 dst_type.length = pixels; 1530 1531 for (i = 0; i < num_srcs; ++i) { 1532 LLVMValueRef chans[4]; 1533 LLVMValueRef res = NULL; 1534 1535 dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), ""); 1536 1537 for (j = 0; j < src_fmt->nr_channels; ++j) { 1538 unsigned mask = 0; 1539 unsigned sa = src_fmt->channel[j].shift; 1540 #ifdef PIPE_ARCH_LITTLE_ENDIAN 1541 unsigned from_lsb = j; 1542 #else 1543 unsigned from_lsb = src_fmt->nr_channels - j - 1; 1544 #endif 1545 1546 assert(blend_type.width > src_fmt->channel[j].size); 1547 1548 for (k = 0; k < blend_type.width; ++k) { 1549 mask |= 1 << k; 1550 } 1551 1552 /* Extract bits */ 1553 chans[j] = LLVMBuildLShr(builder, 1554 dst[i], 1555 lp_build_const_int_vec(gallivm, src_type, 1556 from_lsb * blend_type.width), 1557 ""); 1558 1559 chans[j] = LLVMBuildAnd(builder, 1560 chans[j], 1561 lp_build_const_int_vec(gallivm, src_type, mask), 1562 ""); 1563 1564 /* Scale down bits */ 1565 if (src_type.norm) { 1566 chans[j] = scale_bits(gallivm, blend_type.width, 1567 src_fmt->channel[j].size, chans[j], src_type); 1568 } 1569 1570 /* Insert bits */ 1571 chans[j] = LLVMBuildShl(builder, 1572 chans[j], 1573 lp_build_const_int_vec(gallivm, src_type, sa), 1574 ""); 1575 1576 sa += src_fmt->channel[j].size; 1577 1578 if (j == 0) { 1579 res = chans[j]; 1580 } else { 1581 res = LLVMBuildOr(builder, res, chans[j], ""); 1582 } 1583 } 1584 1585 assert (dst_type.width != 24); 1586 1587 dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), ""); 1588 } 1589 } 1590 1591 1592 /** 1593 * Convert alpha to same blend type as src 1594 */ 1595 static void 1596 convert_alpha(struct gallivm_state *gallivm, 1597 struct lp_type row_type, 1598 struct lp_type alpha_type, 1599 const unsigned block_size, 1600 const unsigned block_height, 1601 const unsigned src_count, 1602 const unsigned dst_channels, 1603 const bool pad_inline, 1604 LLVMValueRef* src_alpha) 1605 { 1606 LLVMBuilderRef builder = gallivm->builder; 1607 unsigned i, j; 1608 unsigned length = row_type.length; 1609 row_type.length = alpha_type.length; 1610 1611 /* Twiddle the alpha to match pixels */ 1612 lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha); 1613 1614 /* 1615 * TODO this should use single lp_build_conv call for 1616 * src_count == 1 && dst_channels == 1 case (dropping the concat below) 1617 */ 1618 for (i = 0; i < block_height; ++i) { 1619 lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1); 1620 } 1621 1622 alpha_type = row_type; 1623 row_type.length = length; 1624 1625 /* If only one channel we can only need the single alpha value per pixel */ 1626 if (src_count == 1 && dst_channels == 1) { 1627 1628 lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height, src_alpha, src_count); 1629 } else { 1630 /* If there are more srcs than rows then we need to split alpha up */ 1631 if (src_count > block_height) { 1632 for (i = src_count; i > 0; --i) { 1633 unsigned pixels = block_size / src_count; 1634 unsigned idx = i - 1; 1635 1636 src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4], 1637 (idx * pixels) % 4, pixels); 1638 } 1639 } 1640 1641 /* If there is a src for each pixel broadcast the alpha across whole row */ 1642 if (src_count == block_size) { 1643 for (i = 0; i < src_count; ++i) { 1644 src_alpha[i] = lp_build_broadcast(gallivm, 1645 lp_build_vec_type(gallivm, row_type), src_alpha[i]); 1646 } 1647 } else { 1648 unsigned pixels = block_size / src_count; 1649 unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels; 1650 unsigned alpha_span = 1; 1651 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; 1652 1653 /* Check if we need 2 src_alphas for our shuffles */ 1654 if (pixels > alpha_type.length) { 1655 alpha_span = 2; 1656 } 1657 1658 /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */ 1659 for (j = 0; j < row_type.length; ++j) { 1660 if (j < pixels * channels) { 1661 shuffles[j] = lp_build_const_int32(gallivm, j / channels); 1662 } else { 1663 shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 1664 } 1665 } 1666 1667 for (i = 0; i < src_count; ++i) { 1668 unsigned idx1 = i, idx2 = i; 1669 1670 if (alpha_span > 1){ 1671 idx1 *= alpha_span; 1672 idx2 = idx1 + 1; 1673 } 1674 1675 src_alpha[i] = LLVMBuildShuffleVector(builder, 1676 src_alpha[idx1], 1677 src_alpha[idx2], 1678 LLVMConstVector(shuffles, row_type.length), 1679 ""); 1680 } 1681 } 1682 } 1683 } 1684 1685 1686 /** 1687 * Generates the blend function for unswizzled colour buffers 1688 * Also generates the read & write from colour buffer 1689 */ 1690 static void 1691 generate_unswizzled_blend(struct gallivm_state *gallivm, 1692 unsigned rt, 1693 struct lp_fragment_shader_variant *variant, 1694 enum pipe_format out_format, 1695 unsigned int num_fs, 1696 struct lp_type fs_type, 1697 LLVMValueRef* fs_mask, 1698 LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4], 1699 LLVMValueRef context_ptr, 1700 LLVMValueRef color_ptr, 1701 LLVMValueRef stride, 1702 unsigned partial_mask, 1703 boolean do_branch) 1704 { 1705 const unsigned alpha_channel = 3; 1706 const unsigned block_width = LP_RASTER_BLOCK_SIZE; 1707 const unsigned block_height = LP_RASTER_BLOCK_SIZE; 1708 const unsigned block_size = block_width * block_height; 1709 const unsigned lp_integer_vector_width = 128; 1710 1711 LLVMBuilderRef builder = gallivm->builder; 1712 LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS]; 1713 LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS]; 1714 LLVMValueRef src_alpha[4 * 4]; 1715 LLVMValueRef src1_alpha[4 * 4] = { NULL }; 1716 LLVMValueRef src_mask[4 * 4]; 1717 LLVMValueRef src[4 * 4]; 1718 LLVMValueRef src1[4 * 4]; 1719 LLVMValueRef dst[4 * 4]; 1720 LLVMValueRef blend_color; 1721 LLVMValueRef blend_alpha; 1722 LLVMValueRef i32_zero; 1723 LLVMValueRef check_mask; 1724 LLVMValueRef undef_src_val; 1725 1726 struct lp_build_mask_context mask_ctx; 1727 struct lp_type mask_type; 1728 struct lp_type blend_type; 1729 struct lp_type row_type; 1730 struct lp_type dst_type; 1731 struct lp_type ls_type; 1732 1733 unsigned char swizzle[TGSI_NUM_CHANNELS]; 1734 unsigned vector_width; 1735 unsigned src_channels = TGSI_NUM_CHANNELS; 1736 unsigned dst_channels; 1737 unsigned dst_count; 1738 unsigned src_count; 1739 unsigned i, j; 1740 1741 const struct util_format_description* out_format_desc = util_format_description(out_format); 1742 1743 unsigned dst_alignment; 1744 1745 bool pad_inline = is_arithmetic_format(out_format_desc); 1746 bool has_alpha = false; 1747 const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable && 1748 util_blend_state_is_dual(&variant->key.blend, 0); 1749 1750 const boolean is_1d = variant->key.resource_1d; 1751 boolean twiddle_after_convert = FALSE; 1752 unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs; 1753 LLVMValueRef fpstate = 0; 1754 1755 /* Get type from output format */ 1756 lp_blend_type_from_format_desc(out_format_desc, &row_type); 1757 lp_mem_type_from_format_desc(out_format_desc, &dst_type); 1758 1759 /* 1760 * Technically this code should go into lp_build_smallfloat_to_float 1761 * and lp_build_float_to_smallfloat but due to the 1762 * http://llvm.org/bugs/show_bug.cgi?id=6393 1763 * llvm reorders the mxcsr intrinsics in a way that breaks the code. 1764 * So the ordering is important here and there shouldn't be any 1765 * llvm ir instrunctions in this function before 1766 * this, otherwise half-float format conversions won't work 1767 * (again due to llvm bug #6393). 1768 */ 1769 if (have_smallfloat_format(dst_type, out_format)) { 1770 /* We need to make sure that denorms are ok for half float 1771 conversions */ 1772 fpstate = lp_build_fpstate_get(gallivm); 1773 lp_build_fpstate_set_denorms_zero(gallivm, FALSE); 1774 } 1775 1776 mask_type = lp_int32_vec4_type(); 1777 mask_type.length = fs_type.length; 1778 1779 for (i = num_fs; i < num_fullblock_fs; i++) { 1780 fs_mask[i] = lp_build_zero(gallivm, mask_type); 1781 } 1782 1783 /* Do not bother executing code when mask is empty.. */ 1784 if (do_branch) { 1785 check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type)); 1786 1787 for (i = 0; i < num_fullblock_fs; ++i) { 1788 check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], ""); 1789 } 1790 1791 lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask); 1792 lp_build_mask_check(&mask_ctx); 1793 } 1794 1795 partial_mask |= !variant->opaque; 1796 i32_zero = lp_build_const_int32(gallivm, 0); 1797 1798 undef_src_val = lp_build_undef(gallivm, fs_type); 1799 1800 row_type.length = fs_type.length; 1801 vector_width = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width; 1802 1803 /* Compute correct swizzle and count channels */ 1804 memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS); 1805 dst_channels = 0; 1806 1807 for (i = 0; i < TGSI_NUM_CHANNELS; ++i) { 1808 /* Ensure channel is used */ 1809 if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) { 1810 continue; 1811 } 1812 1813 /* Ensure not already written to (happens in case with GL_ALPHA) */ 1814 if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) { 1815 continue; 1816 } 1817 1818 /* Ensure we havn't already found all channels */ 1819 if (dst_channels >= out_format_desc->nr_channels) { 1820 continue; 1821 } 1822 1823 swizzle[out_format_desc->swizzle[i]] = i; 1824 ++dst_channels; 1825 1826 if (i == alpha_channel) { 1827 has_alpha = true; 1828 } 1829 } 1830 1831 if (format_expands_to_float_soa(out_format_desc)) { 1832 /* 1833 * the code above can't work for layout_other 1834 * for srgb it would sort of work but we short-circuit swizzles, etc. 1835 * as that is done as part of unpack / pack. 1836 */ 1837 dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */ 1838 has_alpha = true; 1839 swizzle[0] = 0; 1840 swizzle[1] = 1; 1841 swizzle[2] = 2; 1842 swizzle[3] = 3; 1843 pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */ 1844 } 1845 1846 /* If 3 channels then pad to include alpha for 4 element transpose */ 1847 if (dst_channels == 3) { 1848 assert (!has_alpha); 1849 for (i = 0; i < TGSI_NUM_CHANNELS; i++) { 1850 if (swizzle[i] > TGSI_NUM_CHANNELS) 1851 swizzle[i] = 3; 1852 } 1853 if (out_format_desc->nr_channels == 4) { 1854 dst_channels = 4; 1855 /* 1856 * We use alpha from the color conversion, not separate one. 1857 * We had to include it for transpose, hence it will get converted 1858 * too (albeit when doing transpose after conversion, that would 1859 * no longer be the case necessarily). 1860 * (It works only with 4 channel dsts, e.g. rgbx formats, because 1861 * otherwise we really have padding, not alpha, included.) 1862 */ 1863 has_alpha = true; 1864 } 1865 } 1866 1867 /* 1868 * Load shader output 1869 */ 1870 for (i = 0; i < num_fullblock_fs; ++i) { 1871 /* Always load alpha for use in blending */ 1872 LLVMValueRef alpha; 1873 if (i < num_fs) { 1874 alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], ""); 1875 } 1876 else { 1877 alpha = undef_src_val; 1878 } 1879 1880 /* Load each channel */ 1881 for (j = 0; j < dst_channels; ++j) { 1882 assert(swizzle[j] < 4); 1883 if (i < num_fs) { 1884 fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], ""); 1885 } 1886 else { 1887 fs_src[i][j] = undef_src_val; 1888 } 1889 } 1890 1891 /* If 3 channels then pad to include alpha for 4 element transpose */ 1892 /* 1893 * XXX If we include that here maybe could actually use it instead of 1894 * separate alpha for blending? 1895 * (Difficult though we actually convert pad channels, not alpha.) 1896 */ 1897 if (dst_channels == 3 && !has_alpha) { 1898 fs_src[i][3] = alpha; 1899 } 1900 1901 /* We split the row_mask and row_alpha as we want 128bit interleave */ 1902 if (fs_type.length == 8) { 1903 src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i], 1904 0, src_channels); 1905 src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i], 1906 src_channels, src_channels); 1907 1908 src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels); 1909 src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, 1910 src_channels, src_channels); 1911 } else { 1912 src_mask[i] = fs_mask[i]; 1913 src_alpha[i] = alpha; 1914 } 1915 } 1916 if (dual_source_blend) { 1917 /* same as above except different src/dst, skip masks and comments... */ 1918 for (i = 0; i < num_fullblock_fs; ++i) { 1919 LLVMValueRef alpha; 1920 if (i < num_fs) { 1921 alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], ""); 1922 } 1923 else { 1924 alpha = undef_src_val; 1925 } 1926 1927 for (j = 0; j < dst_channels; ++j) { 1928 assert(swizzle[j] < 4); 1929 if (i < num_fs) { 1930 fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], ""); 1931 } 1932 else { 1933 fs_src1[i][j] = undef_src_val; 1934 } 1935 } 1936 if (dst_channels == 3 && !has_alpha) { 1937 fs_src1[i][3] = alpha; 1938 } 1939 if (fs_type.length == 8) { 1940 src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels); 1941 src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, 1942 src_channels, src_channels); 1943 } else { 1944 src1_alpha[i] = alpha; 1945 } 1946 } 1947 } 1948 1949 if (util_format_is_pure_integer(out_format)) { 1950 /* 1951 * In this case fs_type was really ints or uints disguised as floats, 1952 * fix that up now. 1953 */ 1954 fs_type.floating = 0; 1955 fs_type.sign = dst_type.sign; 1956 for (i = 0; i < num_fullblock_fs; ++i) { 1957 for (j = 0; j < dst_channels; ++j) { 1958 fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j], 1959 lp_build_vec_type(gallivm, fs_type), ""); 1960 } 1961 if (dst_channels == 3 && !has_alpha) { 1962 fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3], 1963 lp_build_vec_type(gallivm, fs_type), ""); 1964 } 1965 } 1966 } 1967 1968 /* 1969 * We actually should generally do conversion first (for non-1d cases) 1970 * when the blend format is 8 or 16 bits. The reason is obvious, 1971 * there's 2 or 4 times less vectors to deal with for the interleave... 1972 * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit 1973 * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit 1974 * unpack only with 128bit vectors). 1975 * Note: for 16bit sizes really need matching pack conversion code 1976 */ 1977 if (!is_1d && dst_channels != 3 && dst_type.width == 8) { 1978 twiddle_after_convert = TRUE; 1979 } 1980 1981 /* 1982 * Pixel twiddle from fragment shader order to memory order 1983 */ 1984 if (!twiddle_after_convert) { 1985 src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, 1986 dst_channels, fs_src, src, pad_inline); 1987 if (dual_source_blend) { 1988 generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels, 1989 fs_src1, src1, pad_inline); 1990 } 1991 } else { 1992 src_count = num_fullblock_fs * dst_channels; 1993 /* 1994 * We reorder things a bit here, so the cases for 4-wide and 8-wide 1995 * (AVX) turn out the same later when untwiddling/transpose (albeit 1996 * for true AVX2 path untwiddle needs to be different). 1997 * For now just order by colors first (so we can use unpack later). 1998 */ 1999 for (j = 0; j < num_fullblock_fs; j++) { 2000 for (i = 0; i < dst_channels; i++) { 2001 src[i*num_fullblock_fs + j] = fs_src[j][i]; 2002 if (dual_source_blend) { 2003 src1[i*num_fullblock_fs + j] = fs_src1[j][i]; 2004 } 2005 } 2006 } 2007 } 2008 2009 src_channels = dst_channels < 3 ? dst_channels : 4; 2010 if (src_count != num_fullblock_fs * src_channels) { 2011 unsigned ds = src_count / (num_fullblock_fs * src_channels); 2012 row_type.length /= ds; 2013 fs_type.length = row_type.length; 2014 } 2015 2016 blend_type = row_type; 2017 mask_type.length = 4; 2018 2019 /* Convert src to row_type */ 2020 if (dual_source_blend) { 2021 struct lp_type old_row_type = row_type; 2022 lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src); 2023 src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type, src1, src_count, src1); 2024 } 2025 else { 2026 src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src); 2027 } 2028 2029 /* If the rows are not an SSE vector, combine them to become SSE size! */ 2030 if ((row_type.width * row_type.length) % 128) { 2031 unsigned bits = row_type.width * row_type.length; 2032 unsigned combined; 2033 2034 assert(src_count >= (vector_width / bits)); 2035 2036 dst_count = src_count / (vector_width / bits); 2037 2038 combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count); 2039 if (dual_source_blend) { 2040 lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count); 2041 } 2042 2043 row_type.length *= combined; 2044 src_count /= combined; 2045 2046 bits = row_type.width * row_type.length; 2047 assert(bits == 128 || bits == 256); 2048 } 2049 2050 if (twiddle_after_convert) { 2051 fs_twiddle_transpose(gallivm, row_type, src, src_count, src); 2052 if (dual_source_blend) { 2053 fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1); 2054 } 2055 } 2056 2057 /* 2058 * Blend Colour conversion 2059 */ 2060 blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr); 2061 blend_color = LLVMBuildPointerCast(builder, blend_color, 2062 LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), ""); 2063 blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, 2064 &i32_zero, 1, ""), ""); 2065 2066 /* Convert */ 2067 lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1); 2068 2069 if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { 2070 /* 2071 * since blending is done with floats, there was no conversion. 2072 * However, the rules according to fixed point renderbuffers still 2073 * apply, that is we must clamp inputs to 0.0/1.0. 2074 * (This would apply to separate alpha conversion too but we currently 2075 * force has_alpha to be true.) 2076 * TODO: should skip this with "fake" blend, since post-blend conversion 2077 * will clamp anyway. 2078 * TODO: could also skip this if fragment color clamping is enabled. We 2079 * don't support it natively so it gets baked into the shader however, so 2080 * can't really tell here. 2081 */ 2082 struct lp_build_context f32_bld; 2083 assert(row_type.floating); 2084 lp_build_context_init(&f32_bld, gallivm, row_type); 2085 for (i = 0; i < src_count; i++) { 2086 src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]); 2087 } 2088 if (dual_source_blend) { 2089 for (i = 0; i < src_count; i++) { 2090 src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]); 2091 } 2092 } 2093 /* probably can't be different than row_type but better safe than sorry... */ 2094 lp_build_context_init(&f32_bld, gallivm, blend_type); 2095 blend_color = lp_build_clamp(&f32_bld, blend_color, f32_bld.zero, f32_bld.one); 2096 } 2097 2098 /* Extract alpha */ 2099 blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3)); 2100 2101 /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */ 2102 pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width; 2103 if (pad_inline) { 2104 /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */ 2105 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length); 2106 } else { 2107 /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */ 2108 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length); 2109 } 2110 2111 /* 2112 * Mask conversion 2113 */ 2114 lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], block_height, &src_mask[0]); 2115 2116 if (src_count < block_height) { 2117 lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count); 2118 } else if (src_count > block_height) { 2119 for (i = src_count; i > 0; --i) { 2120 unsigned pixels = block_size / src_count; 2121 unsigned idx = i - 1; 2122 2123 src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4], 2124 (idx * pixels) % 4, pixels); 2125 } 2126 } 2127 2128 assert(mask_type.width == 32); 2129 2130 for (i = 0; i < src_count; ++i) { 2131 unsigned pixels = block_size / src_count; 2132 unsigned pixel_width = row_type.width * dst_channels; 2133 2134 if (pixel_width == 24) { 2135 mask_type.width = 8; 2136 mask_type.length = vector_width / mask_type.width; 2137 } else { 2138 mask_type.length = pixels; 2139 mask_type.width = row_type.width * dst_channels; 2140 2141 /* 2142 * If mask_type width is smaller than 32bit, this doesn't quite 2143 * generate the most efficient code (could use some pack). 2144 */ 2145 src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], 2146 lp_build_int_vec_type(gallivm, mask_type), ""); 2147 2148 mask_type.length *= dst_channels; 2149 mask_type.width /= dst_channels; 2150 } 2151 2152 src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], 2153 lp_build_int_vec_type(gallivm, mask_type), ""); 2154 src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length); 2155 } 2156 2157 /* 2158 * Alpha conversion 2159 */ 2160 if (!has_alpha) { 2161 struct lp_type alpha_type = fs_type; 2162 alpha_type.length = 4; 2163 convert_alpha(gallivm, row_type, alpha_type, 2164 block_size, block_height, 2165 src_count, dst_channels, 2166 pad_inline, src_alpha); 2167 if (dual_source_blend) { 2168 convert_alpha(gallivm, row_type, alpha_type, 2169 block_size, block_height, 2170 src_count, dst_channels, 2171 pad_inline, src1_alpha); 2172 } 2173 } 2174 2175 2176 /* 2177 * Load dst from memory 2178 */ 2179 if (src_count < block_height) { 2180 dst_count = block_height; 2181 } else { 2182 dst_count = src_count; 2183 } 2184 2185 dst_type.length *= block_size / dst_count; 2186 2187 if (format_expands_to_float_soa(out_format_desc)) { 2188 /* 2189 * we need multiple values at once for the conversion, so can as well 2190 * load them vectorized here too instead of concatenating later. 2191 * (Still need concatenation later for 8-wide vectors). 2192 */ 2193 dst_count = block_height; 2194 dst_type.length = block_width; 2195 } 2196 2197 /* 2198 * Compute the alignment of the destination pointer in bytes 2199 * We fetch 1-4 pixels, if the format has pot alignment then those fetches 2200 * are always aligned by MIN2(16, fetch_width) except for buffers (not 2201 * 1d tex but can't distinguish here) so need to stick with per-pixel 2202 * alignment in this case. 2203 */ 2204 if (is_1d) { 2205 dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8); 2206 } 2207 else { 2208 dst_alignment = dst_type.length * dst_type.width / 8; 2209 } 2210 /* Force power-of-two alignment by extracting only the least-significant-bit */ 2211 dst_alignment = 1 << (ffs(dst_alignment) - 1); 2212 /* 2213 * Resource base and stride pointers are aligned to 16 bytes, so that's 2214 * the maximum alignment we can guarantee 2215 */ 2216 dst_alignment = MIN2(16, dst_alignment); 2217 2218 ls_type = dst_type; 2219 2220 if (dst_count > src_count) { 2221 if ((dst_type.width == 8 || dst_type.width == 16) && 2222 util_is_power_of_two(dst_type.length) && 2223 dst_type.length * dst_type.width < 128) { 2224 /* 2225 * Never try to load values as 4xi8 which we will then 2226 * concatenate to larger vectors. This gives llvm a real 2227 * headache (the problem is the type legalizer (?) will 2228 * try to load that as 4xi8 zext to 4xi32 to fill the vector, 2229 * then the shuffles to concatenate are more or less impossible 2230 * - llvm is easily capable of generating a sequence of 32 2231 * pextrb/pinsrb instructions for that. Albeit it appears to 2232 * be fixed in llvm 4.0. So, load and concatenate with 32bit 2233 * width to avoid the trouble (16bit seems not as bad, llvm 2234 * probably recognizes the load+shuffle as only one shuffle 2235 * is necessary, but we can do just the same anyway). 2236 */ 2237 ls_type.length = dst_type.length * dst_type.width / 32; 2238 ls_type.width = 32; 2239 } 2240 } 2241 2242 if (is_1d) { 2243 load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1, 2244 dst, ls_type, dst_count / 4, dst_alignment); 2245 for (i = dst_count / 4; i < dst_count; i++) { 2246 dst[i] = lp_build_undef(gallivm, ls_type); 2247 } 2248 2249 } 2250 else { 2251 load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, 2252 dst, ls_type, dst_count, dst_alignment); 2253 } 2254 2255 2256 /* 2257 * Convert from dst/output format to src/blending format. 2258 * 2259 * This is necessary as we can only read 1 row from memory at a time, 2260 * so the minimum dst_count will ever be at this point is 4. 2261 * 2262 * With, for example, R8 format you can have all 16 pixels in a 128 bit vector, 2263 * this will take the 4 dsts and combine them into 1 src so we can perform blending 2264 * on all 16 pixels in that single vector at once. 2265 */ 2266 if (dst_count > src_count) { 2267 if (ls_type.length != dst_type.length && ls_type.length == 1) { 2268 LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type); 2269 LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1); 2270 for (i = 0; i < dst_count; i++) { 2271 dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, ""); 2272 } 2273 } 2274 2275 lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count); 2276 2277 if (ls_type.length != dst_type.length) { 2278 struct lp_type tmp_type = dst_type; 2279 tmp_type.length = dst_type.length * 4 / src_count; 2280 for (i = 0; i < src_count; i++) { 2281 dst[i] = LLVMBuildBitCast(builder, dst[i], 2282 lp_build_vec_type(gallivm, tmp_type), ""); 2283 } 2284 } 2285 } 2286 2287 /* 2288 * Blending 2289 */ 2290 /* XXX this is broken for RGB8 formats - 2291 * they get expanded from 12 to 16 elements (to include alpha) 2292 * by convert_to_blend_type then reduced to 15 instead of 12 2293 * by convert_from_blend_type (a simple fix though breaks A8...). 2294 * R16G16B16 also crashes differently however something going wrong 2295 * inside llvm handling npot vector sizes seemingly. 2296 * It seems some cleanup could be done here (like skipping conversion/blend 2297 * when not needed). 2298 */ 2299 convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, 2300 row_type, dst, src_count); 2301 2302 /* 2303 * FIXME: Really should get logic ops / masks out of generic blend / row 2304 * format. Logic ops will definitely not work on the blend float format 2305 * used for SRGB here and I think OpenGL expects this to work as expected 2306 * (that is incoming values converted to srgb then logic op applied). 2307 */ 2308 for (i = 0; i < src_count; ++i) { 2309 dst[i] = lp_build_blend_aos(gallivm, 2310 &variant->key.blend, 2311 out_format, 2312 row_type, 2313 rt, 2314 src[i], 2315 has_alpha ? NULL : src_alpha[i], 2316 src1[i], 2317 has_alpha ? NULL : src1_alpha[i], 2318 dst[i], 2319 partial_mask ? src_mask[i] : NULL, 2320 blend_color, 2321 has_alpha ? NULL : blend_alpha, 2322 swizzle, 2323 pad_inline ? 4 : dst_channels); 2324 } 2325 2326 convert_from_blend_type(gallivm, block_size, out_format_desc, 2327 row_type, dst_type, dst, src_count); 2328 2329 /* Split the blend rows back to memory rows */ 2330 if (dst_count > src_count) { 2331 row_type.length = dst_type.length * (dst_count / src_count); 2332 2333 if (src_count == 1) { 2334 dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2); 2335 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2); 2336 2337 row_type.length /= 2; 2338 src_count *= 2; 2339 } 2340 2341 dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2); 2342 dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2); 2343 dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2); 2344 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2); 2345 2346 row_type.length /= 2; 2347 src_count *= 2; 2348 } 2349 2350 /* 2351 * Store blend result to memory 2352 */ 2353 if (is_1d) { 2354 store_unswizzled_block(gallivm, color_ptr, stride, block_width, 1, 2355 dst, dst_type, dst_count / 4, dst_alignment); 2356 } 2357 else { 2358 store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, 2359 dst, dst_type, dst_count, dst_alignment); 2360 } 2361 2362 if (have_smallfloat_format(dst_type, out_format)) { 2363 lp_build_fpstate_set(gallivm, fpstate); 2364 } 2365 2366 if (do_branch) { 2367 lp_build_mask_end(&mask_ctx); 2368 } 2369 } 2370 2371 2372 /** 2373 * Generate the runtime callable function for the whole fragment pipeline. 2374 * Note that the function which we generate operates on a block of 16 2375 * pixels at at time. The block contains 2x2 quads. Each quad contains 2376 * 2x2 pixels. 2377 */ 2378 static void 2379 generate_fragment(struct llvmpipe_context *lp, 2380 struct lp_fragment_shader *shader, 2381 struct lp_fragment_shader_variant *variant, 2382 unsigned partial_mask) 2383 { 2384 struct gallivm_state *gallivm = variant->gallivm; 2385 const struct lp_fragment_shader_variant_key *key = &variant->key; 2386 struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; 2387 char func_name[64]; 2388 struct lp_type fs_type; 2389 struct lp_type blend_type; 2390 LLVMTypeRef fs_elem_type; 2391 LLVMTypeRef blend_vec_type; 2392 LLVMTypeRef arg_types[13]; 2393 LLVMTypeRef func_type; 2394 LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context); 2395 LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context); 2396 LLVMValueRef context_ptr; 2397 LLVMValueRef x; 2398 LLVMValueRef y; 2399 LLVMValueRef a0_ptr; 2400 LLVMValueRef dadx_ptr; 2401 LLVMValueRef dady_ptr; 2402 LLVMValueRef color_ptr_ptr; 2403 LLVMValueRef stride_ptr; 2404 LLVMValueRef depth_ptr; 2405 LLVMValueRef depth_stride; 2406 LLVMValueRef mask_input; 2407 LLVMValueRef thread_data_ptr; 2408 LLVMBasicBlockRef block; 2409 LLVMBuilderRef builder; 2410 struct lp_build_sampler_soa *sampler; 2411 struct lp_build_interp_soa_context interp; 2412 LLVMValueRef fs_mask[16 / 4]; 2413 LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4]; 2414 LLVMValueRef function; 2415 LLVMValueRef facing; 2416 unsigned num_fs; 2417 unsigned i; 2418 unsigned chan; 2419 unsigned cbuf; 2420 boolean cbuf0_write_all; 2421 const boolean dual_source_blend = key->blend.rt[0].blend_enable && 2422 util_blend_state_is_dual(&key->blend, 0); 2423 2424 assert(lp_native_vector_width / 32 >= 4); 2425 2426 /* Adjust color input interpolation according to flatshade state: 2427 */ 2428 memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]); 2429 for (i = 0; i < shader->info.base.num_inputs; i++) { 2430 if (inputs[i].interp == LP_INTERP_COLOR) { 2431 if (key->flatshade) 2432 inputs[i].interp = LP_INTERP_CONSTANT; 2433 else 2434 inputs[i].interp = LP_INTERP_PERSPECTIVE; 2435 } 2436 } 2437 2438 /* check if writes to cbuf[0] are to be copied to all cbufs */ 2439 cbuf0_write_all = 2440 shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]; 2441 2442 /* TODO: actually pick these based on the fs and color buffer 2443 * characteristics. */ 2444 2445 memset(&fs_type, 0, sizeof fs_type); 2446 fs_type.floating = TRUE; /* floating point values */ 2447 fs_type.sign = TRUE; /* values are signed */ 2448 fs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ 2449 fs_type.width = 32; /* 32-bit float */ 2450 fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */ 2451 2452 memset(&blend_type, 0, sizeof blend_type); 2453 blend_type.floating = FALSE; /* values are integers */ 2454 blend_type.sign = FALSE; /* values are unsigned */ 2455 blend_type.norm = TRUE; /* values are in [0,1] or [-1,1] */ 2456 blend_type.width = 8; /* 8-bit ubyte values */ 2457 blend_type.length = 16; /* 16 elements per vector */ 2458 2459 /* 2460 * Generate the function prototype. Any change here must be reflected in 2461 * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa. 2462 */ 2463 2464 fs_elem_type = lp_build_elem_type(gallivm, fs_type); 2465 2466 blend_vec_type = lp_build_vec_type(gallivm, blend_type); 2467 2468 util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", 2469 shader->no, variant->no, partial_mask ? "partial" : "whole"); 2470 2471 arg_types[0] = variant->jit_context_ptr_type; /* context */ 2472 arg_types[1] = int32_type; /* x */ 2473 arg_types[2] = int32_type; /* y */ 2474 arg_types[3] = int32_type; /* facing */ 2475 arg_types[4] = LLVMPointerType(fs_elem_type, 0); /* a0 */ 2476 arg_types[5] = LLVMPointerType(fs_elem_type, 0); /* dadx */ 2477 arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dady */ 2478 arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0); /* color */ 2479 arg_types[8] = LLVMPointerType(int8_type, 0); /* depth */ 2480 arg_types[9] = int32_type; /* mask_input */ 2481 arg_types[10] = variant->jit_thread_data_ptr_type; /* per thread data */ 2482 arg_types[11] = LLVMPointerType(int32_type, 0); /* stride */ 2483 arg_types[12] = int32_type; /* depth_stride */ 2484 2485 func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context), 2486 arg_types, ARRAY_SIZE(arg_types), 0); 2487 2488 function = LLVMAddFunction(gallivm->module, func_name, func_type); 2489 LLVMSetFunctionCallConv(function, LLVMCCallConv); 2490 2491 variant->function[partial_mask] = function; 2492 2493 /* XXX: need to propagate noalias down into color param now we are 2494 * passing a pointer-to-pointer? 2495 */ 2496 for(i = 0; i < ARRAY_SIZE(arg_types); ++i) 2497 if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) 2498 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS); 2499 2500 context_ptr = LLVMGetParam(function, 0); 2501 x = LLVMGetParam(function, 1); 2502 y = LLVMGetParam(function, 2); 2503 facing = LLVMGetParam(function, 3); 2504 a0_ptr = LLVMGetParam(function, 4); 2505 dadx_ptr = LLVMGetParam(function, 5); 2506 dady_ptr = LLVMGetParam(function, 6); 2507 color_ptr_ptr = LLVMGetParam(function, 7); 2508 depth_ptr = LLVMGetParam(function, 8); 2509 mask_input = LLVMGetParam(function, 9); 2510 thread_data_ptr = LLVMGetParam(function, 10); 2511 stride_ptr = LLVMGetParam(function, 11); 2512 depth_stride = LLVMGetParam(function, 12); 2513 2514 lp_build_name(context_ptr, "context"); 2515 lp_build_name(x, "x"); 2516 lp_build_name(y, "y"); 2517 lp_build_name(a0_ptr, "a0"); 2518 lp_build_name(dadx_ptr, "dadx"); 2519 lp_build_name(dady_ptr, "dady"); 2520 lp_build_name(color_ptr_ptr, "color_ptr_ptr"); 2521 lp_build_name(depth_ptr, "depth"); 2522 lp_build_name(mask_input, "mask_input"); 2523 lp_build_name(thread_data_ptr, "thread_data"); 2524 lp_build_name(stride_ptr, "stride_ptr"); 2525 lp_build_name(depth_stride, "depth_stride"); 2526 2527 /* 2528 * Function body 2529 */ 2530 2531 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry"); 2532 builder = gallivm->builder; 2533 assert(builder); 2534 LLVMPositionBuilderAtEnd(builder, block); 2535 2536 /* code generated texture sampling */ 2537 sampler = lp_llvm_sampler_soa_create(key->state); 2538 2539 num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */ 2540 /* for 1d resources only run "upper half" of stamp */ 2541 if (key->resource_1d) 2542 num_fs /= 2; 2543 2544 { 2545 LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs); 2546 LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type); 2547 LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type, 2548 num_loop, "mask_store"); 2549 LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS]; 2550 boolean pixel_center_integer = 2551 shader->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER]; 2552 2553 /* 2554 * The shader input interpolation info is not explicitely baked in the 2555 * shader key, but everything it derives from (TGSI, and flatshade) is 2556 * already included in the shader key. 2557 */ 2558 lp_build_interp_soa_init(&interp, 2559 gallivm, 2560 shader->info.base.num_inputs, 2561 inputs, 2562 pixel_center_integer, 2563 key->depth_clamp, 2564 builder, fs_type, 2565 a0_ptr, dadx_ptr, dady_ptr, 2566 x, y); 2567 2568 for (i = 0; i < num_fs; i++) { 2569 LLVMValueRef mask; 2570 LLVMValueRef indexi = lp_build_const_int32(gallivm, i); 2571 LLVMValueRef mask_ptr = LLVMBuildGEP(builder, mask_store, 2572 &indexi, 1, "mask_ptr"); 2573 2574 if (partial_mask) { 2575 mask = generate_quad_mask(gallivm, fs_type, 2576 i*fs_type.length/4, mask_input); 2577 } 2578 else { 2579 mask = lp_build_const_int_vec(gallivm, fs_type, ~0); 2580 } 2581 LLVMBuildStore(builder, mask, mask_ptr); 2582 } 2583 2584 generate_fs_loop(gallivm, 2585 shader, key, 2586 builder, 2587 fs_type, 2588 context_ptr, 2589 num_loop, 2590 &interp, 2591 sampler, 2592 mask_store, /* output */ 2593 color_store, 2594 depth_ptr, 2595 depth_stride, 2596 facing, 2597 thread_data_ptr); 2598 2599 for (i = 0; i < num_fs; i++) { 2600 LLVMValueRef indexi = lp_build_const_int32(gallivm, i); 2601 LLVMValueRef ptr = LLVMBuildGEP(builder, mask_store, 2602 &indexi, 1, ""); 2603 fs_mask[i] = LLVMBuildLoad(builder, ptr, "mask"); 2604 /* This is fucked up need to reorganize things */ 2605 for (cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { 2606 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 2607 ptr = LLVMBuildGEP(builder, 2608 color_store[cbuf * !cbuf0_write_all][chan], 2609 &indexi, 1, ""); 2610 fs_out_color[cbuf][chan][i] = ptr; 2611 } 2612 } 2613 if (dual_source_blend) { 2614 /* only support one dual source blend target hence always use output 1 */ 2615 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 2616 ptr = LLVMBuildGEP(builder, 2617 color_store[1][chan], 2618 &indexi, 1, ""); 2619 fs_out_color[1][chan][i] = ptr; 2620 } 2621 } 2622 } 2623 } 2624 2625 sampler->destroy(sampler); 2626 2627 /* Loop over color outputs / color buffers to do blending. 2628 */ 2629 for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { 2630 if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE) { 2631 LLVMValueRef color_ptr; 2632 LLVMValueRef stride; 2633 LLVMValueRef index = lp_build_const_int32(gallivm, cbuf); 2634 2635 boolean do_branch = ((key->depth.enabled 2636 || key->stencil[0].enabled 2637 || key->alpha.enabled) 2638 && !shader->info.base.uses_kill); 2639 2640 color_ptr = LLVMBuildLoad(builder, 2641 LLVMBuildGEP(builder, color_ptr_ptr, 2642 &index, 1, ""), 2643 ""); 2644 2645 lp_build_name(color_ptr, "color_ptr%d", cbuf); 2646 2647 stride = LLVMBuildLoad(builder, 2648 LLVMBuildGEP(builder, stride_ptr, &index, 1, ""), 2649 ""); 2650 2651 generate_unswizzled_blend(gallivm, cbuf, variant, 2652 key->cbuf_format[cbuf], 2653 num_fs, fs_type, fs_mask, fs_out_color, 2654 context_ptr, color_ptr, stride, 2655 partial_mask, do_branch); 2656 } 2657 } 2658 2659 LLVMBuildRetVoid(builder); 2660 2661 gallivm_verify_function(gallivm, function); 2662 } 2663 2664 2665 static void 2666 dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) 2667 { 2668 unsigned i; 2669 2670 debug_printf("fs variant %p:\n", (void *) key); 2671 2672 if (key->flatshade) { 2673 debug_printf("flatshade = 1\n"); 2674 } 2675 for (i = 0; i < key->nr_cbufs; ++i) { 2676 debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i])); 2677 } 2678 if (key->depth.enabled || key->stencil[0].enabled) { 2679 debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format)); 2680 } 2681 if (key->depth.enabled) { 2682 debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE)); 2683 debug_printf("depth.writemask = %u\n", key->depth.writemask); 2684 } 2685 2686 for (i = 0; i < 2; ++i) { 2687 if (key->stencil[i].enabled) { 2688 debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE)); 2689 debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE)); 2690 debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE)); 2691 debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE)); 2692 debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask); 2693 debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask); 2694 } 2695 } 2696 2697 if (key->alpha.enabled) { 2698 debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE)); 2699 } 2700 2701 if (key->occlusion_count) { 2702 debug_printf("occlusion_count = 1\n"); 2703 } 2704 2705 if (key->blend.logicop_enable) { 2706 debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE)); 2707 } 2708 else if (key->blend.rt[0].blend_enable) { 2709 debug_printf("blend.rgb_func = %s\n", util_dump_blend_func (key->blend.rt[0].rgb_func, TRUE)); 2710 debug_printf("blend.rgb_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE)); 2711 debug_printf("blend.rgb_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE)); 2712 debug_printf("blend.alpha_func = %s\n", util_dump_blend_func (key->blend.rt[0].alpha_func, TRUE)); 2713 debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE)); 2714 debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE)); 2715 } 2716 debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask); 2717 if (key->blend.alpha_to_coverage) { 2718 debug_printf("blend.alpha_to_coverage is enabled\n"); 2719 } 2720 for (i = 0; i < key->nr_samplers; ++i) { 2721 const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state; 2722 debug_printf("sampler[%u] = \n", i); 2723 debug_printf(" .wrap = %s %s %s\n", 2724 util_dump_tex_wrap(sampler->wrap_s, TRUE), 2725 util_dump_tex_wrap(sampler->wrap_t, TRUE), 2726 util_dump_tex_wrap(sampler->wrap_r, TRUE)); 2727 debug_printf(" .min_img_filter = %s\n", 2728 util_dump_tex_filter(sampler->min_img_filter, TRUE)); 2729 debug_printf(" .min_mip_filter = %s\n", 2730 util_dump_tex_mipfilter(sampler->min_mip_filter, TRUE)); 2731 debug_printf(" .mag_img_filter = %s\n", 2732 util_dump_tex_filter(sampler->mag_img_filter, TRUE)); 2733 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) 2734 debug_printf(" .compare_func = %s\n", util_dump_func(sampler->compare_func, TRUE)); 2735 debug_printf(" .normalized_coords = %u\n", sampler->normalized_coords); 2736 debug_printf(" .min_max_lod_equal = %u\n", sampler->min_max_lod_equal); 2737 debug_printf(" .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero); 2738 debug_printf(" .apply_min_lod = %u\n", sampler->apply_min_lod); 2739 debug_printf(" .apply_max_lod = %u\n", sampler->apply_max_lod); 2740 } 2741 for (i = 0; i < key->nr_sampler_views; ++i) { 2742 const struct lp_static_texture_state *texture = &key->state[i].texture_state; 2743 debug_printf("texture[%u] = \n", i); 2744 debug_printf(" .format = %s\n", 2745 util_format_name(texture->format)); 2746 debug_printf(" .target = %s\n", 2747 util_dump_tex_target(texture->target, TRUE)); 2748 debug_printf(" .level_zero_only = %u\n", 2749 texture->level_zero_only); 2750 debug_printf(" .pot = %u %u %u\n", 2751 texture->pot_width, 2752 texture->pot_height, 2753 texture->pot_depth); 2754 } 2755 } 2756 2757 2758 void 2759 lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant) 2760 { 2761 debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n", 2762 variant->shader->no, variant->no); 2763 tgsi_dump(variant->shader->base.tokens, 0); 2764 dump_fs_variant_key(&variant->key); 2765 debug_printf("variant->opaque = %u\n", variant->opaque); 2766 debug_printf("\n"); 2767 } 2768 2769 2770 /** 2771 * Generate a new fragment shader variant from the shader code and 2772 * other state indicated by the key. 2773 */ 2774 static struct lp_fragment_shader_variant * 2775 generate_variant(struct llvmpipe_context *lp, 2776 struct lp_fragment_shader *shader, 2777 const struct lp_fragment_shader_variant_key *key) 2778 { 2779 struct lp_fragment_shader_variant *variant; 2780 const struct util_format_description *cbuf0_format_desc; 2781 boolean fullcolormask; 2782 char module_name[64]; 2783 2784 variant = CALLOC_STRUCT(lp_fragment_shader_variant); 2785 if (!variant) 2786 return NULL; 2787 2788 util_snprintf(module_name, sizeof(module_name), "fs%u_variant%u", 2789 shader->no, shader->variants_created); 2790 2791 variant->gallivm = gallivm_create(module_name, lp->context); 2792 if (!variant->gallivm) { 2793 FREE(variant); 2794 return NULL; 2795 } 2796 2797 variant->shader = shader; 2798 variant->list_item_global.base = variant; 2799 variant->list_item_local.base = variant; 2800 variant->no = shader->variants_created++; 2801 2802 memcpy(&variant->key, key, shader->variant_key_size); 2803 2804 /* 2805 * Determine whether we are touching all channels in the color buffer. 2806 */ 2807 fullcolormask = FALSE; 2808 if (key->nr_cbufs == 1) { 2809 cbuf0_format_desc = util_format_description(key->cbuf_format[0]); 2810 fullcolormask = util_format_colormask_full(cbuf0_format_desc, key->blend.rt[0].colormask); 2811 } 2812 2813 variant->opaque = 2814 !key->blend.logicop_enable && 2815 !key->blend.rt[0].blend_enable && 2816 fullcolormask && 2817 !key->stencil[0].enabled && 2818 !key->alpha.enabled && 2819 !key->blend.alpha_to_coverage && 2820 !key->depth.enabled && 2821 !shader->info.base.uses_kill 2822 ? TRUE : FALSE; 2823 2824 if ((shader->info.base.num_tokens <= 1) && 2825 !key->depth.enabled && !key->stencil[0].enabled) { 2826 variant->ps_inv_multiplier = 0; 2827 } else { 2828 variant->ps_inv_multiplier = 1; 2829 } 2830 2831 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) { 2832 lp_debug_fs_variant(variant); 2833 } 2834 2835 lp_jit_init_types(variant); 2836 2837 if (variant->jit_function[RAST_EDGE_TEST] == NULL) 2838 generate_fragment(lp, shader, variant, RAST_EDGE_TEST); 2839 2840 if (variant->jit_function[RAST_WHOLE] == NULL) { 2841 if (variant->opaque) { 2842 /* Specialized shader, which doesn't need to read the color buffer. */ 2843 generate_fragment(lp, shader, variant, RAST_WHOLE); 2844 } 2845 } 2846 2847 /* 2848 * Compile everything 2849 */ 2850 2851 gallivm_compile_module(variant->gallivm); 2852 2853 variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module); 2854 2855 if (variant->function[RAST_EDGE_TEST]) { 2856 variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func) 2857 gallivm_jit_function(variant->gallivm, 2858 variant->function[RAST_EDGE_TEST]); 2859 } 2860 2861 if (variant->function[RAST_WHOLE]) { 2862 variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func) 2863 gallivm_jit_function(variant->gallivm, 2864 variant->function[RAST_WHOLE]); 2865 } else if (!variant->jit_function[RAST_WHOLE]) { 2866 variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST]; 2867 } 2868 2869 gallivm_free_ir(variant->gallivm); 2870 2871 return variant; 2872 } 2873 2874 2875 static void * 2876 llvmpipe_create_fs_state(struct pipe_context *pipe, 2877 const struct pipe_shader_state *templ) 2878 { 2879 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); 2880 struct lp_fragment_shader *shader; 2881 int nr_samplers; 2882 int nr_sampler_views; 2883 int i; 2884 2885 shader = CALLOC_STRUCT(lp_fragment_shader); 2886 if (!shader) 2887 return NULL; 2888 2889 shader->no = fs_no++; 2890 make_empty_list(&shader->variants); 2891 2892 /* get/save the summary info for this shader */ 2893 lp_build_tgsi_info(templ->tokens, &shader->info); 2894 2895 /* we need to keep a local copy of the tokens */ 2896 shader->base.tokens = tgsi_dup_tokens(templ->tokens); 2897 2898 shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ); 2899 if (shader->draw_data == NULL) { 2900 FREE((void *) shader->base.tokens); 2901 FREE(shader); 2902 return NULL; 2903 } 2904 2905 nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; 2906 nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; 2907 2908 shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key, 2909 state[MAX2(nr_samplers, nr_sampler_views)]); 2910 2911 for (i = 0; i < shader->info.base.num_inputs; i++) { 2912 shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i]; 2913 shader->inputs[i].cyl_wrap = shader->info.base.input_cylindrical_wrap[i]; 2914 2915 switch (shader->info.base.input_interpolate[i]) { 2916 case TGSI_INTERPOLATE_CONSTANT: 2917 shader->inputs[i].interp = LP_INTERP_CONSTANT; 2918 break; 2919 case TGSI_INTERPOLATE_LINEAR: 2920 shader->inputs[i].interp = LP_INTERP_LINEAR; 2921 break; 2922 case TGSI_INTERPOLATE_PERSPECTIVE: 2923 shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; 2924 break; 2925 case TGSI_INTERPOLATE_COLOR: 2926 shader->inputs[i].interp = LP_INTERP_COLOR; 2927 break; 2928 default: 2929 assert(0); 2930 break; 2931 } 2932 2933 switch (shader->info.base.input_semantic_name[i]) { 2934 case TGSI_SEMANTIC_FACE: 2935 shader->inputs[i].interp = LP_INTERP_FACING; 2936 break; 2937 case TGSI_SEMANTIC_POSITION: 2938 /* Position was already emitted above 2939 */ 2940 shader->inputs[i].interp = LP_INTERP_POSITION; 2941 shader->inputs[i].src_index = 0; 2942 continue; 2943 } 2944 2945 /* XXX this is a completely pointless index map... */ 2946 shader->inputs[i].src_index = i+1; 2947 } 2948 2949 if (LP_DEBUG & DEBUG_TGSI) { 2950 unsigned attrib; 2951 debug_printf("llvmpipe: Create fragment shader #%u %p:\n", 2952 shader->no, (void *) shader); 2953 tgsi_dump(templ->tokens, 0); 2954 debug_printf("usage masks:\n"); 2955 for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) { 2956 unsigned usage_mask = shader->info.base.input_usage_mask[attrib]; 2957 debug_printf(" IN[%u].%s%s%s%s\n", 2958 attrib, 2959 usage_mask & TGSI_WRITEMASK_X ? "x" : "", 2960 usage_mask & TGSI_WRITEMASK_Y ? "y" : "", 2961 usage_mask & TGSI_WRITEMASK_Z ? "z" : "", 2962 usage_mask & TGSI_WRITEMASK_W ? "w" : ""); 2963 } 2964 debug_printf("\n"); 2965 } 2966 2967 return shader; 2968 } 2969 2970 2971 static void 2972 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs) 2973 { 2974 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); 2975 2976 if (llvmpipe->fs == fs) 2977 return; 2978 2979 llvmpipe->fs = (struct lp_fragment_shader *) fs; 2980 2981 draw_bind_fragment_shader(llvmpipe->draw, 2982 (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL)); 2983 2984 llvmpipe->dirty |= LP_NEW_FS; 2985 } 2986 2987 2988 /** 2989 * Remove shader variant from two lists: the shader's variant list 2990 * and the context's variant list. 2991 */ 2992 void 2993 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp, 2994 struct lp_fragment_shader_variant *variant) 2995 { 2996 if (gallivm_debug & GALLIVM_DEBUG_IR) { 2997 debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached" 2998 " #%u v total cached #%u\n", 2999 variant->shader->no, 3000 variant->no, 3001 variant->shader->variants_created, 3002 variant->shader->variants_cached, 3003 lp->nr_fs_variants); 3004 } 3005 3006 gallivm_destroy(variant->gallivm); 3007 3008 /* remove from shader's list */ 3009 remove_from_list(&variant->list_item_local); 3010 variant->shader->variants_cached--; 3011 3012 /* remove from context's list */ 3013 remove_from_list(&variant->list_item_global); 3014 lp->nr_fs_variants--; 3015 lp->nr_fs_instrs -= variant->nr_instrs; 3016 3017 FREE(variant); 3018 } 3019 3020 3021 static void 3022 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs) 3023 { 3024 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); 3025 struct lp_fragment_shader *shader = fs; 3026 struct lp_fs_variant_list_item *li; 3027 3028 assert(fs != llvmpipe->fs); 3029 3030 /* 3031 * XXX: we need to flush the context until we have some sort of reference 3032 * counting in fragment shaders as they may still be binned 3033 * Flushing alone might not sufficient we need to wait on it too. 3034 */ 3035 llvmpipe_finish(pipe, __FUNCTION__); 3036 3037 /* Delete all the variants */ 3038 li = first_elem(&shader->variants); 3039 while(!at_end(&shader->variants, li)) { 3040 struct lp_fs_variant_list_item *next = next_elem(li); 3041 llvmpipe_remove_shader_variant(llvmpipe, li->base); 3042 li = next; 3043 } 3044 3045 /* Delete draw module's data */ 3046 draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data); 3047 3048 assert(shader->variants_cached == 0); 3049 FREE((void *) shader->base.tokens); 3050 FREE(shader); 3051 } 3052 3053 3054 3055 static void 3056 llvmpipe_set_constant_buffer(struct pipe_context *pipe, 3057 uint shader, uint index, 3058 const struct pipe_constant_buffer *cb) 3059 { 3060 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); 3061 struct pipe_resource *constants = cb ? cb->buffer : NULL; 3062 3063 assert(shader < PIPE_SHADER_TYPES); 3064 assert(index < ARRAY_SIZE(llvmpipe->constants[shader])); 3065 3066 /* note: reference counting */ 3067 util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb); 3068 3069 if (constants) { 3070 if (!(constants->bind & PIPE_BIND_CONSTANT_BUFFER)) { 3071 debug_printf("Illegal set constant without bind flag\n"); 3072 constants->bind |= PIPE_BIND_CONSTANT_BUFFER; 3073 } 3074 } 3075 3076 if (shader == PIPE_SHADER_VERTEX || 3077 shader == PIPE_SHADER_GEOMETRY) { 3078 /* Pass the constants to the 'draw' module */ 3079 const unsigned size = cb ? cb->buffer_size : 0; 3080 const ubyte *data; 3081 3082 if (constants) { 3083 data = (ubyte *) llvmpipe_resource_data(constants); 3084 } 3085 else if (cb && cb->user_buffer) { 3086 data = (ubyte *) cb->user_buffer; 3087 } 3088 else { 3089 data = NULL; 3090 } 3091 3092 if (data) 3093 data += cb->buffer_offset; 3094 3095 draw_set_mapped_constant_buffer(llvmpipe->draw, shader, 3096 index, data, size); 3097 } 3098 else { 3099 llvmpipe->dirty |= LP_NEW_FS_CONSTANTS; 3100 } 3101 3102 if (cb && cb->user_buffer) { 3103 pipe_resource_reference(&constants, NULL); 3104 } 3105 } 3106 3107 3108 /** 3109 * Return the blend factor equivalent to a destination alpha of one. 3110 */ 3111 static inline unsigned 3112 force_dst_alpha_one(unsigned factor, boolean clamped_zero) 3113 { 3114 switch(factor) { 3115 case PIPE_BLENDFACTOR_DST_ALPHA: 3116 return PIPE_BLENDFACTOR_ONE; 3117 case PIPE_BLENDFACTOR_INV_DST_ALPHA: 3118 return PIPE_BLENDFACTOR_ZERO; 3119 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: 3120 if (clamped_zero) 3121 return PIPE_BLENDFACTOR_ZERO; 3122 else 3123 return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE; 3124 } 3125 3126 return factor; 3127 } 3128 3129 3130 /** 3131 * We need to generate several variants of the fragment pipeline to match 3132 * all the combinations of the contributing state atoms. 3133 * 3134 * TODO: there is actually no reason to tie this to context state -- the 3135 * generated code could be cached globally in the screen. 3136 */ 3137 static void 3138 make_variant_key(struct llvmpipe_context *lp, 3139 struct lp_fragment_shader *shader, 3140 struct lp_fragment_shader_variant_key *key) 3141 { 3142 unsigned i; 3143 3144 memset(key, 0, shader->variant_key_size); 3145 3146 if (lp->framebuffer.zsbuf) { 3147 enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format; 3148 const struct util_format_description *zsbuf_desc = 3149 util_format_description(zsbuf_format); 3150 3151 if (lp->depth_stencil->depth.enabled && 3152 util_format_has_depth(zsbuf_desc)) { 3153 key->zsbuf_format = zsbuf_format; 3154 memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth); 3155 } 3156 if (lp->depth_stencil->stencil[0].enabled && 3157 util_format_has_stencil(zsbuf_desc)) { 3158 key->zsbuf_format = zsbuf_format; 3159 memcpy(&key->stencil, &lp->depth_stencil->stencil, sizeof key->stencil); 3160 } 3161 if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) { 3162 key->resource_1d = TRUE; 3163 } 3164 } 3165 3166 /* 3167 * Propagate the depth clamp setting from the rasterizer state. 3168 * depth_clip == 0 implies depth clamping is enabled. 3169 * 3170 * When clip_halfz is enabled, then always clamp the depth values. 3171 * 3172 * XXX: This is incorrect for GL, but correct for d3d10 (depth 3173 * clamp is always active in d3d10, regardless if depth clip is 3174 * enabled or not). 3175 * (GL has an always-on [0,1] clamp on fs depth output instead 3176 * to ensure the depth values stay in range. Doesn't look like 3177 * we do that, though...) 3178 */ 3179 if (lp->rasterizer->clip_halfz) { 3180 key->depth_clamp = 1; 3181 } else { 3182 key->depth_clamp = (lp->rasterizer->depth_clip == 0) ? 1 : 0; 3183 } 3184 3185 /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */ 3186 if (!lp->framebuffer.nr_cbufs || 3187 !lp->framebuffer.cbufs[0] || 3188 !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) { 3189 key->alpha.enabled = lp->depth_stencil->alpha.enabled; 3190 } 3191 if(key->alpha.enabled) 3192 key->alpha.func = lp->depth_stencil->alpha.func; 3193 /* alpha.ref_value is passed in jit_context */ 3194 3195 key->flatshade = lp->rasterizer->flatshade; 3196 if (lp->active_occlusion_queries) { 3197 key->occlusion_count = TRUE; 3198 } 3199 3200 if (lp->framebuffer.nr_cbufs) { 3201 memcpy(&key->blend, lp->blend, sizeof key->blend); 3202 } 3203 3204 key->nr_cbufs = lp->framebuffer.nr_cbufs; 3205 3206 if (!key->blend.independent_blend_enable) { 3207 /* we always need independent blend otherwise the fixups below won't work */ 3208 for (i = 1; i < key->nr_cbufs; i++) { 3209 memcpy(&key->blend.rt[i], &key->blend.rt[0], sizeof(key->blend.rt[0])); 3210 } 3211 key->blend.independent_blend_enable = 1; 3212 } 3213 3214 for (i = 0; i < lp->framebuffer.nr_cbufs; i++) { 3215 struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i]; 3216 3217 if (lp->framebuffer.cbufs[i]) { 3218 enum pipe_format format = lp->framebuffer.cbufs[i]->format; 3219 const struct util_format_description *format_desc; 3220 3221 key->cbuf_format[i] = format; 3222 3223 /* 3224 * Figure out if this is a 1d resource. Note that OpenGL allows crazy 3225 * mixing of 2d textures with height 1 and 1d textures, so make sure 3226 * we pick 1d if any cbuf or zsbuf is 1d. 3227 */ 3228 if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) { 3229 key->resource_1d = TRUE; 3230 } 3231 3232 format_desc = util_format_description(format); 3233 assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || 3234 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); 3235 3236 /* 3237 * Mask out color channels not present in the color buffer. 3238 */ 3239 blend_rt->colormask &= util_format_colormask(format_desc); 3240 3241 /* 3242 * Disable blend for integer formats. 3243 */ 3244 if (util_format_is_pure_integer(format)) { 3245 blend_rt->blend_enable = 0; 3246 } 3247 3248 /* 3249 * Our swizzled render tiles always have an alpha channel, but the 3250 * linear render target format often does not, so force here the dst 3251 * alpha to be one. 3252 * 3253 * This is not a mere optimization. Wrong results will be produced if 3254 * the dst alpha is used, the dst format does not have alpha, and the 3255 * previous rendering was not flushed from the swizzled to linear 3256 * buffer. For example, NonPowTwo DCT. 3257 * 3258 * TODO: This should be generalized to all channels for better 3259 * performance, but only alpha causes correctness issues. 3260 * 3261 * Also, force rgb/alpha func/factors match, to make AoS blending 3262 * easier. 3263 */ 3264 if (format_desc->swizzle[3] > PIPE_SWIZZLE_W || 3265 format_desc->swizzle[3] == format_desc->swizzle[0]) { 3266 /* Doesn't cover mixed snorm/unorm but can't render to them anyway */ 3267 boolean clamped_zero = !util_format_is_float(format) && 3268 !util_format_is_snorm(format); 3269 blend_rt->rgb_src_factor = 3270 force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero); 3271 blend_rt->rgb_dst_factor = 3272 force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero); 3273 blend_rt->alpha_func = blend_rt->rgb_func; 3274 blend_rt->alpha_src_factor = blend_rt->rgb_src_factor; 3275 blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor; 3276 } 3277 } 3278 else { 3279 /* no color buffer for this fragment output */ 3280 key->cbuf_format[i] = PIPE_FORMAT_NONE; 3281 blend_rt->colormask = 0x0; 3282 blend_rt->blend_enable = 0; 3283 } 3284 } 3285 3286 /* This value will be the same for all the variants of a given shader: 3287 */ 3288 key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; 3289 3290 for(i = 0; i < key->nr_samplers; ++i) { 3291 if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { 3292 lp_sampler_static_sampler_state(&key->state[i].sampler_state, 3293 lp->samplers[PIPE_SHADER_FRAGMENT][i]); 3294 } 3295 } 3296 3297 /* 3298 * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes 3299 * are dx10-style? Can't really have mixed opcodes, at least not 3300 * if we want to skip the holes here (without rescanning tgsi). 3301 */ 3302 if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { 3303 key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; 3304 for(i = 0; i < key->nr_sampler_views; ++i) { 3305 if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { 3306 lp_sampler_static_texture_state(&key->state[i].texture_state, 3307 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); 3308 } 3309 } 3310 } 3311 else { 3312 key->nr_sampler_views = key->nr_samplers; 3313 for(i = 0; i < key->nr_sampler_views; ++i) { 3314 if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { 3315 lp_sampler_static_texture_state(&key->state[i].texture_state, 3316 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); 3317 } 3318 } 3319 } 3320 } 3321 3322 3323 3324 /** 3325 * Update fragment shader state. This is called just prior to drawing 3326 * something when some fragment-related state has changed. 3327 */ 3328 void 3329 llvmpipe_update_fs(struct llvmpipe_context *lp) 3330 { 3331 struct lp_fragment_shader *shader = lp->fs; 3332 struct lp_fragment_shader_variant_key key; 3333 struct lp_fragment_shader_variant *variant = NULL; 3334 struct lp_fs_variant_list_item *li; 3335 3336 make_variant_key(lp, shader, &key); 3337 3338 /* Search the variants for one which matches the key */ 3339 li = first_elem(&shader->variants); 3340 while(!at_end(&shader->variants, li)) { 3341 if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) { 3342 variant = li->base; 3343 break; 3344 } 3345 li = next_elem(li); 3346 } 3347 3348 if (variant) { 3349 /* Move this variant to the head of the list to implement LRU 3350 * deletion of shader's when we have too many. 3351 */ 3352 move_to_head(&lp->fs_variants_list, &variant->list_item_global); 3353 } 3354 else { 3355 /* variant not found, create it now */ 3356 int64_t t0, t1, dt; 3357 unsigned i; 3358 unsigned variants_to_cull; 3359 3360 if (0) { 3361 debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n", 3362 lp->nr_fs_variants, 3363 lp->nr_fs_instrs, 3364 lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0); 3365 } 3366 3367 /* First, check if we've exceeded the max number of shader variants. 3368 * If so, free 25% of them (the least recently used ones). 3369 */ 3370 variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 4 : 0; 3371 3372 if (variants_to_cull || 3373 lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) { 3374 struct pipe_context *pipe = &lp->pipe; 3375 3376 /* 3377 * XXX: we need to flush the context until we have some sort of 3378 * reference counting in fragment shaders as they may still be binned 3379 * Flushing alone might not be sufficient we need to wait on it too. 3380 */ 3381 llvmpipe_finish(pipe, __FUNCTION__); 3382 3383 /* 3384 * We need to re-check lp->nr_fs_variants because an arbitrarliy large 3385 * number of shader variants (potentially all of them) could be 3386 * pending for destruction on flush. 3387 */ 3388 3389 for (i = 0; i < variants_to_cull || lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS; i++) { 3390 struct lp_fs_variant_list_item *item; 3391 if (is_empty_list(&lp->fs_variants_list)) { 3392 break; 3393 } 3394 item = last_elem(&lp->fs_variants_list); 3395 assert(item); 3396 assert(item->base); 3397 llvmpipe_remove_shader_variant(lp, item->base); 3398 } 3399 } 3400 3401 /* 3402 * Generate the new variant. 3403 */ 3404 t0 = os_time_get(); 3405 variant = generate_variant(lp, shader, &key); 3406 t1 = os_time_get(); 3407 dt = t1 - t0; 3408 LP_COUNT_ADD(llvm_compile_time, dt); 3409 LP_COUNT_ADD(nr_llvm_compiles, 2); /* emit vs. omit in/out test */ 3410 3411 /* Put the new variant into the list */ 3412 if (variant) { 3413 insert_at_head(&shader->variants, &variant->list_item_local); 3414 insert_at_head(&lp->fs_variants_list, &variant->list_item_global); 3415 lp->nr_fs_variants++; 3416 lp->nr_fs_instrs += variant->nr_instrs; 3417 shader->variants_cached++; 3418 } 3419 } 3420 3421 /* Bind this variant */ 3422 lp_setup_set_fs_variant(lp->setup, variant); 3423 } 3424 3425 3426 3427 3428 3429 void 3430 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe) 3431 { 3432 llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state; 3433 llvmpipe->pipe.bind_fs_state = llvmpipe_bind_fs_state; 3434 llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state; 3435 3436 llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer; 3437 } 3438 3439 /* 3440 * Rasterization is disabled if there is no pixel shader and 3441 * both depth and stencil testing are disabled: 3442 * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125 3443 */ 3444 boolean 3445 llvmpipe_rasterization_disabled(struct llvmpipe_context *lp) 3446 { 3447 boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1; 3448 3449 return (null_fs && 3450 !lp->depth_stencil->depth.enabled && 3451 !lp->depth_stencil->stencil[0].enabled); 3452 } 3453