1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * Copyright 2007-2008 VMware, Inc. 5 * All Rights Reserved. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the 9 * "Software"), to deal in the Software without restriction, including 10 * without limitation the rights to use, copy, modify, merge, publish, 11 * distribute, sub license, and/or sell copies of the Software, and to 12 * permit persons to whom the Software is furnished to do so, subject to 13 * the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the 16 * next paragraph) shall be included in all copies or substantial portions 17 * of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 * 27 **************************************************************************/ 28 29 /** 30 * @file 31 * Position and shader input interpolation. 32 * 33 * @author Jose Fonseca <jfonseca (at) vmware.com> 34 */ 35 36 #include "pipe/p_shader_tokens.h" 37 #include "util/u_debug.h" 38 #include "util/u_memory.h" 39 #include "util/u_math.h" 40 #include "tgsi/tgsi_scan.h" 41 #include "gallivm/lp_bld_debug.h" 42 #include "gallivm/lp_bld_const.h" 43 #include "gallivm/lp_bld_arit.h" 44 #include "gallivm/lp_bld_swizzle.h" 45 #include "gallivm/lp_bld_flow.h" 46 #include "lp_bld_interp.h" 47 48 49 /* 50 * The shader JIT function operates on blocks of quads. 51 * Each block has 2x2 quads and each quad has 2x2 pixels. 52 * 53 * We iterate over the quads in order 0, 1, 2, 3: 54 * 55 * ################# 56 * # | # | # 57 * #---0---#---1---# 58 * # | # | # 59 * ################# 60 * # | # | # 61 * #---2---#---3---# 62 * # | # | # 63 * ################# 64 * 65 * If we iterate over multiple quads at once, quads 01 and 23 are processed 66 * together. 67 * 68 * Within each quad, we have four pixels which are represented in SOA 69 * order: 70 * 71 * ######### 72 * # 0 | 1 # 73 * #---+---# 74 * # 2 | 3 # 75 * ######### 76 * 77 * So the green channel (for example) of the four pixels is stored in 78 * a single vector register: {g0, g1, g2, g3}. 79 * The order stays the same even with multiple quads: 80 * 0 1 4 5 81 * 2 3 6 7 82 * is stored as g0..g7 83 */ 84 85 86 /** 87 * Do one perspective divide per quad. 88 * 89 * For perspective interpolation, the final attribute value is given 90 * 91 * a' = a/w = a * oow 92 * 93 * where 94 * 95 * a = a0 + dadx*x + dady*y 96 * w = w0 + dwdx*x + dwdy*y 97 * oow = 1/w = 1/(w0 + dwdx*x + dwdy*y) 98 * 99 * Instead of computing the division per pixel, with this macro we compute the 100 * division on the upper left pixel of each quad, and use a linear 101 * approximation in the remaining pixels, given by: 102 * 103 * da'dx = (dadx - dwdx*a)*oow 104 * da'dy = (dady - dwdy*a)*oow 105 * 106 * Ironically, this actually makes things slower -- probably because the 107 * divide hardware unit is rarely used, whereas the multiply unit is typically 108 * already saturated. 109 */ 110 #define PERSPECTIVE_DIVIDE_PER_QUAD 0 111 112 113 static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3}; 114 static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3}; 115 116 117 static void 118 attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix) 119 { 120 if(attrib == 0) 121 lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix); 122 else 123 lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix); 124 } 125 126 static void 127 calc_offsets(struct lp_build_context *coeff_bld, 128 unsigned quad_start_index, 129 LLVMValueRef *pixoffx, 130 LLVMValueRef *pixoffy) 131 { 132 unsigned i; 133 unsigned num_pix = coeff_bld->type.length; 134 struct gallivm_state *gallivm = coeff_bld->gallivm; 135 LLVMBuilderRef builder = coeff_bld->gallivm->builder; 136 LLVMValueRef nr, pixxf, pixyf; 137 138 *pixoffx = coeff_bld->undef; 139 *pixoffy = coeff_bld->undef; 140 141 for (i = 0; i < num_pix; i++) { 142 nr = lp_build_const_int32(gallivm, i); 143 pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] + 144 (quad_start_index & 1) * 2); 145 pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] + 146 (quad_start_index & 2)); 147 *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, ""); 148 *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, ""); 149 } 150 } 151 152 153 /* Much easier, and significantly less instructions in the per-stamp 154 * part (less than half) but overall more instructions so a loss if 155 * most quads are active. Might be a win though with larger vectors. 156 * No ability to do per-quad divide (doable but not implemented) 157 * Could be made to work with passed in pixel offsets (i.e. active quad merging). 158 */ 159 static void 160 coeffs_init_simple(struct lp_build_interp_soa_context *bld, 161 LLVMValueRef a0_ptr, 162 LLVMValueRef dadx_ptr, 163 LLVMValueRef dady_ptr) 164 { 165 struct lp_build_context *coeff_bld = &bld->coeff_bld; 166 struct lp_build_context *setup_bld = &bld->setup_bld; 167 struct gallivm_state *gallivm = coeff_bld->gallivm; 168 LLVMBuilderRef builder = gallivm->builder; 169 unsigned attrib; 170 171 for (attrib = 0; attrib < bld->num_attribs; ++attrib) { 172 /* 173 * always fetch all 4 values for performance/simplicity 174 * Note: we do that here because it seems to generate better 175 * code. It generates a lot of moves initially but less 176 * moves later. As far as I can tell this looks like a 177 * llvm issue, instead of simply reloading the values from 178 * the passed in pointers it if it runs out of registers 179 * it spills/reloads them. Maybe some optimization passes 180 * would help. 181 * Might want to investigate this again later. 182 */ 183 const unsigned interp = bld->interp[attrib]; 184 LLVMValueRef index = lp_build_const_int32(gallivm, 185 attrib * TGSI_NUM_CHANNELS); 186 LLVMValueRef ptr; 187 LLVMValueRef dadxaos = setup_bld->zero; 188 LLVMValueRef dadyaos = setup_bld->zero; 189 LLVMValueRef a0aos = setup_bld->zero; 190 191 switch (interp) { 192 case LP_INTERP_PERSPECTIVE: 193 /* fall-through */ 194 195 case LP_INTERP_LINEAR: 196 ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""); 197 ptr = LLVMBuildBitCast(builder, ptr, 198 LLVMPointerType(setup_bld->vec_type, 0), ""); 199 dadxaos = LLVMBuildLoad(builder, ptr, ""); 200 201 ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, ""); 202 ptr = LLVMBuildBitCast(builder, ptr, 203 LLVMPointerType(setup_bld->vec_type, 0), ""); 204 dadyaos = LLVMBuildLoad(builder, ptr, ""); 205 206 attrib_name(dadxaos, attrib, 0, ".dadxaos"); 207 attrib_name(dadyaos, attrib, 0, ".dadyaos"); 208 /* fall-through */ 209 210 case LP_INTERP_CONSTANT: 211 case LP_INTERP_FACING: 212 ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, ""); 213 ptr = LLVMBuildBitCast(builder, ptr, 214 LLVMPointerType(setup_bld->vec_type, 0), ""); 215 a0aos = LLVMBuildLoad(builder, ptr, ""); 216 attrib_name(a0aos, attrib, 0, ".a0aos"); 217 break; 218 219 case LP_INTERP_POSITION: 220 /* Nothing to do as the position coeffs are already setup in slot 0 */ 221 continue; 222 223 default: 224 assert(0); 225 break; 226 } 227 bld->a0aos[attrib] = a0aos; 228 bld->dadxaos[attrib] = dadxaos; 229 bld->dadyaos[attrib] = dadyaos; 230 } 231 } 232 233 /** 234 * Interpolate the shader input attribute values. 235 * This is called for each (group of) quad(s). 236 */ 237 static void 238 attribs_update_simple(struct lp_build_interp_soa_context *bld, 239 struct gallivm_state *gallivm, 240 LLVMValueRef loop_iter, 241 int start, 242 int end) 243 { 244 LLVMBuilderRef builder = gallivm->builder; 245 struct lp_build_context *coeff_bld = &bld->coeff_bld; 246 struct lp_build_context *setup_bld = &bld->setup_bld; 247 LLVMValueRef oow = NULL; 248 unsigned attrib; 249 LLVMValueRef pixoffx; 250 LLVMValueRef pixoffy; 251 LLVMValueRef ptr; 252 253 /* could do this with code-generated passed in pixel offsets too */ 254 255 assert(loop_iter); 256 ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, ""); 257 pixoffx = LLVMBuildLoad(builder, ptr, ""); 258 ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, ""); 259 pixoffy = LLVMBuildLoad(builder, ptr, ""); 260 261 pixoffx = LLVMBuildFAdd(builder, pixoffx, 262 lp_build_broadcast_scalar(coeff_bld, bld->x), ""); 263 pixoffy = LLVMBuildFAdd(builder, pixoffy, 264 lp_build_broadcast_scalar(coeff_bld, bld->y), ""); 265 266 for (attrib = start; attrib < end; attrib++) { 267 const unsigned mask = bld->mask[attrib]; 268 const unsigned interp = bld->interp[attrib]; 269 unsigned chan; 270 271 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { 272 if (mask & (1 << chan)) { 273 LLVMValueRef index; 274 LLVMValueRef dadx = coeff_bld->zero; 275 LLVMValueRef dady = coeff_bld->zero; 276 LLVMValueRef a = coeff_bld->zero; 277 278 index = lp_build_const_int32(gallivm, chan); 279 switch (interp) { 280 case LP_INTERP_PERSPECTIVE: 281 /* fall-through */ 282 283 case LP_INTERP_LINEAR: 284 if (attrib == 0 && chan == 0) { 285 dadx = coeff_bld->one; 286 if (bld->pos_offset) { 287 a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset); 288 } 289 } 290 else if (attrib == 0 && chan == 1) { 291 dady = coeff_bld->one; 292 if (bld->pos_offset) { 293 a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset); 294 } 295 } 296 else { 297 dadx = lp_build_extract_broadcast(gallivm, setup_bld->type, 298 coeff_bld->type, bld->dadxaos[attrib], 299 index); 300 dady = lp_build_extract_broadcast(gallivm, setup_bld->type, 301 coeff_bld->type, bld->dadyaos[attrib], 302 index); 303 a = lp_build_extract_broadcast(gallivm, setup_bld->type, 304 coeff_bld->type, bld->a0aos[attrib], 305 index); 306 } 307 /* 308 * a = a0 + (x * dadx + y * dady) 309 */ 310 a = lp_build_fmuladd(builder, dadx, pixoffx, a); 311 a = lp_build_fmuladd(builder, dady, pixoffy, a); 312 313 if (interp == LP_INTERP_PERSPECTIVE) { 314 if (oow == NULL) { 315 LLVMValueRef w = bld->attribs[0][3]; 316 assert(attrib != 0); 317 assert(bld->mask[0] & TGSI_WRITEMASK_W); 318 oow = lp_build_rcp(coeff_bld, w); 319 } 320 a = lp_build_mul(coeff_bld, a, oow); 321 } 322 break; 323 324 case LP_INTERP_CONSTANT: 325 case LP_INTERP_FACING: 326 a = lp_build_extract_broadcast(gallivm, setup_bld->type, 327 coeff_bld->type, bld->a0aos[attrib], 328 index); 329 break; 330 331 case LP_INTERP_POSITION: 332 assert(attrib > 0); 333 a = bld->attribs[0][chan]; 334 break; 335 336 default: 337 assert(0); 338 break; 339 } 340 341 if ((attrib == 0) && (chan == 2) && !bld->depth_clamp){ 342 /* FIXME: Depth values can exceed 1.0, due to the fact that 343 * setup interpolation coefficients refer to (0,0) which causes 344 * precision loss. So we must clamp to 1.0 here to avoid artifacts. 345 * Note though values outside [0,1] are perfectly valid with 346 * depth clip disabled. 347 * XXX: If depth clip is disabled but we force depth clamp 348 * we may get values larger than 1.0 in the fs (but not in 349 * depth test). Not sure if that's an issue... 350 * Also, on a similar note, it is not obvious if the depth values 351 * appearing in fs (with depth clip disabled) should be clamped 352 * to [0,1], clamped to near/far or not be clamped at all... 353 */ 354 a = lp_build_min(coeff_bld, a, coeff_bld->one); 355 } 356 bld->attribs[attrib][chan] = a; 357 } 358 } 359 } 360 } 361 362 /** 363 * Initialize the bld->a, dadq fields. This involves fetching 364 * those values from the arrays which are passed into the JIT function. 365 */ 366 static void 367 coeffs_init(struct lp_build_interp_soa_context *bld, 368 LLVMValueRef a0_ptr, 369 LLVMValueRef dadx_ptr, 370 LLVMValueRef dady_ptr) 371 { 372 struct lp_build_context *coeff_bld = &bld->coeff_bld; 373 struct lp_build_context *setup_bld = &bld->setup_bld; 374 struct gallivm_state *gallivm = coeff_bld->gallivm; 375 LLVMBuilderRef builder = gallivm->builder; 376 LLVMValueRef pixoffx, pixoffy; 377 unsigned attrib; 378 unsigned chan; 379 unsigned i; 380 381 pixoffx = coeff_bld->undef; 382 pixoffy = coeff_bld->undef; 383 for (i = 0; i < coeff_bld->type.length; i++) { 384 LLVMValueRef nr = lp_build_const_int32(gallivm, i); 385 LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]); 386 LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]); 387 pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, ""); 388 pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, ""); 389 } 390 391 392 for (attrib = 0; attrib < bld->num_attribs; ++attrib) { 393 const unsigned mask = bld->mask[attrib]; 394 const unsigned interp = bld->interp[attrib]; 395 LLVMValueRef index = lp_build_const_int32(gallivm, 396 attrib * TGSI_NUM_CHANNELS); 397 LLVMValueRef ptr; 398 LLVMValueRef dadxaos = setup_bld->zero; 399 LLVMValueRef dadyaos = setup_bld->zero; 400 LLVMValueRef a0aos = setup_bld->zero; 401 402 /* always fetch all 4 values for performance/simplicity */ 403 switch (interp) { 404 case LP_INTERP_PERSPECTIVE: 405 /* fall-through */ 406 407 case LP_INTERP_LINEAR: 408 ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""); 409 ptr = LLVMBuildBitCast(builder, ptr, 410 LLVMPointerType(setup_bld->vec_type, 0), ""); 411 dadxaos = LLVMBuildLoad(builder, ptr, ""); 412 413 ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, ""); 414 ptr = LLVMBuildBitCast(builder, ptr, 415 LLVMPointerType(setup_bld->vec_type, 0), ""); 416 dadyaos = LLVMBuildLoad(builder, ptr, ""); 417 418 attrib_name(dadxaos, attrib, 0, ".dadxaos"); 419 attrib_name(dadyaos, attrib, 0, ".dadyaos"); 420 /* fall-through */ 421 422 case LP_INTERP_CONSTANT: 423 case LP_INTERP_FACING: 424 ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, ""); 425 ptr = LLVMBuildBitCast(builder, ptr, 426 LLVMPointerType(setup_bld->vec_type, 0), ""); 427 a0aos = LLVMBuildLoad(builder, ptr, ""); 428 attrib_name(a0aos, attrib, 0, ".a0aos"); 429 break; 430 431 case LP_INTERP_POSITION: 432 /* Nothing to do as the position coeffs are already setup in slot 0 */ 433 continue; 434 435 default: 436 assert(0); 437 break; 438 } 439 440 /* 441 * a = a0 + (x * dadx + y * dady) 442 * a0aos is the attrib value at top left corner of stamp 443 */ 444 if (interp != LP_INTERP_CONSTANT && 445 interp != LP_INTERP_FACING) { 446 LLVMValueRef x = lp_build_broadcast_scalar(setup_bld, bld->x); 447 LLVMValueRef y = lp_build_broadcast_scalar(setup_bld, bld->y); 448 a0aos = lp_build_fmuladd(builder, x, dadxaos, a0aos); 449 a0aos = lp_build_fmuladd(builder, y, dadyaos, a0aos); 450 } 451 452 /* 453 * dadq = {0, dadx, dady, dadx + dady} 454 * for two quads (side by side) this is: 455 * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady} 456 */ 457 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 458 /* this generates a CRAPLOAD of shuffles... */ 459 if (mask & (1 << chan)) { 460 LLVMValueRef dadx, dady; 461 LLVMValueRef dadq, dadq2; 462 LLVMValueRef a; 463 LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan); 464 465 if (attrib == 0 && chan == 0) { 466 a = bld->x; 467 if (bld->pos_offset) { 468 a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), ""); 469 } 470 a = lp_build_broadcast_scalar(coeff_bld, a); 471 dadx = coeff_bld->one; 472 dady = coeff_bld->zero; 473 } 474 else if (attrib == 0 && chan == 1) { 475 a = bld->y; 476 if (bld->pos_offset) { 477 a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), ""); 478 } 479 a = lp_build_broadcast_scalar(coeff_bld, a); 480 dady = coeff_bld->one; 481 dadx = coeff_bld->zero; 482 } 483 else { 484 dadx = lp_build_extract_broadcast(gallivm, setup_bld->type, 485 coeff_bld->type, dadxaos, chan_index); 486 dady = lp_build_extract_broadcast(gallivm, setup_bld->type, 487 coeff_bld->type, dadyaos, chan_index); 488 489 /* 490 * a = {a, a, a, a} 491 */ 492 a = lp_build_extract_broadcast(gallivm, setup_bld->type, 493 coeff_bld->type, a0aos, chan_index); 494 } 495 496 dadx = LLVMBuildFMul(builder, dadx, pixoffx, ""); 497 dady = LLVMBuildFMul(builder, dady, pixoffy, ""); 498 dadq = LLVMBuildFAdd(builder, dadx, dady, ""); 499 500 /* 501 * Compute the attrib values on the upper-left corner of each 502 * group of quads. 503 * Note that if we process 2 quads at once this doesn't 504 * really exactly to what we want. 505 * We need to access elem 0 and 2 respectively later if we process 506 * 2 quads at once. 507 */ 508 509 if (interp != LP_INTERP_CONSTANT && 510 interp != LP_INTERP_FACING) { 511 dadq2 = LLVMBuildFAdd(builder, dadq, dadq, ""); 512 a = LLVMBuildFAdd(builder, a, dadq2, ""); 513 } 514 515 #if PERSPECTIVE_DIVIDE_PER_QUAD 516 /* 517 * a *= 1 / w 518 */ 519 520 /* 521 * XXX since we're only going to access elements 0,2 out of 8 522 * if we have 8-wide vectors we should do the division only 4-wide. 523 * a is really a 2-elements in a 4-wide vector disguised as 8-wide 524 * in this case. 525 */ 526 if (interp == LP_INTERP_PERSPECTIVE) { 527 LLVMValueRef w = bld->a[0][3]; 528 assert(attrib != 0); 529 assert(bld->mask[0] & TGSI_WRITEMASK_W); 530 if (!bld->oow) { 531 bld->oow = lp_build_rcp(coeff_bld, w); 532 lp_build_name(bld->oow, "oow"); 533 } 534 a = lp_build_mul(coeff_bld, a, bld->oow); 535 } 536 #endif 537 538 attrib_name(a, attrib, chan, ".a"); 539 attrib_name(dadq, attrib, chan, ".dadq"); 540 541 bld->a[attrib][chan] = lp_build_alloca(gallivm, 542 LLVMTypeOf(a), ""); 543 LLVMBuildStore(builder, a, bld->a[attrib][chan]); 544 bld->dadq[attrib][chan] = dadq; 545 } 546 } 547 } 548 } 549 550 551 /** 552 * Increment the shader input attribute values. 553 * This is called when we move from one quad to the next. 554 */ 555 static void 556 attribs_update(struct lp_build_interp_soa_context *bld, 557 struct gallivm_state *gallivm, 558 LLVMValueRef loop_iter, 559 int start, 560 int end) 561 { 562 LLVMBuilderRef builder = gallivm->builder; 563 struct lp_build_context *coeff_bld = &bld->coeff_bld; 564 LLVMValueRef oow = NULL; 565 unsigned attrib; 566 unsigned chan; 567 568 for(attrib = start; attrib < end; ++attrib) { 569 const unsigned mask = bld->mask[attrib]; 570 const unsigned interp = bld->interp[attrib]; 571 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 572 if(mask & (1 << chan)) { 573 LLVMValueRef a; 574 if (interp == LP_INTERP_CONSTANT || 575 interp == LP_INTERP_FACING) { 576 a = LLVMBuildLoad(builder, bld->a[attrib][chan], ""); 577 } 578 else if (interp == LP_INTERP_POSITION) { 579 assert(attrib > 0); 580 a = bld->attribs[0][chan]; 581 } 582 else { 583 LLVMValueRef dadq; 584 585 a = bld->a[attrib][chan]; 586 587 /* 588 * Broadcast the attribute value for this quad into all elements 589 */ 590 591 { 592 /* stored as vector load as float */ 593 LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext( 594 gallivm->context), 0); 595 LLVMValueRef ptr; 596 a = LLVMBuildBitCast(builder, a, ptr_type, ""); 597 ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, ""); 598 a = LLVMBuildLoad(builder, ptr, ""); 599 a = lp_build_broadcast_scalar(&bld->coeff_bld, a); 600 } 601 602 /* 603 * Get the derivatives. 604 */ 605 606 dadq = bld->dadq[attrib][chan]; 607 608 #if PERSPECTIVE_DIVIDE_PER_QUAD 609 if (interp == LP_INTERP_PERSPECTIVE) { 610 LLVMValueRef dwdq = bld->dadq[0][3]; 611 612 if (oow == NULL) { 613 assert(bld->oow); 614 oow = LLVMBuildShuffleVector(coeff_bld->builder, 615 bld->oow, coeff_bld->undef, 616 shuffle, ""); 617 } 618 619 dadq = lp_build_sub(coeff_bld, 620 dadq, 621 lp_build_mul(coeff_bld, a, dwdq)); 622 dadq = lp_build_mul(coeff_bld, dadq, oow); 623 } 624 #endif 625 626 /* 627 * Add the derivatives 628 */ 629 630 a = lp_build_add(coeff_bld, a, dadq); 631 632 #if !PERSPECTIVE_DIVIDE_PER_QUAD 633 if (interp == LP_INTERP_PERSPECTIVE) { 634 if (oow == NULL) { 635 LLVMValueRef w = bld->attribs[0][3]; 636 assert(attrib != 0); 637 assert(bld->mask[0] & TGSI_WRITEMASK_W); 638 oow = lp_build_rcp(coeff_bld, w); 639 } 640 a = lp_build_mul(coeff_bld, a, oow); 641 } 642 #endif 643 644 if (attrib == 0 && chan == 2 && !bld->depth_clamp) { 645 /* FIXME: Depth values can exceed 1.0, due to the fact that 646 * setup interpolation coefficients refer to (0,0) which causes 647 * precision loss. So we must clamp to 1.0 here to avoid artifacts. 648 * Note though values outside [0,1] are perfectly valid with 649 * depth clip disabled.. 650 * XXX: If depth clip is disabled but we force depth clamp 651 * we may get values larger than 1.0 in the fs (but not in 652 * depth test). Not sure if that's an issue... 653 * Also, on a similar note, it is not obvious if the depth values 654 * appearing in fs (with depth clip disabled) should be clamped 655 * to [0,1], clamped to near/far or not be clamped at all... 656 */ 657 a = lp_build_min(coeff_bld, a, coeff_bld->one); 658 } 659 660 attrib_name(a, attrib, chan, ""); 661 } 662 bld->attribs[attrib][chan] = a; 663 } 664 } 665 } 666 } 667 668 669 /** 670 * Generate the position vectors. 671 * 672 * Parameter x0, y0 are the integer values with upper left coordinates. 673 */ 674 static void 675 pos_init(struct lp_build_interp_soa_context *bld, 676 LLVMValueRef x0, 677 LLVMValueRef y0) 678 { 679 LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder; 680 struct lp_build_context *coeff_bld = &bld->coeff_bld; 681 682 bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, ""); 683 bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, ""); 684 } 685 686 687 /** 688 * Initialize fragment shader input attribute info. 689 */ 690 void 691 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, 692 struct gallivm_state *gallivm, 693 unsigned num_inputs, 694 const struct lp_shader_input *inputs, 695 boolean pixel_center_integer, 696 boolean depth_clamp, 697 LLVMBuilderRef builder, 698 struct lp_type type, 699 LLVMValueRef a0_ptr, 700 LLVMValueRef dadx_ptr, 701 LLVMValueRef dady_ptr, 702 LLVMValueRef x0, 703 LLVMValueRef y0) 704 { 705 struct lp_type coeff_type; 706 struct lp_type setup_type; 707 unsigned attrib; 708 unsigned chan; 709 710 memset(bld, 0, sizeof *bld); 711 712 memset(&coeff_type, 0, sizeof coeff_type); 713 coeff_type.floating = TRUE; 714 coeff_type.sign = TRUE; 715 coeff_type.width = 32; 716 coeff_type.length = type.length; 717 718 memset(&setup_type, 0, sizeof setup_type); 719 setup_type.floating = TRUE; 720 setup_type.sign = TRUE; 721 setup_type.width = 32; 722 setup_type.length = TGSI_NUM_CHANNELS; 723 724 725 /* XXX: we don't support interpolating into any other types */ 726 assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0); 727 728 lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type); 729 lp_build_context_init(&bld->setup_bld, gallivm, setup_type); 730 731 /* For convenience */ 732 bld->pos = bld->attribs[0]; 733 bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1]; 734 735 /* Position */ 736 bld->mask[0] = TGSI_WRITEMASK_XYZW; 737 bld->interp[0] = LP_INTERP_LINEAR; 738 739 /* Inputs */ 740 for (attrib = 0; attrib < num_inputs; ++attrib) { 741 bld->mask[1 + attrib] = inputs[attrib].usage_mask; 742 bld->interp[1 + attrib] = inputs[attrib].interp; 743 } 744 bld->num_attribs = 1 + num_inputs; 745 746 /* Ensure all masked out input channels have a valid value */ 747 for (attrib = 0; attrib < bld->num_attribs; ++attrib) { 748 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { 749 bld->attribs[attrib][chan] = bld->coeff_bld.undef; 750 } 751 } 752 753 if (pixel_center_integer) { 754 bld->pos_offset = 0.0; 755 } else { 756 bld->pos_offset = 0.5; 757 } 758 bld->depth_clamp = depth_clamp; 759 760 pos_init(bld, x0, y0); 761 762 /* 763 * Simple method (single step interpolation) may be slower if vector length 764 * is just 4, but the results are different (generally less accurate) with 765 * the other method, so always use more accurate version. 766 */ 767 if (1) { 768 bld->simple_interp = TRUE; 769 { 770 /* XXX this should use a global static table */ 771 unsigned i; 772 unsigned num_loops = 16 / type.length; 773 LLVMValueRef pixoffx, pixoffy, index; 774 LLVMValueRef ptr; 775 776 bld->xoffset_store = lp_build_array_alloca(gallivm, 777 lp_build_vec_type(gallivm, type), 778 lp_build_const_int32(gallivm, num_loops), 779 ""); 780 bld->yoffset_store = lp_build_array_alloca(gallivm, 781 lp_build_vec_type(gallivm, type), 782 lp_build_const_int32(gallivm, num_loops), 783 ""); 784 for (i = 0; i < num_loops; i++) { 785 index = lp_build_const_int32(gallivm, i); 786 calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy); 787 ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, ""); 788 LLVMBuildStore(builder, pixoffx, ptr); 789 ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, ""); 790 LLVMBuildStore(builder, pixoffy, ptr); 791 } 792 } 793 coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr); 794 } 795 else { 796 bld->simple_interp = FALSE; 797 coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr); 798 } 799 800 } 801 802 803 /* 804 * Advance the position and inputs to the given quad within the block. 805 */ 806 807 void 808 lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld, 809 struct gallivm_state *gallivm, 810 LLVMValueRef quad_start_index) 811 { 812 if (bld->simple_interp) { 813 attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs); 814 } 815 else { 816 attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs); 817 } 818 } 819 820 void 821 lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld, 822 struct gallivm_state *gallivm, 823 LLVMValueRef quad_start_index) 824 { 825 if (bld->simple_interp) { 826 attribs_update_simple(bld, gallivm, quad_start_index, 0, 1); 827 } 828 else { 829 attribs_update(bld, gallivm, quad_start_index, 0, 1); 830 } 831 } 832 833