1 /* 2 Copyright (C) Intel Corp. 2006. All Rights Reserved. 3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to 4 develop this 3D driver. 5 6 Permission is hereby granted, free of charge, to any person obtaining 7 a copy of this software and associated documentation files (the 8 "Software"), to deal in the Software without restriction, including 9 without limitation the rights to use, copy, modify, merge, publish, 10 distribute, sublicense, and/or sell copies of the Software, and to 11 permit persons to whom the Software is furnished to do so, subject to 12 the following conditions: 13 14 The above copyright notice and this permission notice (including the 15 next paragraph) shall be included in all copies or substantial 16 portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE 22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 26 **********************************************************************/ 27 /* 28 * Authors: 29 * Keith Whitwell <keith (at) tungstengraphics.com> 30 */ 31 32 33 #include "main/macros.h" 34 #include "brw_context.h" 35 #include "brw_wm.h" 36 37 static bool 38 can_do_pln(struct intel_context *intel, const struct brw_reg *deltas) 39 { 40 struct brw_context *brw = brw_context(&intel->ctx); 41 42 if (!brw->has_pln) 43 return false; 44 45 if (deltas[1].nr != deltas[0].nr + 1) 46 return false; 47 48 if (intel->gen < 6 && ((deltas[0].nr & 1) != 0)) 49 return false; 50 51 return true; 52 } 53 54 /* Return the SrcReg index of the channels that can be immediate float operands 55 * instead of usage of PROGRAM_CONSTANT values through push/pull. 56 */ 57 bool 58 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg) 59 { 60 int opcode_array[] = { 61 [OPCODE_ADD] = 2, 62 [OPCODE_CMP] = 3, 63 [OPCODE_DP3] = 2, 64 [OPCODE_DP4] = 2, 65 [OPCODE_DPH] = 2, 66 [OPCODE_MAX] = 2, 67 [OPCODE_MIN] = 2, 68 [OPCODE_MOV] = 1, 69 [OPCODE_MUL] = 2, 70 [OPCODE_SEQ] = 2, 71 [OPCODE_SGE] = 2, 72 [OPCODE_SGT] = 2, 73 [OPCODE_SLE] = 2, 74 [OPCODE_SLT] = 2, 75 [OPCODE_SNE] = 2, 76 [OPCODE_SWZ] = 1, 77 [OPCODE_XPD] = 2, 78 }; 79 80 /* These opcodes get broken down in a way that allow two 81 * args to be immediates. 82 */ 83 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) { 84 if (arg == 1 || arg == 2) 85 return true; 86 } 87 88 if (opcode > ARRAY_SIZE(opcode_array)) 89 return false; 90 91 return arg == opcode_array[opcode] - 1; 92 } 93 94 /** 95 * Computes the screen-space x,y position of the pixels. 96 * 97 * This will be used by emit_delta_xy() or emit_wpos_xy() for 98 * interpolation of attributes.. 99 * 100 * Payload R0: 101 * 102 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles, 103 * corresponding to each of the 16 execution channels. 104 * R0.1..8 -- ? 105 * R1.0 -- triangle vertex 0.X 106 * R1.1 -- triangle vertex 0.Y 107 * R1.2 -- tile 0 x,y coords (2 packed uwords) 108 * R1.3 -- tile 1 x,y coords (2 packed uwords) 109 * R1.4 -- tile 2 x,y coords (2 packed uwords) 110 * R1.5 -- tile 3 x,y coords (2 packed uwords) 111 * R1.6 -- ? 112 * R1.7 -- ? 113 * R1.8 -- ? 114 */ 115 void emit_pixel_xy(struct brw_wm_compile *c, 116 const struct brw_reg *dst, 117 GLuint mask) 118 { 119 struct brw_compile *p = &c->func; 120 struct brw_reg r1 = brw_vec1_grf(1, 0); 121 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW); 122 struct brw_reg dst0_uw, dst1_uw; 123 124 brw_push_insn_state(p); 125 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 126 127 if (c->dispatch_width == 16) { 128 dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)); 129 dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)); 130 } else { 131 dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW)); 132 dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW)); 133 } 134 135 /* Calculate pixel centers by adding 1 or 0 to each of the 136 * micro-tile coordinates passed in r1. 137 */ 138 if (mask & WRITEMASK_X) { 139 brw_ADD(p, 140 dst0_uw, 141 stride(suboffset(r1_uw, 4), 2, 4, 0), 142 brw_imm_v(0x10101010)); 143 } 144 145 if (mask & WRITEMASK_Y) { 146 brw_ADD(p, 147 dst1_uw, 148 stride(suboffset(r1_uw,5), 2, 4, 0), 149 brw_imm_v(0x11001100)); 150 } 151 brw_pop_insn_state(p); 152 } 153 154 /** 155 * Computes the screen-space x,y distance of the pixels from the start 156 * vertex. 157 * 158 * This will be used in linterp or pinterp with the start vertex value 159 * and the Cx, Cy, and C0 coefficients passed in from the setup engine 160 * to produce interpolated attribute values. 161 */ 162 void emit_delta_xy(struct brw_compile *p, 163 const struct brw_reg *dst, 164 GLuint mask, 165 const struct brw_reg *arg0) 166 { 167 struct intel_context *intel = &p->brw->intel; 168 struct brw_reg r1 = brw_vec1_grf(1, 0); 169 170 if (mask == 0) 171 return; 172 173 assert(mask == WRITEMASK_XY); 174 175 if (intel->gen >= 6) { 176 /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1. 177 Just add them with 0.0 for dst reg.. */ 178 r1 = brw_imm_v(0x00000000); 179 brw_ADD(p, 180 dst[0], 181 retype(arg0[0], BRW_REGISTER_TYPE_UW), 182 r1); 183 brw_ADD(p, 184 dst[1], 185 retype(arg0[1], BRW_REGISTER_TYPE_UW), 186 r1); 187 return; 188 } 189 190 /* Calc delta X,Y by subtracting origin in r1 from the pixel 191 * centers produced by emit_pixel_xy(). 192 */ 193 brw_ADD(p, 194 dst[0], 195 retype(arg0[0], BRW_REGISTER_TYPE_UW), 196 negate(r1)); 197 brw_ADD(p, 198 dst[1], 199 retype(arg0[1], BRW_REGISTER_TYPE_UW), 200 negate(suboffset(r1,1))); 201 } 202 203 /** 204 * Computes the pixel offset from the window origin for gl_FragCoord(). 205 */ 206 void emit_wpos_xy(struct brw_wm_compile *c, 207 const struct brw_reg *dst, 208 GLuint mask, 209 const struct brw_reg *arg0) 210 { 211 struct brw_compile *p = &c->func; 212 struct intel_context *intel = &p->brw->intel; 213 struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W); 214 struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W); 215 216 if (mask & WRITEMASK_X) { 217 if (intel->gen >= 6) { 218 struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F); 219 brw_MOV(p, delta_x_f, delta_x); 220 delta_x = delta_x_f; 221 } 222 223 if (c->fp->program.PixelCenterInteger) { 224 /* X' = X */ 225 brw_MOV(p, dst[0], delta_x); 226 } else { 227 /* X' = X + 0.5 */ 228 brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5)); 229 } 230 } 231 232 if (mask & WRITEMASK_Y) { 233 if (intel->gen >= 6) { 234 struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F); 235 brw_MOV(p, delta_y_f, delta_y); 236 delta_y = delta_y_f; 237 } 238 239 if (c->fp->program.OriginUpperLeft) { 240 if (c->fp->program.PixelCenterInteger) { 241 /* Y' = Y */ 242 brw_MOV(p, dst[1], delta_y); 243 } else { 244 brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5)); 245 } 246 } else { 247 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5; 248 249 /* Y' = (height - 1) - Y + center */ 250 brw_ADD(p, dst[1], negate(delta_y), 251 brw_imm_f(c->key.drawable_height - 1 + center_offset)); 252 } 253 } 254 } 255 256 257 void emit_pixel_w(struct brw_wm_compile *c, 258 const struct brw_reg *dst, 259 GLuint mask, 260 const struct brw_reg *arg0, 261 const struct brw_reg *deltas) 262 { 263 struct brw_compile *p = &c->func; 264 struct intel_context *intel = &p->brw->intel; 265 struct brw_reg src; 266 struct brw_reg temp_dst; 267 268 if (intel->gen >= 6) 269 temp_dst = dst[3]; 270 else 271 temp_dst = brw_message_reg(2); 272 273 assert(intel->gen < 6); 274 275 /* Don't need this if all you are doing is interpolating color, for 276 * instance. 277 */ 278 if (mask & WRITEMASK_W) { 279 struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4); 280 281 /* Calc 1/w - just linterp wpos[3] optimized by putting the 282 * result straight into a message reg. 283 */ 284 if (can_do_pln(intel, deltas)) { 285 brw_PLN(p, temp_dst, interp3, deltas[0]); 286 } else { 287 brw_LINE(p, brw_null_reg(), interp3, deltas[0]); 288 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]); 289 } 290 291 /* Calc w */ 292 if (intel->gen >= 6) 293 src = temp_dst; 294 else 295 src = brw_null_reg(); 296 297 if (c->dispatch_width == 16) { 298 brw_math_16(p, dst[3], 299 BRW_MATH_FUNCTION_INV, 300 2, src, 301 BRW_MATH_PRECISION_FULL); 302 } else { 303 brw_math(p, dst[3], 304 BRW_MATH_FUNCTION_INV, 305 2, src, 306 BRW_MATH_DATA_VECTOR, 307 BRW_MATH_PRECISION_FULL); 308 } 309 } 310 } 311 312 void emit_linterp(struct brw_compile *p, 313 const struct brw_reg *dst, 314 GLuint mask, 315 const struct brw_reg *arg0, 316 const struct brw_reg *deltas) 317 { 318 struct intel_context *intel = &p->brw->intel; 319 struct brw_reg interp[4]; 320 GLuint nr = arg0[0].nr; 321 GLuint i; 322 323 interp[0] = brw_vec1_grf(nr, 0); 324 interp[1] = brw_vec1_grf(nr, 4); 325 interp[2] = brw_vec1_grf(nr+1, 0); 326 interp[3] = brw_vec1_grf(nr+1, 4); 327 328 for (i = 0; i < 4; i++) { 329 if (mask & (1<<i)) { 330 if (intel->gen >= 6) { 331 brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0)); 332 } else if (can_do_pln(intel, deltas)) { 333 brw_PLN(p, dst[i], interp[i], deltas[0]); 334 } else { 335 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]); 336 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]); 337 } 338 } 339 } 340 } 341 342 343 void emit_pinterp(struct brw_compile *p, 344 const struct brw_reg *dst, 345 GLuint mask, 346 const struct brw_reg *arg0, 347 const struct brw_reg *deltas, 348 const struct brw_reg *w) 349 { 350 struct intel_context *intel = &p->brw->intel; 351 struct brw_reg interp[4]; 352 GLuint nr = arg0[0].nr; 353 GLuint i; 354 355 if (intel->gen >= 6) { 356 emit_linterp(p, dst, mask, arg0, interp); 357 return; 358 } 359 360 interp[0] = brw_vec1_grf(nr, 0); 361 interp[1] = brw_vec1_grf(nr, 4); 362 interp[2] = brw_vec1_grf(nr+1, 0); 363 interp[3] = brw_vec1_grf(nr+1, 4); 364 365 for (i = 0; i < 4; i++) { 366 if (mask & (1<<i)) { 367 if (can_do_pln(intel, deltas)) { 368 brw_PLN(p, dst[i], interp[i], deltas[0]); 369 } else { 370 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]); 371 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]); 372 } 373 } 374 } 375 for (i = 0; i < 4; i++) { 376 if (mask & (1<<i)) { 377 brw_MUL(p, dst[i], dst[i], w[3]); 378 } 379 } 380 } 381 382 383 void emit_cinterp(struct brw_compile *p, 384 const struct brw_reg *dst, 385 GLuint mask, 386 const struct brw_reg *arg0) 387 { 388 struct brw_reg interp[4]; 389 GLuint nr = arg0[0].nr; 390 GLuint i; 391 392 interp[0] = brw_vec1_grf(nr, 0); 393 interp[1] = brw_vec1_grf(nr, 4); 394 interp[2] = brw_vec1_grf(nr+1, 0); 395 interp[3] = brw_vec1_grf(nr+1, 4); 396 397 for (i = 0; i < 4; i++) { 398 if (mask & (1<<i)) { 399 brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */ 400 } 401 } 402 } 403 404 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */ 405 void emit_frontfacing(struct brw_compile *p, 406 const struct brw_reg *dst, 407 GLuint mask) 408 { 409 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 410 GLuint i; 411 412 if (!(mask & WRITEMASK_XYZW)) 413 return; 414 415 for (i = 0; i < 4; i++) { 416 if (mask & (1<<i)) { 417 brw_MOV(p, dst[i], brw_imm_f(0.0)); 418 } 419 } 420 421 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 422 * us front face 423 */ 424 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31)); 425 for (i = 0; i < 4; i++) { 426 if (mask & (1<<i)) { 427 brw_MOV(p, dst[i], brw_imm_f(1.0)); 428 } 429 } 430 brw_set_predicate_control_flag_value(p, 0xff); 431 } 432 433 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 434 * looking like: 435 * 436 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 437 * 438 * and we're trying to produce: 439 * 440 * DDX DDY 441 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 442 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 443 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 444 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 445 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 446 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 447 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 448 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 449 * 450 * and add another set of two more subspans if in 16-pixel dispatch mode. 451 * 452 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 453 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 454 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 455 * between each other. We could probably do it like ddx and swizzle the right 456 * order later, but bail for now and just produce 457 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 458 * 459 * The negate_value boolean is used to negate the d/dy computation for FBOs, 460 * since they place the origin at the upper left instead of the lower left. 461 */ 462 void emit_ddxy(struct brw_compile *p, 463 const struct brw_reg *dst, 464 GLuint mask, 465 bool is_ddx, 466 const struct brw_reg *arg0, 467 bool negate_value) 468 { 469 int i; 470 struct brw_reg src0, src1; 471 472 if (mask & SATURATE) 473 brw_set_saturate(p, 1); 474 for (i = 0; i < 4; i++ ) { 475 if (mask & (1<<i)) { 476 if (is_ddx) { 477 src0 = brw_reg(arg0[i].file, arg0[i].nr, 1, 478 BRW_REGISTER_TYPE_F, 479 BRW_VERTICAL_STRIDE_2, 480 BRW_WIDTH_2, 481 BRW_HORIZONTAL_STRIDE_0, 482 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 483 src1 = brw_reg(arg0[i].file, arg0[i].nr, 0, 484 BRW_REGISTER_TYPE_F, 485 BRW_VERTICAL_STRIDE_2, 486 BRW_WIDTH_2, 487 BRW_HORIZONTAL_STRIDE_0, 488 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 489 } else { 490 src0 = brw_reg(arg0[i].file, arg0[i].nr, 0, 491 BRW_REGISTER_TYPE_F, 492 BRW_VERTICAL_STRIDE_4, 493 BRW_WIDTH_4, 494 BRW_HORIZONTAL_STRIDE_0, 495 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 496 src1 = brw_reg(arg0[i].file, arg0[i].nr, 2, 497 BRW_REGISTER_TYPE_F, 498 BRW_VERTICAL_STRIDE_4, 499 BRW_WIDTH_4, 500 BRW_HORIZONTAL_STRIDE_0, 501 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 502 } 503 if (negate_value) 504 brw_ADD(p, dst[i], src1, negate(src0)); 505 else 506 brw_ADD(p, dst[i], src0, negate(src1)); 507 } 508 } 509 if (mask & SATURATE) 510 brw_set_saturate(p, 0); 511 } 512 513 void emit_alu1(struct brw_compile *p, 514 struct brw_instruction *(*func)(struct brw_compile *, 515 struct brw_reg, 516 struct brw_reg), 517 const struct brw_reg *dst, 518 GLuint mask, 519 const struct brw_reg *arg0) 520 { 521 GLuint i; 522 523 if (mask & SATURATE) 524 brw_set_saturate(p, 1); 525 526 for (i = 0; i < 4; i++) { 527 if (mask & (1<<i)) { 528 func(p, dst[i], arg0[i]); 529 } 530 } 531 532 if (mask & SATURATE) 533 brw_set_saturate(p, 0); 534 } 535 536 537 void emit_alu2(struct brw_compile *p, 538 struct brw_instruction *(*func)(struct brw_compile *, 539 struct brw_reg, 540 struct brw_reg, 541 struct brw_reg), 542 const struct brw_reg *dst, 543 GLuint mask, 544 const struct brw_reg *arg0, 545 const struct brw_reg *arg1) 546 { 547 GLuint i; 548 549 if (mask & SATURATE) 550 brw_set_saturate(p, 1); 551 552 for (i = 0; i < 4; i++) { 553 if (mask & (1<<i)) { 554 func(p, dst[i], arg0[i], arg1[i]); 555 } 556 } 557 558 if (mask & SATURATE) 559 brw_set_saturate(p, 0); 560 } 561 562 563 void emit_mad(struct brw_compile *p, 564 const struct brw_reg *dst, 565 GLuint mask, 566 const struct brw_reg *arg0, 567 const struct brw_reg *arg1, 568 const struct brw_reg *arg2) 569 { 570 GLuint i; 571 572 for (i = 0; i < 4; i++) { 573 if (mask & (1<<i)) { 574 brw_MUL(p, dst[i], arg0[i], arg1[i]); 575 576 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 577 brw_ADD(p, dst[i], dst[i], arg2[i]); 578 brw_set_saturate(p, 0); 579 } 580 } 581 } 582 583 void emit_lrp(struct brw_compile *p, 584 const struct brw_reg *dst, 585 GLuint mask, 586 const struct brw_reg *arg0, 587 const struct brw_reg *arg1, 588 const struct brw_reg *arg2) 589 { 590 GLuint i; 591 592 /* Uses dst as a temporary: 593 */ 594 for (i = 0; i < 4; i++) { 595 if (mask & (1<<i)) { 596 /* Can I use the LINE instruction for this? 597 */ 598 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0)); 599 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]); 600 601 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 602 brw_MAC(p, dst[i], arg0[i], arg1[i]); 603 brw_set_saturate(p, 0); 604 } 605 } 606 } 607 608 void emit_sop(struct brw_compile *p, 609 const struct brw_reg *dst, 610 GLuint mask, 611 GLuint cond, 612 const struct brw_reg *arg0, 613 const struct brw_reg *arg1) 614 { 615 GLuint i; 616 617 for (i = 0; i < 4; i++) { 618 if (mask & (1<<i)) { 619 brw_push_insn_state(p); 620 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]); 621 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 622 brw_MOV(p, dst[i], brw_imm_f(0)); 623 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); 624 brw_MOV(p, dst[i], brw_imm_f(1.0)); 625 brw_pop_insn_state(p); 626 } 627 } 628 } 629 630 static void emit_slt( struct brw_compile *p, 631 const struct brw_reg *dst, 632 GLuint mask, 633 const struct brw_reg *arg0, 634 const struct brw_reg *arg1 ) 635 { 636 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1); 637 } 638 639 static void emit_sle( struct brw_compile *p, 640 const struct brw_reg *dst, 641 GLuint mask, 642 const struct brw_reg *arg0, 643 const struct brw_reg *arg1 ) 644 { 645 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1); 646 } 647 648 static void emit_sgt( struct brw_compile *p, 649 const struct brw_reg *dst, 650 GLuint mask, 651 const struct brw_reg *arg0, 652 const struct brw_reg *arg1 ) 653 { 654 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1); 655 } 656 657 static void emit_sge( struct brw_compile *p, 658 const struct brw_reg *dst, 659 GLuint mask, 660 const struct brw_reg *arg0, 661 const struct brw_reg *arg1 ) 662 { 663 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1); 664 } 665 666 static void emit_seq( struct brw_compile *p, 667 const struct brw_reg *dst, 668 GLuint mask, 669 const struct brw_reg *arg0, 670 const struct brw_reg *arg1 ) 671 { 672 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1); 673 } 674 675 static void emit_sne( struct brw_compile *p, 676 const struct brw_reg *dst, 677 GLuint mask, 678 const struct brw_reg *arg0, 679 const struct brw_reg *arg1 ) 680 { 681 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1); 682 } 683 684 void emit_cmp(struct brw_compile *p, 685 const struct brw_reg *dst, 686 GLuint mask, 687 const struct brw_reg *arg0, 688 const struct brw_reg *arg1, 689 const struct brw_reg *arg2) 690 { 691 GLuint i; 692 693 for (i = 0; i < 4; i++) { 694 if (mask & (1<<i)) { 695 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0)); 696 697 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 698 brw_SEL(p, dst[i], arg1[i], arg2[i]); 699 brw_set_saturate(p, 0); 700 brw_set_predicate_control_flag_value(p, 0xff); 701 } 702 } 703 } 704 705 void emit_sign(struct brw_compile *p, 706 const struct brw_reg *dst, 707 GLuint mask, 708 const struct brw_reg *arg0) 709 { 710 GLuint i; 711 712 for (i = 0; i < 4; i++) { 713 if (mask & (1<<i)) { 714 brw_MOV(p, dst[i], brw_imm_f(0.0)); 715 716 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0)); 717 brw_MOV(p, dst[i], brw_imm_f(-1.0)); 718 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 719 720 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0)); 721 brw_MOV(p, dst[i], brw_imm_f(1.0)); 722 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 723 } 724 } 725 } 726 727 void emit_max(struct brw_compile *p, 728 const struct brw_reg *dst, 729 GLuint mask, 730 const struct brw_reg *arg0, 731 const struct brw_reg *arg1) 732 { 733 GLuint i; 734 735 for (i = 0; i < 4; i++) { 736 if (mask & (1<<i)) { 737 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]); 738 739 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 740 brw_SEL(p, dst[i], arg0[i], arg1[i]); 741 brw_set_saturate(p, 0); 742 brw_set_predicate_control_flag_value(p, 0xff); 743 } 744 } 745 } 746 747 void emit_min(struct brw_compile *p, 748 const struct brw_reg *dst, 749 GLuint mask, 750 const struct brw_reg *arg0, 751 const struct brw_reg *arg1) 752 { 753 GLuint i; 754 755 for (i = 0; i < 4; i++) { 756 if (mask & (1<<i)) { 757 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]); 758 759 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 760 brw_SEL(p, dst[i], arg0[i], arg1[i]); 761 brw_set_saturate(p, 0); 762 brw_set_predicate_control_flag_value(p, 0xff); 763 } 764 } 765 } 766 767 768 void emit_dp2(struct brw_compile *p, 769 const struct brw_reg *dst, 770 GLuint mask, 771 const struct brw_reg *arg0, 772 const struct brw_reg *arg1) 773 { 774 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; 775 776 if (!(mask & WRITEMASK_XYZW)) 777 return; /* Do not emit dead code */ 778 779 assert(is_power_of_two(mask & WRITEMASK_XYZW)); 780 781 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); 782 783 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 784 brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]); 785 brw_set_saturate(p, 0); 786 } 787 788 789 void emit_dp3(struct brw_compile *p, 790 const struct brw_reg *dst, 791 GLuint mask, 792 const struct brw_reg *arg0, 793 const struct brw_reg *arg1) 794 { 795 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; 796 797 if (!(mask & WRITEMASK_XYZW)) 798 return; /* Do not emit dead code */ 799 800 assert(is_power_of_two(mask & WRITEMASK_XYZW)); 801 802 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); 803 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); 804 805 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 806 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]); 807 brw_set_saturate(p, 0); 808 } 809 810 811 void emit_dp4(struct brw_compile *p, 812 const struct brw_reg *dst, 813 GLuint mask, 814 const struct brw_reg *arg0, 815 const struct brw_reg *arg1) 816 { 817 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; 818 819 if (!(mask & WRITEMASK_XYZW)) 820 return; /* Do not emit dead code */ 821 822 assert(is_power_of_two(mask & WRITEMASK_XYZW)); 823 824 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); 825 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); 826 brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]); 827 828 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 829 brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]); 830 brw_set_saturate(p, 0); 831 } 832 833 834 void emit_dph(struct brw_compile *p, 835 const struct brw_reg *dst, 836 GLuint mask, 837 const struct brw_reg *arg0, 838 const struct brw_reg *arg1) 839 { 840 const int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; 841 842 if (!(mask & WRITEMASK_XYZW)) 843 return; /* Do not emit dead code */ 844 845 assert(is_power_of_two(mask & WRITEMASK_XYZW)); 846 847 brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); 848 brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); 849 brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]); 850 851 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 852 brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]); 853 brw_set_saturate(p, 0); 854 } 855 856 857 void emit_xpd(struct brw_compile *p, 858 const struct brw_reg *dst, 859 GLuint mask, 860 const struct brw_reg *arg0, 861 const struct brw_reg *arg1) 862 { 863 GLuint i; 864 865 assert((mask & WRITEMASK_W) != WRITEMASK_W); 866 867 for (i = 0 ; i < 3; i++) { 868 if (mask & (1<<i)) { 869 GLuint i2 = (i+2)%3; 870 GLuint i1 = (i+1)%3; 871 872 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]); 873 874 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 875 brw_MAC(p, dst[i], arg0[i1], arg1[i2]); 876 brw_set_saturate(p, 0); 877 } 878 } 879 } 880 881 882 void emit_math1(struct brw_wm_compile *c, 883 GLuint function, 884 const struct brw_reg *dst, 885 GLuint mask, 886 const struct brw_reg *arg0) 887 { 888 struct brw_compile *p = &c->func; 889 struct intel_context *intel = &p->brw->intel; 890 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; 891 struct brw_reg src; 892 893 if (!(mask & WRITEMASK_XYZW)) 894 return; /* Do not emit dead code */ 895 896 assert(is_power_of_two(mask & WRITEMASK_XYZW)); 897 898 if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 || 899 arg0[0].file != BRW_GENERAL_REGISTER_FILE) || 900 arg0[0].negate || arg0[0].abs)) { 901 /* Gen6 math requires that source and dst horizontal stride be 1, 902 * and that the argument be in the GRF. 903 * 904 * The hardware ignores source modifiers (negate and abs) on math 905 * instructions, so we also move to a temp to set those up. 906 */ 907 src = dst[dst_chan]; 908 brw_MOV(p, src, arg0[0]); 909 } else { 910 src = arg0[0]; 911 } 912 913 /* Send two messages to perform all 16 operations: 914 */ 915 brw_push_insn_state(p); 916 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 917 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 918 brw_math(p, 919 dst[dst_chan], 920 function, 921 2, 922 src, 923 BRW_MATH_DATA_VECTOR, 924 BRW_MATH_PRECISION_FULL); 925 926 if (c->dispatch_width == 16) { 927 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 928 brw_math(p, 929 offset(dst[dst_chan],1), 930 function, 931 3, 932 sechalf(src), 933 BRW_MATH_DATA_VECTOR, 934 BRW_MATH_PRECISION_FULL); 935 } 936 brw_pop_insn_state(p); 937 } 938 939 940 void emit_math2(struct brw_wm_compile *c, 941 GLuint function, 942 const struct brw_reg *dst, 943 GLuint mask, 944 const struct brw_reg *arg0, 945 const struct brw_reg *arg1) 946 { 947 struct brw_compile *p = &c->func; 948 struct intel_context *intel = &p->brw->intel; 949 int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; 950 951 if (!(mask & WRITEMASK_XYZW)) 952 return; /* Do not emit dead code */ 953 954 assert(is_power_of_two(mask & WRITEMASK_XYZW)); 955 956 brw_push_insn_state(p); 957 958 /* math can only operate on up to a vec8 at a time, so in 959 * dispatch_width==16 we have to do the second half manually. 960 */ 961 if (intel->gen >= 6) { 962 struct brw_reg src0 = arg0[0]; 963 struct brw_reg src1 = arg1[0]; 964 struct brw_reg temp_dst = dst[dst_chan]; 965 966 if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) { 967 brw_MOV(p, temp_dst, src0); 968 src0 = temp_dst; 969 } 970 971 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) { 972 /* This is a heinous hack to get a temporary register for use 973 * in case both arg0 and arg1 are constants. Why you're 974 * doing exponentiation on constant values in the shader, we 975 * don't know. 976 * 977 * max_wm_grf is almost surely less than the maximum GRF, and 978 * gen6 doesn't care about the number of GRFs used in a 979 * shader like pre-gen6 did. 980 */ 981 struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0); 982 brw_MOV(p, temp, src1); 983 src1 = temp; 984 } 985 986 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 987 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 988 brw_math2(p, 989 temp_dst, 990 function, 991 src0, 992 src1); 993 if (c->dispatch_width == 16) { 994 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 995 brw_math2(p, 996 sechalf(temp_dst), 997 function, 998 sechalf(src0), 999 sechalf(src1)); 1000 } 1001 } else { 1002 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 1003 brw_MOV(p, brw_message_reg(3), arg1[0]); 1004 if (c->dispatch_width == 16) { 1005 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 1006 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0])); 1007 } 1008 1009 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 1010 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 1011 brw_math(p, 1012 dst[dst_chan], 1013 function, 1014 2, 1015 arg0[0], 1016 BRW_MATH_DATA_VECTOR, 1017 BRW_MATH_PRECISION_FULL); 1018 1019 /* Send two messages to perform all 16 operations: 1020 */ 1021 if (c->dispatch_width == 16) { 1022 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 1023 brw_math(p, 1024 offset(dst[dst_chan],1), 1025 function, 1026 4, 1027 sechalf(arg0[0]), 1028 BRW_MATH_DATA_VECTOR, 1029 BRW_MATH_PRECISION_FULL); 1030 } 1031 } 1032 brw_pop_insn_state(p); 1033 } 1034 1035 1036 void emit_tex(struct brw_wm_compile *c, 1037 struct brw_reg *dst, 1038 GLuint dst_flags, 1039 struct brw_reg *arg, 1040 struct brw_reg depth_payload, 1041 GLuint tex_idx, 1042 GLuint sampler, 1043 bool shadow) 1044 { 1045 struct brw_compile *p = &c->func; 1046 struct intel_context *intel = &p->brw->intel; 1047 struct brw_reg dst_retyped; 1048 GLuint cur_mrf = 2, response_length; 1049 GLuint i, nr_texcoords; 1050 GLuint emit; 1051 GLuint msg_type; 1052 GLuint mrf_per_channel; 1053 GLuint simd_mode; 1054 1055 if (c->dispatch_width == 16) { 1056 mrf_per_channel = 2; 1057 response_length = 8; 1058 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW); 1059 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1060 } else { 1061 mrf_per_channel = 1; 1062 response_length = 4; 1063 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW); 1064 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1065 } 1066 1067 /* How many input regs are there? 1068 */ 1069 switch (tex_idx) { 1070 case TEXTURE_1D_INDEX: 1071 emit = WRITEMASK_X; 1072 nr_texcoords = 1; 1073 break; 1074 case TEXTURE_2D_INDEX: 1075 case TEXTURE_1D_ARRAY_INDEX: 1076 case TEXTURE_RECT_INDEX: 1077 case TEXTURE_EXTERNAL_INDEX: 1078 emit = WRITEMASK_XY; 1079 nr_texcoords = 2; 1080 break; 1081 case TEXTURE_3D_INDEX: 1082 case TEXTURE_2D_ARRAY_INDEX: 1083 case TEXTURE_CUBE_INDEX: 1084 emit = WRITEMASK_XYZ; 1085 nr_texcoords = 3; 1086 break; 1087 default: 1088 /* unexpected target */ 1089 abort(); 1090 } 1091 1092 /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */ 1093 if (intel->gen < 5 && c->dispatch_width == 8) 1094 nr_texcoords = 3; 1095 1096 if (shadow) { 1097 if (intel->gen < 7) { 1098 /* For shadow comparisons, we have to supply u,v,r. */ 1099 nr_texcoords = 3; 1100 } else { 1101 /* On Ivybridge, the shadow comparitor comes first. Just load it. */ 1102 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]); 1103 cur_mrf += mrf_per_channel; 1104 } 1105 } 1106 1107 /* Emit the texcoords. */ 1108 for (i = 0; i < nr_texcoords; i++) { 1109 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) 1110 brw_set_saturate(p, true); 1111 1112 if (emit & (1<<i)) 1113 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]); 1114 else 1115 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0)); 1116 cur_mrf += mrf_per_channel; 1117 1118 brw_set_saturate(p, false); 1119 } 1120 1121 /* Fill in the shadow comparison reference value. */ 1122 if (shadow && intel->gen < 7) { 1123 if (intel->gen >= 5) { 1124 /* Fill in the cube map array index value. */ 1125 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0)); 1126 cur_mrf += mrf_per_channel; 1127 } else if (c->dispatch_width == 8) { 1128 /* Fill in the LOD bias value. */ 1129 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0)); 1130 cur_mrf += mrf_per_channel; 1131 } 1132 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]); 1133 cur_mrf += mrf_per_channel; 1134 } 1135 1136 if (intel->gen >= 5) { 1137 if (shadow) 1138 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 1139 else 1140 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 1141 } else { 1142 /* Note that G45 and older determines shadow compare and dispatch width 1143 * from message length for most messages. 1144 */ 1145 if (c->dispatch_width == 16 && shadow) 1146 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; 1147 else 1148 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; 1149 } 1150 1151 brw_SAMPLE(p, 1152 dst_retyped, 1153 1, 1154 retype(depth_payload, BRW_REGISTER_TYPE_UW), 1155 SURF_INDEX_TEXTURE(sampler), 1156 sampler, 1157 dst_flags & WRITEMASK_XYZW, 1158 msg_type, 1159 response_length, 1160 cur_mrf - 1, 1161 1, 1162 simd_mode, 1163 BRW_SAMPLER_RETURN_FORMAT_FLOAT32); 1164 } 1165 1166 1167 void emit_txb(struct brw_wm_compile *c, 1168 struct brw_reg *dst, 1169 GLuint dst_flags, 1170 struct brw_reg *arg, 1171 struct brw_reg depth_payload, 1172 GLuint tex_idx, 1173 GLuint sampler) 1174 { 1175 struct brw_compile *p = &c->func; 1176 struct intel_context *intel = &p->brw->intel; 1177 GLuint msgLength; 1178 GLuint msg_type; 1179 GLuint mrf_per_channel; 1180 GLuint response_length; 1181 struct brw_reg dst_retyped; 1182 1183 /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased 1184 * samples, so we'll use the 16-wide instruction, leave the second halves 1185 * undefined, and trust the execution mask to keep the undefined pixels 1186 * from mattering. 1187 */ 1188 if (c->dispatch_width == 16 || intel->gen < 5) { 1189 if (intel->gen >= 5) 1190 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 1191 else 1192 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 1193 mrf_per_channel = 2; 1194 dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW); 1195 response_length = 8; 1196 } else { 1197 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 1198 mrf_per_channel = 1; 1199 dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW); 1200 response_length = 4; 1201 } 1202 1203 /* Shadow ignored for txb. */ 1204 switch (tex_idx) { 1205 case TEXTURE_1D_INDEX: 1206 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]); 1207 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0)); 1208 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0)); 1209 break; 1210 case TEXTURE_2D_INDEX: 1211 case TEXTURE_RECT_INDEX: 1212 case TEXTURE_EXTERNAL_INDEX: 1213 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]); 1214 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]); 1215 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0)); 1216 break; 1217 case TEXTURE_3D_INDEX: 1218 case TEXTURE_CUBE_INDEX: 1219 brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]); 1220 brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]); 1221 brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]); 1222 break; 1223 default: 1224 /* unexpected target */ 1225 abort(); 1226 } 1227 1228 brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]); 1229 msgLength = 2 + 4 * mrf_per_channel - 1; 1230 1231 brw_SAMPLE(p, 1232 dst_retyped, 1233 1, 1234 retype(depth_payload, BRW_REGISTER_TYPE_UW), 1235 SURF_INDEX_TEXTURE(sampler), 1236 sampler, 1237 dst_flags & WRITEMASK_XYZW, 1238 msg_type, 1239 response_length, 1240 msgLength, 1241 1, 1242 BRW_SAMPLER_SIMD_MODE_SIMD16, 1243 BRW_SAMPLER_RETURN_FORMAT_FLOAT32); 1244 } 1245 1246 1247 static void emit_lit(struct brw_wm_compile *c, 1248 const struct brw_reg *dst, 1249 GLuint mask, 1250 const struct brw_reg *arg0) 1251 { 1252 struct brw_compile *p = &c->func; 1253 1254 assert((mask & WRITEMASK_XW) == 0); 1255 1256 if (mask & WRITEMASK_Y) { 1257 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); 1258 brw_MOV(p, dst[1], arg0[0]); 1259 brw_set_saturate(p, 0); 1260 } 1261 1262 if (mask & WRITEMASK_Z) { 1263 emit_math2(c, BRW_MATH_FUNCTION_POW, 1264 &dst[2], 1265 WRITEMASK_X | (mask & SATURATE), 1266 &arg0[1], 1267 &arg0[3]); 1268 } 1269 1270 /* Ordinarily you'd use an iff statement to skip or shortcircuit 1271 * some of the POW calculations above, but 16-wide iff statements 1272 * seem to lock c1 hardware, so this is a nasty workaround: 1273 */ 1274 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0)); 1275 { 1276 if (mask & WRITEMASK_Y) 1277 brw_MOV(p, dst[1], brw_imm_f(0)); 1278 1279 if (mask & WRITEMASK_Z) 1280 brw_MOV(p, dst[2], brw_imm_f(0)); 1281 } 1282 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 1283 } 1284 1285 1286 /* Kill pixel - set execution mask to zero for those pixels which 1287 * fail. 1288 */ 1289 static void emit_kil( struct brw_wm_compile *c, 1290 struct brw_reg *arg0) 1291 { 1292 struct brw_compile *p = &c->func; 1293 struct intel_context *intel = &p->brw->intel; 1294 struct brw_reg pixelmask; 1295 GLuint i, j; 1296 1297 if (intel->gen >= 6) 1298 pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 1299 else 1300 pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 1301 1302 for (i = 0; i < 4; i++) { 1303 /* Check if we've already done the comparison for this reg 1304 * -- common when someone does KIL TEMP.wwww. 1305 */ 1306 for (j = 0; j < i; j++) { 1307 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0) 1308 break; 1309 } 1310 if (j != i) 1311 continue; 1312 1313 brw_push_insn_state(p); 1314 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0)); 1315 brw_set_predicate_control_flag_value(p, 0xff); 1316 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 1317 brw_AND(p, pixelmask, brw_flag_reg(), pixelmask); 1318 brw_pop_insn_state(p); 1319 } 1320 } 1321 1322 static void fire_fb_write( struct brw_wm_compile *c, 1323 GLuint base_reg, 1324 GLuint nr, 1325 GLuint target, 1326 GLuint eot ) 1327 { 1328 struct brw_compile *p = &c->func; 1329 struct intel_context *intel = &p->brw->intel; 1330 uint32_t msg_control; 1331 1332 /* Pass through control information: 1333 * 1334 * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case. 1335 */ 1336 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */ 1337 if (intel->gen < 6) 1338 { 1339 brw_push_insn_state(p); 1340 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */ 1341 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 1342 brw_MOV(p, 1343 brw_message_reg(base_reg + 1), 1344 brw_vec8_grf(1, 0)); 1345 brw_pop_insn_state(p); 1346 } 1347 1348 if (c->dispatch_width == 16) 1349 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; 1350 else 1351 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; 1352 1353 /* Send framebuffer write message: */ 1354 /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */ 1355 brw_fb_WRITE(p, 1356 c->dispatch_width, 1357 base_reg, 1358 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW), 1359 msg_control, 1360 target, 1361 nr, 1362 0, 1363 eot, 1364 true); 1365 } 1366 1367 1368 static void emit_aa( struct brw_wm_compile *c, 1369 struct brw_reg *arg1, 1370 GLuint reg ) 1371 { 1372 struct brw_compile *p = &c->func; 1373 GLuint comp = c->aa_dest_stencil_reg / 2; 1374 GLuint off = c->aa_dest_stencil_reg % 2; 1375 struct brw_reg aa = offset(arg1[comp], off); 1376 1377 brw_push_insn_state(p); 1378 brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */ 1379 brw_MOV(p, brw_message_reg(reg), aa); 1380 brw_pop_insn_state(p); 1381 } 1382 1383 1384 /* Post-fragment-program processing. Send the results to the 1385 * framebuffer. 1386 * \param arg0 the fragment color 1387 * \param arg1 the pass-through depth value 1388 * \param arg2 the shader-computed depth value 1389 */ 1390 void emit_fb_write(struct brw_wm_compile *c, 1391 struct brw_reg *arg0, 1392 struct brw_reg *arg1, 1393 struct brw_reg *arg2, 1394 GLuint target, 1395 GLuint eot) 1396 { 1397 struct brw_compile *p = &c->func; 1398 struct brw_context *brw = p->brw; 1399 struct intel_context *intel = &brw->intel; 1400 GLuint nr = 2; 1401 GLuint channel; 1402 1403 /* Reserve a space for AA - may not be needed: 1404 */ 1405 if (c->aa_dest_stencil_reg) 1406 nr += 1; 1407 1408 /* I don't really understand how this achieves the color interleave 1409 * (ie RGBARGBA) in the result: [Do the saturation here] 1410 */ 1411 brw_push_insn_state(p); 1412 1413 if (c->key.clamp_fragment_color) 1414 brw_set_saturate(p, 1); 1415 1416 for (channel = 0; channel < 4; channel++) { 1417 if (intel->gen >= 6) { 1418 /* gen6 SIMD16 single source DP write looks like: 1419 * m + 0: r0 1420 * m + 1: r1 1421 * m + 2: g0 1422 * m + 3: g1 1423 * m + 4: b0 1424 * m + 5: b1 1425 * m + 6: a0 1426 * m + 7: a1 1427 */ 1428 if (c->dispatch_width == 16) { 1429 brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]); 1430 } else { 1431 brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]); 1432 } 1433 } else if (c->dispatch_width == 16 && brw->has_compr4) { 1434 /* pre-gen6 SIMD16 single source DP write looks like: 1435 * m + 0: r0 1436 * m + 1: g0 1437 * m + 2: b0 1438 * m + 3: a0 1439 * m + 4: r1 1440 * m + 5: g1 1441 * m + 6: b1 1442 * m + 7: a1 1443 * 1444 * By setting the high bit of the MRF register number, we indicate 1445 * that we want COMPR4 mode - instead of doing the usual destination 1446 * + 1 for the second half we get destination + 4. 1447 */ 1448 brw_MOV(p, 1449 brw_message_reg(nr + channel + BRW_MRF_COMPR4), 1450 arg0[channel]); 1451 } else { 1452 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */ 1453 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */ 1454 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 1455 brw_MOV(p, 1456 brw_message_reg(nr + channel), 1457 arg0[channel]); 1458 1459 if (c->dispatch_width == 16) { 1460 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 1461 brw_MOV(p, 1462 brw_message_reg(nr + channel + 4), 1463 sechalf(arg0[channel])); 1464 } 1465 } 1466 } 1467 1468 brw_set_saturate(p, 0); 1469 1470 /* skip over the regs populated above: 1471 */ 1472 if (c->dispatch_width == 16) 1473 nr += 8; 1474 else 1475 nr += 4; 1476 1477 brw_pop_insn_state(p); 1478 1479 if (c->source_depth_to_render_target) 1480 { 1481 if (c->computes_depth) 1482 brw_MOV(p, brw_message_reg(nr), arg2[2]); 1483 else 1484 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */ 1485 1486 nr += 2; 1487 } 1488 1489 if (c->dest_depth_reg) 1490 { 1491 GLuint comp = c->dest_depth_reg / 2; 1492 GLuint off = c->dest_depth_reg % 2; 1493 1494 if (off != 0) { 1495 brw_push_insn_state(p); 1496 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 1497 1498 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1)); 1499 /* 2nd half? */ 1500 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]); 1501 brw_pop_insn_state(p); 1502 } 1503 else { 1504 brw_MOV(p, brw_message_reg(nr), arg1[comp]); 1505 } 1506 nr += 2; 1507 } 1508 1509 if (intel->gen >= 6) { 1510 /* Load the message header. There's no implied move from src0 1511 * to the base mrf on gen6. 1512 */ 1513 brw_push_insn_state(p); 1514 brw_set_mask_control(p, BRW_MASK_DISABLE); 1515 brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD), 1516 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 1517 brw_pop_insn_state(p); 1518 1519 if (target != 0) { 1520 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1521 0, 1522 2), BRW_REGISTER_TYPE_UD), 1523 brw_imm_ud(target)); 1524 } 1525 } 1526 1527 if (!c->runtime_check_aads_emit) { 1528 if (c->aa_dest_stencil_reg) 1529 emit_aa(c, arg1, 2); 1530 1531 fire_fb_write(c, 0, nr, target, eot); 1532 } 1533 else { 1534 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 1535 struct brw_reg ip = brw_ip_reg(); 1536 int jmp; 1537 1538 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 1539 brw_set_conditionalmod(p, BRW_CONDITIONAL_Z); 1540 brw_AND(p, 1541 v1_null_ud, 1542 get_element_ud(brw_vec8_grf(1,0), 6), 1543 brw_imm_ud(1<<26)); 1544 1545 jmp = brw_JMPI(p, ip, ip, brw_imm_w(0)) - p->store; 1546 { 1547 emit_aa(c, arg1, 2); 1548 fire_fb_write(c, 0, nr, target, eot); 1549 /* note - thread killed in subroutine */ 1550 } 1551 brw_land_fwd_jump(p, jmp); 1552 1553 /* ELSE: Shuffle up one register to fill in the hole left for AA: 1554 */ 1555 fire_fb_write(c, 1, nr-1, target, eot); 1556 } 1557 } 1558 1559 /** 1560 * Move a GPR to scratch memory. 1561 */ 1562 static void emit_spill( struct brw_wm_compile *c, 1563 struct brw_reg reg, 1564 GLuint slot ) 1565 { 1566 struct brw_compile *p = &c->func; 1567 1568 /* 1569 mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr } 1570 */ 1571 brw_MOV(p, brw_message_reg(2), reg); 1572 1573 /* 1574 mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask } 1575 send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 } 1576 */ 1577 brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot); 1578 } 1579 1580 1581 /** 1582 * Load a GPR from scratch memory. 1583 */ 1584 static void emit_unspill( struct brw_wm_compile *c, 1585 struct brw_reg reg, 1586 GLuint slot ) 1587 { 1588 struct brw_compile *p = &c->func; 1589 1590 /* Slot 0 is the undef value. 1591 */ 1592 if (slot == 0) { 1593 brw_MOV(p, reg, brw_imm_f(0)); 1594 return; 1595 } 1596 1597 /* 1598 mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask } 1599 send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 } 1600 */ 1601 1602 brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot); 1603 } 1604 1605 1606 /** 1607 * Retrieve up to 4 GEN4 register pairs for the given wm reg: 1608 * Args with unspill_reg != 0 will be loaded from scratch memory. 1609 */ 1610 static void get_argument_regs( struct brw_wm_compile *c, 1611 struct brw_wm_ref *arg[], 1612 struct brw_reg *regs ) 1613 { 1614 GLuint i; 1615 1616 for (i = 0; i < 4; i++) { 1617 if (arg[i]) { 1618 if (arg[i]->unspill_reg) 1619 emit_unspill(c, 1620 brw_vec8_grf(arg[i]->unspill_reg, 0), 1621 arg[i]->value->spill_slot); 1622 1623 regs[i] = arg[i]->hw_reg; 1624 } 1625 else { 1626 regs[i] = brw_null_reg(); 1627 } 1628 } 1629 } 1630 1631 1632 /** 1633 * For values that have a spill_slot!=0, write those regs to scratch memory. 1634 */ 1635 static void spill_values( struct brw_wm_compile *c, 1636 struct brw_wm_value *values, 1637 GLuint nr ) 1638 { 1639 GLuint i; 1640 1641 for (i = 0; i < nr; i++) 1642 if (values[i].spill_slot) 1643 emit_spill(c, values[i].hw_reg, values[i].spill_slot); 1644 } 1645 1646 1647 /* Emit the fragment program instructions here. 1648 */ 1649 void brw_wm_emit( struct brw_wm_compile *c ) 1650 { 1651 struct brw_compile *p = &c->func; 1652 struct intel_context *intel = &p->brw->intel; 1653 GLuint insn; 1654 1655 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1656 if (intel->gen >= 6) 1657 brw_set_acc_write_control(p, 1); 1658 1659 /* Check if any of the payload regs need to be spilled: 1660 */ 1661 spill_values(c, c->payload.depth, 4); 1662 spill_values(c, c->creg, c->nr_creg); 1663 spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX); 1664 1665 1666 for (insn = 0; insn < c->nr_insns; insn++) { 1667 1668 struct brw_wm_instruction *inst = &c->instruction[insn]; 1669 struct brw_reg args[3][4], dst[4]; 1670 GLuint i, dst_flags; 1671 1672 /* Get argument regs: 1673 */ 1674 for (i = 0; i < 3; i++) 1675 get_argument_regs(c, inst->src[i], args[i]); 1676 1677 /* Get dest regs: 1678 */ 1679 for (i = 0; i < 4; i++) 1680 if (inst->dst[i]) 1681 dst[i] = inst->dst[i]->hw_reg; 1682 else 1683 dst[i] = brw_null_reg(); 1684 1685 /* Flags 1686 */ 1687 dst_flags = inst->writemask; 1688 if (inst->saturate) 1689 dst_flags |= SATURATE; 1690 1691 switch (inst->opcode) { 1692 /* Generated instructions for calculating triangle interpolants: 1693 */ 1694 case WM_PIXELXY: 1695 emit_pixel_xy(c, dst, dst_flags); 1696 break; 1697 1698 case WM_DELTAXY: 1699 emit_delta_xy(p, dst, dst_flags, args[0]); 1700 break; 1701 1702 case WM_WPOSXY: 1703 emit_wpos_xy(c, dst, dst_flags, args[0]); 1704 break; 1705 1706 case WM_PIXELW: 1707 emit_pixel_w(c, dst, dst_flags, args[0], args[1]); 1708 break; 1709 1710 case WM_LINTERP: 1711 emit_linterp(p, dst, dst_flags, args[0], args[1]); 1712 break; 1713 1714 case WM_PINTERP: 1715 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]); 1716 break; 1717 1718 case WM_CINTERP: 1719 emit_cinterp(p, dst, dst_flags, args[0]); 1720 break; 1721 1722 case WM_FB_WRITE: 1723 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot); 1724 break; 1725 1726 case WM_FRONTFACING: 1727 emit_frontfacing(p, dst, dst_flags); 1728 break; 1729 1730 /* Straightforward arithmetic: 1731 */ 1732 case OPCODE_ADD: 1733 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]); 1734 break; 1735 1736 case OPCODE_FRC: 1737 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]); 1738 break; 1739 1740 case OPCODE_FLR: 1741 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]); 1742 break; 1743 1744 case OPCODE_DDX: 1745 emit_ddxy(p, dst, dst_flags, true, args[0], false); 1746 break; 1747 1748 case OPCODE_DDY: 1749 /* Make sure fp->program.UsesDFdy flag got set (otherwise there's no 1750 * guarantee that c->key.render_to_fbo is set). 1751 */ 1752 assert(c->fp->program.UsesDFdy); 1753 emit_ddxy(p, dst, dst_flags, false, args[0], c->key.render_to_fbo); 1754 break; 1755 1756 case OPCODE_DP2: 1757 emit_dp2(p, dst, dst_flags, args[0], args[1]); 1758 break; 1759 1760 case OPCODE_DP3: 1761 emit_dp3(p, dst, dst_flags, args[0], args[1]); 1762 break; 1763 1764 case OPCODE_DP4: 1765 emit_dp4(p, dst, dst_flags, args[0], args[1]); 1766 break; 1767 1768 case OPCODE_DPH: 1769 emit_dph(p, dst, dst_flags, args[0], args[1]); 1770 break; 1771 1772 case OPCODE_TRUNC: 1773 for (i = 0; i < 4; i++) { 1774 if (dst_flags & (1<<i)) { 1775 brw_RNDZ(p, dst[i], args[0][i]); 1776 } 1777 } 1778 break; 1779 1780 case OPCODE_LRP: 1781 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]); 1782 break; 1783 1784 case OPCODE_MAD: 1785 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]); 1786 break; 1787 1788 case OPCODE_MOV: 1789 case OPCODE_SWZ: 1790 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]); 1791 break; 1792 1793 case OPCODE_MUL: 1794 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]); 1795 break; 1796 1797 case OPCODE_XPD: 1798 emit_xpd(p, dst, dst_flags, args[0], args[1]); 1799 break; 1800 1801 /* Higher math functions: 1802 */ 1803 case OPCODE_RCP: 1804 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]); 1805 break; 1806 1807 case OPCODE_RSQ: 1808 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]); 1809 break; 1810 1811 case OPCODE_SIN: 1812 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]); 1813 break; 1814 1815 case OPCODE_COS: 1816 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]); 1817 break; 1818 1819 case OPCODE_EX2: 1820 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]); 1821 break; 1822 1823 case OPCODE_LG2: 1824 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]); 1825 break; 1826 1827 case OPCODE_SCS: 1828 /* There is an scs math function, but it would need some 1829 * fixup for 16-element execution. 1830 */ 1831 if (dst_flags & WRITEMASK_X) 1832 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]); 1833 if (dst_flags & WRITEMASK_Y) 1834 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]); 1835 break; 1836 1837 case OPCODE_POW: 1838 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]); 1839 break; 1840 1841 /* Comparisons: 1842 */ 1843 case OPCODE_CMP: 1844 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]); 1845 break; 1846 1847 case OPCODE_MAX: 1848 emit_max(p, dst, dst_flags, args[0], args[1]); 1849 break; 1850 1851 case OPCODE_MIN: 1852 emit_min(p, dst, dst_flags, args[0], args[1]); 1853 break; 1854 1855 case OPCODE_SLT: 1856 emit_slt(p, dst, dst_flags, args[0], args[1]); 1857 break; 1858 1859 case OPCODE_SLE: 1860 emit_sle(p, dst, dst_flags, args[0], args[1]); 1861 break; 1862 case OPCODE_SGT: 1863 emit_sgt(p, dst, dst_flags, args[0], args[1]); 1864 break; 1865 case OPCODE_SGE: 1866 emit_sge(p, dst, dst_flags, args[0], args[1]); 1867 break; 1868 case OPCODE_SEQ: 1869 emit_seq(p, dst, dst_flags, args[0], args[1]); 1870 break; 1871 case OPCODE_SNE: 1872 emit_sne(p, dst, dst_flags, args[0], args[1]); 1873 break; 1874 1875 case OPCODE_SSG: 1876 emit_sign(p, dst, dst_flags, args[0]); 1877 break; 1878 1879 case OPCODE_LIT: 1880 emit_lit(c, dst, dst_flags, args[0]); 1881 break; 1882 1883 /* Texturing operations: 1884 */ 1885 case OPCODE_TEX: 1886 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg, 1887 inst->tex_idx, inst->tex_unit, 1888 inst->tex_shadow); 1889 break; 1890 1891 case OPCODE_TXB: 1892 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg, 1893 inst->tex_idx, inst->tex_unit); 1894 break; 1895 1896 case OPCODE_KIL: 1897 emit_kil(c, args[0]); 1898 break; 1899 1900 default: 1901 printf("Unsupported opcode %i (%s) in fragment shader\n", 1902 inst->opcode, inst->opcode < MAX_OPCODE ? 1903 _mesa_opcode_string(inst->opcode) : 1904 "unknown"); 1905 } 1906 1907 for (i = 0; i < 4; i++) 1908 if (inst->dst[i] && inst->dst[i]->spill_slot) 1909 emit_spill(c, 1910 inst->dst[i]->hw_reg, 1911 inst->dst[i]->spill_slot); 1912 } 1913 1914 /* Only properly tested on ILK */ 1915 if (p->brw->intel.gen == 5) { 1916 brw_remove_duplicate_mrf_moves(p); 1917 if (c->dispatch_width == 16) 1918 brw_remove_grf_to_mrf_moves(p); 1919 } 1920 1921 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1922 int i; 1923 1924 printf("wm-native:\n"); 1925 for (i = 0; i < p->nr_insn; i++) 1926 brw_disasm(stdout, &p->store[i], p->brw->intel.gen); 1927 printf("\n"); 1928 } 1929 } 1930 1931