Home | History | Annotate | Download | only in i965
      1 /*
      2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
      3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
      4  develop this 3D driver.
      5 
      6  Permission is hereby granted, free of charge, to any person obtaining
      7  a copy of this software and associated documentation files (the
      8  "Software"), to deal in the Software without restriction, including
      9  without limitation the rights to use, copy, modify, merge, publish,
     10  distribute, sublicense, and/or sell copies of the Software, and to
     11  permit persons to whom the Software is furnished to do so, subject to
     12  the following conditions:
     13 
     14  The above copyright notice and this permission notice (including the
     15  next paragraph) shall be included in all copies or substantial
     16  portions of the Software.
     17 
     18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
     22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25 
     26  **********************************************************************/
     27  /*
     28   * Authors:
     29   *   Keith Whitwell <keith (at) tungstengraphics.com>
     30   */
     31 
     32 
     33 #include "main/macros.h"
     34 #include "brw_context.h"
     35 #include "brw_wm.h"
     36 
     37 static bool
     38 can_do_pln(struct intel_context *intel, const struct brw_reg *deltas)
     39 {
     40    struct brw_context *brw = brw_context(&intel->ctx);
     41 
     42    if (!brw->has_pln)
     43       return false;
     44 
     45    if (deltas[1].nr != deltas[0].nr + 1)
     46       return false;
     47 
     48    if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
     49       return false;
     50 
     51    return true;
     52 }
     53 
     54 /* Return the SrcReg index of the channels that can be immediate float operands
     55  * instead of usage of PROGRAM_CONSTANT values through push/pull.
     56  */
     57 bool
     58 brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
     59 {
     60    int opcode_array[] = {
     61       [OPCODE_ADD] = 2,
     62       [OPCODE_CMP] = 3,
     63       [OPCODE_DP3] = 2,
     64       [OPCODE_DP4] = 2,
     65       [OPCODE_DPH] = 2,
     66       [OPCODE_MAX] = 2,
     67       [OPCODE_MIN] = 2,
     68       [OPCODE_MOV] = 1,
     69       [OPCODE_MUL] = 2,
     70       [OPCODE_SEQ] = 2,
     71       [OPCODE_SGE] = 2,
     72       [OPCODE_SGT] = 2,
     73       [OPCODE_SLE] = 2,
     74       [OPCODE_SLT] = 2,
     75       [OPCODE_SNE] = 2,
     76       [OPCODE_SWZ] = 1,
     77       [OPCODE_XPD] = 2,
     78    };
     79 
     80    /* These opcodes get broken down in a way that allow two
     81     * args to be immediates.
     82     */
     83    if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
     84       if (arg == 1 || arg == 2)
     85 	 return true;
     86    }
     87 
     88    if (opcode > ARRAY_SIZE(opcode_array))
     89       return false;
     90 
     91    return arg == opcode_array[opcode] - 1;
     92 }
     93 
     94 /**
     95  * Computes the screen-space x,y position of the pixels.
     96  *
     97  * This will be used by emit_delta_xy() or emit_wpos_xy() for
     98  * interpolation of attributes..
     99  *
    100  * Payload R0:
    101  *
    102  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
    103  *         corresponding to each of the 16 execution channels.
    104  * R0.1..8 -- ?
    105  * R1.0 -- triangle vertex 0.X
    106  * R1.1 -- triangle vertex 0.Y
    107  * R1.2 -- tile 0 x,y coords (2 packed uwords)
    108  * R1.3 -- tile 1 x,y coords (2 packed uwords)
    109  * R1.4 -- tile 2 x,y coords (2 packed uwords)
    110  * R1.5 -- tile 3 x,y coords (2 packed uwords)
    111  * R1.6 -- ?
    112  * R1.7 -- ?
    113  * R1.8 -- ?
    114  */
    115 void emit_pixel_xy(struct brw_wm_compile *c,
    116 		   const struct brw_reg *dst,
    117 		   GLuint mask)
    118 {
    119    struct brw_compile *p = &c->func;
    120    struct brw_reg r1 = brw_vec1_grf(1, 0);
    121    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
    122    struct brw_reg dst0_uw, dst1_uw;
    123 
    124    brw_push_insn_state(p);
    125    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    126 
    127    if (c->dispatch_width == 16) {
    128       dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
    129       dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
    130    } else {
    131       dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
    132       dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
    133    }
    134 
    135    /* Calculate pixel centers by adding 1 or 0 to each of the
    136     * micro-tile coordinates passed in r1.
    137     */
    138    if (mask & WRITEMASK_X) {
    139       brw_ADD(p,
    140 	      dst0_uw,
    141 	      stride(suboffset(r1_uw, 4), 2, 4, 0),
    142 	      brw_imm_v(0x10101010));
    143    }
    144 
    145    if (mask & WRITEMASK_Y) {
    146       brw_ADD(p,
    147 	      dst1_uw,
    148 	      stride(suboffset(r1_uw,5), 2, 4, 0),
    149 	      brw_imm_v(0x11001100));
    150    }
    151    brw_pop_insn_state(p);
    152 }
    153 
    154 /**
    155  * Computes the screen-space x,y distance of the pixels from the start
    156  * vertex.
    157  *
    158  * This will be used in linterp or pinterp with the start vertex value
    159  * and the Cx, Cy, and C0 coefficients passed in from the setup engine
    160  * to produce interpolated attribute values.
    161  */
    162 void emit_delta_xy(struct brw_compile *p,
    163 		   const struct brw_reg *dst,
    164 		   GLuint mask,
    165 		   const struct brw_reg *arg0)
    166 {
    167    struct intel_context *intel = &p->brw->intel;
    168    struct brw_reg r1 = brw_vec1_grf(1, 0);
    169 
    170    if (mask == 0)
    171       return;
    172 
    173    assert(mask == WRITEMASK_XY);
    174 
    175    if (intel->gen >= 6) {
    176        /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
    177 	  Just add them with 0.0 for dst reg.. */
    178        r1 = brw_imm_v(0x00000000);
    179        brw_ADD(p,
    180 	       dst[0],
    181 	       retype(arg0[0], BRW_REGISTER_TYPE_UW),
    182 	       r1);
    183        brw_ADD(p,
    184 	       dst[1],
    185 	       retype(arg0[1], BRW_REGISTER_TYPE_UW),
    186 	       r1);
    187        return;
    188    }
    189 
    190    /* Calc delta X,Y by subtracting origin in r1 from the pixel
    191     * centers produced by emit_pixel_xy().
    192     */
    193    brw_ADD(p,
    194 	   dst[0],
    195 	   retype(arg0[0], BRW_REGISTER_TYPE_UW),
    196 	   negate(r1));
    197    brw_ADD(p,
    198 	   dst[1],
    199 	   retype(arg0[1], BRW_REGISTER_TYPE_UW),
    200 	   negate(suboffset(r1,1)));
    201 }
    202 
    203 /**
    204  * Computes the pixel offset from the window origin for gl_FragCoord().
    205  */
    206 void emit_wpos_xy(struct brw_wm_compile *c,
    207 		  const struct brw_reg *dst,
    208 		  GLuint mask,
    209 		  const struct brw_reg *arg0)
    210 {
    211    struct brw_compile *p = &c->func;
    212    struct intel_context *intel = &p->brw->intel;
    213    struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
    214    struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
    215 
    216    if (mask & WRITEMASK_X) {
    217       if (intel->gen >= 6) {
    218 	 struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
    219 	 brw_MOV(p, delta_x_f, delta_x);
    220 	 delta_x = delta_x_f;
    221       }
    222 
    223       if (c->fp->program.PixelCenterInteger) {
    224 	 /* X' = X */
    225 	 brw_MOV(p, dst[0], delta_x);
    226       } else {
    227 	 /* X' = X + 0.5 */
    228 	 brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
    229       }
    230    }
    231 
    232    if (mask & WRITEMASK_Y) {
    233       if (intel->gen >= 6) {
    234 	 struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
    235 	 brw_MOV(p, delta_y_f, delta_y);
    236 	 delta_y = delta_y_f;
    237       }
    238 
    239       if (c->fp->program.OriginUpperLeft) {
    240 	 if (c->fp->program.PixelCenterInteger) {
    241 	    /* Y' = Y */
    242 	    brw_MOV(p, dst[1], delta_y);
    243 	 } else {
    244 	    brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
    245 	 }
    246       } else {
    247 	 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
    248 
    249 	 /* Y' = (height - 1) - Y + center */
    250 	 brw_ADD(p, dst[1], negate(delta_y),
    251 		 brw_imm_f(c->key.drawable_height - 1 + center_offset));
    252       }
    253    }
    254 }
    255 
    256 
    257 void emit_pixel_w(struct brw_wm_compile *c,
    258 		  const struct brw_reg *dst,
    259 		  GLuint mask,
    260 		  const struct brw_reg *arg0,
    261 		  const struct brw_reg *deltas)
    262 {
    263    struct brw_compile *p = &c->func;
    264    struct intel_context *intel = &p->brw->intel;
    265    struct brw_reg src;
    266    struct brw_reg temp_dst;
    267 
    268    if (intel->gen >= 6)
    269 	temp_dst = dst[3];
    270    else
    271 	temp_dst = brw_message_reg(2);
    272 
    273    assert(intel->gen < 6);
    274 
    275    /* Don't need this if all you are doing is interpolating color, for
    276     * instance.
    277     */
    278    if (mask & WRITEMASK_W) {
    279       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
    280 
    281       /* Calc 1/w - just linterp wpos[3] optimized by putting the
    282        * result straight into a message reg.
    283        */
    284       if (can_do_pln(intel, deltas)) {
    285 	 brw_PLN(p, temp_dst, interp3, deltas[0]);
    286       } else {
    287 	 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
    288 	 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
    289       }
    290 
    291       /* Calc w */
    292       if (intel->gen >= 6)
    293 	 src = temp_dst;
    294       else
    295 	 src = brw_null_reg();
    296 
    297       if (c->dispatch_width == 16) {
    298 	 brw_math_16(p, dst[3],
    299 		     BRW_MATH_FUNCTION_INV,
    300 		     2, src,
    301 		     BRW_MATH_PRECISION_FULL);
    302       } else {
    303 	 brw_math(p, dst[3],
    304 		  BRW_MATH_FUNCTION_INV,
    305 		  2, src,
    306 		  BRW_MATH_DATA_VECTOR,
    307 		  BRW_MATH_PRECISION_FULL);
    308       }
    309    }
    310 }
    311 
    312 void emit_linterp(struct brw_compile *p,
    313 		  const struct brw_reg *dst,
    314 		  GLuint mask,
    315 		  const struct brw_reg *arg0,
    316 		  const struct brw_reg *deltas)
    317 {
    318    struct intel_context *intel = &p->brw->intel;
    319    struct brw_reg interp[4];
    320    GLuint nr = arg0[0].nr;
    321    GLuint i;
    322 
    323    interp[0] = brw_vec1_grf(nr, 0);
    324    interp[1] = brw_vec1_grf(nr, 4);
    325    interp[2] = brw_vec1_grf(nr+1, 0);
    326    interp[3] = brw_vec1_grf(nr+1, 4);
    327 
    328    for (i = 0; i < 4; i++) {
    329       if (mask & (1<<i)) {
    330 	 if (intel->gen >= 6) {
    331 	    brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
    332 	 } else if (can_do_pln(intel, deltas)) {
    333 	    brw_PLN(p, dst[i], interp[i], deltas[0]);
    334 	 } else {
    335 	    brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
    336 	    brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
    337 	 }
    338       }
    339    }
    340 }
    341 
    342 
    343 void emit_pinterp(struct brw_compile *p,
    344 		  const struct brw_reg *dst,
    345 		  GLuint mask,
    346 		  const struct brw_reg *arg0,
    347 		  const struct brw_reg *deltas,
    348 		  const struct brw_reg *w)
    349 {
    350    struct intel_context *intel = &p->brw->intel;
    351    struct brw_reg interp[4];
    352    GLuint nr = arg0[0].nr;
    353    GLuint i;
    354 
    355    if (intel->gen >= 6) {
    356       emit_linterp(p, dst, mask, arg0, interp);
    357       return;
    358    }
    359 
    360    interp[0] = brw_vec1_grf(nr, 0);
    361    interp[1] = brw_vec1_grf(nr, 4);
    362    interp[2] = brw_vec1_grf(nr+1, 0);
    363    interp[3] = brw_vec1_grf(nr+1, 4);
    364 
    365    for (i = 0; i < 4; i++) {
    366       if (mask & (1<<i)) {
    367 	 if (can_do_pln(intel, deltas)) {
    368 	    brw_PLN(p, dst[i], interp[i], deltas[0]);
    369 	 } else {
    370 	    brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
    371 	    brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
    372 	 }
    373       }
    374    }
    375    for (i = 0; i < 4; i++) {
    376       if (mask & (1<<i)) {
    377 	 brw_MUL(p, dst[i], dst[i], w[3]);
    378       }
    379    }
    380 }
    381 
    382 
    383 void emit_cinterp(struct brw_compile *p,
    384 		  const struct brw_reg *dst,
    385 		  GLuint mask,
    386 		  const struct brw_reg *arg0)
    387 {
    388    struct brw_reg interp[4];
    389    GLuint nr = arg0[0].nr;
    390    GLuint i;
    391 
    392    interp[0] = brw_vec1_grf(nr, 0);
    393    interp[1] = brw_vec1_grf(nr, 4);
    394    interp[2] = brw_vec1_grf(nr+1, 0);
    395    interp[3] = brw_vec1_grf(nr+1, 4);
    396 
    397    for (i = 0; i < 4; i++) {
    398       if (mask & (1<<i)) {
    399          brw_MOV(p, dst[i], suboffset(interp[i],3));	/* TODO: optimize away like other moves */
    400       }
    401    }
    402 }
    403 
    404 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
    405 void emit_frontfacing(struct brw_compile *p,
    406 		      const struct brw_reg *dst,
    407 		      GLuint mask)
    408 {
    409    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
    410    GLuint i;
    411 
    412    if (!(mask & WRITEMASK_XYZW))
    413       return;
    414 
    415    for (i = 0; i < 4; i++) {
    416       if (mask & (1<<i)) {
    417 	 brw_MOV(p, dst[i], brw_imm_f(0.0));
    418       }
    419    }
    420 
    421    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
    422     * us front face
    423     */
    424    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
    425    for (i = 0; i < 4; i++) {
    426       if (mask & (1<<i)) {
    427 	 brw_MOV(p, dst[i], brw_imm_f(1.0));
    428       }
    429    }
    430    brw_set_predicate_control_flag_value(p, 0xff);
    431 }
    432 
    433 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
    434  * looking like:
    435  *
    436  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
    437  *
    438  * and we're trying to produce:
    439  *
    440  *           DDX                     DDY
    441  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
    442  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
    443  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
    444  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
    445  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
    446  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
    447  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
    448  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
    449  *
    450  * and add another set of two more subspans if in 16-pixel dispatch mode.
    451  *
    452  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
    453  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
    454  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
    455  * between each other.  We could probably do it like ddx and swizzle the right
    456  * order later, but bail for now and just produce
    457  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
    458  *
    459  * The negate_value boolean is used to negate the d/dy computation for FBOs,
    460  * since they place the origin at the upper left instead of the lower left.
    461  */
    462 void emit_ddxy(struct brw_compile *p,
    463 	       const struct brw_reg *dst,
    464 	       GLuint mask,
    465 	       bool is_ddx,
    466 	       const struct brw_reg *arg0,
    467                bool negate_value)
    468 {
    469    int i;
    470    struct brw_reg src0, src1;
    471 
    472    if (mask & SATURATE)
    473       brw_set_saturate(p, 1);
    474    for (i = 0; i < 4; i++ ) {
    475       if (mask & (1<<i)) {
    476 	 if (is_ddx) {
    477 	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
    478 			   BRW_REGISTER_TYPE_F,
    479 			   BRW_VERTICAL_STRIDE_2,
    480 			   BRW_WIDTH_2,
    481 			   BRW_HORIZONTAL_STRIDE_0,
    482 			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    483 	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
    484 			   BRW_REGISTER_TYPE_F,
    485 			   BRW_VERTICAL_STRIDE_2,
    486 			   BRW_WIDTH_2,
    487 			   BRW_HORIZONTAL_STRIDE_0,
    488 			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    489 	 } else {
    490 	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
    491 			   BRW_REGISTER_TYPE_F,
    492 			   BRW_VERTICAL_STRIDE_4,
    493 			   BRW_WIDTH_4,
    494 			   BRW_HORIZONTAL_STRIDE_0,
    495 			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    496 	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
    497 			   BRW_REGISTER_TYPE_F,
    498 			   BRW_VERTICAL_STRIDE_4,
    499 			   BRW_WIDTH_4,
    500 			   BRW_HORIZONTAL_STRIDE_0,
    501 			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    502 	 }
    503          if (negate_value)
    504             brw_ADD(p, dst[i], src1, negate(src0));
    505          else
    506             brw_ADD(p, dst[i], src0, negate(src1));
    507       }
    508    }
    509    if (mask & SATURATE)
    510       brw_set_saturate(p, 0);
    511 }
    512 
    513 void emit_alu1(struct brw_compile *p,
    514 	       struct brw_instruction *(*func)(struct brw_compile *,
    515 					       struct brw_reg,
    516 					       struct brw_reg),
    517 	       const struct brw_reg *dst,
    518 	       GLuint mask,
    519 	       const struct brw_reg *arg0)
    520 {
    521    GLuint i;
    522 
    523    if (mask & SATURATE)
    524       brw_set_saturate(p, 1);
    525 
    526    for (i = 0; i < 4; i++) {
    527       if (mask & (1<<i)) {
    528 	 func(p, dst[i], arg0[i]);
    529       }
    530    }
    531 
    532    if (mask & SATURATE)
    533       brw_set_saturate(p, 0);
    534 }
    535 
    536 
    537 void emit_alu2(struct brw_compile *p,
    538 	       struct brw_instruction *(*func)(struct brw_compile *,
    539 					       struct brw_reg,
    540 					       struct brw_reg,
    541 					       struct brw_reg),
    542 	       const struct brw_reg *dst,
    543 	       GLuint mask,
    544 	       const struct brw_reg *arg0,
    545 	       const struct brw_reg *arg1)
    546 {
    547    GLuint i;
    548 
    549    if (mask & SATURATE)
    550       brw_set_saturate(p, 1);
    551 
    552    for (i = 0; i < 4; i++) {
    553       if (mask & (1<<i)) {
    554 	 func(p, dst[i], arg0[i], arg1[i]);
    555       }
    556    }
    557 
    558    if (mask & SATURATE)
    559       brw_set_saturate(p, 0);
    560 }
    561 
    562 
    563 void emit_mad(struct brw_compile *p,
    564 	      const struct brw_reg *dst,
    565 	      GLuint mask,
    566 	      const struct brw_reg *arg0,
    567 	      const struct brw_reg *arg1,
    568 	      const struct brw_reg *arg2)
    569 {
    570    GLuint i;
    571 
    572    for (i = 0; i < 4; i++) {
    573       if (mask & (1<<i)) {
    574 	 brw_MUL(p, dst[i], arg0[i], arg1[i]);
    575 
    576 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    577 	 brw_ADD(p, dst[i], dst[i], arg2[i]);
    578 	 brw_set_saturate(p, 0);
    579       }
    580    }
    581 }
    582 
    583 void emit_lrp(struct brw_compile *p,
    584 	      const struct brw_reg *dst,
    585 	      GLuint mask,
    586 	      const struct brw_reg *arg0,
    587 	      const struct brw_reg *arg1,
    588 	      const struct brw_reg *arg2)
    589 {
    590    GLuint i;
    591 
    592    /* Uses dst as a temporary:
    593     */
    594    for (i = 0; i < 4; i++) {
    595       if (mask & (1<<i)) {
    596 	 /* Can I use the LINE instruction for this?
    597 	  */
    598 	 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
    599 	 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
    600 
    601 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    602 	 brw_MAC(p, dst[i], arg0[i], arg1[i]);
    603 	 brw_set_saturate(p, 0);
    604       }
    605    }
    606 }
    607 
    608 void emit_sop(struct brw_compile *p,
    609 	      const struct brw_reg *dst,
    610 	      GLuint mask,
    611 	      GLuint cond,
    612 	      const struct brw_reg *arg0,
    613 	      const struct brw_reg *arg1)
    614 {
    615    GLuint i;
    616 
    617    for (i = 0; i < 4; i++) {
    618       if (mask & (1<<i)) {
    619 	 brw_push_insn_state(p);
    620 	 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
    621 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    622 	 brw_MOV(p, dst[i], brw_imm_f(0));
    623 	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
    624 	 brw_MOV(p, dst[i], brw_imm_f(1.0));
    625 	 brw_pop_insn_state(p);
    626       }
    627    }
    628 }
    629 
    630 static void emit_slt( struct brw_compile *p,
    631 		      const struct brw_reg *dst,
    632 		      GLuint mask,
    633 		      const struct brw_reg *arg0,
    634 		      const struct brw_reg *arg1 )
    635 {
    636    emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
    637 }
    638 
    639 static void emit_sle( struct brw_compile *p,
    640 		      const struct brw_reg *dst,
    641 		      GLuint mask,
    642 		      const struct brw_reg *arg0,
    643 		      const struct brw_reg *arg1 )
    644 {
    645    emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
    646 }
    647 
    648 static void emit_sgt( struct brw_compile *p,
    649 		      const struct brw_reg *dst,
    650 		      GLuint mask,
    651 		      const struct brw_reg *arg0,
    652 		      const struct brw_reg *arg1 )
    653 {
    654    emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
    655 }
    656 
    657 static void emit_sge( struct brw_compile *p,
    658 		      const struct brw_reg *dst,
    659 		      GLuint mask,
    660 		      const struct brw_reg *arg0,
    661 		      const struct brw_reg *arg1 )
    662 {
    663    emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
    664 }
    665 
    666 static void emit_seq( struct brw_compile *p,
    667 		      const struct brw_reg *dst,
    668 		      GLuint mask,
    669 		      const struct brw_reg *arg0,
    670 		      const struct brw_reg *arg1 )
    671 {
    672    emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
    673 }
    674 
    675 static void emit_sne( struct brw_compile *p,
    676 		      const struct brw_reg *dst,
    677 		      GLuint mask,
    678 		      const struct brw_reg *arg0,
    679 		      const struct brw_reg *arg1 )
    680 {
    681    emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
    682 }
    683 
    684 void emit_cmp(struct brw_compile *p,
    685 	      const struct brw_reg *dst,
    686 	      GLuint mask,
    687 	      const struct brw_reg *arg0,
    688 	      const struct brw_reg *arg1,
    689 	      const struct brw_reg *arg2)
    690 {
    691    GLuint i;
    692 
    693    for (i = 0; i < 4; i++) {
    694       if (mask & (1<<i)) {
    695 	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
    696 
    697 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    698 	 brw_SEL(p, dst[i], arg1[i], arg2[i]);
    699 	 brw_set_saturate(p, 0);
    700 	 brw_set_predicate_control_flag_value(p, 0xff);
    701       }
    702    }
    703 }
    704 
    705 void emit_sign(struct brw_compile *p,
    706 	       const struct brw_reg *dst,
    707 	       GLuint mask,
    708 	       const struct brw_reg *arg0)
    709 {
    710    GLuint i;
    711 
    712    for (i = 0; i < 4; i++) {
    713       if (mask & (1<<i)) {
    714 	 brw_MOV(p, dst[i], brw_imm_f(0.0));
    715 
    716 	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
    717 	 brw_MOV(p, dst[i], brw_imm_f(-1.0));
    718 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    719 
    720 	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
    721 	 brw_MOV(p, dst[i], brw_imm_f(1.0));
    722 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    723       }
    724    }
    725 }
    726 
    727 void emit_max(struct brw_compile *p,
    728 	      const struct brw_reg *dst,
    729 	      GLuint mask,
    730 	      const struct brw_reg *arg0,
    731 	      const struct brw_reg *arg1)
    732 {
    733    GLuint i;
    734 
    735    for (i = 0; i < 4; i++) {
    736       if (mask & (1<<i)) {
    737 	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
    738 
    739 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    740 	 brw_SEL(p, dst[i], arg0[i], arg1[i]);
    741 	 brw_set_saturate(p, 0);
    742 	 brw_set_predicate_control_flag_value(p, 0xff);
    743       }
    744    }
    745 }
    746 
    747 void emit_min(struct brw_compile *p,
    748 	      const struct brw_reg *dst,
    749 	      GLuint mask,
    750 	      const struct brw_reg *arg0,
    751 	      const struct brw_reg *arg1)
    752 {
    753    GLuint i;
    754 
    755    for (i = 0; i < 4; i++) {
    756       if (mask & (1<<i)) {
    757 	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
    758 
    759 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    760 	 brw_SEL(p, dst[i], arg0[i], arg1[i]);
    761 	 brw_set_saturate(p, 0);
    762 	 brw_set_predicate_control_flag_value(p, 0xff);
    763       }
    764    }
    765 }
    766 
    767 
    768 void emit_dp2(struct brw_compile *p,
    769 	      const struct brw_reg *dst,
    770 	      GLuint mask,
    771 	      const struct brw_reg *arg0,
    772 	      const struct brw_reg *arg1)
    773 {
    774    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
    775 
    776    if (!(mask & WRITEMASK_XYZW))
    777       return; /* Do not emit dead code */
    778 
    779    assert(is_power_of_two(mask & WRITEMASK_XYZW));
    780 
    781    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    782 
    783    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    784    brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
    785    brw_set_saturate(p, 0);
    786 }
    787 
    788 
    789 void emit_dp3(struct brw_compile *p,
    790 	      const struct brw_reg *dst,
    791 	      GLuint mask,
    792 	      const struct brw_reg *arg0,
    793 	      const struct brw_reg *arg1)
    794 {
    795    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
    796 
    797    if (!(mask & WRITEMASK_XYZW))
    798       return; /* Do not emit dead code */
    799 
    800    assert(is_power_of_two(mask & WRITEMASK_XYZW));
    801 
    802    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    803    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
    804 
    805    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    806    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
    807    brw_set_saturate(p, 0);
    808 }
    809 
    810 
    811 void emit_dp4(struct brw_compile *p,
    812 	      const struct brw_reg *dst,
    813 	      GLuint mask,
    814 	      const struct brw_reg *arg0,
    815 	      const struct brw_reg *arg1)
    816 {
    817    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
    818 
    819    if (!(mask & WRITEMASK_XYZW))
    820       return; /* Do not emit dead code */
    821 
    822    assert(is_power_of_two(mask & WRITEMASK_XYZW));
    823 
    824    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    825    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
    826    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
    827 
    828    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    829    brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
    830    brw_set_saturate(p, 0);
    831 }
    832 
    833 
    834 void emit_dph(struct brw_compile *p,
    835 	      const struct brw_reg *dst,
    836 	      GLuint mask,
    837 	      const struct brw_reg *arg0,
    838 	      const struct brw_reg *arg1)
    839 {
    840    const int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
    841 
    842    if (!(mask & WRITEMASK_XYZW))
    843       return; /* Do not emit dead code */
    844 
    845    assert(is_power_of_two(mask & WRITEMASK_XYZW));
    846 
    847    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    848    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
    849    brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
    850 
    851    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    852    brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
    853    brw_set_saturate(p, 0);
    854 }
    855 
    856 
    857 void emit_xpd(struct brw_compile *p,
    858 	      const struct brw_reg *dst,
    859 	      GLuint mask,
    860 	      const struct brw_reg *arg0,
    861 	      const struct brw_reg *arg1)
    862 {
    863    GLuint i;
    864 
    865    assert((mask & WRITEMASK_W) != WRITEMASK_W);
    866 
    867    for (i = 0 ; i < 3; i++) {
    868       if (mask & (1<<i)) {
    869 	 GLuint i2 = (i+2)%3;
    870 	 GLuint i1 = (i+1)%3;
    871 
    872 	 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
    873 
    874 	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    875 	 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
    876 	 brw_set_saturate(p, 0);
    877       }
    878    }
    879 }
    880 
    881 
    882 void emit_math1(struct brw_wm_compile *c,
    883 		GLuint function,
    884 		const struct brw_reg *dst,
    885 		GLuint mask,
    886 		const struct brw_reg *arg0)
    887 {
    888    struct brw_compile *p = &c->func;
    889    struct intel_context *intel = &p->brw->intel;
    890    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
    891    struct brw_reg src;
    892 
    893    if (!(mask & WRITEMASK_XYZW))
    894       return; /* Do not emit dead code */
    895 
    896    assert(is_power_of_two(mask & WRITEMASK_XYZW));
    897 
    898    if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
    899 			    arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
    900 			   arg0[0].negate || arg0[0].abs)) {
    901       /* Gen6 math requires that source and dst horizontal stride be 1,
    902        * and that the argument be in the GRF.
    903        *
    904        * The hardware ignores source modifiers (negate and abs) on math
    905        * instructions, so we also move to a temp to set those up.
    906        */
    907       src = dst[dst_chan];
    908       brw_MOV(p, src, arg0[0]);
    909    } else {
    910       src = arg0[0];
    911    }
    912 
    913    /* Send two messages to perform all 16 operations:
    914     */
    915    brw_push_insn_state(p);
    916    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    917    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    918    brw_math(p,
    919 	    dst[dst_chan],
    920 	    function,
    921 	    2,
    922 	    src,
    923 	    BRW_MATH_DATA_VECTOR,
    924 	    BRW_MATH_PRECISION_FULL);
    925 
    926    if (c->dispatch_width == 16) {
    927       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    928       brw_math(p,
    929 	       offset(dst[dst_chan],1),
    930 	       function,
    931 	       3,
    932 	       sechalf(src),
    933 	       BRW_MATH_DATA_VECTOR,
    934 	       BRW_MATH_PRECISION_FULL);
    935    }
    936    brw_pop_insn_state(p);
    937 }
    938 
    939 
    940 void emit_math2(struct brw_wm_compile *c,
    941 		GLuint function,
    942 		const struct brw_reg *dst,
    943 		GLuint mask,
    944 		const struct brw_reg *arg0,
    945 		const struct brw_reg *arg1)
    946 {
    947    struct brw_compile *p = &c->func;
    948    struct intel_context *intel = &p->brw->intel;
    949    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
    950 
    951    if (!(mask & WRITEMASK_XYZW))
    952       return; /* Do not emit dead code */
    953 
    954    assert(is_power_of_two(mask & WRITEMASK_XYZW));
    955 
    956    brw_push_insn_state(p);
    957 
    958    /* math can only operate on up to a vec8 at a time, so in
    959     * dispatch_width==16 we have to do the second half manually.
    960     */
    961    if (intel->gen >= 6) {
    962       struct brw_reg src0 = arg0[0];
    963       struct brw_reg src1 = arg1[0];
    964       struct brw_reg temp_dst = dst[dst_chan];
    965 
    966       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
    967 	 brw_MOV(p, temp_dst, src0);
    968 	 src0 = temp_dst;
    969       }
    970 
    971       if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
    972 	 /* This is a heinous hack to get a temporary register for use
    973 	  * in case both arg0 and arg1 are constants.  Why you're
    974 	  * doing exponentiation on constant values in the shader, we
    975 	  * don't know.
    976 	  *
    977 	  * max_wm_grf is almost surely less than the maximum GRF, and
    978 	  * gen6 doesn't care about the number of GRFs used in a
    979 	  * shader like pre-gen6 did.
    980 	  */
    981 	 struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
    982 	 brw_MOV(p, temp, src1);
    983 	 src1 = temp;
    984       }
    985 
    986       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
    987       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    988       brw_math2(p,
    989 		temp_dst,
    990 		function,
    991 		src0,
    992 		src1);
    993       if (c->dispatch_width == 16) {
    994 	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    995 	 brw_math2(p,
    996 		   sechalf(temp_dst),
    997 		   function,
    998 		   sechalf(src0),
    999 		   sechalf(src1));
   1000       }
   1001    } else {
   1002       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1003       brw_MOV(p, brw_message_reg(3), arg1[0]);
   1004       if (c->dispatch_width == 16) {
   1005 	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
   1006 	 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
   1007       }
   1008 
   1009       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
   1010       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1011       brw_math(p,
   1012 	       dst[dst_chan],
   1013 	       function,
   1014 	       2,
   1015 	       arg0[0],
   1016 	       BRW_MATH_DATA_VECTOR,
   1017 	       BRW_MATH_PRECISION_FULL);
   1018 
   1019       /* Send two messages to perform all 16 operations:
   1020        */
   1021       if (c->dispatch_width == 16) {
   1022 	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
   1023 	 brw_math(p,
   1024 		  offset(dst[dst_chan],1),
   1025 		  function,
   1026 		  4,
   1027 		  sechalf(arg0[0]),
   1028 		  BRW_MATH_DATA_VECTOR,
   1029 		  BRW_MATH_PRECISION_FULL);
   1030       }
   1031    }
   1032    brw_pop_insn_state(p);
   1033 }
   1034 
   1035 
   1036 void emit_tex(struct brw_wm_compile *c,
   1037 	      struct brw_reg *dst,
   1038 	      GLuint dst_flags,
   1039 	      struct brw_reg *arg,
   1040 	      struct brw_reg depth_payload,
   1041 	      GLuint tex_idx,
   1042 	      GLuint sampler,
   1043 	      bool shadow)
   1044 {
   1045    struct brw_compile *p = &c->func;
   1046    struct intel_context *intel = &p->brw->intel;
   1047    struct brw_reg dst_retyped;
   1048    GLuint cur_mrf = 2, response_length;
   1049    GLuint i, nr_texcoords;
   1050    GLuint emit;
   1051    GLuint msg_type;
   1052    GLuint mrf_per_channel;
   1053    GLuint simd_mode;
   1054 
   1055    if (c->dispatch_width == 16) {
   1056       mrf_per_channel = 2;
   1057       response_length = 8;
   1058       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
   1059       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
   1060    } else {
   1061       mrf_per_channel = 1;
   1062       response_length = 4;
   1063       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
   1064       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
   1065    }
   1066 
   1067    /* How many input regs are there?
   1068     */
   1069    switch (tex_idx) {
   1070    case TEXTURE_1D_INDEX:
   1071       emit = WRITEMASK_X;
   1072       nr_texcoords = 1;
   1073       break;
   1074    case TEXTURE_2D_INDEX:
   1075    case TEXTURE_1D_ARRAY_INDEX:
   1076    case TEXTURE_RECT_INDEX:
   1077    case TEXTURE_EXTERNAL_INDEX:
   1078       emit = WRITEMASK_XY;
   1079       nr_texcoords = 2;
   1080       break;
   1081    case TEXTURE_3D_INDEX:
   1082    case TEXTURE_2D_ARRAY_INDEX:
   1083    case TEXTURE_CUBE_INDEX:
   1084       emit = WRITEMASK_XYZ;
   1085       nr_texcoords = 3;
   1086       break;
   1087    default:
   1088       /* unexpected target */
   1089       abort();
   1090    }
   1091 
   1092    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
   1093    if (intel->gen < 5 && c->dispatch_width == 8)
   1094       nr_texcoords = 3;
   1095 
   1096    if (shadow) {
   1097       if (intel->gen < 7) {
   1098 	 /* For shadow comparisons, we have to supply u,v,r. */
   1099 	 nr_texcoords = 3;
   1100       } else {
   1101 	 /* On Ivybridge, the shadow comparitor comes first. Just load it. */
   1102 	 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
   1103 	 cur_mrf += mrf_per_channel;
   1104       }
   1105    }
   1106 
   1107    /* Emit the texcoords. */
   1108    for (i = 0; i < nr_texcoords; i++) {
   1109       if (c->key.tex.gl_clamp_mask[i] & (1 << sampler))
   1110 	 brw_set_saturate(p, true);
   1111 
   1112       if (emit & (1<<i))
   1113 	 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
   1114       else
   1115 	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
   1116       cur_mrf += mrf_per_channel;
   1117 
   1118       brw_set_saturate(p, false);
   1119    }
   1120 
   1121    /* Fill in the shadow comparison reference value. */
   1122    if (shadow && intel->gen < 7) {
   1123       if (intel->gen >= 5) {
   1124 	 /* Fill in the cube map array index value. */
   1125 	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
   1126 	 cur_mrf += mrf_per_channel;
   1127       } else if (c->dispatch_width == 8) {
   1128 	 /* Fill in the LOD bias value. */
   1129 	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
   1130 	 cur_mrf += mrf_per_channel;
   1131       }
   1132       brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
   1133       cur_mrf += mrf_per_channel;
   1134    }
   1135 
   1136    if (intel->gen >= 5) {
   1137       if (shadow)
   1138 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
   1139       else
   1140 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
   1141    } else {
   1142       /* Note that G45 and older determines shadow compare and dispatch width
   1143        * from message length for most messages.
   1144        */
   1145       if (c->dispatch_width == 16 && shadow)
   1146 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
   1147       else
   1148 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
   1149    }
   1150 
   1151    brw_SAMPLE(p,
   1152 	      dst_retyped,
   1153 	      1,
   1154 	      retype(depth_payload, BRW_REGISTER_TYPE_UW),
   1155               SURF_INDEX_TEXTURE(sampler),
   1156 	      sampler,
   1157 	      dst_flags & WRITEMASK_XYZW,
   1158 	      msg_type,
   1159 	      response_length,
   1160 	      cur_mrf - 1,
   1161 	      1,
   1162 	      simd_mode,
   1163 	      BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
   1164 }
   1165 
   1166 
   1167 void emit_txb(struct brw_wm_compile *c,
   1168 	      struct brw_reg *dst,
   1169 	      GLuint dst_flags,
   1170 	      struct brw_reg *arg,
   1171 	      struct brw_reg depth_payload,
   1172 	      GLuint tex_idx,
   1173 	      GLuint sampler)
   1174 {
   1175    struct brw_compile *p = &c->func;
   1176    struct intel_context *intel = &p->brw->intel;
   1177    GLuint msgLength;
   1178    GLuint msg_type;
   1179    GLuint mrf_per_channel;
   1180    GLuint response_length;
   1181    struct brw_reg dst_retyped;
   1182 
   1183    /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
   1184     * samples, so we'll use the 16-wide instruction, leave the second halves
   1185     * undefined, and trust the execution mask to keep the undefined pixels
   1186     * from mattering.
   1187     */
   1188    if (c->dispatch_width == 16 || intel->gen < 5) {
   1189       if (intel->gen >= 5)
   1190 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
   1191       else
   1192 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
   1193       mrf_per_channel = 2;
   1194       dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
   1195       response_length = 8;
   1196    } else {
   1197       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
   1198       mrf_per_channel = 1;
   1199       dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
   1200       response_length = 4;
   1201    }
   1202 
   1203    /* Shadow ignored for txb. */
   1204    switch (tex_idx) {
   1205    case TEXTURE_1D_INDEX:
   1206       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
   1207       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
   1208       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
   1209       break;
   1210    case TEXTURE_2D_INDEX:
   1211    case TEXTURE_RECT_INDEX:
   1212    case TEXTURE_EXTERNAL_INDEX:
   1213       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
   1214       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
   1215       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
   1216       break;
   1217    case TEXTURE_3D_INDEX:
   1218    case TEXTURE_CUBE_INDEX:
   1219       brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
   1220       brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
   1221       brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
   1222       break;
   1223    default:
   1224       /* unexpected target */
   1225       abort();
   1226    }
   1227 
   1228    brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
   1229    msgLength = 2 + 4 * mrf_per_channel - 1;
   1230 
   1231    brw_SAMPLE(p,
   1232 	      dst_retyped,
   1233 	      1,
   1234 	      retype(depth_payload, BRW_REGISTER_TYPE_UW),
   1235               SURF_INDEX_TEXTURE(sampler),
   1236 	      sampler,
   1237 	      dst_flags & WRITEMASK_XYZW,
   1238 	      msg_type,
   1239 	      response_length,
   1240 	      msgLength,
   1241 	      1,
   1242 	      BRW_SAMPLER_SIMD_MODE_SIMD16,
   1243 	      BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
   1244 }
   1245 
   1246 
   1247 static void emit_lit(struct brw_wm_compile *c,
   1248 		     const struct brw_reg *dst,
   1249 		     GLuint mask,
   1250 		     const struct brw_reg *arg0)
   1251 {
   1252    struct brw_compile *p = &c->func;
   1253 
   1254    assert((mask & WRITEMASK_XW) == 0);
   1255 
   1256    if (mask & WRITEMASK_Y) {
   1257       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
   1258       brw_MOV(p, dst[1], arg0[0]);
   1259       brw_set_saturate(p, 0);
   1260    }
   1261 
   1262    if (mask & WRITEMASK_Z) {
   1263       emit_math2(c, BRW_MATH_FUNCTION_POW,
   1264 		 &dst[2],
   1265 		 WRITEMASK_X | (mask & SATURATE),
   1266 		 &arg0[1],
   1267 		 &arg0[3]);
   1268    }
   1269 
   1270    /* Ordinarily you'd use an iff statement to skip or shortcircuit
   1271     * some of the POW calculations above, but 16-wide iff statements
   1272     * seem to lock c1 hardware, so this is a nasty workaround:
   1273     */
   1274    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
   1275    {
   1276       if (mask & WRITEMASK_Y)
   1277 	 brw_MOV(p, dst[1], brw_imm_f(0));
   1278 
   1279       if (mask & WRITEMASK_Z)
   1280 	 brw_MOV(p, dst[2], brw_imm_f(0));
   1281    }
   1282    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
   1283 }
   1284 
   1285 
   1286 /* Kill pixel - set execution mask to zero for those pixels which
   1287  * fail.
   1288  */
   1289 static void emit_kil( struct brw_wm_compile *c,
   1290 		      struct brw_reg *arg0)
   1291 {
   1292    struct brw_compile *p = &c->func;
   1293    struct intel_context *intel = &p->brw->intel;
   1294    struct brw_reg pixelmask;
   1295    GLuint i, j;
   1296 
   1297    if (intel->gen >= 6)
   1298       pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
   1299    else
   1300       pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
   1301 
   1302    for (i = 0; i < 4; i++) {
   1303       /* Check if we've already done the comparison for this reg
   1304        * -- common when someone does KIL TEMP.wwww.
   1305        */
   1306       for (j = 0; j < i; j++) {
   1307 	 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
   1308 	    break;
   1309       }
   1310       if (j != i)
   1311 	 continue;
   1312 
   1313       brw_push_insn_state(p);
   1314       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
   1315       brw_set_predicate_control_flag_value(p, 0xff);
   1316       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1317       brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
   1318       brw_pop_insn_state(p);
   1319    }
   1320 }
   1321 
   1322 static void fire_fb_write( struct brw_wm_compile *c,
   1323 			   GLuint base_reg,
   1324 			   GLuint nr,
   1325 			   GLuint target,
   1326 			   GLuint eot )
   1327 {
   1328    struct brw_compile *p = &c->func;
   1329    struct intel_context *intel = &p->brw->intel;
   1330    uint32_t msg_control;
   1331 
   1332    /* Pass through control information:
   1333     *
   1334     * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
   1335     */
   1336 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
   1337    if (intel->gen < 6)
   1338    {
   1339       brw_push_insn_state(p);
   1340       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
   1341       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1342       brw_MOV(p,
   1343 	       brw_message_reg(base_reg + 1),
   1344 	       brw_vec8_grf(1, 0));
   1345       brw_pop_insn_state(p);
   1346    }
   1347 
   1348    if (c->dispatch_width == 16)
   1349       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
   1350    else
   1351       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
   1352 
   1353    /* Send framebuffer write message: */
   1354 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
   1355    brw_fb_WRITE(p,
   1356 		c->dispatch_width,
   1357 		base_reg,
   1358 		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
   1359 		msg_control,
   1360 		target,
   1361 		nr,
   1362 		0,
   1363 		eot,
   1364 		true);
   1365 }
   1366 
   1367 
   1368 static void emit_aa( struct brw_wm_compile *c,
   1369 		     struct brw_reg *arg1,
   1370 		     GLuint reg )
   1371 {
   1372    struct brw_compile *p = &c->func;
   1373    GLuint comp = c->aa_dest_stencil_reg / 2;
   1374    GLuint off = c->aa_dest_stencil_reg % 2;
   1375    struct brw_reg aa = offset(arg1[comp], off);
   1376 
   1377    brw_push_insn_state(p);
   1378    brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
   1379    brw_MOV(p, brw_message_reg(reg), aa);
   1380    brw_pop_insn_state(p);
   1381 }
   1382 
   1383 
   1384 /* Post-fragment-program processing.  Send the results to the
   1385  * framebuffer.
   1386  * \param arg0  the fragment color
   1387  * \param arg1  the pass-through depth value
   1388  * \param arg2  the shader-computed depth value
   1389  */
   1390 void emit_fb_write(struct brw_wm_compile *c,
   1391 		   struct brw_reg *arg0,
   1392 		   struct brw_reg *arg1,
   1393 		   struct brw_reg *arg2,
   1394 		   GLuint target,
   1395 		   GLuint eot)
   1396 {
   1397    struct brw_compile *p = &c->func;
   1398    struct brw_context *brw = p->brw;
   1399    struct intel_context *intel = &brw->intel;
   1400    GLuint nr = 2;
   1401    GLuint channel;
   1402 
   1403    /* Reserve a space for AA - may not be needed:
   1404     */
   1405    if (c->aa_dest_stencil_reg)
   1406       nr += 1;
   1407 
   1408    /* I don't really understand how this achieves the color interleave
   1409     * (ie RGBARGBA) in the result:  [Do the saturation here]
   1410     */
   1411    brw_push_insn_state(p);
   1412 
   1413    if (c->key.clamp_fragment_color)
   1414       brw_set_saturate(p, 1);
   1415 
   1416    for (channel = 0; channel < 4; channel++) {
   1417       if (intel->gen >= 6) {
   1418 	 /* gen6 SIMD16 single source DP write looks like:
   1419 	  * m + 0: r0
   1420 	  * m + 1: r1
   1421 	  * m + 2: g0
   1422 	  * m + 3: g1
   1423 	  * m + 4: b0
   1424 	  * m + 5: b1
   1425 	  * m + 6: a0
   1426 	  * m + 7: a1
   1427 	  */
   1428 	 if (c->dispatch_width == 16) {
   1429 	    brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
   1430 	 } else {
   1431 	    brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
   1432 	 }
   1433       } else if (c->dispatch_width == 16 && brw->has_compr4) {
   1434 	 /* pre-gen6 SIMD16 single source DP write looks like:
   1435 	  * m + 0: r0
   1436 	  * m + 1: g0
   1437 	  * m + 2: b0
   1438 	  * m + 3: a0
   1439 	  * m + 4: r1
   1440 	  * m + 5: g1
   1441 	  * m + 6: b1
   1442 	  * m + 7: a1
   1443 	  *
   1444 	  * By setting the high bit of the MRF register number, we indicate
   1445 	  * that we want COMPR4 mode - instead of doing the usual destination
   1446 	  * + 1 for the second half we get destination + 4.
   1447 	  */
   1448 	 brw_MOV(p,
   1449 		 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
   1450 		 arg0[channel]);
   1451       } else {
   1452 	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
   1453 	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
   1454 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1455 	 brw_MOV(p,
   1456 		 brw_message_reg(nr + channel),
   1457 		 arg0[channel]);
   1458 
   1459 	 if (c->dispatch_width == 16) {
   1460 	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
   1461 	    brw_MOV(p,
   1462 		    brw_message_reg(nr + channel + 4),
   1463 		    sechalf(arg0[channel]));
   1464 	 }
   1465       }
   1466    }
   1467 
   1468    brw_set_saturate(p, 0);
   1469 
   1470    /* skip over the regs populated above:
   1471     */
   1472    if (c->dispatch_width == 16)
   1473       nr += 8;
   1474    else
   1475       nr += 4;
   1476 
   1477    brw_pop_insn_state(p);
   1478 
   1479    if (c->source_depth_to_render_target)
   1480    {
   1481       if (c->computes_depth)
   1482 	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
   1483       else
   1484 	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
   1485 
   1486       nr += 2;
   1487    }
   1488 
   1489    if (c->dest_depth_reg)
   1490    {
   1491       GLuint comp = c->dest_depth_reg / 2;
   1492       GLuint off = c->dest_depth_reg % 2;
   1493 
   1494       if (off != 0) {
   1495          brw_push_insn_state(p);
   1496          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1497 
   1498          brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
   1499          /* 2nd half? */
   1500          brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
   1501          brw_pop_insn_state(p);
   1502       }
   1503       else {
   1504          brw_MOV(p, brw_message_reg(nr), arg1[comp]);
   1505       }
   1506       nr += 2;
   1507    }
   1508 
   1509    if (intel->gen >= 6) {
   1510       /* Load the message header.  There's no implied move from src0
   1511        * to the base mrf on gen6.
   1512        */
   1513       brw_push_insn_state(p);
   1514       brw_set_mask_control(p, BRW_MASK_DISABLE);
   1515       brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
   1516 	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   1517       brw_pop_insn_state(p);
   1518 
   1519       if (target != 0) {
   1520 	 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
   1521 					0,
   1522 					2), BRW_REGISTER_TYPE_UD),
   1523 		 brw_imm_ud(target));
   1524       }
   1525    }
   1526 
   1527    if (!c->runtime_check_aads_emit) {
   1528       if (c->aa_dest_stencil_reg)
   1529 	 emit_aa(c, arg1, 2);
   1530 
   1531       fire_fb_write(c, 0, nr, target, eot);
   1532    }
   1533    else {
   1534       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
   1535       struct brw_reg ip = brw_ip_reg();
   1536       int jmp;
   1537 
   1538       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1539       brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
   1540       brw_AND(p,
   1541 	      v1_null_ud,
   1542 	      get_element_ud(brw_vec8_grf(1,0), 6),
   1543 	      brw_imm_ud(1<<26));
   1544 
   1545       jmp = brw_JMPI(p, ip, ip, brw_imm_w(0)) - p->store;
   1546       {
   1547 	 emit_aa(c, arg1, 2);
   1548 	 fire_fb_write(c, 0, nr, target, eot);
   1549 	 /* note - thread killed in subroutine */
   1550       }
   1551       brw_land_fwd_jump(p, jmp);
   1552 
   1553       /* ELSE: Shuffle up one register to fill in the hole left for AA:
   1554        */
   1555       fire_fb_write(c, 1, nr-1, target, eot);
   1556    }
   1557 }
   1558 
   1559 /**
   1560  * Move a GPR to scratch memory.
   1561  */
   1562 static void emit_spill( struct brw_wm_compile *c,
   1563 			struct brw_reg reg,
   1564 			GLuint slot )
   1565 {
   1566    struct brw_compile *p = &c->func;
   1567 
   1568    /*
   1569      mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
   1570    */
   1571    brw_MOV(p, brw_message_reg(2), reg);
   1572 
   1573    /*
   1574      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
   1575      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
   1576    */
   1577    brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
   1578 }
   1579 
   1580 
   1581 /**
   1582  * Load a GPR from scratch memory.
   1583  */
   1584 static void emit_unspill( struct brw_wm_compile *c,
   1585 			  struct brw_reg reg,
   1586 			  GLuint slot )
   1587 {
   1588    struct brw_compile *p = &c->func;
   1589 
   1590    /* Slot 0 is the undef value.
   1591     */
   1592    if (slot == 0) {
   1593       brw_MOV(p, reg, brw_imm_f(0));
   1594       return;
   1595    }
   1596 
   1597    /*
   1598      mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
   1599      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
   1600    */
   1601 
   1602    brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
   1603 }
   1604 
   1605 
   1606 /**
   1607  * Retrieve up to 4 GEN4 register pairs for the given wm reg:
   1608  * Args with unspill_reg != 0 will be loaded from scratch memory.
   1609  */
   1610 static void get_argument_regs( struct brw_wm_compile *c,
   1611 			       struct brw_wm_ref *arg[],
   1612 			       struct brw_reg *regs )
   1613 {
   1614    GLuint i;
   1615 
   1616    for (i = 0; i < 4; i++) {
   1617       if (arg[i]) {
   1618 	 if (arg[i]->unspill_reg)
   1619 	    emit_unspill(c,
   1620 			 brw_vec8_grf(arg[i]->unspill_reg, 0),
   1621 			 arg[i]->value->spill_slot);
   1622 
   1623 	 regs[i] = arg[i]->hw_reg;
   1624       }
   1625       else {
   1626 	 regs[i] = brw_null_reg();
   1627       }
   1628    }
   1629 }
   1630 
   1631 
   1632 /**
   1633  * For values that have a spill_slot!=0, write those regs to scratch memory.
   1634  */
   1635 static void spill_values( struct brw_wm_compile *c,
   1636 			  struct brw_wm_value *values,
   1637 			  GLuint nr )
   1638 {
   1639    GLuint i;
   1640 
   1641    for (i = 0; i < nr; i++)
   1642       if (values[i].spill_slot)
   1643 	 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
   1644 }
   1645 
   1646 
   1647 /* Emit the fragment program instructions here.
   1648  */
   1649 void brw_wm_emit( struct brw_wm_compile *c )
   1650 {
   1651    struct brw_compile *p = &c->func;
   1652    struct intel_context *intel = &p->brw->intel;
   1653    GLuint insn;
   1654 
   1655    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
   1656    if (intel->gen >= 6)
   1657 	brw_set_acc_write_control(p, 1);
   1658 
   1659    /* Check if any of the payload regs need to be spilled:
   1660     */
   1661    spill_values(c, c->payload.depth, 4);
   1662    spill_values(c, c->creg, c->nr_creg);
   1663    spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
   1664 
   1665 
   1666    for (insn = 0; insn < c->nr_insns; insn++) {
   1667 
   1668       struct brw_wm_instruction *inst = &c->instruction[insn];
   1669       struct brw_reg args[3][4], dst[4];
   1670       GLuint i, dst_flags;
   1671 
   1672       /* Get argument regs:
   1673        */
   1674       for (i = 0; i < 3; i++)
   1675 	 get_argument_regs(c, inst->src[i], args[i]);
   1676 
   1677       /* Get dest regs:
   1678        */
   1679       for (i = 0; i < 4; i++)
   1680 	 if (inst->dst[i])
   1681 	    dst[i] = inst->dst[i]->hw_reg;
   1682 	 else
   1683 	    dst[i] = brw_null_reg();
   1684 
   1685       /* Flags
   1686        */
   1687       dst_flags = inst->writemask;
   1688       if (inst->saturate)
   1689 	 dst_flags |= SATURATE;
   1690 
   1691       switch (inst->opcode) {
   1692 	 /* Generated instructions for calculating triangle interpolants:
   1693 	  */
   1694       case WM_PIXELXY:
   1695 	 emit_pixel_xy(c, dst, dst_flags);
   1696 	 break;
   1697 
   1698       case WM_DELTAXY:
   1699 	 emit_delta_xy(p, dst, dst_flags, args[0]);
   1700 	 break;
   1701 
   1702       case WM_WPOSXY:
   1703 	 emit_wpos_xy(c, dst, dst_flags, args[0]);
   1704 	 break;
   1705 
   1706       case WM_PIXELW:
   1707 	 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
   1708 	 break;
   1709 
   1710       case WM_LINTERP:
   1711 	 emit_linterp(p, dst, dst_flags, args[0], args[1]);
   1712 	 break;
   1713 
   1714       case WM_PINTERP:
   1715 	 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
   1716 	 break;
   1717 
   1718       case WM_CINTERP:
   1719 	 emit_cinterp(p, dst, dst_flags, args[0]);
   1720 	 break;
   1721 
   1722       case WM_FB_WRITE:
   1723 	 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
   1724 	 break;
   1725 
   1726       case WM_FRONTFACING:
   1727 	 emit_frontfacing(p, dst, dst_flags);
   1728 	 break;
   1729 
   1730 	 /* Straightforward arithmetic:
   1731 	  */
   1732       case OPCODE_ADD:
   1733 	 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
   1734 	 break;
   1735 
   1736       case OPCODE_FRC:
   1737 	 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
   1738 	 break;
   1739 
   1740       case OPCODE_FLR:
   1741 	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
   1742 	 break;
   1743 
   1744       case OPCODE_DDX:
   1745 	 emit_ddxy(p, dst, dst_flags, true, args[0], false);
   1746 	 break;
   1747 
   1748       case OPCODE_DDY:
   1749          /* Make sure fp->program.UsesDFdy flag got set (otherwise there's no
   1750           * guarantee that c->key.render_to_fbo is set).
   1751           */
   1752          assert(c->fp->program.UsesDFdy);
   1753 	 emit_ddxy(p, dst, dst_flags, false, args[0], c->key.render_to_fbo);
   1754 	 break;
   1755 
   1756       case OPCODE_DP2:
   1757 	 emit_dp2(p, dst, dst_flags, args[0], args[1]);
   1758 	 break;
   1759 
   1760       case OPCODE_DP3:
   1761 	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
   1762 	 break;
   1763 
   1764       case OPCODE_DP4:
   1765 	 emit_dp4(p, dst, dst_flags, args[0], args[1]);
   1766 	 break;
   1767 
   1768       case OPCODE_DPH:
   1769 	 emit_dph(p, dst, dst_flags, args[0], args[1]);
   1770 	 break;
   1771 
   1772       case OPCODE_TRUNC:
   1773 	 for (i = 0; i < 4; i++) {
   1774 	    if (dst_flags & (1<<i)) {
   1775 	       brw_RNDZ(p, dst[i], args[0][i]);
   1776 	    }
   1777 	 }
   1778 	 break;
   1779 
   1780       case OPCODE_LRP:
   1781 	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
   1782 	 break;
   1783 
   1784       case OPCODE_MAD:
   1785 	 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
   1786 	 break;
   1787 
   1788       case OPCODE_MOV:
   1789       case OPCODE_SWZ:
   1790 	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
   1791 	 break;
   1792 
   1793       case OPCODE_MUL:
   1794 	 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
   1795 	 break;
   1796 
   1797       case OPCODE_XPD:
   1798 	 emit_xpd(p, dst, dst_flags, args[0], args[1]);
   1799 	 break;
   1800 
   1801 	 /* Higher math functions:
   1802 	  */
   1803       case OPCODE_RCP:
   1804 	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
   1805 	 break;
   1806 
   1807       case OPCODE_RSQ:
   1808 	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
   1809 	 break;
   1810 
   1811       case OPCODE_SIN:
   1812 	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
   1813 	 break;
   1814 
   1815       case OPCODE_COS:
   1816 	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
   1817 	 break;
   1818 
   1819       case OPCODE_EX2:
   1820 	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
   1821 	 break;
   1822 
   1823       case OPCODE_LG2:
   1824 	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
   1825 	 break;
   1826 
   1827       case OPCODE_SCS:
   1828 	 /* There is an scs math function, but it would need some
   1829 	  * fixup for 16-element execution.
   1830 	  */
   1831 	 if (dst_flags & WRITEMASK_X)
   1832 	    emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
   1833 	 if (dst_flags & WRITEMASK_Y)
   1834 	    emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
   1835 	 break;
   1836 
   1837       case OPCODE_POW:
   1838 	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
   1839 	 break;
   1840 
   1841 	 /* Comparisons:
   1842 	  */
   1843       case OPCODE_CMP:
   1844 	 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
   1845 	 break;
   1846 
   1847       case OPCODE_MAX:
   1848 	 emit_max(p, dst, dst_flags, args[0], args[1]);
   1849 	 break;
   1850 
   1851       case OPCODE_MIN:
   1852 	 emit_min(p, dst, dst_flags, args[0], args[1]);
   1853 	 break;
   1854 
   1855       case OPCODE_SLT:
   1856 	 emit_slt(p, dst, dst_flags, args[0], args[1]);
   1857 	 break;
   1858 
   1859       case OPCODE_SLE:
   1860 	 emit_sle(p, dst, dst_flags, args[0], args[1]);
   1861 	break;
   1862       case OPCODE_SGT:
   1863 	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
   1864 	break;
   1865       case OPCODE_SGE:
   1866 	 emit_sge(p, dst, dst_flags, args[0], args[1]);
   1867 	 break;
   1868       case OPCODE_SEQ:
   1869 	 emit_seq(p, dst, dst_flags, args[0], args[1]);
   1870 	break;
   1871       case OPCODE_SNE:
   1872 	 emit_sne(p, dst, dst_flags, args[0], args[1]);
   1873 	break;
   1874 
   1875       case OPCODE_SSG:
   1876 	 emit_sign(p, dst, dst_flags, args[0]);
   1877 	 break;
   1878 
   1879       case OPCODE_LIT:
   1880 	 emit_lit(c, dst, dst_flags, args[0]);
   1881 	 break;
   1882 
   1883 	 /* Texturing operations:
   1884 	  */
   1885       case OPCODE_TEX:
   1886 	 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
   1887 		  inst->tex_idx, inst->tex_unit,
   1888 		  inst->tex_shadow);
   1889 	 break;
   1890 
   1891       case OPCODE_TXB:
   1892 	 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
   1893 		  inst->tex_idx, inst->tex_unit);
   1894 	 break;
   1895 
   1896       case OPCODE_KIL:
   1897 	 emit_kil(c, args[0]);
   1898 	 break;
   1899 
   1900       default:
   1901 	 printf("Unsupported opcode %i (%s) in fragment shader\n",
   1902 		inst->opcode, inst->opcode < MAX_OPCODE ?
   1903 		_mesa_opcode_string(inst->opcode) :
   1904 		"unknown");
   1905       }
   1906 
   1907       for (i = 0; i < 4; i++)
   1908 	if (inst->dst[i] && inst->dst[i]->spill_slot)
   1909 	   emit_spill(c,
   1910 		      inst->dst[i]->hw_reg,
   1911 		      inst->dst[i]->spill_slot);
   1912    }
   1913 
   1914    /* Only properly tested on ILK */
   1915    if (p->brw->intel.gen == 5) {
   1916      brw_remove_duplicate_mrf_moves(p);
   1917      if (c->dispatch_width == 16)
   1918 	brw_remove_grf_to_mrf_moves(p);
   1919    }
   1920 
   1921    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
   1922       int i;
   1923 
   1924      printf("wm-native:\n");
   1925      for (i = 0; i < p->nr_insn; i++)
   1926 	 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
   1927       printf("\n");
   1928    }
   1929 }
   1930 
   1931