Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 /** @file brw_fs_emit.cpp
     25  *
     26  * This file supports emitting code from the FS LIR to the actual
     27  * native instructions.
     28  */
     29 
     30 extern "C" {
     31 #include "main/macros.h"
     32 #include "brw_context.h"
     33 #include "brw_eu.h"
     34 } /* extern "C" */
     35 
     36 #include "brw_fs.h"
     37 #include "brw_fs_cfg.h"
     38 #include "glsl/ir_print_visitor.h"
     39 
     40 void
     41 fs_visitor::generate_fb_write(fs_inst *inst)
     42 {
     43    bool eot = inst->eot;
     44    struct brw_reg implied_header;
     45    uint32_t msg_control;
     46 
     47    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
     48     * move, here's g1.
     49     */
     50    brw_push_insn_state(p);
     51    brw_set_mask_control(p, BRW_MASK_DISABLE);
     52    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     53 
     54    if (inst->header_present) {
     55       if (intel->gen >= 6) {
     56 	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
     57 	 brw_MOV(p,
     58 		 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
     59 		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
     60 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     61 
     62          if (inst->target > 0 &&
     63 	     c->key.nr_color_regions > 1 &&
     64 	     c->key.sample_alpha_to_coverage) {
     65             /* Set "Source0 Alpha Present to RenderTarget" bit in message
     66              * header.
     67              */
     68             brw_OR(p,
     69 		   vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
     70 		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
     71 		   brw_imm_ud(0x1 << 11));
     72          }
     73 
     74 	 if (inst->target > 0) {
     75 	    /* Set the render target index for choosing BLEND_STATE. */
     76 	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
     77 					   inst->base_mrf, 2),
     78 			      BRW_REGISTER_TYPE_UD),
     79 		    brw_imm_ud(inst->target));
     80 	 }
     81 
     82 	 implied_header = brw_null_reg();
     83       } else {
     84 	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
     85 
     86 	 brw_MOV(p,
     87 		 brw_message_reg(inst->base_mrf + 1),
     88 		 brw_vec8_grf(1, 0));
     89       }
     90    } else {
     91       implied_header = brw_null_reg();
     92    }
     93 
     94    if (this->dual_src_output.file != BAD_FILE)
     95       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
     96    else if (c->dispatch_width == 16)
     97       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
     98    else
     99       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
    100 
    101    brw_pop_insn_state(p);
    102 
    103    brw_fb_WRITE(p,
    104 		c->dispatch_width,
    105 		inst->base_mrf,
    106 		implied_header,
    107 		msg_control,
    108 		inst->target,
    109 		inst->mlen,
    110 		0,
    111 		eot,
    112 		inst->header_present);
    113 }
    114 
    115 /* Computes the integer pixel x,y values from the origin.
    116  *
    117  * This is the basis of gl_FragCoord computation, but is also used
    118  * pre-gen6 for computing the deltas from v0 for computing
    119  * interpolation.
    120  */
    121 void
    122 fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
    123 {
    124    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
    125    struct brw_reg src;
    126    struct brw_reg deltas;
    127 
    128    if (is_x) {
    129       src = stride(suboffset(g1_uw, 4), 2, 4, 0);
    130       deltas = brw_imm_v(0x10101010);
    131    } else {
    132       src = stride(suboffset(g1_uw, 5), 2, 4, 0);
    133       deltas = brw_imm_v(0x11001100);
    134    }
    135 
    136    if (c->dispatch_width == 16) {
    137       dst = vec16(dst);
    138    }
    139 
    140    /* We do this 8 or 16-wide, but since the destination is UW we
    141     * don't do compression in the 16-wide case.
    142     */
    143    brw_push_insn_state(p);
    144    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    145    brw_ADD(p, dst, src, deltas);
    146    brw_pop_insn_state(p);
    147 }
    148 
    149 void
    150 fs_visitor::generate_linterp(fs_inst *inst,
    151 			     struct brw_reg dst, struct brw_reg *src)
    152 {
    153    struct brw_reg delta_x = src[0];
    154    struct brw_reg delta_y = src[1];
    155    struct brw_reg interp = src[2];
    156 
    157    if (brw->has_pln &&
    158        delta_y.nr == delta_x.nr + 1 &&
    159        (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
    160       brw_PLN(p, dst, interp, delta_x);
    161    } else {
    162       brw_LINE(p, brw_null_reg(), interp, delta_x);
    163       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
    164    }
    165 }
    166 
    167 void
    168 fs_visitor::generate_math1_gen7(fs_inst *inst,
    169 			        struct brw_reg dst,
    170 			        struct brw_reg src0)
    171 {
    172    assert(inst->mlen == 0);
    173    brw_math(p, dst,
    174 	    brw_math_function(inst->opcode),
    175 	    0, src0,
    176 	    BRW_MATH_DATA_VECTOR,
    177 	    BRW_MATH_PRECISION_FULL);
    178 }
    179 
    180 void
    181 fs_visitor::generate_math2_gen7(fs_inst *inst,
    182 			        struct brw_reg dst,
    183 			        struct brw_reg src0,
    184 			        struct brw_reg src1)
    185 {
    186    assert(inst->mlen == 0);
    187    brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
    188 }
    189 
    190 void
    191 fs_visitor::generate_math1_gen6(fs_inst *inst,
    192 			        struct brw_reg dst,
    193 			        struct brw_reg src0)
    194 {
    195    int op = brw_math_function(inst->opcode);
    196 
    197    assert(inst->mlen == 0);
    198 
    199    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    200    brw_math(p, dst,
    201 	    op,
    202 	    0, src0,
    203 	    BRW_MATH_DATA_VECTOR,
    204 	    BRW_MATH_PRECISION_FULL);
    205 
    206    if (c->dispatch_width == 16) {
    207       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    208       brw_math(p, sechalf(dst),
    209 	       op,
    210 	       0, sechalf(src0),
    211 	       BRW_MATH_DATA_VECTOR,
    212 	       BRW_MATH_PRECISION_FULL);
    213       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
    214    }
    215 }
    216 
    217 void
    218 fs_visitor::generate_math2_gen6(fs_inst *inst,
    219 			        struct brw_reg dst,
    220 			        struct brw_reg src0,
    221 			        struct brw_reg src1)
    222 {
    223    int op = brw_math_function(inst->opcode);
    224 
    225    assert(inst->mlen == 0);
    226 
    227    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    228    brw_math2(p, dst, op, src0, src1);
    229 
    230    if (c->dispatch_width == 16) {
    231       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    232       brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
    233       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
    234    }
    235 }
    236 
    237 void
    238 fs_visitor::generate_math_gen4(fs_inst *inst,
    239 			       struct brw_reg dst,
    240 			       struct brw_reg src)
    241 {
    242    int op = brw_math_function(inst->opcode);
    243 
    244    assert(inst->mlen >= 1);
    245 
    246    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    247    brw_math(p, dst,
    248 	    op,
    249 	    inst->base_mrf, src,
    250 	    BRW_MATH_DATA_VECTOR,
    251 	    BRW_MATH_PRECISION_FULL);
    252 
    253    if (c->dispatch_width == 16) {
    254       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    255       brw_math(p, sechalf(dst),
    256 	       op,
    257 	       inst->base_mrf + 1, sechalf(src),
    258 	       BRW_MATH_DATA_VECTOR,
    259 	       BRW_MATH_PRECISION_FULL);
    260 
    261       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
    262    }
    263 }
    264 
    265 void
    266 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
    267 {
    268    int msg_type = -1;
    269    int rlen = 4;
    270    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
    271    uint32_t return_format;
    272 
    273    switch (dst.type) {
    274    case BRW_REGISTER_TYPE_D:
    275       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
    276       break;
    277    case BRW_REGISTER_TYPE_UD:
    278       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
    279       break;
    280    default:
    281       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
    282       break;
    283    }
    284 
    285    if (c->dispatch_width == 16)
    286       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    287 
    288    if (intel->gen >= 5) {
    289       switch (inst->opcode) {
    290       case SHADER_OPCODE_TEX:
    291 	 if (inst->shadow_compare) {
    292 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
    293 	 } else {
    294 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
    295 	 }
    296 	 break;
    297       case FS_OPCODE_TXB:
    298 	 if (inst->shadow_compare) {
    299 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
    300 	 } else {
    301 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
    302 	 }
    303 	 break;
    304       case SHADER_OPCODE_TXL:
    305 	 if (inst->shadow_compare) {
    306 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
    307 	 } else {
    308 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
    309 	 }
    310 	 break;
    311       case SHADER_OPCODE_TXS:
    312 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
    313 	 break;
    314       case SHADER_OPCODE_TXD:
    315          if (inst->shadow_compare) {
    316             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
    317             assert(intel->is_haswell);
    318             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
    319          } else {
    320             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
    321          }
    322 	 break;
    323       case SHADER_OPCODE_TXF:
    324 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
    325 	 break;
    326       default:
    327 	 assert(!"not reached");
    328 	 break;
    329       }
    330    } else {
    331       switch (inst->opcode) {
    332       case SHADER_OPCODE_TEX:
    333 	 /* Note that G45 and older determines shadow compare and dispatch width
    334 	  * from message length for most messages.
    335 	  */
    336 	 assert(c->dispatch_width == 8);
    337 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
    338 	 if (inst->shadow_compare) {
    339 	    assert(inst->mlen == 6);
    340 	 } else {
    341 	    assert(inst->mlen <= 4);
    342 	 }
    343 	 break;
    344       case FS_OPCODE_TXB:
    345 	 if (inst->shadow_compare) {
    346 	    assert(inst->mlen == 6);
    347 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
    348 	 } else {
    349 	    assert(inst->mlen == 9);
    350 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
    351 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    352 	 }
    353 	 break;
    354       case SHADER_OPCODE_TXL:
    355 	 if (inst->shadow_compare) {
    356 	    assert(inst->mlen == 6);
    357 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
    358 	 } else {
    359 	    assert(inst->mlen == 9);
    360 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
    361 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    362 	 }
    363 	 break;
    364       case SHADER_OPCODE_TXD:
    365 	 /* There is no sample_d_c message; comparisons are done manually */
    366 	 assert(inst->mlen == 7 || inst->mlen == 10);
    367 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
    368 	 break;
    369       case SHADER_OPCODE_TXF:
    370 	 assert(inst->mlen == 9);
    371 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
    372 	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    373 	 break;
    374       case SHADER_OPCODE_TXS:
    375 	 assert(inst->mlen == 3);
    376 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
    377 	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    378 	 break;
    379       default:
    380 	 assert(!"not reached");
    381 	 break;
    382       }
    383    }
    384    assert(msg_type != -1);
    385 
    386    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
    387       rlen = 8;
    388       dst = vec16(dst);
    389    }
    390 
    391    /* Load the message header if present.  If there's a texture offset,
    392     * we need to set it up explicitly and load the offset bitfield.
    393     * Otherwise, we can use an implied move from g0 to the first message reg.
    394     */
    395    if (inst->texture_offset) {
    396       brw_push_insn_state(p);
    397       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    398       /* Explicitly set up the message header by copying g0 to the MRF. */
    399       brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
    400                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    401 
    402       /* Then set the offset bits in DWord 2. */
    403       brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
    404                                      inst->base_mrf, 2), BRW_REGISTER_TYPE_UD),
    405                  brw_imm_ud(inst->texture_offset));
    406       brw_pop_insn_state(p);
    407    } else if (inst->header_present) {
    408       /* Set up an implied move from g0 to the MRF. */
    409       src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
    410    }
    411 
    412    brw_SAMPLE(p,
    413 	      retype(dst, BRW_REGISTER_TYPE_UW),
    414 	      inst->base_mrf,
    415 	      src,
    416               SURF_INDEX_TEXTURE(inst->sampler),
    417 	      inst->sampler,
    418 	      WRITEMASK_XYZW,
    419 	      msg_type,
    420 	      rlen,
    421 	      inst->mlen,
    422 	      inst->header_present,
    423 	      simd_mode,
    424 	      return_format);
    425 }
    426 
    427 
    428 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
    429  * looking like:
    430  *
    431  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
    432  *
    433  * and we're trying to produce:
    434  *
    435  *           DDX                     DDY
    436  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
    437  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
    438  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
    439  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
    440  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
    441  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
    442  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
    443  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
    444  *
    445  * and add another set of two more subspans if in 16-pixel dispatch mode.
    446  *
    447  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
    448  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
    449  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
    450  * between each other.  We could probably do it like ddx and swizzle the right
    451  * order later, but bail for now and just produce
    452  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
    453  */
    454 void
    455 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
    456 {
    457    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
    458 				 BRW_REGISTER_TYPE_F,
    459 				 BRW_VERTICAL_STRIDE_2,
    460 				 BRW_WIDTH_2,
    461 				 BRW_HORIZONTAL_STRIDE_0,
    462 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    463    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
    464 				 BRW_REGISTER_TYPE_F,
    465 				 BRW_VERTICAL_STRIDE_2,
    466 				 BRW_WIDTH_2,
    467 				 BRW_HORIZONTAL_STRIDE_0,
    468 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    469    brw_ADD(p, dst, src0, negate(src1));
    470 }
    471 
    472 /* The negate_value boolean is used to negate the derivative computation for
    473  * FBOs, since they place the origin at the upper left instead of the lower
    474  * left.
    475  */
    476 void
    477 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
    478                          bool negate_value)
    479 {
    480    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
    481 				 BRW_REGISTER_TYPE_F,
    482 				 BRW_VERTICAL_STRIDE_4,
    483 				 BRW_WIDTH_4,
    484 				 BRW_HORIZONTAL_STRIDE_0,
    485 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    486    struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
    487 				 BRW_REGISTER_TYPE_F,
    488 				 BRW_VERTICAL_STRIDE_4,
    489 				 BRW_WIDTH_4,
    490 				 BRW_HORIZONTAL_STRIDE_0,
    491 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    492    if (negate_value)
    493       brw_ADD(p, dst, src1, negate(src0));
    494    else
    495       brw_ADD(p, dst, src0, negate(src1));
    496 }
    497 
    498 void
    499 fs_visitor::generate_discard(fs_inst *inst)
    500 {
    501    struct brw_reg f0 = brw_flag_reg();
    502 
    503    if (intel->gen >= 6) {
    504       struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
    505       struct brw_reg some_register;
    506 
    507       /* As of gen6, we no longer have the mask register to look at,
    508        * so life gets a bit more complicated.
    509        */
    510 
    511       /* Load the flag register with all ones. */
    512       brw_push_insn_state(p);
    513       brw_set_mask_control(p, BRW_MASK_DISABLE);
    514       brw_MOV(p, f0, brw_imm_uw(0xffff));
    515       brw_pop_insn_state(p);
    516 
    517       /* Do a comparison that should always fail, to produce 0s in the flag
    518        * reg where we have active channels.
    519        */
    520       some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
    521       brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
    522 	      BRW_CONDITIONAL_NZ, some_register, some_register);
    523 
    524       /* Undo CMP's whacking of predication*/
    525       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    526 
    527       brw_push_insn_state(p);
    528       brw_set_mask_control(p, BRW_MASK_DISABLE);
    529       brw_AND(p, g1, f0, g1);
    530       brw_pop_insn_state(p);
    531    } else {
    532       struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
    533 
    534       brw_push_insn_state(p);
    535       brw_set_mask_control(p, BRW_MASK_DISABLE);
    536       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    537 
    538       /* Unlike the 965, we have the mask reg, so we just need
    539        * somewhere to invert that (containing channels to be disabled)
    540        * so it can be ANDed with the mask of pixels still to be
    541        * written. Use the flag reg for consistency with gen6+.
    542        */
    543       brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */
    544       brw_AND(p, g0, f0, g0);
    545 
    546       brw_pop_insn_state(p);
    547    }
    548 }
    549 
    550 void
    551 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
    552 {
    553    assert(inst->mlen != 0);
    554 
    555    brw_MOV(p,
    556 	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
    557 	   retype(src, BRW_REGISTER_TYPE_UD));
    558    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
    559 				 inst->offset);
    560 }
    561 
    562 void
    563 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
    564 {
    565    assert(inst->mlen != 0);
    566 
    567    /* Clear any post destination dependencies that would be ignored by
    568     * the block read.  See the B-Spec for pre-gen5 send instruction.
    569     *
    570     * This could use a better solution, since texture sampling and
    571     * math reads could potentially run into it as well -- anywhere
    572     * that we have a SEND with a destination that is a register that
    573     * was written but not read within the last N instructions (what's
    574     * N?  unsure).  This is rare because of dead code elimination, but
    575     * not impossible.
    576     */
    577    if (intel->gen == 4 && !intel->is_g4x)
    578       brw_MOV(p, brw_null_reg(), dst);
    579 
    580    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
    581 				inst->offset);
    582 
    583    if (intel->gen == 4 && !intel->is_g4x) {
    584       /* gen4 errata: destination from a send can't be used as a
    585        * destination until it's been read.  Just read it so we don't
    586        * have to worry.
    587        */
    588       brw_MOV(p, brw_null_reg(), dst);
    589    }
    590 }
    591 
    592 void
    593 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst,
    594 					struct brw_reg index,
    595 					struct brw_reg offset)
    596 {
    597    assert(inst->mlen != 0);
    598 
    599    /* Clear any post destination dependencies that would be ignored by
    600     * the block read.  See the B-Spec for pre-gen5 send instruction.
    601     *
    602     * This could use a better solution, since texture sampling and
    603     * math reads could potentially run into it as well -- anywhere
    604     * that we have a SEND with a destination that is a register that
    605     * was written but not read within the last N instructions (what's
    606     * N?  unsure).  This is rare because of dead code elimination, but
    607     * not impossible.
    608     */
    609    if (intel->gen == 4 && !intel->is_g4x)
    610       brw_MOV(p, brw_null_reg(), dst);
    611 
    612    assert(index.file == BRW_IMMEDIATE_VALUE &&
    613 	  index.type == BRW_REGISTER_TYPE_UD);
    614    uint32_t surf_index = index.dw1.ud;
    615 
    616    assert(offset.file == BRW_IMMEDIATE_VALUE &&
    617 	  offset.type == BRW_REGISTER_TYPE_UD);
    618    uint32_t read_offset = offset.dw1.ud;
    619 
    620    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
    621 			read_offset, surf_index);
    622 
    623    if (intel->gen == 4 && !intel->is_g4x) {
    624       /* gen4 errata: destination from a send can't be used as a
    625        * destination until it's been read.  Just read it so we don't
    626        * have to worry.
    627        */
    628       brw_MOV(p, brw_null_reg(), dst);
    629    }
    630 }
    631 
    632 
    633 /**
    634  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
    635  * into the flags register (f0.0).
    636  *
    637  * Used only on Gen6 and above.
    638  */
    639 void
    640 fs_visitor::generate_mov_dispatch_to_flags()
    641 {
    642    struct brw_reg f0 = brw_flag_reg();
    643    struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
    644 
    645    assert (intel->gen >= 6);
    646    brw_push_insn_state(p);
    647    brw_set_mask_control(p, BRW_MASK_DISABLE);
    648    brw_MOV(p, f0, g1);
    649    brw_pop_insn_state(p);
    650 }
    651 
    652 
    653 static uint32_t brw_file_from_reg(fs_reg *reg)
    654 {
    655    switch (reg->file) {
    656    case ARF:
    657       return BRW_ARCHITECTURE_REGISTER_FILE;
    658    case GRF:
    659       return BRW_GENERAL_REGISTER_FILE;
    660    case MRF:
    661       return BRW_MESSAGE_REGISTER_FILE;
    662    case IMM:
    663       return BRW_IMMEDIATE_VALUE;
    664    default:
    665       assert(!"not reached");
    666       return BRW_GENERAL_REGISTER_FILE;
    667    }
    668 }
    669 
    670 static struct brw_reg
    671 brw_reg_from_fs_reg(fs_reg *reg)
    672 {
    673    struct brw_reg brw_reg;
    674 
    675    switch (reg->file) {
    676    case GRF:
    677    case ARF:
    678    case MRF:
    679       if (reg->smear == -1) {
    680 	 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
    681       } else {
    682 	 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
    683       }
    684       brw_reg = retype(brw_reg, reg->type);
    685       if (reg->sechalf)
    686 	 brw_reg = sechalf(brw_reg);
    687       break;
    688    case IMM:
    689       switch (reg->type) {
    690       case BRW_REGISTER_TYPE_F:
    691 	 brw_reg = brw_imm_f(reg->imm.f);
    692 	 break;
    693       case BRW_REGISTER_TYPE_D:
    694 	 brw_reg = brw_imm_d(reg->imm.i);
    695 	 break;
    696       case BRW_REGISTER_TYPE_UD:
    697 	 brw_reg = brw_imm_ud(reg->imm.u);
    698 	 break;
    699       default:
    700 	 assert(!"not reached");
    701 	 brw_reg = brw_null_reg();
    702 	 break;
    703       }
    704       break;
    705    case FIXED_HW_REG:
    706       brw_reg = reg->fixed_hw_reg;
    707       break;
    708    case BAD_FILE:
    709       /* Probably unused. */
    710       brw_reg = brw_null_reg();
    711       break;
    712    case UNIFORM:
    713       assert(!"not reached");
    714       brw_reg = brw_null_reg();
    715       break;
    716    default:
    717       assert(!"not reached");
    718       brw_reg = brw_null_reg();
    719       break;
    720    }
    721    if (reg->abs)
    722       brw_reg = brw_abs(brw_reg);
    723    if (reg->negate)
    724       brw_reg = negate(brw_reg);
    725 
    726    return brw_reg;
    727 }
    728 
    729 void
    730 fs_visitor::generate_code()
    731 {
    732    int last_native_inst = p->nr_insn;
    733    const char *last_annotation_string = NULL;
    734    ir_instruction *last_annotation_ir = NULL;
    735 
    736    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
    737       printf("Native code for fragment shader %d (%d-wide dispatch):\n",
    738 	     prog->Name, c->dispatch_width);
    739    }
    740 
    741    fs_cfg *cfg = NULL;
    742    if (unlikely(INTEL_DEBUG & DEBUG_WM))
    743       cfg = new(mem_ctx) fs_cfg(this);
    744 
    745    foreach_list(node, &this->instructions) {
    746       fs_inst *inst = (fs_inst *)node;
    747       struct brw_reg src[3], dst;
    748 
    749       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
    750 	 foreach_list(node, &cfg->block_list) {
    751 	    fs_bblock_link *link = (fs_bblock_link *)node;
    752 	    fs_bblock *block = link->block;
    753 
    754 	    if (block->start == inst) {
    755 	       printf("   START B%d", block->block_num);
    756 	       foreach_list(predecessor_node, &block->parents) {
    757 		  fs_bblock_link *predecessor_link =
    758 		     (fs_bblock_link *)predecessor_node;
    759 		  fs_bblock *predecessor_block = predecessor_link->block;
    760 		  printf(" <-B%d", predecessor_block->block_num);
    761 	       }
    762 	       printf("\n");
    763 	    }
    764 	 }
    765 
    766 	 if (last_annotation_ir != inst->ir) {
    767 	    last_annotation_ir = inst->ir;
    768 	    if (last_annotation_ir) {
    769 	       printf("   ");
    770 	       last_annotation_ir->print();
    771 	       printf("\n");
    772 	    }
    773 	 }
    774 	 if (last_annotation_string != inst->annotation) {
    775 	    last_annotation_string = inst->annotation;
    776 	    if (last_annotation_string)
    777 	       printf("   %s\n", last_annotation_string);
    778 	 }
    779       }
    780 
    781       for (unsigned int i = 0; i < 3; i++) {
    782 	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
    783 
    784 	 /* The accumulator result appears to get used for the
    785 	  * conditional modifier generation.  When negating a UD
    786 	  * value, there is a 33rd bit generated for the sign in the
    787 	  * accumulator value, so now you can't check, for example,
    788 	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
    789 	  */
    790 	 assert(!inst->conditional_mod ||
    791 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
    792 		!inst->src[i].negate);
    793       }
    794       dst = brw_reg_from_fs_reg(&inst->dst);
    795 
    796       brw_set_conditionalmod(p, inst->conditional_mod);
    797       brw_set_predicate_control(p, inst->predicated);
    798       brw_set_predicate_inverse(p, inst->predicate_inverse);
    799       brw_set_saturate(p, inst->saturate);
    800 
    801       if (inst->force_uncompressed || c->dispatch_width == 8) {
    802 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    803       } else if (inst->force_sechalf) {
    804 	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    805       } else {
    806 	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
    807       }
    808 
    809       switch (inst->opcode) {
    810       case BRW_OPCODE_MOV:
    811 	 brw_MOV(p, dst, src[0]);
    812 	 break;
    813       case BRW_OPCODE_ADD:
    814 	 brw_ADD(p, dst, src[0], src[1]);
    815 	 break;
    816       case BRW_OPCODE_MUL:
    817 	 brw_MUL(p, dst, src[0], src[1]);
    818 	 break;
    819       case BRW_OPCODE_MACH:
    820 	 brw_set_acc_write_control(p, 1);
    821 	 brw_MACH(p, dst, src[0], src[1]);
    822 	 brw_set_acc_write_control(p, 0);
    823 	 break;
    824 
    825       case BRW_OPCODE_MAD:
    826 	 brw_set_access_mode(p, BRW_ALIGN_16);
    827 	 if (c->dispatch_width == 16) {
    828 	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    829 	    brw_MAD(p, dst, src[0], src[1], src[2]);
    830 	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    831 	    brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
    832 	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
    833 	 } else {
    834 	    brw_MAD(p, dst, src[0], src[1], src[2]);
    835 	 }
    836 	 brw_set_access_mode(p, BRW_ALIGN_1);
    837 	 break;
    838 
    839       case BRW_OPCODE_FRC:
    840 	 brw_FRC(p, dst, src[0]);
    841 	 break;
    842       case BRW_OPCODE_RNDD:
    843 	 brw_RNDD(p, dst, src[0]);
    844 	 break;
    845       case BRW_OPCODE_RNDE:
    846 	 brw_RNDE(p, dst, src[0]);
    847 	 break;
    848       case BRW_OPCODE_RNDZ:
    849 	 brw_RNDZ(p, dst, src[0]);
    850 	 break;
    851 
    852       case BRW_OPCODE_AND:
    853 	 brw_AND(p, dst, src[0], src[1]);
    854 	 break;
    855       case BRW_OPCODE_OR:
    856 	 brw_OR(p, dst, src[0], src[1]);
    857 	 break;
    858       case BRW_OPCODE_XOR:
    859 	 brw_XOR(p, dst, src[0], src[1]);
    860 	 break;
    861       case BRW_OPCODE_NOT:
    862 	 brw_NOT(p, dst, src[0]);
    863 	 break;
    864       case BRW_OPCODE_ASR:
    865 	 brw_ASR(p, dst, src[0], src[1]);
    866 	 break;
    867       case BRW_OPCODE_SHR:
    868 	 brw_SHR(p, dst, src[0], src[1]);
    869 	 break;
    870       case BRW_OPCODE_SHL:
    871 	 brw_SHL(p, dst, src[0], src[1]);
    872 	 break;
    873 
    874       case BRW_OPCODE_CMP:
    875 	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
    876 	 break;
    877       case BRW_OPCODE_SEL:
    878 	 brw_SEL(p, dst, src[0], src[1]);
    879 	 break;
    880 
    881       case BRW_OPCODE_IF:
    882 	 if (inst->src[0].file != BAD_FILE) {
    883 	    /* The instruction has an embedded compare (only allowed on gen6) */
    884 	    assert(intel->gen == 6);
    885 	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
    886 	 } else {
    887 	    brw_IF(p, c->dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
    888 	 }
    889 	 break;
    890 
    891       case BRW_OPCODE_ELSE:
    892 	 brw_ELSE(p);
    893 	 break;
    894       case BRW_OPCODE_ENDIF:
    895 	 brw_ENDIF(p);
    896 	 break;
    897 
    898       case BRW_OPCODE_DO:
    899 	 brw_DO(p, BRW_EXECUTE_8);
    900 	 break;
    901 
    902       case BRW_OPCODE_BREAK:
    903 	 brw_BREAK(p);
    904 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    905 	 break;
    906       case BRW_OPCODE_CONTINUE:
    907 	 /* FINISHME: We need to write the loop instruction support still. */
    908 	 if (intel->gen >= 6)
    909 	    gen6_CONT(p);
    910 	 else
    911 	    brw_CONT(p);
    912 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    913 	 break;
    914 
    915       case BRW_OPCODE_WHILE:
    916 	 brw_WHILE(p);
    917 	 break;
    918 
    919       case SHADER_OPCODE_RCP:
    920       case SHADER_OPCODE_RSQ:
    921       case SHADER_OPCODE_SQRT:
    922       case SHADER_OPCODE_EXP2:
    923       case SHADER_OPCODE_LOG2:
    924       case SHADER_OPCODE_SIN:
    925       case SHADER_OPCODE_COS:
    926 	 if (intel->gen >= 7) {
    927 	    generate_math1_gen7(inst, dst, src[0]);
    928 	 } else if (intel->gen == 6) {
    929 	    generate_math1_gen6(inst, dst, src[0]);
    930 	 } else {
    931 	    generate_math_gen4(inst, dst, src[0]);
    932 	 }
    933 	 break;
    934       case SHADER_OPCODE_INT_QUOTIENT:
    935       case SHADER_OPCODE_INT_REMAINDER:
    936       case SHADER_OPCODE_POW:
    937 	 if (intel->gen >= 7) {
    938 	    generate_math2_gen7(inst, dst, src[0], src[1]);
    939 	 } else if (intel->gen == 6) {
    940 	    generate_math2_gen6(inst, dst, src[0], src[1]);
    941 	 } else {
    942 	    generate_math_gen4(inst, dst, src[0]);
    943 	 }
    944 	 break;
    945       case FS_OPCODE_PIXEL_X:
    946 	 generate_pixel_xy(dst, true);
    947 	 break;
    948       case FS_OPCODE_PIXEL_Y:
    949 	 generate_pixel_xy(dst, false);
    950 	 break;
    951       case FS_OPCODE_CINTERP:
    952 	 brw_MOV(p, dst, src[0]);
    953 	 break;
    954       case FS_OPCODE_LINTERP:
    955 	 generate_linterp(inst, dst, src);
    956 	 break;
    957       case SHADER_OPCODE_TEX:
    958       case FS_OPCODE_TXB:
    959       case SHADER_OPCODE_TXD:
    960       case SHADER_OPCODE_TXF:
    961       case SHADER_OPCODE_TXL:
    962       case SHADER_OPCODE_TXS:
    963 	 generate_tex(inst, dst, src[0]);
    964 	 break;
    965       case FS_OPCODE_DISCARD:
    966 	 generate_discard(inst);
    967 	 break;
    968       case FS_OPCODE_DDX:
    969 	 generate_ddx(inst, dst, src[0]);
    970 	 break;
    971       case FS_OPCODE_DDY:
    972          /* Make sure fp->UsesDFdy flag got set (otherwise there's no
    973           * guarantee that c->key.render_to_fbo is set).
    974           */
    975          assert(fp->UsesDFdy);
    976 	 generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
    977 	 break;
    978 
    979       case FS_OPCODE_SPILL:
    980 	 generate_spill(inst, src[0]);
    981 	 break;
    982 
    983       case FS_OPCODE_UNSPILL:
    984 	 generate_unspill(inst, dst);
    985 	 break;
    986 
    987       case FS_OPCODE_PULL_CONSTANT_LOAD:
    988 	 generate_pull_constant_load(inst, dst, src[0], src[1]);
    989 	 break;
    990 
    991       case FS_OPCODE_FB_WRITE:
    992 	 generate_fb_write(inst);
    993 	 break;
    994 
    995       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
    996          generate_mov_dispatch_to_flags();
    997          break;
    998 
    999       default:
   1000 	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
   1001 	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
   1002 			  brw_opcodes[inst->opcode].name);
   1003 	 } else {
   1004 	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
   1005 	 }
   1006 	 fail("unsupported opcode in FS\n");
   1007       }
   1008 
   1009       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
   1010 	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
   1011 	    if (0) {
   1012 	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
   1013 		      ((uint32_t *)&p->store[i])[3],
   1014 		      ((uint32_t *)&p->store[i])[2],
   1015 		      ((uint32_t *)&p->store[i])[1],
   1016 		      ((uint32_t *)&p->store[i])[0]);
   1017 	    }
   1018 	    brw_disasm(stdout, &p->store[i], intel->gen);
   1019 	 }
   1020 
   1021 	 foreach_list(node, &cfg->block_list) {
   1022 	    fs_bblock_link *link = (fs_bblock_link *)node;
   1023 	    fs_bblock *block = link->block;
   1024 
   1025 	    if (block->end == inst) {
   1026 	       printf("   END B%d", block->block_num);
   1027 	       foreach_list(successor_node, &block->children) {
   1028 		  fs_bblock_link *successor_link =
   1029 		     (fs_bblock_link *)successor_node;
   1030 		  fs_bblock *successor_block = successor_link->block;
   1031 		  printf(" ->B%d", successor_block->block_num);
   1032 	       }
   1033 	       printf("\n");
   1034 	    }
   1035 	 }
   1036       }
   1037 
   1038       last_native_inst = p->nr_insn;
   1039    }
   1040 
   1041    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
   1042       printf("\n");
   1043    }
   1044 
   1045    brw_set_uip_jip(p);
   1046 
   1047    /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
   1048     * emit issues, it doesn't get the jump distances into the output,
   1049     * which is often something we want to debug.  So this is here in
   1050     * case you're doing that.
   1051     */
   1052    if (0) {
   1053       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
   1054 	 for (unsigned int i = 0; i < p->nr_insn; i++) {
   1055 	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
   1056 		   ((uint32_t *)&p->store[i])[3],
   1057 		   ((uint32_t *)&p->store[i])[2],
   1058 		   ((uint32_t *)&p->store[i])[1],
   1059 		   ((uint32_t *)&p->store[i])[0]);
   1060 	    brw_disasm(stdout, &p->store[i], intel->gen);
   1061 	 }
   1062       }
   1063    }
   1064 }
   1065