Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 /** @file brw_fs_visitor.cpp
     25  *
     26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
     27  * makes it easier to do backend-specific optimizations than doing so
     28  * in the GLSL IR or in the native code.
     29  */
     30 extern "C" {
     31 
     32 #include <sys/types.h>
     33 
     34 #include "main/macros.h"
     35 #include "main/shaderobj.h"
     36 #include "main/uniforms.h"
     37 #include "program/prog_parameter.h"
     38 #include "program/prog_print.h"
     39 #include "program/prog_optimize.h"
     40 #include "program/register_allocate.h"
     41 #include "program/sampler.h"
     42 #include "program/hash_table.h"
     43 #include "brw_context.h"
     44 #include "brw_eu.h"
     45 #include "brw_wm.h"
     46 }
     47 #include "brw_shader.h"
     48 #include "brw_fs.h"
     49 #include "glsl/glsl_types.h"
     50 #include "glsl/ir_optimization.h"
     51 #include "glsl/ir_print_visitor.h"
     52 
     53 void
     54 fs_visitor::visit(ir_variable *ir)
     55 {
     56    fs_reg *reg = NULL;
     57 
     58    if (variable_storage(ir))
     59       return;
     60 
     61    if (ir->mode == ir_var_in) {
     62       if (!strcmp(ir->name, "gl_FragCoord")) {
     63 	 reg = emit_fragcoord_interpolation(ir);
     64       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
     65 	 reg = emit_frontfacing_interpolation(ir);
     66       } else {
     67 	 reg = emit_general_interpolation(ir);
     68       }
     69       assert(reg);
     70       hash_table_insert(this->variable_ht, reg, ir);
     71       return;
     72    } else if (ir->mode == ir_var_out) {
     73       reg = new(this->mem_ctx) fs_reg(this, ir->type);
     74 
     75       if (ir->index > 0) {
     76 	 assert(ir->location == FRAG_RESULT_DATA0);
     77 	 assert(ir->index == 1);
     78 	 this->dual_src_output = *reg;
     79       } else if (ir->location == FRAG_RESULT_COLOR) {
     80 	 /* Writing gl_FragColor outputs to all color regions. */
     81 	 for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
     82 	    this->outputs[i] = *reg;
     83 	    this->output_components[i] = 4;
     84 	 }
     85       } else if (ir->location == FRAG_RESULT_DEPTH) {
     86 	 this->frag_depth = ir;
     87       } else {
     88 	 /* gl_FragData or a user-defined FS output */
     89 	 assert(ir->location >= FRAG_RESULT_DATA0 &&
     90 		ir->location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
     91 
     92 	 int vector_elements =
     93 	    ir->type->is_array() ? ir->type->fields.array->vector_elements
     94 				 : ir->type->vector_elements;
     95 
     96 	 /* General color output. */
     97 	 for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
     98 	    int output = ir->location - FRAG_RESULT_DATA0 + i;
     99 	    this->outputs[output] = *reg;
    100 	    this->outputs[output].reg_offset += vector_elements * i;
    101 	    this->output_components[output] = vector_elements;
    102 	 }
    103       }
    104    } else if (ir->mode == ir_var_uniform) {
    105       int param_index = c->prog_data.nr_params;
    106 
    107       /* Thanks to the lower_ubo_reference pass, we will see only
    108        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
    109        * variables, so no need for them to be in variable_ht.
    110        */
    111       if (ir->uniform_block != -1)
    112          return;
    113 
    114       if (c->dispatch_width == 16) {
    115 	 if (!variable_storage(ir)) {
    116 	    fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
    117 	 }
    118 	 return;
    119       }
    120 
    121       if (!strncmp(ir->name, "gl_", 3)) {
    122 	 setup_builtin_uniform_values(ir);
    123       } else {
    124 	 setup_uniform_values(ir->location, ir->type);
    125       }
    126 
    127       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
    128       reg->type = brw_type_for_base_type(ir->type);
    129    }
    130 
    131    if (!reg)
    132       reg = new(this->mem_ctx) fs_reg(this, ir->type);
    133 
    134    hash_table_insert(this->variable_ht, reg, ir);
    135 }
    136 
    137 void
    138 fs_visitor::visit(ir_dereference_variable *ir)
    139 {
    140    fs_reg *reg = variable_storage(ir->var);
    141    this->result = *reg;
    142 }
    143 
    144 void
    145 fs_visitor::visit(ir_dereference_record *ir)
    146 {
    147    const glsl_type *struct_type = ir->record->type;
    148 
    149    ir->record->accept(this);
    150 
    151    unsigned int offset = 0;
    152    for (unsigned int i = 0; i < struct_type->length; i++) {
    153       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
    154 	 break;
    155       offset += type_size(struct_type->fields.structure[i].type);
    156    }
    157    this->result.reg_offset += offset;
    158    this->result.type = brw_type_for_base_type(ir->type);
    159 }
    160 
    161 void
    162 fs_visitor::visit(ir_dereference_array *ir)
    163 {
    164    ir_constant *index;
    165    int element_size;
    166 
    167    ir->array->accept(this);
    168    index = ir->array_index->as_constant();
    169 
    170    element_size = type_size(ir->type);
    171    this->result.type = brw_type_for_base_type(ir->type);
    172 
    173    if (index) {
    174       assert(this->result.file == UNIFORM || this->result.file == GRF);
    175       this->result.reg_offset += index->value.i[0] * element_size;
    176    } else {
    177       assert(!"FINISHME: non-constant array element");
    178    }
    179 }
    180 
    181 /* Instruction selection: Produce a MOV.sat instead of
    182  * MIN(MAX(val, 0), 1) when possible.
    183  */
    184 bool
    185 fs_visitor::try_emit_saturate(ir_expression *ir)
    186 {
    187    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
    188 
    189    if (!sat_val)
    190       return false;
    191 
    192    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
    193 
    194    sat_val->accept(this);
    195    fs_reg src = this->result;
    196 
    197    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
    198 
    199    /* If the last instruction from our accept() didn't generate our
    200     * src, generate a saturated MOV
    201     */
    202    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
    203    if (!modify || modify->regs_written() != 1) {
    204       this->result = fs_reg(this, ir->type);
    205       fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
    206       inst->saturate = true;
    207    } else {
    208       modify->saturate = true;
    209       this->result = src;
    210    }
    211 
    212 
    213    return true;
    214 }
    215 
    216 bool
    217 fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
    218 {
    219    /* 3-src instructions were introduced in gen6. */
    220    if (intel->gen < 6)
    221       return false;
    222 
    223    /* MAD can only handle floating-point data. */
    224    if (ir->type != glsl_type::float_type)
    225       return false;
    226 
    227    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
    228    ir_expression *mul = ir->operands[mul_arg]->as_expression();
    229 
    230    if (!mul || mul->operation != ir_binop_mul)
    231       return false;
    232 
    233    if (nonmul->as_constant() ||
    234        mul->operands[0]->as_constant() ||
    235        mul->operands[1]->as_constant())
    236       return false;
    237 
    238    nonmul->accept(this);
    239    fs_reg src0 = this->result;
    240 
    241    mul->operands[0]->accept(this);
    242    fs_reg src1 = this->result;
    243 
    244    mul->operands[1]->accept(this);
    245    fs_reg src2 = this->result;
    246 
    247    this->result = fs_reg(this, ir->type);
    248    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
    249 
    250    return true;
    251 }
    252 
    253 void
    254 fs_visitor::visit(ir_expression *ir)
    255 {
    256    unsigned int operand;
    257    fs_reg op[2], temp;
    258    fs_inst *inst;
    259 
    260    assert(ir->get_num_operands() <= 2);
    261 
    262    if (try_emit_saturate(ir))
    263       return;
    264    if (ir->operation == ir_binop_add) {
    265       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
    266 	 return;
    267    }
    268 
    269    for (operand = 0; operand < ir->get_num_operands(); operand++) {
    270       ir->operands[operand]->accept(this);
    271       if (this->result.file == BAD_FILE) {
    272 	 ir_print_visitor v;
    273 	 fail("Failed to get tree for expression operand:\n");
    274 	 ir->operands[operand]->accept(&v);
    275       }
    276       op[operand] = this->result;
    277 
    278       /* Matrix expression operands should have been broken down to vector
    279        * operations already.
    280        */
    281       assert(!ir->operands[operand]->type->is_matrix());
    282       /* And then those vector operands should have been broken down to scalar.
    283        */
    284       assert(!ir->operands[operand]->type->is_vector());
    285    }
    286 
    287    /* Storage for our result.  If our result goes into an assignment, it will
    288     * just get copy-propagated out, so no worries.
    289     */
    290    this->result = fs_reg(this, ir->type);
    291 
    292    switch (ir->operation) {
    293    case ir_unop_logic_not:
    294       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
    295        * ones complement of the whole register, not just bit 0.
    296        */
    297       emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
    298       break;
    299    case ir_unop_neg:
    300       op[0].negate = !op[0].negate;
    301       this->result = op[0];
    302       break;
    303    case ir_unop_abs:
    304       op[0].abs = true;
    305       op[0].negate = false;
    306       this->result = op[0];
    307       break;
    308    case ir_unop_sign:
    309       temp = fs_reg(this, ir->type);
    310 
    311       emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
    312 
    313       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
    314       inst->conditional_mod = BRW_CONDITIONAL_G;
    315       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
    316       inst->predicated = true;
    317 
    318       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
    319       inst->conditional_mod = BRW_CONDITIONAL_L;
    320       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
    321       inst->predicated = true;
    322 
    323       break;
    324    case ir_unop_rcp:
    325       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
    326       break;
    327 
    328    case ir_unop_exp2:
    329       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
    330       break;
    331    case ir_unop_log2:
    332       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
    333       break;
    334    case ir_unop_exp:
    335    case ir_unop_log:
    336       assert(!"not reached: should be handled by ir_explog_to_explog2");
    337       break;
    338    case ir_unop_sin:
    339    case ir_unop_sin_reduced:
    340       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
    341       break;
    342    case ir_unop_cos:
    343    case ir_unop_cos_reduced:
    344       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
    345       break;
    346 
    347    case ir_unop_dFdx:
    348       emit(FS_OPCODE_DDX, this->result, op[0]);
    349       break;
    350    case ir_unop_dFdy:
    351       emit(FS_OPCODE_DDY, this->result, op[0]);
    352       break;
    353 
    354    case ir_binop_add:
    355       emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
    356       break;
    357    case ir_binop_sub:
    358       assert(!"not reached: should be handled by ir_sub_to_add_neg");
    359       break;
    360 
    361    case ir_binop_mul:
    362       if (ir->type->is_integer()) {
    363 	 /* For integer multiplication, the MUL uses the low 16 bits
    364 	  * of one of the operands (src0 on gen6, src1 on gen7).  The
    365 	  * MACH accumulates in the contribution of the upper 16 bits
    366 	  * of that operand.
    367 	  *
    368 	  * FINISHME: Emit just the MUL if we know an operand is small
    369 	  * enough.
    370 	  */
    371 	 if (intel->gen >= 7 && c->dispatch_width == 16)
    372 	    fail("16-wide explicit accumulator operands unsupported\n");
    373 
    374 	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
    375 
    376 	 emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
    377 	 emit(BRW_OPCODE_MACH, reg_null_d, op[0], op[1]);
    378 	 emit(BRW_OPCODE_MOV, this->result, fs_reg(acc));
    379       } else {
    380 	 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
    381       }
    382       break;
    383    case ir_binop_div:
    384       if (intel->gen >= 7 && c->dispatch_width == 16)
    385 	 fail("16-wide INTDIV unsupported\n");
    386 
    387       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
    388       assert(ir->type->is_integer());
    389       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
    390       break;
    391    case ir_binop_mod:
    392       if (intel->gen >= 7 && c->dispatch_width == 16)
    393 	 fail("16-wide INTDIV unsupported\n");
    394 
    395       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
    396       assert(ir->type->is_integer());
    397       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
    398       break;
    399 
    400    case ir_binop_less:
    401    case ir_binop_greater:
    402    case ir_binop_lequal:
    403    case ir_binop_gequal:
    404    case ir_binop_equal:
    405    case ir_binop_all_equal:
    406    case ir_binop_nequal:
    407    case ir_binop_any_nequal:
    408       temp = this->result;
    409       /* original gen4 does implicit conversion before comparison. */
    410       if (intel->gen < 5)
    411 	 temp.type = op[0].type;
    412 
    413       resolve_ud_negate(&op[0]);
    414       resolve_ud_negate(&op[1]);
    415 
    416       resolve_bool_comparison(ir->operands[0], &op[0]);
    417       resolve_bool_comparison(ir->operands[1], &op[1]);
    418 
    419       inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
    420       inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
    421       break;
    422 
    423    case ir_binop_logic_xor:
    424       emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
    425       break;
    426 
    427    case ir_binop_logic_or:
    428       emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
    429       break;
    430 
    431    case ir_binop_logic_and:
    432       emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
    433       break;
    434 
    435    case ir_binop_dot:
    436    case ir_unop_any:
    437       assert(!"not reached: should be handled by brw_fs_channel_expressions");
    438       break;
    439 
    440    case ir_unop_noise:
    441       assert(!"not reached: should be handled by lower_noise");
    442       break;
    443 
    444    case ir_quadop_vector:
    445       assert(!"not reached: should be handled by lower_quadop_vector");
    446       break;
    447 
    448    case ir_unop_sqrt:
    449       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
    450       break;
    451 
    452    case ir_unop_rsq:
    453       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
    454       break;
    455 
    456    case ir_unop_bitcast_i2f:
    457    case ir_unop_bitcast_u2f:
    458       op[0].type = BRW_REGISTER_TYPE_F;
    459       this->result = op[0];
    460       break;
    461    case ir_unop_i2u:
    462    case ir_unop_bitcast_f2u:
    463       op[0].type = BRW_REGISTER_TYPE_UD;
    464       this->result = op[0];
    465       break;
    466    case ir_unop_u2i:
    467    case ir_unop_bitcast_f2i:
    468       op[0].type = BRW_REGISTER_TYPE_D;
    469       this->result = op[0];
    470       break;
    471    case ir_unop_i2f:
    472    case ir_unop_u2f:
    473    case ir_unop_f2i:
    474    case ir_unop_f2u:
    475       emit(BRW_OPCODE_MOV, this->result, op[0]);
    476       break;
    477 
    478    case ir_unop_b2i:
    479       inst = emit(BRW_OPCODE_AND, this->result, op[0], fs_reg(1));
    480       break;
    481    case ir_unop_b2f:
    482       temp = fs_reg(this, glsl_type::int_type);
    483       emit(BRW_OPCODE_AND, temp, op[0], fs_reg(1));
    484       emit(BRW_OPCODE_MOV, this->result, temp);
    485       break;
    486 
    487    case ir_unop_f2b:
    488       inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f));
    489       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    490       emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
    491       break;
    492    case ir_unop_i2b:
    493       assert(op[0].type == BRW_REGISTER_TYPE_D);
    494 
    495       inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0));
    496       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    497       emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
    498       break;
    499 
    500    case ir_unop_trunc:
    501       emit(BRW_OPCODE_RNDZ, this->result, op[0]);
    502       break;
    503    case ir_unop_ceil:
    504       op[0].negate = !op[0].negate;
    505       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
    506       this->result.negate = true;
    507       break;
    508    case ir_unop_floor:
    509       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
    510       break;
    511    case ir_unop_fract:
    512       inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
    513       break;
    514    case ir_unop_round_even:
    515       emit(BRW_OPCODE_RNDE, this->result, op[0]);
    516       break;
    517 
    518    case ir_binop_min:
    519       resolve_ud_negate(&op[0]);
    520       resolve_ud_negate(&op[1]);
    521 
    522       if (intel->gen >= 6) {
    523 	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
    524 	 inst->conditional_mod = BRW_CONDITIONAL_L;
    525       } else {
    526 	 /* Unalias the destination */
    527 	 this->result = fs_reg(this, ir->type);
    528 
    529 	 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
    530 	 inst->conditional_mod = BRW_CONDITIONAL_L;
    531 
    532 	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
    533 	 inst->predicated = true;
    534       }
    535       break;
    536    case ir_binop_max:
    537       resolve_ud_negate(&op[0]);
    538       resolve_ud_negate(&op[1]);
    539 
    540       if (intel->gen >= 6) {
    541 	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
    542 	 inst->conditional_mod = BRW_CONDITIONAL_GE;
    543       } else {
    544 	 /* Unalias the destination */
    545 	 this->result = fs_reg(this, ir->type);
    546 
    547 	 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
    548 	 inst->conditional_mod = BRW_CONDITIONAL_G;
    549 
    550 	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
    551 	 inst->predicated = true;
    552       }
    553       break;
    554 
    555    case ir_binop_pow:
    556       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
    557       break;
    558 
    559    case ir_unop_bit_not:
    560       inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
    561       break;
    562    case ir_binop_bit_and:
    563       inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
    564       break;
    565    case ir_binop_bit_xor:
    566       inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
    567       break;
    568    case ir_binop_bit_or:
    569       inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
    570       break;
    571 
    572    case ir_binop_lshift:
    573       inst = emit(BRW_OPCODE_SHL, this->result, op[0], op[1]);
    574       break;
    575 
    576    case ir_binop_rshift:
    577       if (ir->type->base_type == GLSL_TYPE_INT)
    578 	 inst = emit(BRW_OPCODE_ASR, this->result, op[0], op[1]);
    579       else
    580 	 inst = emit(BRW_OPCODE_SHR, this->result, op[0], op[1]);
    581       break;
    582 
    583    case ir_binop_ubo_load:
    584       ir_constant *uniform_block = ir->operands[0]->as_constant();
    585       ir_constant *offset = ir->operands[1]->as_constant();
    586 
    587       fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
    588       packed_consts.type = result.type;
    589       fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_WM_UBO(uniform_block->value.u[0]));
    590       fs_inst *pull = emit(fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
    591                                    packed_consts,
    592                                    surf_index,
    593                                    fs_reg(offset->value.u[0])));
    594       pull->base_mrf = 14;
    595       pull->mlen = 1;
    596 
    597       packed_consts.smear = offset->value.u[0] % 16 / 4;
    598       for (int i = 0; i < ir->type->vector_elements; i++) {
    599          /* UBO bools are any nonzero value.  We consider bools to be
    600           * values with the low bit set to 1.  Convert them using CMP.
    601           */
    602          if (ir->type->base_type == GLSL_TYPE_BOOL) {
    603             fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, result,
    604                                          packed_consts, fs_reg(0u)));
    605             inst->conditional_mod = BRW_CONDITIONAL_NZ;
    606          } else {
    607             emit(fs_inst(BRW_OPCODE_MOV, result, packed_consts));
    608          }
    609 
    610          packed_consts.smear++;
    611          result.reg_offset++;
    612 
    613          /* The std140 packing rules don't allow vectors to cross 16-byte
    614           * boundaries, and a reg is 32 bytes.
    615           */
    616          assert(packed_consts.smear < 8);
    617       }
    618       result.reg_offset = 0;
    619       break;
    620    }
    621 }
    622 
    623 void
    624 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
    625 				   const glsl_type *type, bool predicated)
    626 {
    627    switch (type->base_type) {
    628    case GLSL_TYPE_FLOAT:
    629    case GLSL_TYPE_UINT:
    630    case GLSL_TYPE_INT:
    631    case GLSL_TYPE_BOOL:
    632       for (unsigned int i = 0; i < type->components(); i++) {
    633 	 l.type = brw_type_for_base_type(type);
    634 	 r.type = brw_type_for_base_type(type);
    635 
    636 	 if (predicated || !l.equals(r)) {
    637 	    fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
    638 	    inst->predicated = predicated;
    639 	 }
    640 
    641 	 l.reg_offset++;
    642 	 r.reg_offset++;
    643       }
    644       break;
    645    case GLSL_TYPE_ARRAY:
    646       for (unsigned int i = 0; i < type->length; i++) {
    647 	 emit_assignment_writes(l, r, type->fields.array, predicated);
    648       }
    649       break;
    650 
    651    case GLSL_TYPE_STRUCT:
    652       for (unsigned int i = 0; i < type->length; i++) {
    653 	 emit_assignment_writes(l, r, type->fields.structure[i].type,
    654 				predicated);
    655       }
    656       break;
    657 
    658    case GLSL_TYPE_SAMPLER:
    659       break;
    660 
    661    default:
    662       assert(!"not reached");
    663       break;
    664    }
    665 }
    666 
    667 /* If the RHS processing resulted in an instruction generating a
    668  * temporary value, and it would be easy to rewrite the instruction to
    669  * generate its result right into the LHS instead, do so.  This ends
    670  * up reliably removing instructions where it can be tricky to do so
    671  * later without real UD chain information.
    672  */
    673 bool
    674 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
    675                                    fs_reg dst,
    676                                    fs_reg src,
    677                                    fs_inst *pre_rhs_inst,
    678                                    fs_inst *last_rhs_inst)
    679 {
    680    /* Only attempt if we're doing a direct assignment. */
    681    if (ir->condition ||
    682        !(ir->lhs->type->is_scalar() ||
    683         (ir->lhs->type->is_vector() &&
    684          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
    685       return false;
    686 
    687    /* Make sure the last instruction generated our source reg. */
    688    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
    689 						    last_rhs_inst,
    690 						    src);
    691    if (!modify)
    692       return false;
    693 
    694    /* If last_rhs_inst wrote a different number of components than our LHS,
    695     * we can't safely rewrite it.
    696     */
    697    if (ir->lhs->type->vector_elements != modify->regs_written())
    698       return false;
    699 
    700    /* Success!  Rewrite the instruction. */
    701    modify->dst = dst;
    702 
    703    return true;
    704 }
    705 
    706 void
    707 fs_visitor::visit(ir_assignment *ir)
    708 {
    709    fs_reg l, r;
    710    fs_inst *inst;
    711 
    712    /* FINISHME: arrays on the lhs */
    713    ir->lhs->accept(this);
    714    l = this->result;
    715 
    716    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
    717 
    718    ir->rhs->accept(this);
    719    r = this->result;
    720 
    721    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
    722 
    723    assert(l.file != BAD_FILE);
    724    assert(r.file != BAD_FILE);
    725 
    726    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
    727       return;
    728 
    729    if (ir->condition) {
    730       emit_bool_to_cond_code(ir->condition);
    731    }
    732 
    733    if (ir->lhs->type->is_scalar() ||
    734        ir->lhs->type->is_vector()) {
    735       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
    736 	 if (ir->write_mask & (1 << i)) {
    737 	    inst = emit(BRW_OPCODE_MOV, l, r);
    738 	    if (ir->condition)
    739 	       inst->predicated = true;
    740 	    r.reg_offset++;
    741 	 }
    742 	 l.reg_offset++;
    743       }
    744    } else {
    745       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
    746    }
    747 }
    748 
    749 fs_inst *
    750 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    751 			      fs_reg shadow_c, fs_reg lod, fs_reg dPdy)
    752 {
    753    int mlen;
    754    int base_mrf = 1;
    755    bool simd16 = false;
    756    fs_reg orig_dst;
    757 
    758    /* g0 header. */
    759    mlen = 1;
    760 
    761    if (ir->shadow_comparitor) {
    762       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
    763 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
    764 	 coordinate.reg_offset++;
    765       }
    766       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
    767       mlen += 3;
    768 
    769       if (ir->op == ir_tex) {
    770 	 /* There's no plain shadow compare message, so we use shadow
    771 	  * compare with a bias of 0.0.
    772 	  */
    773 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
    774 	 mlen++;
    775       } else if (ir->op == ir_txb || ir->op == ir_txl) {
    776 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
    777 	 mlen++;
    778       } else {
    779          assert(!"Should not get here.");
    780       }
    781 
    782       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
    783       mlen++;
    784    } else if (ir->op == ir_tex) {
    785       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
    786 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
    787 	 coordinate.reg_offset++;
    788       }
    789       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
    790       mlen += 3;
    791    } else if (ir->op == ir_txd) {
    792       fs_reg &dPdx = lod;
    793 
    794       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
    795 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
    796 	 coordinate.reg_offset++;
    797       }
    798       /* the slots for u and v are always present, but r is optional */
    799       mlen += MAX2(ir->coordinate->type->vector_elements, 2);
    800 
    801       /*  P   = u, v, r
    802        * dPdx = dudx, dvdx, drdx
    803        * dPdy = dudy, dvdy, drdy
    804        *
    805        * 1-arg: Does not exist.
    806        *
    807        * 2-arg: dudx   dvdx   dudy   dvdy
    808        *        dPdx.x dPdx.y dPdy.x dPdy.y
    809        *        m4     m5     m6     m7
    810        *
    811        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
    812        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
    813        *        m5     m6     m7     m8     m9     m10
    814        */
    815       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
    816 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdx);
    817 	 dPdx.reg_offset++;
    818       }
    819       mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
    820 
    821       for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
    822 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdy);
    823 	 dPdy.reg_offset++;
    824       }
    825       mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
    826    } else if (ir->op == ir_txs) {
    827       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
    828       simd16 = true;
    829       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
    830       mlen += 2;
    831    } else {
    832       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
    833        * instructions.  We'll need to do SIMD16 here.
    834        */
    835       simd16 = true;
    836       assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
    837 
    838       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
    839 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
    840 	      coordinate);
    841 	 coordinate.reg_offset++;
    842       }
    843 
    844       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
    845        * be necessary for TXF (ld), but seems wise to do for all messages.
    846        */
    847       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
    848 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
    849       }
    850 
    851       /* lod/bias appears after u/v/r. */
    852       mlen += 6;
    853 
    854       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, lod.type), lod);
    855       mlen++;
    856 
    857       /* The unused upper half. */
    858       mlen++;
    859    }
    860 
    861    if (simd16) {
    862       /* Now, since we're doing simd16, the return is 2 interleaved
    863        * vec4s where the odd-indexed ones are junk. We'll need to move
    864        * this weirdness around to the expected layout.
    865        */
    866       orig_dst = dst;
    867       const glsl_type *vec_type =
    868 	 glsl_type::get_instance(ir->type->base_type, 4, 1);
    869       dst = fs_reg(this, glsl_type::get_array_instance(vec_type, 2));
    870       dst.type = intel->is_g4x ? brw_type_for_base_type(ir->type)
    871 			       : BRW_REGISTER_TYPE_F;
    872    }
    873 
    874    fs_inst *inst = NULL;
    875    switch (ir->op) {
    876    case ir_tex:
    877       inst = emit(SHADER_OPCODE_TEX, dst);
    878       break;
    879    case ir_txb:
    880       inst = emit(FS_OPCODE_TXB, dst);
    881       break;
    882    case ir_txl:
    883       inst = emit(SHADER_OPCODE_TXL, dst);
    884       break;
    885    case ir_txd:
    886       inst = emit(SHADER_OPCODE_TXD, dst);
    887       break;
    888    case ir_txs:
    889       inst = emit(SHADER_OPCODE_TXS, dst);
    890       break;
    891    case ir_txf:
    892       inst = emit(SHADER_OPCODE_TXF, dst);
    893       break;
    894    }
    895    inst->base_mrf = base_mrf;
    896    inst->mlen = mlen;
    897    inst->header_present = true;
    898 
    899    if (simd16) {
    900       for (int i = 0; i < 4; i++) {
    901 	 emit(BRW_OPCODE_MOV, orig_dst, dst);
    902 	 orig_dst.reg_offset++;
    903 	 dst.reg_offset += 2;
    904       }
    905    }
    906 
    907    return inst;
    908 }
    909 
    910 /* gen5's sampler has slots for u, v, r, array index, then optional
    911  * parameters like shadow comparitor or LOD bias.  If optional
    912  * parameters aren't present, those base slots are optional and don't
    913  * need to be included in the message.
    914  *
    915  * We don't fill in the unnecessary slots regardless, which may look
    916  * surprising in the disassembly.
    917  */
    918 fs_inst *
    919 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    920 			      fs_reg shadow_c, fs_reg lod, fs_reg lod2)
    921 {
    922    int mlen = 0;
    923    int base_mrf = 2;
    924    int reg_width = c->dispatch_width / 8;
    925    bool header_present = false;
    926    const int vector_elements =
    927       ir->coordinate ? ir->coordinate->type->vector_elements : 0;
    928 
    929    if (ir->offset != NULL && ir->op == ir_txf) {
    930       /* It appears that the ld instruction used for txf does its
    931        * address bounds check before adding in the offset.  To work
    932        * around this, just add the integer offset to the integer texel
    933        * coordinate, and don't put the offset in the header.
    934        */
    935       ir_constant *offset = ir->offset->as_constant();
    936       for (int i = 0; i < vector_elements; i++) {
    937 	 emit(BRW_OPCODE_ADD,
    938 	      fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
    939 	      coordinate,
    940 	      offset->value.i[i]);
    941 	 coordinate.reg_offset++;
    942       }
    943    } else {
    944       if (ir->offset) {
    945 	 /* The offsets set up by the ir_texture visitor are in the
    946 	  * m1 header, so we can't go headerless.
    947 	  */
    948 	 header_present = true;
    949 	 mlen++;
    950 	 base_mrf--;
    951       }
    952 
    953       for (int i = 0; i < vector_elements; i++) {
    954 	 emit(BRW_OPCODE_MOV,
    955 	      fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
    956 	      coordinate);
    957 	 coordinate.reg_offset++;
    958       }
    959    }
    960    mlen += vector_elements * reg_width;
    961 
    962    if (ir->shadow_comparitor) {
    963       mlen = MAX2(mlen, header_present + 4 * reg_width);
    964 
    965       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
    966       mlen += reg_width;
    967    }
    968 
    969    fs_inst *inst = NULL;
    970    switch (ir->op) {
    971    case ir_tex:
    972       inst = emit(SHADER_OPCODE_TEX, dst);
    973       break;
    974    case ir_txb:
    975       mlen = MAX2(mlen, header_present + 4 * reg_width);
    976       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
    977       mlen += reg_width;
    978 
    979       inst = emit(FS_OPCODE_TXB, dst);
    980       break;
    981    case ir_txl:
    982       mlen = MAX2(mlen, header_present + 4 * reg_width);
    983       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
    984       mlen += reg_width;
    985 
    986       inst = emit(SHADER_OPCODE_TXL, dst);
    987       break;
    988    case ir_txd: {
    989       mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
    990 
    991       /**
    992        *  P   =  u,    v,    r
    993        * dPdx = dudx, dvdx, drdx
    994        * dPdy = dudy, dvdy, drdy
    995        *
    996        * Load up these values:
    997        * - dudx   dudy   dvdx   dvdy   drdx   drdy
    998        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
    999        */
   1000       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
   1001 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
   1002 	 lod.reg_offset++;
   1003 	 mlen += reg_width;
   1004 
   1005 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
   1006 	 lod2.reg_offset++;
   1007 	 mlen += reg_width;
   1008       }
   1009 
   1010       inst = emit(SHADER_OPCODE_TXD, dst);
   1011       break;
   1012    }
   1013    case ir_txs:
   1014       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
   1015       mlen += reg_width;
   1016       inst = emit(SHADER_OPCODE_TXS, dst);
   1017       break;
   1018    case ir_txf:
   1019       mlen = header_present + 4 * reg_width;
   1020 
   1021       emit(BRW_OPCODE_MOV,
   1022 	   fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD),
   1023 	   lod);
   1024       inst = emit(SHADER_OPCODE_TXF, dst);
   1025       break;
   1026    }
   1027    inst->base_mrf = base_mrf;
   1028    inst->mlen = mlen;
   1029    inst->header_present = header_present;
   1030 
   1031    if (mlen > 11) {
   1032       fail("Message length >11 disallowed by hardware\n");
   1033    }
   1034 
   1035    return inst;
   1036 }
   1037 
   1038 fs_inst *
   1039 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
   1040 			      fs_reg shadow_c, fs_reg lod, fs_reg lod2)
   1041 {
   1042    int mlen = 0;
   1043    int base_mrf = 2;
   1044    int reg_width = c->dispatch_width / 8;
   1045    bool header_present = false;
   1046    int offsets[3];
   1047 
   1048    if (ir->offset && ir->op != ir_txf) {
   1049       /* The offsets set up by the ir_texture visitor are in the
   1050        * m1 header, so we can't go headerless.
   1051        */
   1052       header_present = true;
   1053       mlen++;
   1054       base_mrf--;
   1055    }
   1056 
   1057    if (ir->shadow_comparitor) {
   1058       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
   1059       mlen += reg_width;
   1060    }
   1061 
   1062    /* Set up the LOD info */
   1063    switch (ir->op) {
   1064    case ir_tex:
   1065       break;
   1066    case ir_txb:
   1067       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
   1068       mlen += reg_width;
   1069       break;
   1070    case ir_txl:
   1071       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
   1072       mlen += reg_width;
   1073       break;
   1074    case ir_txd: {
   1075       if (c->dispatch_width == 16)
   1076 	 fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
   1077 
   1078       /* Load dPdx and the coordinate together:
   1079        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
   1080        */
   1081       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
   1082 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
   1083 	 coordinate.reg_offset++;
   1084 	 mlen += reg_width;
   1085 
   1086 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
   1087 	 lod.reg_offset++;
   1088 	 mlen += reg_width;
   1089 
   1090 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
   1091 	 lod2.reg_offset++;
   1092 	 mlen += reg_width;
   1093       }
   1094       break;
   1095    }
   1096    case ir_txs:
   1097       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
   1098       mlen += reg_width;
   1099       break;
   1100    case ir_txf:
   1101       /* It appears that the ld instruction used for txf does its
   1102        * address bounds check before adding in the offset.  To work
   1103        * around this, just add the integer offset to the integer texel
   1104        * coordinate, and don't put the offset in the header.
   1105        */
   1106       if (ir->offset) {
   1107 	 ir_constant *offset = ir->offset->as_constant();
   1108 	 offsets[0] = offset->value.i[0];
   1109 	 offsets[1] = offset->value.i[1];
   1110 	 offsets[2] = offset->value.i[2];
   1111       } else {
   1112 	 memset(offsets, 0, sizeof(offsets));
   1113       }
   1114 
   1115       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
   1116       emit(BRW_OPCODE_ADD,
   1117 	   fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[0]);
   1118       coordinate.reg_offset++;
   1119       mlen += reg_width;
   1120 
   1121       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), lod);
   1122       mlen += reg_width;
   1123 
   1124       for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
   1125 	 emit(BRW_OPCODE_ADD,
   1126 	      fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[i]);
   1127 	 coordinate.reg_offset++;
   1128 	 mlen += reg_width;
   1129       }
   1130       break;
   1131    }
   1132 
   1133    /* Set up the coordinate (except for cases where it was done above) */
   1134    if (ir->op != ir_txd && ir->op != ir_txs && ir->op != ir_txf) {
   1135       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
   1136 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
   1137 	 coordinate.reg_offset++;
   1138 	 mlen += reg_width;
   1139       }
   1140    }
   1141 
   1142    /* Generate the SEND */
   1143    fs_inst *inst = NULL;
   1144    switch (ir->op) {
   1145    case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
   1146    case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
   1147    case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
   1148    case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
   1149    case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
   1150    case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
   1151    }
   1152    inst->base_mrf = base_mrf;
   1153    inst->mlen = mlen;
   1154    inst->header_present = header_present;
   1155 
   1156    if (mlen > 11) {
   1157       fail("Message length >11 disallowed by hardware\n");
   1158    }
   1159 
   1160    return inst;
   1161 }
   1162 
   1163 /**
   1164  * Emit code to produce the coordinates for a texture lookup.
   1165  *
   1166  * Returns the fs_reg containing the texture coordinate (as opposed to
   1167  * setting this->result).
   1168  */
   1169 fs_reg
   1170 fs_visitor::emit_texcoord(ir_texture *ir, int sampler, int texunit)
   1171 {
   1172    fs_inst *inst = NULL;
   1173 
   1174    if (!ir->coordinate)
   1175       return fs_reg(); /* Return the default BAD_FILE register. */
   1176 
   1177    ir->coordinate->accept(this);
   1178    fs_reg coordinate = this->result;
   1179 
   1180    bool needs_gl_clamp = true;
   1181 
   1182    fs_reg scale_x, scale_y;
   1183 
   1184    /* The 965 requires the EU to do the normalization of GL rectangle
   1185     * texture coordinates.  We use the program parameter state
   1186     * tracking to get the scaling factor.
   1187     */
   1188    if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT &&
   1189        (intel->gen < 6 ||
   1190 	(intel->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
   1191 			     c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
   1192       struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
   1193       int tokens[STATE_LENGTH] = {
   1194 	 STATE_INTERNAL,
   1195 	 STATE_TEXRECT_SCALE,
   1196 	 texunit,
   1197 	 0,
   1198 	 0
   1199       };
   1200 
   1201       if (c->dispatch_width == 16) {
   1202 	 fail("rectangle scale uniform setup not supported on 16-wide\n");
   1203 	 return fs_reg(this, ir->type);
   1204       }
   1205 
   1206       scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
   1207       scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
   1208 
   1209       GLuint index = _mesa_add_state_reference(params,
   1210 					       (gl_state_index *)tokens);
   1211 
   1212       this->param_index[c->prog_data.nr_params] = index;
   1213       this->param_offset[c->prog_data.nr_params] = 0;
   1214       c->prog_data.nr_params++;
   1215       this->param_index[c->prog_data.nr_params] = index;
   1216       this->param_offset[c->prog_data.nr_params] = 1;
   1217       c->prog_data.nr_params++;
   1218    }
   1219 
   1220    /* The 965 requires the EU to do the normalization of GL rectangle
   1221     * texture coordinates.  We use the program parameter state
   1222     * tracking to get the scaling factor.
   1223     */
   1224    if (intel->gen < 6 &&
   1225        ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
   1226       fs_reg dst = fs_reg(this, ir->coordinate->type);
   1227       fs_reg src = coordinate;
   1228       coordinate = dst;
   1229 
   1230       emit(BRW_OPCODE_MUL, dst, src, scale_x);
   1231       dst.reg_offset++;
   1232       src.reg_offset++;
   1233       emit(BRW_OPCODE_MUL, dst, src, scale_y);
   1234    } else if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
   1235       /* On gen6+, the sampler handles the rectangle coordinates
   1236        * natively, without needing rescaling.  But that means we have
   1237        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
   1238        * not [0, 1] like the default case below.
   1239        */
   1240       needs_gl_clamp = false;
   1241 
   1242       for (int i = 0; i < 2; i++) {
   1243 	 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
   1244 	    fs_reg chan = coordinate;
   1245 	    chan.reg_offset += i;
   1246 
   1247 	    inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
   1248 	    inst->conditional_mod = BRW_CONDITIONAL_G;
   1249 
   1250 	    /* Our parameter comes in as 1.0/width or 1.0/height,
   1251 	     * because that's what people normally want for doing
   1252 	     * texture rectangle handling.  We need width or height
   1253 	     * for clamping, but we don't care enough to make a new
   1254 	     * parameter type, so just invert back.
   1255 	     */
   1256 	    fs_reg limit = fs_reg(this, glsl_type::float_type);
   1257 	    emit(BRW_OPCODE_MOV, limit, i == 0 ? scale_x : scale_y);
   1258 	    emit(SHADER_OPCODE_RCP, limit, limit);
   1259 
   1260 	    inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
   1261 	    inst->conditional_mod = BRW_CONDITIONAL_L;
   1262 	 }
   1263       }
   1264    }
   1265 
   1266    if (ir->coordinate && needs_gl_clamp) {
   1267       for (unsigned int i = 0;
   1268 	   i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
   1269 	 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
   1270 	    fs_reg chan = coordinate;
   1271 	    chan.reg_offset += i;
   1272 
   1273 	    fs_inst *inst = emit(BRW_OPCODE_MOV, chan, chan);
   1274 	    inst->saturate = true;
   1275 	 }
   1276       }
   1277    }
   1278    return coordinate;
   1279 }
   1280 
   1281 void
   1282 fs_visitor::visit(ir_texture *ir)
   1283 {
   1284    fs_inst *inst = NULL;
   1285 
   1286    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &fp->Base);
   1287    int texunit = fp->Base.SamplerUnits[sampler];
   1288 
   1289    /* Should be lowered by do_lower_texture_projection */
   1290    assert(!ir->projector);
   1291 
   1292    /* Generate code to compute all the subexpression trees.  This has to be
   1293     * done before loading any values into MRFs for the sampler message since
   1294     * generating these values may involve SEND messages that need the MRFs.
   1295     */
   1296    fs_reg coordinate = emit_texcoord(ir, sampler, texunit);
   1297 
   1298    fs_reg shadow_comparitor;
   1299    if (ir->shadow_comparitor) {
   1300       ir->shadow_comparitor->accept(this);
   1301       shadow_comparitor = this->result;
   1302    }
   1303 
   1304    fs_reg lod, lod2;
   1305    switch (ir->op) {
   1306    case ir_tex:
   1307       break;
   1308    case ir_txb:
   1309       ir->lod_info.bias->accept(this);
   1310       lod = this->result;
   1311       break;
   1312    case ir_txd:
   1313       ir->lod_info.grad.dPdx->accept(this);
   1314       lod = this->result;
   1315 
   1316       ir->lod_info.grad.dPdy->accept(this);
   1317       lod2 = this->result;
   1318       break;
   1319    case ir_txf:
   1320    case ir_txl:
   1321    case ir_txs:
   1322       ir->lod_info.lod->accept(this);
   1323       lod = this->result;
   1324       break;
   1325    };
   1326 
   1327    /* Writemasking doesn't eliminate channels on SIMD8 texture
   1328     * samples, so don't worry about them.
   1329     */
   1330    fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
   1331 
   1332    if (intel->gen >= 7) {
   1333       inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
   1334                                lod, lod2);
   1335    } else if (intel->gen >= 5) {
   1336       inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
   1337                                lod, lod2);
   1338    } else {
   1339       inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
   1340                                lod, lod2);
   1341    }
   1342 
   1343    /* The header is set up by generate_tex() when necessary. */
   1344    inst->src[0] = reg_undef;
   1345 
   1346    if (ir->offset != NULL && ir->op != ir_txf)
   1347       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
   1348 
   1349    inst->sampler = sampler;
   1350 
   1351    if (ir->shadow_comparitor)
   1352       inst->shadow_compare = true;
   1353 
   1354    swizzle_result(ir, dst, sampler);
   1355 }
   1356 
   1357 /**
   1358  * Swizzle the result of a texture result.  This is necessary for
   1359  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
   1360  */
   1361 void
   1362 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
   1363 {
   1364    this->result = orig_val;
   1365 
   1366    if (ir->op == ir_txs)
   1367       return;
   1368 
   1369    if (ir->type == glsl_type::float_type) {
   1370       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
   1371       assert(ir->sampler->type->sampler_shadow);
   1372    } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
   1373       fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
   1374 
   1375       for (int i = 0; i < 4; i++) {
   1376 	 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
   1377 	 fs_reg l = swizzled_result;
   1378 	 l.reg_offset += i;
   1379 
   1380 	 if (swiz == SWIZZLE_ZERO) {
   1381 	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
   1382 	 } else if (swiz == SWIZZLE_ONE) {
   1383 	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
   1384 	 } else {
   1385 	    fs_reg r = orig_val;
   1386 	    r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
   1387 	    emit(BRW_OPCODE_MOV, l, r);
   1388 	 }
   1389       }
   1390       this->result = swizzled_result;
   1391    }
   1392 }
   1393 
   1394 void
   1395 fs_visitor::visit(ir_swizzle *ir)
   1396 {
   1397    ir->val->accept(this);
   1398    fs_reg val = this->result;
   1399 
   1400    if (ir->type->vector_elements == 1) {
   1401       this->result.reg_offset += ir->mask.x;
   1402       return;
   1403    }
   1404 
   1405    fs_reg result = fs_reg(this, ir->type);
   1406    this->result = result;
   1407 
   1408    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
   1409       fs_reg channel = val;
   1410       int swiz = 0;
   1411 
   1412       switch (i) {
   1413       case 0:
   1414 	 swiz = ir->mask.x;
   1415 	 break;
   1416       case 1:
   1417 	 swiz = ir->mask.y;
   1418 	 break;
   1419       case 2:
   1420 	 swiz = ir->mask.z;
   1421 	 break;
   1422       case 3:
   1423 	 swiz = ir->mask.w;
   1424 	 break;
   1425       }
   1426 
   1427       channel.reg_offset += swiz;
   1428       emit(BRW_OPCODE_MOV, result, channel);
   1429       result.reg_offset++;
   1430    }
   1431 }
   1432 
   1433 void
   1434 fs_visitor::visit(ir_discard *ir)
   1435 {
   1436    assert(ir->condition == NULL); /* FINISHME */
   1437 
   1438    emit(FS_OPCODE_DISCARD);
   1439 }
   1440 
   1441 void
   1442 fs_visitor::visit(ir_constant *ir)
   1443 {
   1444    /* Set this->result to reg at the bottom of the function because some code
   1445     * paths will cause this visitor to be applied to other fields.  This will
   1446     * cause the value stored in this->result to be modified.
   1447     *
   1448     * Make reg constant so that it doesn't get accidentally modified along the
   1449     * way.  Yes, I actually had this problem. :(
   1450     */
   1451    const fs_reg reg(this, ir->type);
   1452    fs_reg dst_reg = reg;
   1453 
   1454    if (ir->type->is_array()) {
   1455       const unsigned size = type_size(ir->type->fields.array);
   1456 
   1457       for (unsigned i = 0; i < ir->type->length; i++) {
   1458 	 ir->array_elements[i]->accept(this);
   1459 	 fs_reg src_reg = this->result;
   1460 
   1461 	 dst_reg.type = src_reg.type;
   1462 	 for (unsigned j = 0; j < size; j++) {
   1463 	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
   1464 	    src_reg.reg_offset++;
   1465 	    dst_reg.reg_offset++;
   1466 	 }
   1467       }
   1468    } else if (ir->type->is_record()) {
   1469       foreach_list(node, &ir->components) {
   1470 	 ir_constant *const field = (ir_constant *) node;
   1471 	 const unsigned size = type_size(field->type);
   1472 
   1473 	 field->accept(this);
   1474 	 fs_reg src_reg = this->result;
   1475 
   1476 	 dst_reg.type = src_reg.type;
   1477 	 for (unsigned j = 0; j < size; j++) {
   1478 	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
   1479 	    src_reg.reg_offset++;
   1480 	    dst_reg.reg_offset++;
   1481 	 }
   1482       }
   1483    } else {
   1484       const unsigned size = type_size(ir->type);
   1485 
   1486       for (unsigned i = 0; i < size; i++) {
   1487 	 switch (ir->type->base_type) {
   1488 	 case GLSL_TYPE_FLOAT:
   1489 	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
   1490 	    break;
   1491 	 case GLSL_TYPE_UINT:
   1492 	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
   1493 	    break;
   1494 	 case GLSL_TYPE_INT:
   1495 	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
   1496 	    break;
   1497 	 case GLSL_TYPE_BOOL:
   1498 	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
   1499 	    break;
   1500 	 default:
   1501 	    assert(!"Non-float/uint/int/bool constant");
   1502 	 }
   1503 	 dst_reg.reg_offset++;
   1504       }
   1505    }
   1506 
   1507    this->result = reg;
   1508 }
   1509 
   1510 void
   1511 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
   1512 {
   1513    ir_expression *expr = ir->as_expression();
   1514 
   1515    if (expr) {
   1516       fs_reg op[2];
   1517       fs_inst *inst;
   1518 
   1519       assert(expr->get_num_operands() <= 2);
   1520       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
   1521 	 assert(expr->operands[i]->type->is_scalar());
   1522 
   1523 	 expr->operands[i]->accept(this);
   1524 	 op[i] = this->result;
   1525 
   1526 	 resolve_ud_negate(&op[i]);
   1527       }
   1528 
   1529       switch (expr->operation) {
   1530       case ir_unop_logic_not:
   1531 	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
   1532 	 inst->conditional_mod = BRW_CONDITIONAL_Z;
   1533 	 break;
   1534 
   1535       case ir_binop_logic_xor:
   1536       case ir_binop_logic_or:
   1537       case ir_binop_logic_and:
   1538 	 goto out;
   1539 
   1540       case ir_unop_f2b:
   1541 	 if (intel->gen >= 6) {
   1542 	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
   1543 	 } else {
   1544 	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
   1545 	 }
   1546 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
   1547 	 break;
   1548 
   1549       case ir_unop_i2b:
   1550 	 if (intel->gen >= 6) {
   1551 	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
   1552 	 } else {
   1553 	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
   1554 	 }
   1555 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
   1556 	 break;
   1557 
   1558       case ir_binop_greater:
   1559       case ir_binop_gequal:
   1560       case ir_binop_less:
   1561       case ir_binop_lequal:
   1562       case ir_binop_equal:
   1563       case ir_binop_all_equal:
   1564       case ir_binop_nequal:
   1565       case ir_binop_any_nequal:
   1566 	 resolve_bool_comparison(expr->operands[0], &op[0]);
   1567 	 resolve_bool_comparison(expr->operands[1], &op[1]);
   1568 
   1569 	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
   1570 	 inst->conditional_mod =
   1571 	    brw_conditional_for_comparison(expr->operation);
   1572 	 break;
   1573 
   1574       default:
   1575 	 assert(!"not reached");
   1576 	 fail("bad cond code\n");
   1577 	 break;
   1578       }
   1579       return;
   1580    }
   1581 
   1582 out:
   1583    ir->accept(this);
   1584 
   1585    fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
   1586    inst->conditional_mod = BRW_CONDITIONAL_NZ;
   1587 }
   1588 
   1589 /**
   1590  * Emit a gen6 IF statement with the comparison folded into the IF
   1591  * instruction.
   1592  */
   1593 void
   1594 fs_visitor::emit_if_gen6(ir_if *ir)
   1595 {
   1596    ir_expression *expr = ir->condition->as_expression();
   1597 
   1598    if (expr) {
   1599       fs_reg op[2];
   1600       fs_inst *inst;
   1601       fs_reg temp;
   1602 
   1603       assert(expr->get_num_operands() <= 2);
   1604       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
   1605 	 assert(expr->operands[i]->type->is_scalar());
   1606 
   1607 	 expr->operands[i]->accept(this);
   1608 	 op[i] = this->result;
   1609       }
   1610 
   1611       switch (expr->operation) {
   1612       case ir_unop_logic_not:
   1613       case ir_binop_logic_xor:
   1614       case ir_binop_logic_or:
   1615       case ir_binop_logic_and:
   1616          /* For operations on bool arguments, only the low bit of the bool is
   1617           * valid, and the others are undefined.  Fall back to the condition
   1618           * code path.
   1619           */
   1620          break;
   1621 
   1622       case ir_unop_f2b:
   1623 	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
   1624 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
   1625 	 return;
   1626 
   1627       case ir_unop_i2b:
   1628 	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
   1629 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
   1630 	 return;
   1631 
   1632       case ir_binop_greater:
   1633       case ir_binop_gequal:
   1634       case ir_binop_less:
   1635       case ir_binop_lequal:
   1636       case ir_binop_equal:
   1637       case ir_binop_all_equal:
   1638       case ir_binop_nequal:
   1639       case ir_binop_any_nequal:
   1640 	 resolve_bool_comparison(expr->operands[0], &op[0]);
   1641 	 resolve_bool_comparison(expr->operands[1], &op[1]);
   1642 
   1643 	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
   1644 	 inst->conditional_mod =
   1645 	    brw_conditional_for_comparison(expr->operation);
   1646 	 return;
   1647       default:
   1648 	 assert(!"not reached");
   1649 	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
   1650 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
   1651 	 fail("bad condition\n");
   1652 	 return;
   1653       }
   1654    }
   1655 
   1656    emit_bool_to_cond_code(ir->condition);
   1657    fs_inst *inst = emit(BRW_OPCODE_IF);
   1658    inst->predicated = true;
   1659 }
   1660 
   1661 void
   1662 fs_visitor::visit(ir_if *ir)
   1663 {
   1664    fs_inst *inst;
   1665 
   1666    if (intel->gen < 6 && c->dispatch_width == 16) {
   1667       fail("Can't support (non-uniform) control flow on 16-wide\n");
   1668    }
   1669 
   1670    /* Don't point the annotation at the if statement, because then it plus
   1671     * the then and else blocks get printed.
   1672     */
   1673    this->base_ir = ir->condition;
   1674 
   1675    if (intel->gen == 6) {
   1676       emit_if_gen6(ir);
   1677    } else {
   1678       emit_bool_to_cond_code(ir->condition);
   1679 
   1680       inst = emit(BRW_OPCODE_IF);
   1681       inst->predicated = true;
   1682    }
   1683 
   1684    foreach_list(node, &ir->then_instructions) {
   1685       ir_instruction *ir = (ir_instruction *)node;
   1686       this->base_ir = ir;
   1687 
   1688       ir->accept(this);
   1689    }
   1690 
   1691    if (!ir->else_instructions.is_empty()) {
   1692       emit(BRW_OPCODE_ELSE);
   1693 
   1694       foreach_list(node, &ir->else_instructions) {
   1695 	 ir_instruction *ir = (ir_instruction *)node;
   1696 	 this->base_ir = ir;
   1697 
   1698 	 ir->accept(this);
   1699       }
   1700    }
   1701 
   1702    emit(BRW_OPCODE_ENDIF);
   1703 }
   1704 
   1705 void
   1706 fs_visitor::visit(ir_loop *ir)
   1707 {
   1708    fs_reg counter = reg_undef;
   1709 
   1710    if (intel->gen < 6 && c->dispatch_width == 16) {
   1711       fail("Can't support (non-uniform) control flow on 16-wide\n");
   1712    }
   1713 
   1714    if (ir->counter) {
   1715       this->base_ir = ir->counter;
   1716       ir->counter->accept(this);
   1717       counter = *(variable_storage(ir->counter));
   1718 
   1719       if (ir->from) {
   1720 	 this->base_ir = ir->from;
   1721 	 ir->from->accept(this);
   1722 
   1723 	 emit(BRW_OPCODE_MOV, counter, this->result);
   1724       }
   1725    }
   1726 
   1727    this->base_ir = NULL;
   1728    emit(BRW_OPCODE_DO);
   1729 
   1730    if (ir->to) {
   1731       this->base_ir = ir->to;
   1732       ir->to->accept(this);
   1733 
   1734       fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
   1735       inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
   1736 
   1737       inst = emit(BRW_OPCODE_BREAK);
   1738       inst->predicated = true;
   1739    }
   1740 
   1741    foreach_list(node, &ir->body_instructions) {
   1742       ir_instruction *ir = (ir_instruction *)node;
   1743 
   1744       this->base_ir = ir;
   1745       ir->accept(this);
   1746    }
   1747 
   1748    if (ir->increment) {
   1749       this->base_ir = ir->increment;
   1750       ir->increment->accept(this);
   1751       emit(BRW_OPCODE_ADD, counter, counter, this->result);
   1752    }
   1753 
   1754    this->base_ir = NULL;
   1755    emit(BRW_OPCODE_WHILE);
   1756 }
   1757 
   1758 void
   1759 fs_visitor::visit(ir_loop_jump *ir)
   1760 {
   1761    switch (ir->mode) {
   1762    case ir_loop_jump::jump_break:
   1763       emit(BRW_OPCODE_BREAK);
   1764       break;
   1765    case ir_loop_jump::jump_continue:
   1766       emit(BRW_OPCODE_CONTINUE);
   1767       break;
   1768    }
   1769 }
   1770 
   1771 void
   1772 fs_visitor::visit(ir_call *ir)
   1773 {
   1774    assert(!"FINISHME");
   1775 }
   1776 
   1777 void
   1778 fs_visitor::visit(ir_return *ir)
   1779 {
   1780    assert(!"FINISHME");
   1781 }
   1782 
   1783 void
   1784 fs_visitor::visit(ir_function *ir)
   1785 {
   1786    /* Ignore function bodies other than main() -- we shouldn't see calls to
   1787     * them since they should all be inlined before we get to ir_to_mesa.
   1788     */
   1789    if (strcmp(ir->name, "main") == 0) {
   1790       const ir_function_signature *sig;
   1791       exec_list empty;
   1792 
   1793       sig = ir->matching_signature(&empty);
   1794 
   1795       assert(sig);
   1796 
   1797       foreach_list(node, &sig->body) {
   1798 	 ir_instruction *ir = (ir_instruction *)node;
   1799 	 this->base_ir = ir;
   1800 
   1801 	 ir->accept(this);
   1802       }
   1803    }
   1804 }
   1805 
   1806 void
   1807 fs_visitor::visit(ir_function_signature *ir)
   1808 {
   1809    assert(!"not reached");
   1810    (void)ir;
   1811 }
   1812 
   1813 fs_inst *
   1814 fs_visitor::emit(fs_inst inst)
   1815 {
   1816    fs_inst *list_inst = new(mem_ctx) fs_inst;
   1817    *list_inst = inst;
   1818 
   1819    if (force_uncompressed_stack > 0)
   1820       list_inst->force_uncompressed = true;
   1821    else if (force_sechalf_stack > 0)
   1822       list_inst->force_sechalf = true;
   1823 
   1824    list_inst->annotation = this->current_annotation;
   1825    list_inst->ir = this->base_ir;
   1826 
   1827    this->instructions.push_tail(list_inst);
   1828 
   1829    return list_inst;
   1830 }
   1831 
   1832 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
   1833 void
   1834 fs_visitor::emit_dummy_fs()
   1835 {
   1836    int reg_width = c->dispatch_width / 8;
   1837 
   1838    /* Everyone's favorite color. */
   1839    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f));
   1840    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f));
   1841    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f));
   1842    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f));
   1843 
   1844    fs_inst *write;
   1845    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
   1846    write->base_mrf = 2;
   1847    write->mlen = 4 * reg_width;
   1848    write->eot = true;
   1849 }
   1850 
   1851 /* The register location here is relative to the start of the URB
   1852  * data.  It will get adjusted to be a real location before
   1853  * generate_code() time.
   1854  */
   1855 struct brw_reg
   1856 fs_visitor::interp_reg(int location, int channel)
   1857 {
   1858    int regnr = urb_setup[location] * 2 + channel / 2;
   1859    int stride = (channel & 1) * 4;
   1860 
   1861    assert(urb_setup[location] != -1);
   1862 
   1863    return brw_vec1_grf(regnr, stride);
   1864 }
   1865 
   1866 /** Emits the interpolation for the varying inputs. */
   1867 void
   1868 fs_visitor::emit_interpolation_setup_gen4()
   1869 {
   1870    this->current_annotation = "compute pixel centers";
   1871    this->pixel_x = fs_reg(this, glsl_type::uint_type);
   1872    this->pixel_y = fs_reg(this, glsl_type::uint_type);
   1873    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
   1874    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
   1875 
   1876    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
   1877    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
   1878 
   1879    this->current_annotation = "compute pixel deltas from v0";
   1880    if (brw->has_pln) {
   1881       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
   1882          fs_reg(this, glsl_type::vec2_type);
   1883       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
   1884          this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
   1885       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
   1886    } else {
   1887       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
   1888          fs_reg(this, glsl_type::float_type);
   1889       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
   1890          fs_reg(this, glsl_type::float_type);
   1891    }
   1892    emit(BRW_OPCODE_ADD, this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
   1893 	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
   1894    emit(BRW_OPCODE_ADD, this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
   1895 	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
   1896 
   1897    this->current_annotation = "compute pos.w and 1/pos.w";
   1898    /* Compute wpos.w.  It's always in our setup, since it's needed to
   1899     * interpolate the other attributes.
   1900     */
   1901    this->wpos_w = fs_reg(this, glsl_type::float_type);
   1902    emit(FS_OPCODE_LINTERP, wpos_w,
   1903         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
   1904         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
   1905 	interp_reg(FRAG_ATTRIB_WPOS, 3));
   1906    /* Compute the pixel 1/W value from wpos.w. */
   1907    this->pixel_w = fs_reg(this, glsl_type::float_type);
   1908    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
   1909    this->current_annotation = NULL;
   1910 }
   1911 
   1912 /** Emits the interpolation for the varying inputs. */
   1913 void
   1914 fs_visitor::emit_interpolation_setup_gen6()
   1915 {
   1916    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
   1917 
   1918    /* If the pixel centers end up used, the setup is the same as for gen4. */
   1919    this->current_annotation = "compute pixel centers";
   1920    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
   1921    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
   1922    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
   1923    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
   1924    emit(BRW_OPCODE_ADD,
   1925 	int_pixel_x,
   1926 	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
   1927 	fs_reg(brw_imm_v(0x10101010)));
   1928    emit(BRW_OPCODE_ADD,
   1929 	int_pixel_y,
   1930 	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
   1931 	fs_reg(brw_imm_v(0x11001100)));
   1932 
   1933    /* As of gen6, we can no longer mix float and int sources.  We have
   1934     * to turn the integer pixel centers into floats for their actual
   1935     * use.
   1936     */
   1937    this->pixel_x = fs_reg(this, glsl_type::float_type);
   1938    this->pixel_y = fs_reg(this, glsl_type::float_type);
   1939    emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
   1940    emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
   1941 
   1942    this->current_annotation = "compute pos.w";
   1943    this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
   1944    this->wpos_w = fs_reg(this, glsl_type::float_type);
   1945    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
   1946 
   1947    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
   1948       uint8_t reg = c->barycentric_coord_reg[i];
   1949       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
   1950       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
   1951    }
   1952 
   1953    this->current_annotation = NULL;
   1954 }
   1955 
   1956 void
   1957 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
   1958 {
   1959    int reg_width = c->dispatch_width / 8;
   1960    fs_inst *inst;
   1961    fs_reg color = outputs[target];
   1962    fs_reg mrf;
   1963 
   1964    /* If there's no color data to be written, skip it. */
   1965    if (color.file == BAD_FILE)
   1966       return;
   1967 
   1968    color.reg_offset += index;
   1969 
   1970    if (c->dispatch_width == 8 || intel->gen >= 6) {
   1971       /* SIMD8 write looks like:
   1972        * m + 0: r0
   1973        * m + 1: r1
   1974        * m + 2: g0
   1975        * m + 3: g1
   1976        *
   1977        * gen6 SIMD16 DP write looks like:
   1978        * m + 0: r0
   1979        * m + 1: r1
   1980        * m + 2: g0
   1981        * m + 3: g1
   1982        * m + 4: b0
   1983        * m + 5: b1
   1984        * m + 6: a0
   1985        * m + 7: a1
   1986        */
   1987       inst = emit(BRW_OPCODE_MOV,
   1988 		  fs_reg(MRF, first_color_mrf + index * reg_width, color.type),
   1989 		  color);
   1990       inst->saturate = c->key.clamp_fragment_color;
   1991    } else {
   1992       /* pre-gen6 SIMD16 single source DP write looks like:
   1993        * m + 0: r0
   1994        * m + 1: g0
   1995        * m + 2: b0
   1996        * m + 3: a0
   1997        * m + 4: r1
   1998        * m + 5: g1
   1999        * m + 6: b1
   2000        * m + 7: a1
   2001        */
   2002       if (brw->has_compr4) {
   2003 	 /* By setting the high bit of the MRF register number, we
   2004 	  * indicate that we want COMPR4 mode - instead of doing the
   2005 	  * usual destination + 1 for the second half we get
   2006 	  * destination + 4.
   2007 	  */
   2008 	 inst = emit(BRW_OPCODE_MOV,
   2009 		     fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
   2010 			    color.type),
   2011 		     color);
   2012 	 inst->saturate = c->key.clamp_fragment_color;
   2013       } else {
   2014 	 push_force_uncompressed();
   2015 	 inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index,
   2016 					    color.type),
   2017 		     color);
   2018 	 inst->saturate = c->key.clamp_fragment_color;
   2019 	 pop_force_uncompressed();
   2020 
   2021 	 push_force_sechalf();
   2022 	 color.sechalf = true;
   2023 	 inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4,
   2024 					    color.type),
   2025 		     color);
   2026 	 inst->saturate = c->key.clamp_fragment_color;
   2027 	 pop_force_sechalf();
   2028 	 color.sechalf = false;
   2029       }
   2030    }
   2031 }
   2032 
   2033 void
   2034 fs_visitor::emit_fb_writes()
   2035 {
   2036    this->current_annotation = "FB write header";
   2037    bool header_present = true;
   2038    /* We can potentially have a message length of up to 15, so we have to set
   2039     * base_mrf to either 0 or 1 in order to fit in m0..m15.
   2040     */
   2041    int base_mrf = 1;
   2042    int nr = base_mrf;
   2043    int reg_width = c->dispatch_width / 8;
   2044    bool do_dual_src = this->dual_src_output.file != BAD_FILE;
   2045    bool src0_alpha_to_render_target = false;
   2046 
   2047    if (c->dispatch_width == 16 && do_dual_src) {
   2048       fail("GL_ARB_blend_func_extended not yet supported in 16-wide.");
   2049       do_dual_src = false;
   2050    }
   2051 
   2052    /* From the Sandy Bridge PRM, volume 4, page 198:
   2053     *
   2054     *     "Dispatched Pixel Enables. One bit per pixel indicating
   2055     *      which pixels were originally enabled when the thread was
   2056     *      dispatched. This field is only required for the end-of-
   2057     *      thread message and on all dual-source messages."
   2058     */
   2059    if (intel->gen >= 6 &&
   2060        !this->fp->UsesKill &&
   2061        !do_dual_src &&
   2062        c->key.nr_color_regions == 1) {
   2063       header_present = false;
   2064    }
   2065 
   2066    if (header_present) {
   2067       src0_alpha_to_render_target = intel->gen >= 6 &&
   2068 				    !do_dual_src &&
   2069 				    c->key.nr_color_regions > 1 &&
   2070 				    c->key.sample_alpha_to_coverage;
   2071       /* m2, m3 header */
   2072       nr += 2;
   2073    }
   2074 
   2075    if (c->aa_dest_stencil_reg) {
   2076       push_force_uncompressed();
   2077       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
   2078 	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
   2079       pop_force_uncompressed();
   2080    }
   2081 
   2082    /* Reserve space for color. It'll be filled in per MRT below. */
   2083    int color_mrf = nr;
   2084    nr += 4 * reg_width;
   2085    if (do_dual_src)
   2086       nr += 4;
   2087    if (src0_alpha_to_render_target)
   2088       nr += reg_width;
   2089 
   2090    if (c->source_depth_to_render_target) {
   2091       if (intel->gen == 6 && c->dispatch_width == 16) {
   2092 	 /* For outputting oDepth on gen6, SIMD8 writes have to be
   2093 	  * used.  This would require 8-wide moves of each half to
   2094 	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
   2095 	  * Just bail on doing so for now.
   2096 	  */
   2097 	 fail("Missing support for simd16 depth writes on gen6\n");
   2098       }
   2099 
   2100       if (c->computes_depth) {
   2101 	 /* Hand over gl_FragDepth. */
   2102 	 assert(this->frag_depth);
   2103 	 fs_reg depth = *(variable_storage(this->frag_depth));
   2104 
   2105 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
   2106       } else {
   2107 	 /* Pass through the payload depth. */
   2108 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
   2109 	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
   2110       }
   2111       nr += reg_width;
   2112    }
   2113 
   2114    if (c->dest_depth_reg) {
   2115       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
   2116 	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
   2117       nr += reg_width;
   2118    }
   2119 
   2120    if (do_dual_src) {
   2121       fs_reg src0 = this->outputs[0];
   2122       fs_reg src1 = this->dual_src_output;
   2123 
   2124       this->current_annotation = ralloc_asprintf(this->mem_ctx,
   2125 						 "FB write src0");
   2126       for (int i = 0; i < 4; i++) {
   2127 	 fs_inst *inst = emit(BRW_OPCODE_MOV,
   2128 			      fs_reg(MRF, color_mrf + i, src0.type),
   2129 			      src0);
   2130 	 src0.reg_offset++;
   2131 	 inst->saturate = c->key.clamp_fragment_color;
   2132       }
   2133 
   2134       this->current_annotation = ralloc_asprintf(this->mem_ctx,
   2135 						 "FB write src1");
   2136       for (int i = 0; i < 4; i++) {
   2137 	 fs_inst *inst = emit(BRW_OPCODE_MOV,
   2138 			      fs_reg(MRF, color_mrf + 4 + i, src1.type),
   2139 			      src1);
   2140 	 src1.reg_offset++;
   2141 	 inst->saturate = c->key.clamp_fragment_color;
   2142       }
   2143 
   2144       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
   2145       inst->target = 0;
   2146       inst->base_mrf = base_mrf;
   2147       inst->mlen = nr - base_mrf;
   2148       inst->eot = true;
   2149       inst->header_present = header_present;
   2150 
   2151       c->prog_data.dual_src_blend = true;
   2152       this->current_annotation = NULL;
   2153       return;
   2154    }
   2155 
   2156    for (int target = 0; target < c->key.nr_color_regions; target++) {
   2157       this->current_annotation = ralloc_asprintf(this->mem_ctx,
   2158 						 "FB write target %d",
   2159 						 target);
   2160       /* If src0_alpha_to_render_target is true, include source zero alpha
   2161        * data in RenderTargetWrite message for targets > 0.
   2162        */
   2163       int write_color_mrf = color_mrf;
   2164       if (src0_alpha_to_render_target && target != 0) {
   2165          fs_inst *inst;
   2166          fs_reg color = outputs[0];
   2167          color.reg_offset += 3;
   2168 
   2169          inst = emit(BRW_OPCODE_MOV,
   2170 		     fs_reg(MRF, write_color_mrf, color.type),
   2171 		     color);
   2172          inst->saturate = c->key.clamp_fragment_color;
   2173          write_color_mrf = color_mrf + reg_width;
   2174       }
   2175 
   2176       for (unsigned i = 0; i < this->output_components[target]; i++)
   2177          emit_color_write(target, i, write_color_mrf);
   2178 
   2179       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
   2180       inst->target = target;
   2181       inst->base_mrf = base_mrf;
   2182       if (src0_alpha_to_render_target && target == 0)
   2183          inst->mlen = nr - base_mrf - reg_width;
   2184       else
   2185          inst->mlen = nr - base_mrf;
   2186       if (target == c->key.nr_color_regions - 1)
   2187 	 inst->eot = true;
   2188       inst->header_present = header_present;
   2189    }
   2190 
   2191    if (c->key.nr_color_regions == 0) {
   2192       /* Even if there's no color buffers enabled, we still need to send
   2193        * alpha out the pipeline to our null renderbuffer to support
   2194        * alpha-testing, alpha-to-coverage, and so on.
   2195        */
   2196       emit_color_write(0, 3, color_mrf);
   2197 
   2198       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
   2199       inst->base_mrf = base_mrf;
   2200       inst->mlen = nr - base_mrf;
   2201       inst->eot = true;
   2202       inst->header_present = header_present;
   2203    }
   2204 
   2205    this->current_annotation = NULL;
   2206 }
   2207 
   2208 void
   2209 fs_visitor::resolve_ud_negate(fs_reg *reg)
   2210 {
   2211    if (reg->type != BRW_REGISTER_TYPE_UD ||
   2212        !reg->negate)
   2213       return;
   2214 
   2215    fs_reg temp = fs_reg(this, glsl_type::uint_type);
   2216    emit(BRW_OPCODE_MOV, temp, *reg);
   2217    *reg = temp;
   2218 }
   2219 
   2220 void
   2221 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
   2222 {
   2223    if (rvalue->type != glsl_type::bool_type)
   2224       return;
   2225 
   2226    fs_reg temp = fs_reg(this, glsl_type::bool_type);
   2227    emit(BRW_OPCODE_AND, temp, *reg, fs_reg(1));
   2228    *reg = temp;
   2229 }
   2230 
   2231 fs_visitor::fs_visitor(struct brw_wm_compile *c, struct gl_shader_program *prog,
   2232                        struct brw_shader *shader)
   2233 {
   2234    this->c = c;
   2235    this->p = &c->func;
   2236    this->brw = p->brw;
   2237    this->fp = (struct gl_fragment_program *)
   2238       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
   2239    this->prog = prog;
   2240    this->intel = &brw->intel;
   2241    this->ctx = &intel->ctx;
   2242    this->mem_ctx = ralloc_context(NULL);
   2243    this->shader = shader;
   2244    this->failed = false;
   2245    this->variable_ht = hash_table_ctor(0,
   2246                                        hash_table_pointer_hash,
   2247                                        hash_table_pointer_compare);
   2248 
   2249    /* There's a question that appears to be left open in the spec:
   2250     * How do implicit dst conversions interact with the CMP
   2251     * instruction or conditional mods?  On gen6, the instruction:
   2252     *
   2253     * CMP null<d> src0<f> src1<f>
   2254     *
   2255     * will do src1 - src0 and compare that result as if it was an
   2256     * integer.  On gen4, it will do src1 - src0 as float, convert
   2257     * the result to int, and compare as int.  In between, it
   2258     * appears that it does src1 - src0 and does the compare in the
   2259     * execution type so dst type doesn't matter.
   2260     */
   2261    if (this->intel->gen > 4)
   2262       this->reg_null_cmp = reg_null_d;
   2263    else
   2264       this->reg_null_cmp = reg_null_f;
   2265 
   2266    this->frag_depth = NULL;
   2267    memset(this->outputs, 0, sizeof(this->outputs));
   2268    memset(this->output_components, 0, sizeof(this->output_components));
   2269    this->first_non_payload_grf = 0;
   2270    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
   2271 
   2272    this->current_annotation = NULL;
   2273    this->base_ir = NULL;
   2274 
   2275    this->virtual_grf_sizes = NULL;
   2276    this->virtual_grf_count = 0;
   2277    this->virtual_grf_array_size = 0;
   2278    this->virtual_grf_def = NULL;
   2279    this->virtual_grf_use = NULL;
   2280    this->live_intervals_valid = false;
   2281 
   2282    this->force_uncompressed_stack = 0;
   2283    this->force_sechalf_stack = 0;
   2284 }
   2285 
   2286 fs_visitor::~fs_visitor()
   2287 {
   2288    ralloc_free(this->mem_ctx);
   2289    hash_table_dtor(this->variable_ht);
   2290 }
   2291