Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 /** @file brw_fs.cpp
     25  *
     26  * This file drives the GLSL IR -> LIR translation, contains the
     27  * optimizations on the LIR, and drives the generation of native code
     28  * from the LIR.
     29  */
     30 
     31 extern "C" {
     32 
     33 #include <sys/types.h>
     34 
     35 #include "main/macros.h"
     36 #include "main/shaderobj.h"
     37 #include "main/uniforms.h"
     38 #include "main/fbobject.h"
     39 #include "program/prog_parameter.h"
     40 #include "program/prog_print.h"
     41 #include "program/register_allocate.h"
     42 #include "program/sampler.h"
     43 #include "program/hash_table.h"
     44 #include "brw_context.h"
     45 #include "brw_eu.h"
     46 #include "brw_wm.h"
     47 }
     48 #include "brw_shader.h"
     49 #include "brw_fs.h"
     50 #include "glsl/glsl_types.h"
     51 #include "glsl/ir_print_visitor.h"
     52 
     53 void
     54 fs_inst::init()
     55 {
     56    memset(this, 0, sizeof(*this));
     57    this->opcode = BRW_OPCODE_NOP;
     58    this->conditional_mod = BRW_CONDITIONAL_NONE;
     59 
     60    this->dst = reg_undef;
     61    this->src[0] = reg_undef;
     62    this->src[1] = reg_undef;
     63    this->src[2] = reg_undef;
     64 }
     65 
     66 fs_inst::fs_inst()
     67 {
     68    init();
     69 }
     70 
     71 fs_inst::fs_inst(enum opcode opcode)
     72 {
     73    init();
     74    this->opcode = opcode;
     75 }
     76 
     77 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
     78 {
     79    init();
     80    this->opcode = opcode;
     81    this->dst = dst;
     82 
     83    if (dst.file == GRF)
     84       assert(dst.reg_offset >= 0);
     85 }
     86 
     87 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
     88 {
     89    init();
     90    this->opcode = opcode;
     91    this->dst = dst;
     92    this->src[0] = src0;
     93 
     94    if (dst.file == GRF)
     95       assert(dst.reg_offset >= 0);
     96    if (src[0].file == GRF)
     97       assert(src[0].reg_offset >= 0);
     98 }
     99 
    100 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
    101 {
    102    init();
    103    this->opcode = opcode;
    104    this->dst = dst;
    105    this->src[0] = src0;
    106    this->src[1] = src1;
    107 
    108    if (dst.file == GRF)
    109       assert(dst.reg_offset >= 0);
    110    if (src[0].file == GRF)
    111       assert(src[0].reg_offset >= 0);
    112    if (src[1].file == GRF)
    113       assert(src[1].reg_offset >= 0);
    114 }
    115 
    116 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
    117 		 fs_reg src0, fs_reg src1, fs_reg src2)
    118 {
    119    init();
    120    this->opcode = opcode;
    121    this->dst = dst;
    122    this->src[0] = src0;
    123    this->src[1] = src1;
    124    this->src[2] = src2;
    125 
    126    if (dst.file == GRF)
    127       assert(dst.reg_offset >= 0);
    128    if (src[0].file == GRF)
    129       assert(src[0].reg_offset >= 0);
    130    if (src[1].file == GRF)
    131       assert(src[1].reg_offset >= 0);
    132    if (src[2].file == GRF)
    133       assert(src[2].reg_offset >= 0);
    134 }
    135 
    136 bool
    137 fs_inst::equals(fs_inst *inst)
    138 {
    139    return (opcode == inst->opcode &&
    140            dst.equals(inst->dst) &&
    141            src[0].equals(inst->src[0]) &&
    142            src[1].equals(inst->src[1]) &&
    143            src[2].equals(inst->src[2]) &&
    144            saturate == inst->saturate &&
    145            predicated == inst->predicated &&
    146            conditional_mod == inst->conditional_mod &&
    147            mlen == inst->mlen &&
    148            base_mrf == inst->base_mrf &&
    149            sampler == inst->sampler &&
    150            target == inst->target &&
    151            eot == inst->eot &&
    152            header_present == inst->header_present &&
    153            shadow_compare == inst->shadow_compare &&
    154            offset == inst->offset);
    155 }
    156 
    157 int
    158 fs_inst::regs_written()
    159 {
    160    if (is_tex())
    161       return 4;
    162 
    163    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
    164     * but we don't currently use them...nor do we have an opcode for them.
    165     */
    166 
    167    return 1;
    168 }
    169 
    170 bool
    171 fs_inst::overwrites_reg(const fs_reg &reg)
    172 {
    173    return (reg.file == dst.file &&
    174            reg.reg == dst.reg &&
    175            reg.reg_offset >= dst.reg_offset  &&
    176            reg.reg_offset < dst.reg_offset + regs_written());
    177 }
    178 
    179 bool
    180 fs_inst::is_tex()
    181 {
    182    return (opcode == SHADER_OPCODE_TEX ||
    183            opcode == FS_OPCODE_TXB ||
    184            opcode == SHADER_OPCODE_TXD ||
    185            opcode == SHADER_OPCODE_TXF ||
    186            opcode == SHADER_OPCODE_TXL ||
    187            opcode == SHADER_OPCODE_TXS);
    188 }
    189 
    190 bool
    191 fs_inst::is_math()
    192 {
    193    return (opcode == SHADER_OPCODE_RCP ||
    194            opcode == SHADER_OPCODE_RSQ ||
    195            opcode == SHADER_OPCODE_SQRT ||
    196            opcode == SHADER_OPCODE_EXP2 ||
    197            opcode == SHADER_OPCODE_LOG2 ||
    198            opcode == SHADER_OPCODE_SIN ||
    199            opcode == SHADER_OPCODE_COS ||
    200            opcode == SHADER_OPCODE_INT_QUOTIENT ||
    201            opcode == SHADER_OPCODE_INT_REMAINDER ||
    202            opcode == SHADER_OPCODE_POW);
    203 }
    204 
    205 void
    206 fs_reg::init()
    207 {
    208    memset(this, 0, sizeof(*this));
    209    this->smear = -1;
    210 }
    211 
    212 /** Generic unset register constructor. */
    213 fs_reg::fs_reg()
    214 {
    215    init();
    216    this->file = BAD_FILE;
    217 }
    218 
    219 /** Immediate value constructor. */
    220 fs_reg::fs_reg(float f)
    221 {
    222    init();
    223    this->file = IMM;
    224    this->type = BRW_REGISTER_TYPE_F;
    225    this->imm.f = f;
    226 }
    227 
    228 /** Immediate value constructor. */
    229 fs_reg::fs_reg(int32_t i)
    230 {
    231    init();
    232    this->file = IMM;
    233    this->type = BRW_REGISTER_TYPE_D;
    234    this->imm.i = i;
    235 }
    236 
    237 /** Immediate value constructor. */
    238 fs_reg::fs_reg(uint32_t u)
    239 {
    240    init();
    241    this->file = IMM;
    242    this->type = BRW_REGISTER_TYPE_UD;
    243    this->imm.u = u;
    244 }
    245 
    246 /** Fixed brw_reg Immediate value constructor. */
    247 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
    248 {
    249    init();
    250    this->file = FIXED_HW_REG;
    251    this->fixed_hw_reg = fixed_hw_reg;
    252    this->type = fixed_hw_reg.type;
    253 }
    254 
    255 bool
    256 fs_reg::equals(const fs_reg &r) const
    257 {
    258    return (file == r.file &&
    259            reg == r.reg &&
    260            reg_offset == r.reg_offset &&
    261            type == r.type &&
    262            negate == r.negate &&
    263            abs == r.abs &&
    264            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
    265                   sizeof(fixed_hw_reg)) == 0 &&
    266            smear == r.smear &&
    267            imm.u == r.imm.u);
    268 }
    269 
    270 int
    271 fs_visitor::type_size(const struct glsl_type *type)
    272 {
    273    unsigned int size, i;
    274 
    275    switch (type->base_type) {
    276    case GLSL_TYPE_UINT:
    277    case GLSL_TYPE_INT:
    278    case GLSL_TYPE_FLOAT:
    279    case GLSL_TYPE_BOOL:
    280       return type->components();
    281    case GLSL_TYPE_ARRAY:
    282       return type_size(type->fields.array) * type->length;
    283    case GLSL_TYPE_STRUCT:
    284       size = 0;
    285       for (i = 0; i < type->length; i++) {
    286 	 size += type_size(type->fields.structure[i].type);
    287       }
    288       return size;
    289    case GLSL_TYPE_SAMPLER:
    290       /* Samplers take up no register space, since they're baked in at
    291        * link time.
    292        */
    293       return 0;
    294    default:
    295       assert(!"not reached");
    296       return 0;
    297    }
    298 }
    299 
    300 void
    301 fs_visitor::fail(const char *format, ...)
    302 {
    303    va_list va;
    304    char *msg;
    305 
    306    if (failed)
    307       return;
    308 
    309    failed = true;
    310 
    311    va_start(va, format);
    312    msg = ralloc_vasprintf(mem_ctx, format, va);
    313    va_end(va);
    314    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
    315 
    316    this->fail_msg = msg;
    317 
    318    if (INTEL_DEBUG & DEBUG_WM) {
    319       fprintf(stderr, "%s",  msg);
    320    }
    321 }
    322 
    323 fs_inst *
    324 fs_visitor::emit(enum opcode opcode)
    325 {
    326    return emit(fs_inst(opcode));
    327 }
    328 
    329 fs_inst *
    330 fs_visitor::emit(enum opcode opcode, fs_reg dst)
    331 {
    332    return emit(fs_inst(opcode, dst));
    333 }
    334 
    335 fs_inst *
    336 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
    337 {
    338    return emit(fs_inst(opcode, dst, src0));
    339 }
    340 
    341 fs_inst *
    342 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
    343 {
    344    return emit(fs_inst(opcode, dst, src0, src1));
    345 }
    346 
    347 fs_inst *
    348 fs_visitor::emit(enum opcode opcode, fs_reg dst,
    349                  fs_reg src0, fs_reg src1, fs_reg src2)
    350 {
    351    return emit(fs_inst(opcode, dst, src0, src1, src2));
    352 }
    353 
    354 void
    355 fs_visitor::push_force_uncompressed()
    356 {
    357    force_uncompressed_stack++;
    358 }
    359 
    360 void
    361 fs_visitor::pop_force_uncompressed()
    362 {
    363    force_uncompressed_stack--;
    364    assert(force_uncompressed_stack >= 0);
    365 }
    366 
    367 void
    368 fs_visitor::push_force_sechalf()
    369 {
    370    force_sechalf_stack++;
    371 }
    372 
    373 void
    374 fs_visitor::pop_force_sechalf()
    375 {
    376    force_sechalf_stack--;
    377    assert(force_sechalf_stack >= 0);
    378 }
    379 
    380 /**
    381  * Returns how many MRFs an FS opcode will write over.
    382  *
    383  * Note that this is not the 0 or 1 implied writes in an actual gen
    384  * instruction -- the FS opcodes often generate MOVs in addition.
    385  */
    386 int
    387 fs_visitor::implied_mrf_writes(fs_inst *inst)
    388 {
    389    if (inst->mlen == 0)
    390       return 0;
    391 
    392    switch (inst->opcode) {
    393    case SHADER_OPCODE_RCP:
    394    case SHADER_OPCODE_RSQ:
    395    case SHADER_OPCODE_SQRT:
    396    case SHADER_OPCODE_EXP2:
    397    case SHADER_OPCODE_LOG2:
    398    case SHADER_OPCODE_SIN:
    399    case SHADER_OPCODE_COS:
    400       return 1 * c->dispatch_width / 8;
    401    case SHADER_OPCODE_POW:
    402    case SHADER_OPCODE_INT_QUOTIENT:
    403    case SHADER_OPCODE_INT_REMAINDER:
    404       return 2 * c->dispatch_width / 8;
    405    case SHADER_OPCODE_TEX:
    406    case FS_OPCODE_TXB:
    407    case SHADER_OPCODE_TXD:
    408    case SHADER_OPCODE_TXF:
    409    case SHADER_OPCODE_TXL:
    410    case SHADER_OPCODE_TXS:
    411       return 1;
    412    case FS_OPCODE_FB_WRITE:
    413       return 2;
    414    case FS_OPCODE_PULL_CONSTANT_LOAD:
    415    case FS_OPCODE_UNSPILL:
    416       return 1;
    417    case FS_OPCODE_SPILL:
    418       return 2;
    419    default:
    420       assert(!"not reached");
    421       return inst->mlen;
    422    }
    423 }
    424 
    425 int
    426 fs_visitor::virtual_grf_alloc(int size)
    427 {
    428    if (virtual_grf_array_size <= virtual_grf_count) {
    429       if (virtual_grf_array_size == 0)
    430 	 virtual_grf_array_size = 16;
    431       else
    432 	 virtual_grf_array_size *= 2;
    433       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
    434 				   virtual_grf_array_size);
    435    }
    436    virtual_grf_sizes[virtual_grf_count] = size;
    437    return virtual_grf_count++;
    438 }
    439 
    440 /** Fixed HW reg constructor. */
    441 fs_reg::fs_reg(enum register_file file, int reg)
    442 {
    443    init();
    444    this->file = file;
    445    this->reg = reg;
    446    this->type = BRW_REGISTER_TYPE_F;
    447 }
    448 
    449 /** Fixed HW reg constructor. */
    450 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
    451 {
    452    init();
    453    this->file = file;
    454    this->reg = reg;
    455    this->type = type;
    456 }
    457 
    458 /** Automatic reg constructor. */
    459 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
    460 {
    461    init();
    462 
    463    this->file = GRF;
    464    this->reg = v->virtual_grf_alloc(v->type_size(type));
    465    this->reg_offset = 0;
    466    this->type = brw_type_for_base_type(type);
    467 }
    468 
    469 fs_reg *
    470 fs_visitor::variable_storage(ir_variable *var)
    471 {
    472    return (fs_reg *)hash_table_find(this->variable_ht, var);
    473 }
    474 
    475 void
    476 import_uniforms_callback(const void *key,
    477 			 void *data,
    478 			 void *closure)
    479 {
    480    struct hash_table *dst_ht = (struct hash_table *)closure;
    481    const fs_reg *reg = (const fs_reg *)data;
    482 
    483    if (reg->file != UNIFORM)
    484       return;
    485 
    486    hash_table_insert(dst_ht, data, key);
    487 }
    488 
    489 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
    490  * This brings in those uniform definitions
    491  */
    492 void
    493 fs_visitor::import_uniforms(fs_visitor *v)
    494 {
    495    hash_table_call_foreach(v->variable_ht,
    496 			   import_uniforms_callback,
    497 			   variable_ht);
    498    this->params_remap = v->params_remap;
    499 }
    500 
    501 /* Our support for uniforms is piggy-backed on the struct
    502  * gl_fragment_program, because that's where the values actually
    503  * get stored, rather than in some global gl_shader_program uniform
    504  * store.
    505  */
    506 int
    507 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
    508 {
    509    unsigned int offset = 0;
    510 
    511    if (type->is_matrix()) {
    512       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
    513 							type->vector_elements,
    514 							1);
    515 
    516       for (unsigned int i = 0; i < type->matrix_columns; i++) {
    517 	 offset += setup_uniform_values(loc + offset, column);
    518       }
    519 
    520       return offset;
    521    }
    522 
    523    switch (type->base_type) {
    524    case GLSL_TYPE_FLOAT:
    525    case GLSL_TYPE_UINT:
    526    case GLSL_TYPE_INT:
    527    case GLSL_TYPE_BOOL:
    528       for (unsigned int i = 0; i < type->vector_elements; i++) {
    529 	 unsigned int param = c->prog_data.nr_params++;
    530 
    531 	 assert(param < ARRAY_SIZE(c->prog_data.param));
    532 
    533 	 this->param_index[param] = loc;
    534 	 this->param_offset[param] = i;
    535       }
    536       return 1;
    537 
    538    case GLSL_TYPE_STRUCT:
    539       for (unsigned int i = 0; i < type->length; i++) {
    540 	 offset += setup_uniform_values(loc + offset,
    541 					type->fields.structure[i].type);
    542       }
    543       return offset;
    544 
    545    case GLSL_TYPE_ARRAY:
    546       for (unsigned int i = 0; i < type->length; i++) {
    547 	 offset += setup_uniform_values(loc + offset, type->fields.array);
    548       }
    549       return offset;
    550 
    551    case GLSL_TYPE_SAMPLER:
    552       /* The sampler takes up a slot, but we don't use any values from it. */
    553       return 1;
    554 
    555    default:
    556       assert(!"not reached");
    557       return 0;
    558    }
    559 }
    560 
    561 
    562 /* Our support for builtin uniforms is even scarier than non-builtin.
    563  * It sits on top of the PROG_STATE_VAR parameters that are
    564  * automatically updated from GL context state.
    565  */
    566 void
    567 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
    568 {
    569    const ir_state_slot *const slots = ir->state_slots;
    570    assert(ir->state_slots != NULL);
    571 
    572    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
    573       /* This state reference has already been setup by ir_to_mesa, but we'll
    574        * get the same index back here.
    575        */
    576       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
    577 					    (gl_state_index *)slots[i].tokens);
    578 
    579       /* Add each of the unique swizzles of the element as a parameter.
    580        * This'll end up matching the expected layout of the
    581        * array/matrix/structure we're trying to fill in.
    582        */
    583       int last_swiz = -1;
    584       for (unsigned int j = 0; j < 4; j++) {
    585 	 int swiz = GET_SWZ(slots[i].swizzle, j);
    586 	 if (swiz == last_swiz)
    587 	    break;
    588 	 last_swiz = swiz;
    589 
    590 	 this->param_index[c->prog_data.nr_params] = index;
    591 	 this->param_offset[c->prog_data.nr_params] = swiz;
    592 	 c->prog_data.nr_params++;
    593       }
    594    }
    595 }
    596 
    597 fs_reg *
    598 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
    599 {
    600    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
    601    fs_reg wpos = *reg;
    602    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
    603 
    604    /* gl_FragCoord.x */
    605    if (ir->pixel_center_integer) {
    606       emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
    607    } else {
    608       emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
    609    }
    610    wpos.reg_offset++;
    611 
    612    /* gl_FragCoord.y */
    613    if (!flip && ir->pixel_center_integer) {
    614       emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
    615    } else {
    616       fs_reg pixel_y = this->pixel_y;
    617       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
    618 
    619       if (flip) {
    620 	 pixel_y.negate = true;
    621 	 offset += c->key.drawable_height - 1.0;
    622       }
    623 
    624       emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
    625    }
    626    wpos.reg_offset++;
    627 
    628    /* gl_FragCoord.z */
    629    if (intel->gen >= 6) {
    630       emit(BRW_OPCODE_MOV, wpos,
    631 	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
    632    } else {
    633       emit(FS_OPCODE_LINTERP, wpos,
    634            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
    635            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
    636            interp_reg(FRAG_ATTRIB_WPOS, 2));
    637    }
    638    wpos.reg_offset++;
    639 
    640    /* gl_FragCoord.w: Already set up in emit_interpolation */
    641    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
    642 
    643    return reg;
    644 }
    645 
    646 fs_inst *
    647 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
    648                          glsl_interp_qualifier interpolation_mode,
    649                          bool is_centroid)
    650 {
    651    brw_wm_barycentric_interp_mode barycoord_mode;
    652    if (is_centroid) {
    653       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
    654          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
    655       else
    656          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
    657    } else {
    658       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
    659          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
    660       else
    661          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
    662    }
    663    return emit(FS_OPCODE_LINTERP, attr,
    664                this->delta_x[barycoord_mode],
    665                this->delta_y[barycoord_mode], interp);
    666 }
    667 
    668 fs_reg *
    669 fs_visitor::emit_general_interpolation(ir_variable *ir)
    670 {
    671    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
    672    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
    673    fs_reg attr = *reg;
    674 
    675    unsigned int array_elements;
    676    const glsl_type *type;
    677 
    678    if (ir->type->is_array()) {
    679       array_elements = ir->type->length;
    680       if (array_elements == 0) {
    681 	 fail("dereferenced array '%s' has length 0\n", ir->name);
    682       }
    683       type = ir->type->fields.array;
    684    } else {
    685       array_elements = 1;
    686       type = ir->type;
    687    }
    688 
    689    glsl_interp_qualifier interpolation_mode =
    690       ir->determine_interpolation_mode(c->key.flat_shade);
    691 
    692    int location = ir->location;
    693    for (unsigned int i = 0; i < array_elements; i++) {
    694       for (unsigned int j = 0; j < type->matrix_columns; j++) {
    695 	 if (urb_setup[location] == -1) {
    696 	    /* If there's no incoming setup data for this slot, don't
    697 	     * emit interpolation for it.
    698 	     */
    699 	    attr.reg_offset += type->vector_elements;
    700 	    location++;
    701 	    continue;
    702 	 }
    703 
    704 	 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
    705 	    /* Constant interpolation (flat shading) case. The SF has
    706 	     * handed us defined values in only the constant offset
    707 	     * field of the setup reg.
    708 	     */
    709 	    for (unsigned int k = 0; k < type->vector_elements; k++) {
    710 	       struct brw_reg interp = interp_reg(location, k);
    711 	       interp = suboffset(interp, 3);
    712                interp.type = reg->type;
    713 	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
    714 	       attr.reg_offset++;
    715 	    }
    716 	 } else {
    717 	    /* Smooth/noperspective interpolation case. */
    718 	    for (unsigned int k = 0; k < type->vector_elements; k++) {
    719 	       /* FINISHME: At some point we probably want to push
    720 		* this farther by giving similar treatment to the
    721 		* other potentially constant components of the
    722 		* attribute, as well as making brw_vs_constval.c
    723 		* handle varyings other than gl_TexCoord.
    724 		*/
    725 	       if (location >= FRAG_ATTRIB_TEX0 &&
    726 		   location <= FRAG_ATTRIB_TEX7 &&
    727 		   k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
    728 		  emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
    729 	       } else {
    730 		  struct brw_reg interp = interp_reg(location, k);
    731                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
    732                                ir->centroid);
    733                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
    734                      /* Get the pixel/sample mask into f0 so that we know
    735                       * which pixels are lit.  Then, for each channel that is
    736                       * unlit, replace the centroid data with non-centroid
    737                       * data.
    738                       */
    739                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
    740                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
    741                                                   interpolation_mode, false);
    742                      inst->predicated = true;
    743                      inst->predicate_inverse = true;
    744                   }
    745 		  if (intel->gen < 6) {
    746 		     emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
    747 		  }
    748 	       }
    749 	       attr.reg_offset++;
    750 	    }
    751 
    752 	 }
    753 	 location++;
    754       }
    755    }
    756 
    757    return reg;
    758 }
    759 
    760 fs_reg *
    761 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
    762 {
    763    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
    764 
    765    /* The frontfacing comes in as a bit in the thread payload. */
    766    if (intel->gen >= 6) {
    767       emit(BRW_OPCODE_ASR, *reg,
    768 	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
    769 	   fs_reg(15));
    770       emit(BRW_OPCODE_NOT, *reg, *reg);
    771       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
    772    } else {
    773       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
    774       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
    775        * us front face
    776        */
    777       fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
    778 			   fs_reg(r1_6ud),
    779 			   fs_reg(1u << 31));
    780       inst->conditional_mod = BRW_CONDITIONAL_L;
    781       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
    782    }
    783 
    784    return reg;
    785 }
    786 
    787 fs_inst *
    788 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
    789 {
    790    switch (opcode) {
    791    case SHADER_OPCODE_RCP:
    792    case SHADER_OPCODE_RSQ:
    793    case SHADER_OPCODE_SQRT:
    794    case SHADER_OPCODE_EXP2:
    795    case SHADER_OPCODE_LOG2:
    796    case SHADER_OPCODE_SIN:
    797    case SHADER_OPCODE_COS:
    798       break;
    799    default:
    800       assert(!"not reached: bad math opcode");
    801       return NULL;
    802    }
    803 
    804    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
    805     * might be able to do better by doing execsize = 1 math and then
    806     * expanding that result out, but we would need to be careful with
    807     * masking.
    808     *
    809     * Gen 6 hardware ignores source modifiers (negate and abs) on math
    810     * instructions, so we also move to a temp to set those up.
    811     */
    812    if (intel->gen == 6 && (src.file == UNIFORM ||
    813 			   src.abs ||
    814 			   src.negate)) {
    815       fs_reg expanded = fs_reg(this, glsl_type::float_type);
    816       emit(BRW_OPCODE_MOV, expanded, src);
    817       src = expanded;
    818    }
    819 
    820    fs_inst *inst = emit(opcode, dst, src);
    821 
    822    if (intel->gen < 6) {
    823       inst->base_mrf = 2;
    824       inst->mlen = c->dispatch_width / 8;
    825    }
    826 
    827    return inst;
    828 }
    829 
    830 fs_inst *
    831 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
    832 {
    833    int base_mrf = 2;
    834    fs_inst *inst;
    835 
    836    switch (opcode) {
    837    case SHADER_OPCODE_POW:
    838    case SHADER_OPCODE_INT_QUOTIENT:
    839    case SHADER_OPCODE_INT_REMAINDER:
    840       break;
    841    default:
    842       assert(!"not reached: unsupported binary math opcode.");
    843       return NULL;
    844    }
    845 
    846    if (intel->gen >= 7) {
    847       inst = emit(opcode, dst, src0, src1);
    848    } else if (intel->gen == 6) {
    849       /* Can't do hstride == 0 args to gen6 math, so expand it out.
    850        *
    851        * The hardware ignores source modifiers (negate and abs) on math
    852        * instructions, so we also move to a temp to set those up.
    853        */
    854       if (src0.file == UNIFORM || src0.abs || src0.negate) {
    855 	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
    856 	 expanded.type = src0.type;
    857 	 emit(BRW_OPCODE_MOV, expanded, src0);
    858 	 src0 = expanded;
    859       }
    860 
    861       if (src1.file == UNIFORM || src1.abs || src1.negate) {
    862 	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
    863 	 expanded.type = src1.type;
    864 	 emit(BRW_OPCODE_MOV, expanded, src1);
    865 	 src1 = expanded;
    866       }
    867 
    868       inst = emit(opcode, dst, src0, src1);
    869    } else {
    870       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
    871        * "Message Payload":
    872        *
    873        * "Operand0[7].  For the INT DIV functions, this operand is the
    874        *  denominator."
    875        *  ...
    876        * "Operand1[7].  For the INT DIV functions, this operand is the
    877        *  numerator."
    878        */
    879       bool is_int_div = opcode != SHADER_OPCODE_POW;
    880       fs_reg &op0 = is_int_div ? src1 : src0;
    881       fs_reg &op1 = is_int_div ? src0 : src1;
    882 
    883       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
    884       inst = emit(opcode, dst, op0, reg_null_f);
    885 
    886       inst->base_mrf = base_mrf;
    887       inst->mlen = 2 * c->dispatch_width / 8;
    888    }
    889    return inst;
    890 }
    891 
    892 /**
    893  * To be called after the last _mesa_add_state_reference() call, to
    894  * set up prog_data.param[] for assign_curb_setup() and
    895  * setup_pull_constants().
    896  */
    897 void
    898 fs_visitor::setup_paramvalues_refs()
    899 {
    900    if (c->dispatch_width != 8)
    901       return;
    902 
    903    /* Set up the pointers to ParamValues now that that array is finalized. */
    904    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
    905       c->prog_data.param[i] =
    906 	 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
    907 	 this->param_offset[i];
    908    }
    909 }
    910 
    911 void
    912 fs_visitor::assign_curb_setup()
    913 {
    914    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
    915    if (c->dispatch_width == 8) {
    916       c->prog_data.first_curbe_grf = c->nr_payload_regs;
    917    } else {
    918       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
    919    }
    920 
    921    /* Map the offsets in the UNIFORM file to fixed HW regs. */
    922    foreach_list(node, &this->instructions) {
    923       fs_inst *inst = (fs_inst *)node;
    924 
    925       for (unsigned int i = 0; i < 3; i++) {
    926 	 if (inst->src[i].file == UNIFORM) {
    927 	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
    928 	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
    929 						  constant_nr / 8,
    930 						  constant_nr % 8);
    931 
    932 	    inst->src[i].file = FIXED_HW_REG;
    933 	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
    934 	 }
    935       }
    936    }
    937 }
    938 
    939 void
    940 fs_visitor::calculate_urb_setup()
    941 {
    942    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
    943       urb_setup[i] = -1;
    944    }
    945 
    946    int urb_next = 0;
    947    /* Figure out where each of the incoming setup attributes lands. */
    948    if (intel->gen >= 6) {
    949       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
    950 	 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
    951 	    urb_setup[i] = urb_next++;
    952 	 }
    953       }
    954    } else {
    955       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
    956       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
    957          /* Point size is packed into the header, not as a general attribute */
    958          if (i == VERT_RESULT_PSIZ)
    959             continue;
    960 
    961 	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
    962 	    int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
    963 
    964 	    /* The back color slot is skipped when the front color is
    965 	     * also written to.  In addition, some slots can be
    966 	     * written in the vertex shader and not read in the
    967 	     * fragment shader.  So the register number must always be
    968 	     * incremented, mapped or not.
    969 	     */
    970 	    if (fp_index >= 0)
    971 	       urb_setup[fp_index] = urb_next;
    972             urb_next++;
    973 	 }
    974       }
    975 
    976       /*
    977        * It's a FS only attribute, and we did interpolation for this attribute
    978        * in SF thread. So, count it here, too.
    979        *
    980        * See compile_sf_prog() for more info.
    981        */
    982       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
    983          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
    984    }
    985 
    986    /* Each attribute is 4 setup channels, each of which is half a reg. */
    987    c->prog_data.urb_read_length = urb_next * 2;
    988 }
    989 
    990 void
    991 fs_visitor::assign_urb_setup()
    992 {
    993    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
    994 
    995    /* Offset all the urb_setup[] index by the actual position of the
    996     * setup regs, now that the location of the constants has been chosen.
    997     */
    998    foreach_list(node, &this->instructions) {
    999       fs_inst *inst = (fs_inst *)node;
   1000 
   1001       if (inst->opcode == FS_OPCODE_LINTERP) {
   1002 	 assert(inst->src[2].file == FIXED_HW_REG);
   1003 	 inst->src[2].fixed_hw_reg.nr += urb_start;
   1004       }
   1005 
   1006       if (inst->opcode == FS_OPCODE_CINTERP) {
   1007 	 assert(inst->src[0].file == FIXED_HW_REG);
   1008 	 inst->src[0].fixed_hw_reg.nr += urb_start;
   1009       }
   1010    }
   1011 
   1012    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
   1013 }
   1014 
   1015 /**
   1016  * Split large virtual GRFs into separate components if we can.
   1017  *
   1018  * This is mostly duplicated with what brw_fs_vector_splitting does,
   1019  * but that's really conservative because it's afraid of doing
   1020  * splitting that doesn't result in real progress after the rest of
   1021  * the optimization phases, which would cause infinite looping in
   1022  * optimization.  We can do it once here, safely.  This also has the
   1023  * opportunity to split interpolated values, or maybe even uniforms,
   1024  * which we don't have at the IR level.
   1025  *
   1026  * We want to split, because virtual GRFs are what we register
   1027  * allocate and spill (due to contiguousness requirements for some
   1028  * instructions), and they're what we naturally generate in the
   1029  * codegen process, but most virtual GRFs don't actually need to be
   1030  * contiguous sets of GRFs.  If we split, we'll end up with reduced
   1031  * live intervals and better dead code elimination and coalescing.
   1032  */
   1033 void
   1034 fs_visitor::split_virtual_grfs()
   1035 {
   1036    int num_vars = this->virtual_grf_count;
   1037    bool split_grf[num_vars];
   1038    int new_virtual_grf[num_vars];
   1039 
   1040    /* Try to split anything > 0 sized. */
   1041    for (int i = 0; i < num_vars; i++) {
   1042       if (this->virtual_grf_sizes[i] != 1)
   1043 	 split_grf[i] = true;
   1044       else
   1045 	 split_grf[i] = false;
   1046    }
   1047 
   1048    if (brw->has_pln &&
   1049        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
   1050       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
   1051        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
   1052        * Gen6, that was the only supported interpolation mode, and since Gen6,
   1053        * delta_x and delta_y are in fixed hardware registers.
   1054        */
   1055       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
   1056          false;
   1057    }
   1058 
   1059    foreach_list(node, &this->instructions) {
   1060       fs_inst *inst = (fs_inst *)node;
   1061 
   1062       /* If there's a SEND message that requires contiguous destination
   1063        * registers, no splitting is allowed.
   1064        */
   1065       if (inst->regs_written() > 1) {
   1066 	 split_grf[inst->dst.reg] = false;
   1067       }
   1068    }
   1069 
   1070    /* Allocate new space for split regs.  Note that the virtual
   1071     * numbers will be contiguous.
   1072     */
   1073    for (int i = 0; i < num_vars; i++) {
   1074       if (split_grf[i]) {
   1075 	 new_virtual_grf[i] = virtual_grf_alloc(1);
   1076 	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
   1077 	    int reg = virtual_grf_alloc(1);
   1078 	    assert(reg == new_virtual_grf[i] + j - 1);
   1079 	    (void) reg;
   1080 	 }
   1081 	 this->virtual_grf_sizes[i] = 1;
   1082       }
   1083    }
   1084 
   1085    foreach_list(node, &this->instructions) {
   1086       fs_inst *inst = (fs_inst *)node;
   1087 
   1088       if (inst->dst.file == GRF &&
   1089 	  split_grf[inst->dst.reg] &&
   1090 	  inst->dst.reg_offset != 0) {
   1091 	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
   1092 			  inst->dst.reg_offset - 1);
   1093 	 inst->dst.reg_offset = 0;
   1094       }
   1095       for (int i = 0; i < 3; i++) {
   1096 	 if (inst->src[i].file == GRF &&
   1097 	     split_grf[inst->src[i].reg] &&
   1098 	     inst->src[i].reg_offset != 0) {
   1099 	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
   1100 				inst->src[i].reg_offset - 1);
   1101 	    inst->src[i].reg_offset = 0;
   1102 	 }
   1103       }
   1104    }
   1105    this->live_intervals_valid = false;
   1106 }
   1107 
   1108 bool
   1109 fs_visitor::remove_dead_constants()
   1110 {
   1111    if (c->dispatch_width == 8) {
   1112       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
   1113 
   1114       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
   1115 	 this->params_remap[i] = -1;
   1116 
   1117       /* Find which params are still in use. */
   1118       foreach_list(node, &this->instructions) {
   1119 	 fs_inst *inst = (fs_inst *)node;
   1120 
   1121 	 for (int i = 0; i < 3; i++) {
   1122 	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
   1123 
   1124 	    if (inst->src[i].file != UNIFORM)
   1125 	       continue;
   1126 
   1127 	    assert(constant_nr < (int)c->prog_data.nr_params);
   1128 
   1129 	    /* For now, set this to non-negative.  We'll give it the
   1130 	     * actual new number in a moment, in order to keep the
   1131 	     * register numbers nicely ordered.
   1132 	     */
   1133 	    this->params_remap[constant_nr] = 0;
   1134 	 }
   1135       }
   1136 
   1137       /* Figure out what the new numbers for the params will be.  At some
   1138        * point when we're doing uniform array access, we're going to want
   1139        * to keep the distinction between .reg and .reg_offset, but for
   1140        * now we don't care.
   1141        */
   1142       unsigned int new_nr_params = 0;
   1143       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
   1144 	 if (this->params_remap[i] != -1) {
   1145 	    this->params_remap[i] = new_nr_params++;
   1146 	 }
   1147       }
   1148 
   1149       /* Update the list of params to be uploaded to match our new numbering. */
   1150       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
   1151 	 int remapped = this->params_remap[i];
   1152 
   1153 	 if (remapped == -1)
   1154 	    continue;
   1155 
   1156 	 /* We've already done setup_paramvalues_refs() so no need to worry
   1157 	  * about param_index and param_offset.
   1158 	  */
   1159 	 c->prog_data.param[remapped] = c->prog_data.param[i];
   1160       }
   1161 
   1162       c->prog_data.nr_params = new_nr_params;
   1163    } else {
   1164       /* This should have been generated in the 8-wide pass already. */
   1165       assert(this->params_remap);
   1166    }
   1167 
   1168    /* Now do the renumbering of the shader to remove unused params. */
   1169    foreach_list(node, &this->instructions) {
   1170       fs_inst *inst = (fs_inst *)node;
   1171 
   1172       for (int i = 0; i < 3; i++) {
   1173 	 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
   1174 
   1175 	 if (inst->src[i].file != UNIFORM)
   1176 	    continue;
   1177 
   1178 	 assert(this->params_remap[constant_nr] != -1);
   1179 	 inst->src[i].reg = this->params_remap[constant_nr];
   1180 	 inst->src[i].reg_offset = 0;
   1181       }
   1182    }
   1183 
   1184    return true;
   1185 }
   1186 
   1187 /**
   1188  * Choose accesses from the UNIFORM file to demote to using the pull
   1189  * constant buffer.
   1190  *
   1191  * We allow a fragment shader to have more than the specified minimum
   1192  * maximum number of fragment shader uniform components (64).  If
   1193  * there are too many of these, they'd fill up all of register space.
   1194  * So, this will push some of them out to the pull constant buffer and
   1195  * update the program to load them.
   1196  */
   1197 void
   1198 fs_visitor::setup_pull_constants()
   1199 {
   1200    /* Only allow 16 registers (128 uniform components) as push constants. */
   1201    unsigned int max_uniform_components = 16 * 8;
   1202    if (c->prog_data.nr_params <= max_uniform_components)
   1203       return;
   1204 
   1205    if (c->dispatch_width == 16) {
   1206       fail("Pull constants not supported in 16-wide\n");
   1207       return;
   1208    }
   1209 
   1210    /* Just demote the end of the list.  We could probably do better
   1211     * here, demoting things that are rarely used in the program first.
   1212     */
   1213    int pull_uniform_base = max_uniform_components;
   1214    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
   1215 
   1216    foreach_list(node, &this->instructions) {
   1217       fs_inst *inst = (fs_inst *)node;
   1218 
   1219       for (int i = 0; i < 3; i++) {
   1220 	 if (inst->src[i].file != UNIFORM)
   1221 	    continue;
   1222 
   1223 	 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
   1224 	 if (uniform_nr < pull_uniform_base)
   1225 	    continue;
   1226 
   1227 	 fs_reg dst = fs_reg(this, glsl_type::float_type);
   1228 	 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
   1229 	 fs_reg offset = fs_reg((unsigned)(((uniform_nr -
   1230 					     pull_uniform_base) * 4) & ~15));
   1231 	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
   1232 					      dst, index, offset);
   1233 	 pull->ir = inst->ir;
   1234 	 pull->annotation = inst->annotation;
   1235 	 pull->base_mrf = 14;
   1236 	 pull->mlen = 1;
   1237 
   1238 	 inst->insert_before(pull);
   1239 
   1240 	 inst->src[i].file = GRF;
   1241 	 inst->src[i].reg = dst.reg;
   1242 	 inst->src[i].reg_offset = 0;
   1243 	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
   1244       }
   1245    }
   1246 
   1247    for (int i = 0; i < pull_uniform_count; i++) {
   1248       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
   1249    }
   1250    c->prog_data.nr_params -= pull_uniform_count;
   1251    c->prog_data.nr_pull_params = pull_uniform_count;
   1252 }
   1253 
   1254 /**
   1255  * Attempts to move immediate constants into the immediate
   1256  * constant slot of following instructions.
   1257  *
   1258  * Immediate constants are a bit tricky -- they have to be in the last
   1259  * operand slot, you can't do abs/negate on them,
   1260  */
   1261 
   1262 bool
   1263 fs_visitor::propagate_constants()
   1264 {
   1265    bool progress = false;
   1266 
   1267    calculate_live_intervals();
   1268 
   1269    foreach_list(node, &this->instructions) {
   1270       fs_inst *inst = (fs_inst *)node;
   1271 
   1272       if (inst->opcode != BRW_OPCODE_MOV ||
   1273 	  inst->predicated ||
   1274 	  inst->dst.file != GRF || inst->src[0].file != IMM ||
   1275 	  inst->dst.type != inst->src[0].type ||
   1276 	  (c->dispatch_width == 16 &&
   1277 	   (inst->force_uncompressed || inst->force_sechalf)))
   1278 	 continue;
   1279 
   1280       /* Don't bother with cases where we should have had the
   1281        * operation on the constant folded in GLSL already.
   1282        */
   1283       if (inst->saturate)
   1284 	 continue;
   1285 
   1286       /* Found a move of a constant to a GRF.  Find anything else using the GRF
   1287        * before it's written, and replace it with the constant if we can.
   1288        */
   1289       for (fs_inst *scan_inst = (fs_inst *)inst->next;
   1290 	   !scan_inst->is_tail_sentinel();
   1291 	   scan_inst = (fs_inst *)scan_inst->next) {
   1292 	 if (scan_inst->opcode == BRW_OPCODE_DO ||
   1293 	     scan_inst->opcode == BRW_OPCODE_WHILE ||
   1294 	     scan_inst->opcode == BRW_OPCODE_ELSE ||
   1295 	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
   1296 	    break;
   1297 	 }
   1298 
   1299 	 for (int i = 2; i >= 0; i--) {
   1300 	    if (scan_inst->src[i].file != GRF ||
   1301 		scan_inst->src[i].reg != inst->dst.reg ||
   1302 		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
   1303 	       continue;
   1304 
   1305 	    /* Don't bother with cases where we should have had the
   1306 	     * operation on the constant folded in GLSL already.
   1307 	     */
   1308 	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
   1309 	       continue;
   1310 
   1311 	    switch (scan_inst->opcode) {
   1312 	    case BRW_OPCODE_MOV:
   1313 	       scan_inst->src[i] = inst->src[0];
   1314 	       progress = true;
   1315 	       break;
   1316 
   1317 	    case BRW_OPCODE_MUL:
   1318 	    case BRW_OPCODE_ADD:
   1319 	       if (i == 1) {
   1320 		  scan_inst->src[i] = inst->src[0];
   1321 		  progress = true;
   1322 	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
   1323 		  /* Fit this constant in by commuting the operands.
   1324 		   * Exception: we can't do this for 32-bit integer MUL
   1325 		   * because it's asymmetric.
   1326 		   */
   1327 		  if (scan_inst->opcode == BRW_OPCODE_MUL &&
   1328 		      (scan_inst->src[1].type == BRW_REGISTER_TYPE_D ||
   1329 		       scan_inst->src[1].type == BRW_REGISTER_TYPE_UD))
   1330 		     break;
   1331 		  scan_inst->src[0] = scan_inst->src[1];
   1332 		  scan_inst->src[1] = inst->src[0];
   1333 		  progress = true;
   1334 	       }
   1335 	       break;
   1336 
   1337 	    case BRW_OPCODE_CMP:
   1338 	    case BRW_OPCODE_IF:
   1339 	       if (i == 1) {
   1340 		  scan_inst->src[i] = inst->src[0];
   1341 		  progress = true;
   1342 	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
   1343 		  uint32_t new_cmod;
   1344 
   1345 		  new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
   1346 		  if (new_cmod != ~0u) {
   1347 		     /* Fit this constant in by swapping the operands and
   1348 		      * flipping the test
   1349 		      */
   1350 		     scan_inst->src[0] = scan_inst->src[1];
   1351 		     scan_inst->src[1] = inst->src[0];
   1352 		     scan_inst->conditional_mod = new_cmod;
   1353 		     progress = true;
   1354 		  }
   1355 	       }
   1356 	       break;
   1357 
   1358 	    case BRW_OPCODE_SEL:
   1359 	       if (i == 1) {
   1360 		  scan_inst->src[i] = inst->src[0];
   1361 		  progress = true;
   1362 	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
   1363 		  scan_inst->src[0] = scan_inst->src[1];
   1364 		  scan_inst->src[1] = inst->src[0];
   1365 
   1366 		  /* If this was predicated, flipping operands means
   1367 		   * we also need to flip the predicate.
   1368 		   */
   1369 		  if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) {
   1370 		     scan_inst->predicate_inverse =
   1371 			!scan_inst->predicate_inverse;
   1372 		  }
   1373 		  progress = true;
   1374 	       }
   1375 	       break;
   1376 
   1377 	    case SHADER_OPCODE_RCP:
   1378 	       /* The hardware doesn't do math on immediate values
   1379 		* (because why are you doing that, seriously?), but
   1380 		* the correct answer is to just constant fold it
   1381 		* anyway.
   1382 		*/
   1383 	       assert(i == 0);
   1384 	       if (inst->src[0].imm.f != 0.0f) {
   1385 		  scan_inst->opcode = BRW_OPCODE_MOV;
   1386 		  scan_inst->src[0] = inst->src[0];
   1387 		  scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f;
   1388 		  progress = true;
   1389 	       }
   1390 	       break;
   1391 
   1392             case FS_OPCODE_PULL_CONSTANT_LOAD:
   1393 	       scan_inst->src[i] = inst->src[0];
   1394 	       progress = true;
   1395 	       break;
   1396 
   1397 	    default:
   1398 	       break;
   1399 	    }
   1400 	 }
   1401 
   1402 	 if (scan_inst->dst.file == GRF &&
   1403              scan_inst->overwrites_reg(inst->dst)) {
   1404 	    break;
   1405 	 }
   1406       }
   1407    }
   1408 
   1409    if (progress)
   1410        this->live_intervals_valid = false;
   1411 
   1412    return progress;
   1413 }
   1414 
   1415 
   1416 /**
   1417  * Attempts to move immediate constants into the immediate
   1418  * constant slot of following instructions.
   1419  *
   1420  * Immediate constants are a bit tricky -- they have to be in the last
   1421  * operand slot, you can't do abs/negate on them,
   1422  */
   1423 
   1424 bool
   1425 fs_visitor::opt_algebraic()
   1426 {
   1427    bool progress = false;
   1428 
   1429    calculate_live_intervals();
   1430 
   1431    foreach_list(node, &this->instructions) {
   1432       fs_inst *inst = (fs_inst *)node;
   1433 
   1434       switch (inst->opcode) {
   1435       case BRW_OPCODE_MUL:
   1436 	 if (inst->src[1].file != IMM)
   1437 	    continue;
   1438 
   1439 	 /* a * 1.0 = a */
   1440 	 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
   1441 	     inst->src[1].imm.f == 1.0) {
   1442 	    inst->opcode = BRW_OPCODE_MOV;
   1443 	    inst->src[1] = reg_undef;
   1444 	    progress = true;
   1445 	    break;
   1446 	 }
   1447 
   1448 	 break;
   1449       default:
   1450 	 break;
   1451       }
   1452    }
   1453 
   1454    return progress;
   1455 }
   1456 
   1457 /**
   1458  * Must be called after calculate_live_intervales() to remove unused
   1459  * writes to registers -- register allocation will fail otherwise
   1460  * because something deffed but not used won't be considered to
   1461  * interfere with other regs.
   1462  */
   1463 bool
   1464 fs_visitor::dead_code_eliminate()
   1465 {
   1466    bool progress = false;
   1467    int pc = 0;
   1468 
   1469    calculate_live_intervals();
   1470 
   1471    foreach_list_safe(node, &this->instructions) {
   1472       fs_inst *inst = (fs_inst *)node;
   1473 
   1474       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
   1475 	 inst->remove();
   1476 	 progress = true;
   1477       }
   1478 
   1479       pc++;
   1480    }
   1481 
   1482    if (progress)
   1483       live_intervals_valid = false;
   1484 
   1485    return progress;
   1486 }
   1487 
   1488 /**
   1489  * Implements a second type of register coalescing: This one checks if
   1490  * the two regs involved in a raw move don't interfere, in which case
   1491  * they can both by stored in the same place and the MOV removed.
   1492  */
   1493 bool
   1494 fs_visitor::register_coalesce_2()
   1495 {
   1496    bool progress = false;
   1497 
   1498    calculate_live_intervals();
   1499 
   1500    foreach_list_safe(node, &this->instructions) {
   1501       fs_inst *inst = (fs_inst *)node;
   1502 
   1503       if (inst->opcode != BRW_OPCODE_MOV ||
   1504 	  inst->predicated ||
   1505 	  inst->saturate ||
   1506 	  inst->src[0].file != GRF ||
   1507 	  inst->src[0].negate ||
   1508 	  inst->src[0].abs ||
   1509 	  inst->src[0].smear != -1 ||
   1510 	  inst->dst.file != GRF ||
   1511 	  inst->dst.type != inst->src[0].type ||
   1512 	  virtual_grf_sizes[inst->src[0].reg] != 1 ||
   1513 	  virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
   1514 	 continue;
   1515       }
   1516 
   1517       int reg_from = inst->src[0].reg;
   1518       assert(inst->src[0].reg_offset == 0);
   1519       int reg_to = inst->dst.reg;
   1520       int reg_to_offset = inst->dst.reg_offset;
   1521 
   1522       foreach_list_safe(node, &this->instructions) {
   1523 	 fs_inst *scan_inst = (fs_inst *)node;
   1524 
   1525 	 if (scan_inst->dst.file == GRF &&
   1526 	     scan_inst->dst.reg == reg_from) {
   1527 	    scan_inst->dst.reg = reg_to;
   1528 	    scan_inst->dst.reg_offset = reg_to_offset;
   1529 	 }
   1530 	 for (int i = 0; i < 3; i++) {
   1531 	    if (scan_inst->src[i].file == GRF &&
   1532 		scan_inst->src[i].reg == reg_from) {
   1533 	       scan_inst->src[i].reg = reg_to;
   1534 	       scan_inst->src[i].reg_offset = reg_to_offset;
   1535 	    }
   1536 	 }
   1537       }
   1538 
   1539       inst->remove();
   1540       live_intervals_valid = false;
   1541       progress = true;
   1542       continue;
   1543    }
   1544 
   1545    return progress;
   1546 }
   1547 
   1548 bool
   1549 fs_visitor::register_coalesce()
   1550 {
   1551    bool progress = false;
   1552    int if_depth = 0;
   1553    int loop_depth = 0;
   1554 
   1555    foreach_list_safe(node, &this->instructions) {
   1556       fs_inst *inst = (fs_inst *)node;
   1557 
   1558       /* Make sure that we dominate the instructions we're going to
   1559        * scan for interfering with our coalescing, or we won't have
   1560        * scanned enough to see if anything interferes with our
   1561        * coalescing.  We don't dominate the following instructions if
   1562        * we're in a loop or an if block.
   1563        */
   1564       switch (inst->opcode) {
   1565       case BRW_OPCODE_DO:
   1566 	 loop_depth++;
   1567 	 break;
   1568       case BRW_OPCODE_WHILE:
   1569 	 loop_depth--;
   1570 	 break;
   1571       case BRW_OPCODE_IF:
   1572 	 if_depth++;
   1573 	 break;
   1574       case BRW_OPCODE_ENDIF:
   1575 	 if_depth--;
   1576 	 break;
   1577       default:
   1578 	 break;
   1579       }
   1580       if (loop_depth || if_depth)
   1581 	 continue;
   1582 
   1583       if (inst->opcode != BRW_OPCODE_MOV ||
   1584 	  inst->predicated ||
   1585 	  inst->saturate ||
   1586 	  inst->dst.file != GRF || (inst->src[0].file != GRF &&
   1587 				    inst->src[0].file != UNIFORM)||
   1588 	  inst->dst.type != inst->src[0].type)
   1589 	 continue;
   1590 
   1591       bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
   1592 
   1593       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
   1594        * them: check for no writes to either one until the exit of the
   1595        * program.
   1596        */
   1597       bool interfered = false;
   1598 
   1599       for (fs_inst *scan_inst = (fs_inst *)inst->next;
   1600 	   !scan_inst->is_tail_sentinel();
   1601 	   scan_inst = (fs_inst *)scan_inst->next) {
   1602 	 if (scan_inst->dst.file == GRF) {
   1603 	    if (scan_inst->overwrites_reg(inst->dst) ||
   1604                 scan_inst->overwrites_reg(inst->src[0])) {
   1605 	       interfered = true;
   1606 	       break;
   1607 	    }
   1608 	 }
   1609 
   1610 	 /* The gen6 MATH instruction can't handle source modifiers or
   1611 	  * unusual register regions, so avoid coalescing those for
   1612 	  * now.  We should do something more specific.
   1613 	  */
   1614 	 if (intel->gen >= 6 &&
   1615 	     scan_inst->is_math() &&
   1616 	     (has_source_modifiers || inst->src[0].file == UNIFORM)) {
   1617 	    interfered = true;
   1618 	    break;
   1619 	 }
   1620 
   1621 	 /* The accumulator result appears to get used for the
   1622 	  * conditional modifier generation.  When negating a UD
   1623 	  * value, there is a 33rd bit generated for the sign in the
   1624 	  * accumulator value, so now you can't check, for example,
   1625 	  * equality with a 32-bit value.  See piglit fs-op-neg-uint.
   1626 	  */
   1627 	 if (scan_inst->conditional_mod &&
   1628 	     inst->src[0].negate &&
   1629 	     inst->src[0].type == BRW_REGISTER_TYPE_UD) {
   1630 	    interfered = true;
   1631 	    break;
   1632 	 }
   1633       }
   1634       if (interfered) {
   1635 	 continue;
   1636       }
   1637 
   1638       /* Rewrite the later usage to point at the source of the move to
   1639        * be removed.
   1640        */
   1641       for (fs_inst *scan_inst = inst;
   1642 	   !scan_inst->is_tail_sentinel();
   1643 	   scan_inst = (fs_inst *)scan_inst->next) {
   1644 	 for (int i = 0; i < 3; i++) {
   1645 	    if (scan_inst->src[i].file == GRF &&
   1646 		scan_inst->src[i].reg == inst->dst.reg &&
   1647 		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
   1648 	       fs_reg new_src = inst->src[0];
   1649                if (scan_inst->src[i].abs) {
   1650                   new_src.negate = 0;
   1651                   new_src.abs = 1;
   1652                }
   1653 	       new_src.negate ^= scan_inst->src[i].negate;
   1654 	       scan_inst->src[i] = new_src;
   1655 	    }
   1656 	 }
   1657       }
   1658 
   1659       inst->remove();
   1660       progress = true;
   1661    }
   1662 
   1663    if (progress)
   1664       live_intervals_valid = false;
   1665 
   1666    return progress;
   1667 }
   1668 
   1669 
   1670 bool
   1671 fs_visitor::compute_to_mrf()
   1672 {
   1673    bool progress = false;
   1674    int next_ip = 0;
   1675 
   1676    calculate_live_intervals();
   1677 
   1678    foreach_list_safe(node, &this->instructions) {
   1679       fs_inst *inst = (fs_inst *)node;
   1680 
   1681       int ip = next_ip;
   1682       next_ip++;
   1683 
   1684       if (inst->opcode != BRW_OPCODE_MOV ||
   1685 	  inst->predicated ||
   1686 	  inst->dst.file != MRF || inst->src[0].file != GRF ||
   1687 	  inst->dst.type != inst->src[0].type ||
   1688 	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
   1689 	 continue;
   1690 
   1691       /* Work out which hardware MRF registers are written by this
   1692        * instruction.
   1693        */
   1694       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
   1695       int mrf_high;
   1696       if (inst->dst.reg & BRW_MRF_COMPR4) {
   1697 	 mrf_high = mrf_low + 4;
   1698       } else if (c->dispatch_width == 16 &&
   1699 		 (!inst->force_uncompressed && !inst->force_sechalf)) {
   1700 	 mrf_high = mrf_low + 1;
   1701       } else {
   1702 	 mrf_high = mrf_low;
   1703       }
   1704 
   1705       /* Can't compute-to-MRF this GRF if someone else was going to
   1706        * read it later.
   1707        */
   1708       if (this->virtual_grf_use[inst->src[0].reg] > ip)
   1709 	 continue;
   1710 
   1711       /* Found a move of a GRF to a MRF.  Let's see if we can go
   1712        * rewrite the thing that made this GRF to write into the MRF.
   1713        */
   1714       fs_inst *scan_inst;
   1715       for (scan_inst = (fs_inst *)inst->prev;
   1716 	   scan_inst->prev != NULL;
   1717 	   scan_inst = (fs_inst *)scan_inst->prev) {
   1718 	 if (scan_inst->dst.file == GRF &&
   1719 	     scan_inst->dst.reg == inst->src[0].reg) {
   1720 	    /* Found the last thing to write our reg we want to turn
   1721 	     * into a compute-to-MRF.
   1722 	     */
   1723 
   1724             /* SENDs can only write to GRFs, so no compute-to-MRF. */
   1725 	    if (scan_inst->mlen) {
   1726 	       break;
   1727 	    }
   1728 
   1729 	    /* If it's predicated, it (probably) didn't populate all
   1730 	     * the channels.  We might be able to rewrite everything
   1731 	     * that writes that reg, but it would require smarter
   1732 	     * tracking to delay the rewriting until complete success.
   1733 	     */
   1734 	    if (scan_inst->predicated)
   1735 	       break;
   1736 
   1737 	    /* If it's half of register setup and not the same half as
   1738 	     * our MOV we're trying to remove, bail for now.
   1739 	     */
   1740 	    if (scan_inst->force_uncompressed != inst->force_uncompressed ||
   1741 		scan_inst->force_sechalf != inst->force_sechalf) {
   1742 	       break;
   1743 	    }
   1744 
   1745 	    /* SEND instructions can't have MRF as a destination. */
   1746 	    if (scan_inst->mlen)
   1747 	       break;
   1748 
   1749 	    if (intel->gen >= 6) {
   1750 	       /* gen6 math instructions must have the destination be
   1751 		* GRF, so no compute-to-MRF for them.
   1752 		*/
   1753 	       if (scan_inst->is_math()) {
   1754 		  break;
   1755 	       }
   1756 	    }
   1757 
   1758 	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
   1759 	       /* Found the creator of our MRF's source value. */
   1760 	       scan_inst->dst.file = MRF;
   1761 	       scan_inst->dst.reg = inst->dst.reg;
   1762 	       scan_inst->saturate |= inst->saturate;
   1763 	       inst->remove();
   1764 	       progress = true;
   1765 	    }
   1766 	    break;
   1767 	 }
   1768 
   1769 	 /* We don't handle flow control here.  Most computation of
   1770 	  * values that end up in MRFs are shortly before the MRF
   1771 	  * write anyway.
   1772 	  */
   1773 	 if (scan_inst->opcode == BRW_OPCODE_DO ||
   1774 	     scan_inst->opcode == BRW_OPCODE_WHILE ||
   1775 	     scan_inst->opcode == BRW_OPCODE_ELSE ||
   1776 	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
   1777 	    break;
   1778 	 }
   1779 
   1780 	 /* You can't read from an MRF, so if someone else reads our
   1781 	  * MRF's source GRF that we wanted to rewrite, that stops us.
   1782 	  */
   1783 	 bool interfered = false;
   1784 	 for (int i = 0; i < 3; i++) {
   1785 	    if (scan_inst->src[i].file == GRF &&
   1786 		scan_inst->src[i].reg == inst->src[0].reg &&
   1787 		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
   1788 	       interfered = true;
   1789 	    }
   1790 	 }
   1791 	 if (interfered)
   1792 	    break;
   1793 
   1794 	 if (scan_inst->dst.file == MRF) {
   1795 	    /* If somebody else writes our MRF here, we can't
   1796 	     * compute-to-MRF before that.
   1797 	     */
   1798 	    int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
   1799 	    int scan_mrf_high;
   1800 
   1801 	    if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
   1802 	       scan_mrf_high = scan_mrf_low + 4;
   1803 	    } else if (c->dispatch_width == 16 &&
   1804 		       (!scan_inst->force_uncompressed &&
   1805 			!scan_inst->force_sechalf)) {
   1806 	       scan_mrf_high = scan_mrf_low + 1;
   1807 	    } else {
   1808 	       scan_mrf_high = scan_mrf_low;
   1809 	    }
   1810 
   1811 	    if (mrf_low == scan_mrf_low ||
   1812 		mrf_low == scan_mrf_high ||
   1813 		mrf_high == scan_mrf_low ||
   1814 		mrf_high == scan_mrf_high) {
   1815 	       break;
   1816 	    }
   1817 	 }
   1818 
   1819 	 if (scan_inst->mlen > 0) {
   1820 	    /* Found a SEND instruction, which means that there are
   1821 	     * live values in MRFs from base_mrf to base_mrf +
   1822 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
   1823 	     * above it.
   1824 	     */
   1825 	    if (mrf_low >= scan_inst->base_mrf &&
   1826 		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
   1827 	       break;
   1828 	    }
   1829 	    if (mrf_high >= scan_inst->base_mrf &&
   1830 		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
   1831 	       break;
   1832 	    }
   1833 	 }
   1834       }
   1835    }
   1836 
   1837    if (progress)
   1838       live_intervals_valid = false;
   1839 
   1840    return progress;
   1841 }
   1842 
   1843 /**
   1844  * Walks through basic blocks, looking for repeated MRF writes and
   1845  * removing the later ones.
   1846  */
   1847 bool
   1848 fs_visitor::remove_duplicate_mrf_writes()
   1849 {
   1850    fs_inst *last_mrf_move[16];
   1851    bool progress = false;
   1852 
   1853    /* Need to update the MRF tracking for compressed instructions. */
   1854    if (c->dispatch_width == 16)
   1855       return false;
   1856 
   1857    memset(last_mrf_move, 0, sizeof(last_mrf_move));
   1858 
   1859    foreach_list_safe(node, &this->instructions) {
   1860       fs_inst *inst = (fs_inst *)node;
   1861 
   1862       switch (inst->opcode) {
   1863       case BRW_OPCODE_DO:
   1864       case BRW_OPCODE_WHILE:
   1865       case BRW_OPCODE_IF:
   1866       case BRW_OPCODE_ELSE:
   1867       case BRW_OPCODE_ENDIF:
   1868 	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
   1869 	 continue;
   1870       default:
   1871 	 break;
   1872       }
   1873 
   1874       if (inst->opcode == BRW_OPCODE_MOV &&
   1875 	  inst->dst.file == MRF) {
   1876 	 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
   1877 	 if (prev_inst && inst->equals(prev_inst)) {
   1878 	    inst->remove();
   1879 	    progress = true;
   1880 	    continue;
   1881 	 }
   1882       }
   1883 
   1884       /* Clear out the last-write records for MRFs that were overwritten. */
   1885       if (inst->dst.file == MRF) {
   1886 	 last_mrf_move[inst->dst.reg] = NULL;
   1887       }
   1888 
   1889       if (inst->mlen > 0) {
   1890 	 /* Found a SEND instruction, which will include two or fewer
   1891 	  * implied MRF writes.  We could do better here.
   1892 	  */
   1893 	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
   1894 	    last_mrf_move[inst->base_mrf + i] = NULL;
   1895 	 }
   1896       }
   1897 
   1898       /* Clear out any MRF move records whose sources got overwritten. */
   1899       if (inst->dst.file == GRF) {
   1900 	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
   1901 	    if (last_mrf_move[i] &&
   1902 		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
   1903 	       last_mrf_move[i] = NULL;
   1904 	    }
   1905 	 }
   1906       }
   1907 
   1908       if (inst->opcode == BRW_OPCODE_MOV &&
   1909 	  inst->dst.file == MRF &&
   1910 	  inst->src[0].file == GRF &&
   1911 	  !inst->predicated) {
   1912 	 last_mrf_move[inst->dst.reg] = inst;
   1913       }
   1914    }
   1915 
   1916    if (progress)
   1917       live_intervals_valid = false;
   1918 
   1919    return progress;
   1920 }
   1921 
   1922 /**
   1923  * Possibly returns an instruction that set up @param reg.
   1924  *
   1925  * Sometimes we want to take the result of some expression/variable
   1926  * dereference tree and rewrite the instruction generating the result
   1927  * of the tree.  When processing the tree, we know that the
   1928  * instructions generated are all writing temporaries that are dead
   1929  * outside of this tree.  So, if we have some instructions that write
   1930  * a temporary, we're free to point that temp write somewhere else.
   1931  *
   1932  * Note that this doesn't guarantee that the instruction generated
   1933  * only reg -- it might be the size=4 destination of a texture instruction.
   1934  */
   1935 fs_inst *
   1936 fs_visitor::get_instruction_generating_reg(fs_inst *start,
   1937 					   fs_inst *end,
   1938 					   fs_reg reg)
   1939 {
   1940    if (end == start ||
   1941        end->predicated ||
   1942        end->force_uncompressed ||
   1943        end->force_sechalf ||
   1944        !reg.equals(end->dst)) {
   1945       return NULL;
   1946    } else {
   1947       return end;
   1948    }
   1949 }
   1950 
   1951 bool
   1952 fs_visitor::run()
   1953 {
   1954    uint32_t prog_offset_16 = 0;
   1955    uint32_t orig_nr_params = c->prog_data.nr_params;
   1956 
   1957    brw_wm_payload_setup(brw, c);
   1958 
   1959    if (c->dispatch_width == 16) {
   1960       /* align to 64 byte boundary. */
   1961       while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
   1962 	 brw_NOP(p);
   1963       }
   1964 
   1965       /* Save off the start of this 16-wide program in case we succeed. */
   1966       prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
   1967 
   1968       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
   1969    }
   1970 
   1971    if (0) {
   1972       emit_dummy_fs();
   1973    } else {
   1974       calculate_urb_setup();
   1975       if (intel->gen < 6)
   1976 	 emit_interpolation_setup_gen4();
   1977       else
   1978 	 emit_interpolation_setup_gen6();
   1979 
   1980       /* Generate FS IR for main().  (the visitor only descends into
   1981        * functions called "main").
   1982        */
   1983       foreach_list(node, &*shader->ir) {
   1984 	 ir_instruction *ir = (ir_instruction *)node;
   1985 	 base_ir = ir;
   1986 	 this->result = reg_undef;
   1987 	 ir->accept(this);
   1988       }
   1989       if (failed)
   1990 	 return false;
   1991 
   1992       emit_fb_writes();
   1993 
   1994       split_virtual_grfs();
   1995 
   1996       setup_paramvalues_refs();
   1997       setup_pull_constants();
   1998 
   1999       bool progress;
   2000       do {
   2001 	 progress = false;
   2002 
   2003 	 progress = remove_duplicate_mrf_writes() || progress;
   2004 
   2005 	 progress = propagate_constants() || progress;
   2006 	 progress = opt_algebraic() || progress;
   2007 	 progress = opt_cse() || progress;
   2008 	 progress = opt_copy_propagate() || progress;
   2009 	 progress = register_coalesce() || progress;
   2010 	 progress = register_coalesce_2() || progress;
   2011 	 progress = compute_to_mrf() || progress;
   2012 	 progress = dead_code_eliminate() || progress;
   2013       } while (progress);
   2014 
   2015       remove_dead_constants();
   2016 
   2017       schedule_instructions();
   2018 
   2019       assign_curb_setup();
   2020       assign_urb_setup();
   2021 
   2022       if (0) {
   2023 	 /* Debug of register spilling: Go spill everything. */
   2024 	 for (int i = 0; i < virtual_grf_count; i++) {
   2025 	    spill_reg(i);
   2026 	 }
   2027       }
   2028 
   2029       if (0)
   2030 	 assign_regs_trivial();
   2031       else {
   2032 	 while (!assign_regs()) {
   2033 	    if (failed)
   2034 	       break;
   2035 	 }
   2036       }
   2037    }
   2038    assert(force_uncompressed_stack == 0);
   2039    assert(force_sechalf_stack == 0);
   2040 
   2041    if (failed)
   2042       return false;
   2043 
   2044    generate_code();
   2045 
   2046    if (c->dispatch_width == 8) {
   2047       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
   2048    } else {
   2049       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
   2050       c->prog_data.prog_offset_16 = prog_offset_16;
   2051 
   2052       /* Make sure we didn't try to sneak in an extra uniform */
   2053       assert(orig_nr_params == c->prog_data.nr_params);
   2054       (void) orig_nr_params;
   2055    }
   2056 
   2057    return !failed;
   2058 }
   2059 
   2060 bool
   2061 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
   2062 	       struct gl_shader_program *prog)
   2063 {
   2064    struct intel_context *intel = &brw->intel;
   2065    bool start_busy = false;
   2066    float start_time = 0;
   2067 
   2068    if (!prog)
   2069       return false;
   2070 
   2071    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
   2072       start_busy = (intel->batch.last_bo &&
   2073                     drm_intel_bo_busy(intel->batch.last_bo));
   2074       start_time = get_time();
   2075    }
   2076 
   2077    struct brw_shader *shader =
   2078      (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
   2079    if (!shader)
   2080       return false;
   2081 
   2082    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
   2083       printf("GLSL IR for native fragment shader %d:\n", prog->Name);
   2084       _mesa_print_ir(shader->ir, NULL);
   2085       printf("\n\n");
   2086    }
   2087 
   2088    /* Now the main event: Visit the shader IR and generate our FS IR for it.
   2089     */
   2090    c->dispatch_width = 8;
   2091 
   2092    fs_visitor v(c, prog, shader);
   2093    if (!v.run()) {
   2094       prog->LinkStatus = false;
   2095       ralloc_strcat(&prog->InfoLog, v.fail_msg);
   2096 
   2097       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
   2098 		    v.fail_msg);
   2099 
   2100       return false;
   2101    }
   2102 
   2103    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
   2104       c->dispatch_width = 16;
   2105       fs_visitor v2(c, prog, shader);
   2106       v2.import_uniforms(&v);
   2107       if (!v2.run()) {
   2108          perf_debug("16-wide shader failed to compile, falling back to "
   2109                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
   2110       }
   2111    }
   2112 
   2113    c->prog_data.dispatch_width = 8;
   2114 
   2115    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
   2116       if (shader->compiled_once)
   2117          brw_wm_debug_recompile(brw, prog, &c->key);
   2118       shader->compiled_once = true;
   2119 
   2120       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
   2121          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
   2122                     (get_time() - start_time) * 1000);
   2123       }
   2124    }
   2125 
   2126    return true;
   2127 }
   2128 
   2129 bool
   2130 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
   2131 {
   2132    struct brw_context *brw = brw_context(ctx);
   2133    struct intel_context *intel = &brw->intel;
   2134    struct brw_wm_prog_key key;
   2135 
   2136    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
   2137       return true;
   2138 
   2139    struct gl_fragment_program *fp = (struct gl_fragment_program *)
   2140       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
   2141    struct brw_fragment_program *bfp = brw_fragment_program(fp);
   2142    bool program_uses_dfdy = fp->UsesDFdy;
   2143 
   2144    memset(&key, 0, sizeof(key));
   2145 
   2146    if (intel->gen < 6) {
   2147       if (fp->UsesKill)
   2148          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
   2149 
   2150       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
   2151          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
   2152 
   2153       /* Just assume depth testing. */
   2154       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
   2155       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
   2156    }
   2157 
   2158    if (prog->Name != 0)
   2159       key.proj_attrib_mask = 0xffffffff;
   2160 
   2161    if (intel->gen < 6)
   2162       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
   2163 
   2164    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
   2165       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
   2166 	 continue;
   2167 
   2168       if (prog->Name == 0)
   2169          key.proj_attrib_mask |= 1 << i;
   2170 
   2171       if (intel->gen < 6) {
   2172          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
   2173 
   2174          if (vp_index >= 0)
   2175             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
   2176       }
   2177    }
   2178 
   2179    key.clamp_fragment_color = true;
   2180 
   2181    for (int i = 0; i < MAX_SAMPLERS; i++) {
   2182       if (fp->Base.ShadowSamplers & (1 << i)) {
   2183          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
   2184          key.tex.swizzles[i] =
   2185             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
   2186       } else {
   2187          /* Color sampler: assume no swizzling. */
   2188          key.tex.swizzles[i] = SWIZZLE_XYZW;
   2189       }
   2190    }
   2191 
   2192    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
   2193       key.drawable_height = ctx->DrawBuffer->Height;
   2194    }
   2195 
   2196    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
   2197       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
   2198    }
   2199 
   2200    key.nr_color_regions = 1;
   2201 
   2202    key.program_string_id = bfp->id;
   2203 
   2204    uint32_t old_prog_offset = brw->wm.prog_offset;
   2205    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
   2206 
   2207    bool success = do_wm_prog(brw, prog, bfp, &key);
   2208 
   2209    brw->wm.prog_offset = old_prog_offset;
   2210    brw->wm.prog_data = old_prog_data;
   2211 
   2212    return success;
   2213 }
   2214