Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2011 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include "brw_vec4.h"
     25 extern "C" {
     26 #include "main/macros.h"
     27 #include "program/prog_parameter.h"
     28 #include "program/sampler.h"
     29 }
     30 
     31 namespace brw {
     32 
     33 vec4_instruction::vec4_instruction(vec4_visitor *v,
     34 				   enum opcode opcode, dst_reg dst,
     35 				   src_reg src0, src_reg src1, src_reg src2)
     36 {
     37    this->opcode = opcode;
     38    this->dst = dst;
     39    this->src[0] = src0;
     40    this->src[1] = src1;
     41    this->src[2] = src2;
     42    this->ir = v->base_ir;
     43    this->annotation = v->current_annotation;
     44 }
     45 
     46 vec4_instruction *
     47 vec4_visitor::emit(vec4_instruction *inst)
     48 {
     49    this->instructions.push_tail(inst);
     50 
     51    return inst;
     52 }
     53 
     54 vec4_instruction *
     55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
     56 {
     57    new_inst->ir = inst->ir;
     58    new_inst->annotation = inst->annotation;
     59 
     60    inst->insert_before(new_inst);
     61 
     62    return inst;
     63 }
     64 
     65 vec4_instruction *
     66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
     67 		   src_reg src0, src_reg src1, src_reg src2)
     68 {
     69    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
     70 					     src0, src1, src2));
     71 }
     72 
     73 
     74 vec4_instruction *
     75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
     76 {
     77    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
     78 }
     79 
     80 vec4_instruction *
     81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
     82 {
     83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
     84 }
     85 
     86 vec4_instruction *
     87 vec4_visitor::emit(enum opcode opcode)
     88 {
     89    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
     90 }
     91 
     92 #define ALU1(op)							\
     93    vec4_instruction *							\
     94    vec4_visitor::op(dst_reg dst, src_reg src0)				\
     95    {									\
     96       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
     97 					   src0);			\
     98    }
     99 
    100 #define ALU2(op)							\
    101    vec4_instruction *							\
    102    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)		\
    103    {									\
    104       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
    105 					   src0, src1);			\
    106    }
    107 
    108 ALU1(NOT)
    109 ALU1(MOV)
    110 ALU1(FRC)
    111 ALU1(RNDD)
    112 ALU1(RNDE)
    113 ALU1(RNDZ)
    114 ALU2(ADD)
    115 ALU2(MUL)
    116 ALU2(MACH)
    117 ALU2(AND)
    118 ALU2(OR)
    119 ALU2(XOR)
    120 ALU2(DP3)
    121 ALU2(DP4)
    122 
    123 /** Gen4 predicated IF. */
    124 vec4_instruction *
    125 vec4_visitor::IF(uint32_t predicate)
    126 {
    127    vec4_instruction *inst;
    128 
    129    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
    130    inst->predicate = predicate;
    131 
    132    return inst;
    133 }
    134 
    135 /** Gen6+ IF with embedded comparison. */
    136 vec4_instruction *
    137 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
    138 {
    139    assert(intel->gen >= 6);
    140 
    141    vec4_instruction *inst;
    142 
    143    resolve_ud_negate(&src0);
    144    resolve_ud_negate(&src1);
    145 
    146    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
    147 					src0, src1);
    148    inst->conditional_mod = condition;
    149 
    150    return inst;
    151 }
    152 
    153 /**
    154  * CMP: Sets the low bit of the destination channels with the result
    155  * of the comparison, while the upper bits are undefined, and updates
    156  * the flag register with the packed 16 bits of the result.
    157  */
    158 vec4_instruction *
    159 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
    160 {
    161    vec4_instruction *inst;
    162 
    163    /* original gen4 does type conversion to the destination type
    164     * before before comparison, producing garbage results for floating
    165     * point comparisons.
    166     */
    167    if (intel->gen == 4) {
    168       dst.type = src0.type;
    169       if (dst.file == HW_REG)
    170 	 dst.fixed_hw_reg.type = dst.type;
    171    }
    172 
    173    resolve_ud_negate(&src0);
    174    resolve_ud_negate(&src1);
    175 
    176    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
    177    inst->conditional_mod = condition;
    178 
    179    return inst;
    180 }
    181 
    182 vec4_instruction *
    183 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
    184 {
    185    vec4_instruction *inst;
    186 
    187    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
    188 					dst, index);
    189    inst->base_mrf = 14;
    190    inst->mlen = 1;
    191 
    192    return inst;
    193 }
    194 
    195 vec4_instruction *
    196 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
    197 {
    198    vec4_instruction *inst;
    199 
    200    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
    201 					dst, src, index);
    202    inst->base_mrf = 13;
    203    inst->mlen = 2;
    204 
    205    return inst;
    206 }
    207 
    208 void
    209 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
    210 {
    211    static enum opcode dot_opcodes[] = {
    212       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
    213    };
    214 
    215    emit(dot_opcodes[elements - 2], dst, src0, src1);
    216 }
    217 
    218 void
    219 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
    220 {
    221    /* The gen6 math instruction ignores the source modifiers --
    222     * swizzle, abs, negate, and at least some parts of the register
    223     * region description.
    224     *
    225     * While it would seem that this MOV could be avoided at this point
    226     * in the case that the swizzle is matched up with the destination
    227     * writemask, note that uniform packing and register allocation
    228     * could rearrange our swizzle, so let's leave this matter up to
    229     * copy propagation later.
    230     */
    231    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
    232    emit(MOV(dst_reg(temp_src), src));
    233 
    234    if (dst.writemask != WRITEMASK_XYZW) {
    235       /* The gen6 math instruction must be align1, so we can't do
    236        * writemasks.
    237        */
    238       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
    239 
    240       emit(opcode, temp_dst, temp_src);
    241 
    242       emit(MOV(dst, src_reg(temp_dst)));
    243    } else {
    244       emit(opcode, dst, temp_src);
    245    }
    246 }
    247 
    248 void
    249 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
    250 {
    251    vec4_instruction *inst = emit(opcode, dst, src);
    252    inst->base_mrf = 1;
    253    inst->mlen = 1;
    254 }
    255 
    256 void
    257 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
    258 {
    259    switch (opcode) {
    260    case SHADER_OPCODE_RCP:
    261    case SHADER_OPCODE_RSQ:
    262    case SHADER_OPCODE_SQRT:
    263    case SHADER_OPCODE_EXP2:
    264    case SHADER_OPCODE_LOG2:
    265    case SHADER_OPCODE_SIN:
    266    case SHADER_OPCODE_COS:
    267       break;
    268    default:
    269       assert(!"not reached: bad math opcode");
    270       return;
    271    }
    272 
    273    if (intel->gen >= 7) {
    274       emit(opcode, dst, src);
    275    } else if (intel->gen == 6) {
    276       return emit_math1_gen6(opcode, dst, src);
    277    } else {
    278       return emit_math1_gen4(opcode, dst, src);
    279    }
    280 }
    281 
    282 void
    283 vec4_visitor::emit_math2_gen6(enum opcode opcode,
    284 			      dst_reg dst, src_reg src0, src_reg src1)
    285 {
    286    src_reg expanded;
    287 
    288    /* The gen6 math instruction ignores the source modifiers --
    289     * swizzle, abs, negate, and at least some parts of the register
    290     * region description.  Move the sources to temporaries to make it
    291     * generally work.
    292     */
    293 
    294    expanded = src_reg(this, glsl_type::vec4_type);
    295    expanded.type = src0.type;
    296    emit(MOV(dst_reg(expanded), src0));
    297    src0 = expanded;
    298 
    299    expanded = src_reg(this, glsl_type::vec4_type);
    300    expanded.type = src1.type;
    301    emit(MOV(dst_reg(expanded), src1));
    302    src1 = expanded;
    303 
    304    if (dst.writemask != WRITEMASK_XYZW) {
    305       /* The gen6 math instruction must be align1, so we can't do
    306        * writemasks.
    307        */
    308       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
    309       temp_dst.type = dst.type;
    310 
    311       emit(opcode, temp_dst, src0, src1);
    312 
    313       emit(MOV(dst, src_reg(temp_dst)));
    314    } else {
    315       emit(opcode, dst, src0, src1);
    316    }
    317 }
    318 
    319 void
    320 vec4_visitor::emit_math2_gen4(enum opcode opcode,
    321 			      dst_reg dst, src_reg src0, src_reg src1)
    322 {
    323    vec4_instruction *inst = emit(opcode, dst, src0, src1);
    324    inst->base_mrf = 1;
    325    inst->mlen = 2;
    326 }
    327 
    328 void
    329 vec4_visitor::emit_math(enum opcode opcode,
    330 			dst_reg dst, src_reg src0, src_reg src1)
    331 {
    332    switch (opcode) {
    333    case SHADER_OPCODE_POW:
    334    case SHADER_OPCODE_INT_QUOTIENT:
    335    case SHADER_OPCODE_INT_REMAINDER:
    336       break;
    337    default:
    338       assert(!"not reached: unsupported binary math opcode");
    339       return;
    340    }
    341 
    342    if (intel->gen >= 7) {
    343       emit(opcode, dst, src0, src1);
    344    } else if (intel->gen == 6) {
    345       return emit_math2_gen6(opcode, dst, src0, src1);
    346    } else {
    347       return emit_math2_gen4(opcode, dst, src0, src1);
    348    }
    349 }
    350 
    351 void
    352 vec4_visitor::visit_instructions(const exec_list *list)
    353 {
    354    foreach_list(node, list) {
    355       ir_instruction *ir = (ir_instruction *)node;
    356 
    357       base_ir = ir;
    358       ir->accept(this);
    359    }
    360 }
    361 
    362 
    363 static int
    364 type_size(const struct glsl_type *type)
    365 {
    366    unsigned int i;
    367    int size;
    368 
    369    switch (type->base_type) {
    370    case GLSL_TYPE_UINT:
    371    case GLSL_TYPE_INT:
    372    case GLSL_TYPE_FLOAT:
    373    case GLSL_TYPE_BOOL:
    374       if (type->is_matrix()) {
    375 	 return type->matrix_columns;
    376       } else {
    377 	 /* Regardless of size of vector, it gets a vec4. This is bad
    378 	  * packing for things like floats, but otherwise arrays become a
    379 	  * mess.  Hopefully a later pass over the code can pack scalars
    380 	  * down if appropriate.
    381 	  */
    382 	 return 1;
    383       }
    384    case GLSL_TYPE_ARRAY:
    385       assert(type->length > 0);
    386       return type_size(type->fields.array) * type->length;
    387    case GLSL_TYPE_STRUCT:
    388       size = 0;
    389       for (i = 0; i < type->length; i++) {
    390 	 size += type_size(type->fields.structure[i].type);
    391       }
    392       return size;
    393    case GLSL_TYPE_SAMPLER:
    394       /* Samplers take up one slot in UNIFORMS[], but they're baked in
    395        * at link time.
    396        */
    397       return 1;
    398    default:
    399       assert(0);
    400       return 0;
    401    }
    402 }
    403 
    404 int
    405 vec4_visitor::virtual_grf_alloc(int size)
    406 {
    407    if (virtual_grf_array_size <= virtual_grf_count) {
    408       if (virtual_grf_array_size == 0)
    409 	 virtual_grf_array_size = 16;
    410       else
    411 	 virtual_grf_array_size *= 2;
    412       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
    413 				   virtual_grf_array_size);
    414       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
    415 				     virtual_grf_array_size);
    416    }
    417    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
    418    virtual_grf_reg_count += size;
    419    virtual_grf_sizes[virtual_grf_count] = size;
    420    return virtual_grf_count++;
    421 }
    422 
    423 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
    424 {
    425    init();
    426 
    427    this->file = GRF;
    428    this->reg = v->virtual_grf_alloc(type_size(type));
    429 
    430    if (type->is_array() || type->is_record()) {
    431       this->swizzle = BRW_SWIZZLE_NOOP;
    432    } else {
    433       this->swizzle = swizzle_for_size(type->vector_elements);
    434    }
    435 
    436    this->type = brw_type_for_base_type(type);
    437 }
    438 
    439 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    440 {
    441    init();
    442 
    443    this->file = GRF;
    444    this->reg = v->virtual_grf_alloc(type_size(type));
    445 
    446    if (type->is_array() || type->is_record()) {
    447       this->writemask = WRITEMASK_XYZW;
    448    } else {
    449       this->writemask = (1 << type->vector_elements) - 1;
    450    }
    451 
    452    this->type = brw_type_for_base_type(type);
    453 }
    454 
    455 /* Our support for uniforms is piggy-backed on the struct
    456  * gl_fragment_program, because that's where the values actually
    457  * get stored, rather than in some global gl_shader_program uniform
    458  * store.
    459  */
    460 int
    461 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
    462 {
    463    unsigned int offset = 0;
    464    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
    465 
    466    if (type->is_matrix()) {
    467       const glsl_type *column = type->column_type();
    468 
    469       for (unsigned int i = 0; i < type->matrix_columns; i++) {
    470 	 offset += setup_uniform_values(loc + offset, column);
    471       }
    472 
    473       return offset;
    474    }
    475 
    476    switch (type->base_type) {
    477    case GLSL_TYPE_FLOAT:
    478    case GLSL_TYPE_UINT:
    479    case GLSL_TYPE_INT:
    480    case GLSL_TYPE_BOOL:
    481       for (unsigned int i = 0; i < type->vector_elements; i++) {
    482 	 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
    483       }
    484 
    485       /* Set up pad elements to get things aligned to a vec4 boundary. */
    486       for (unsigned int i = type->vector_elements; i < 4; i++) {
    487 	 static float zero = 0;
    488 
    489 	 c->prog_data.param[this->uniforms * 4 + i] = &zero;
    490       }
    491 
    492       /* Track the size of this uniform vector, for future packing of
    493        * uniforms.
    494        */
    495       this->uniform_vector_size[this->uniforms] = type->vector_elements;
    496       this->uniforms++;
    497 
    498       return 1;
    499 
    500    case GLSL_TYPE_STRUCT:
    501       for (unsigned int i = 0; i < type->length; i++) {
    502 	 offset += setup_uniform_values(loc + offset,
    503 					type->fields.structure[i].type);
    504       }
    505       return offset;
    506 
    507    case GLSL_TYPE_ARRAY:
    508       for (unsigned int i = 0; i < type->length; i++) {
    509 	 offset += setup_uniform_values(loc + offset, type->fields.array);
    510       }
    511       return offset;
    512 
    513    case GLSL_TYPE_SAMPLER:
    514       /* The sampler takes up a slot, but we don't use any values from it. */
    515       return 1;
    516 
    517    default:
    518       assert(!"not reached");
    519       return 0;
    520    }
    521 }
    522 
    523 void
    524 vec4_visitor::setup_uniform_clipplane_values()
    525 {
    526    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
    527 
    528    /* Pre-Gen6, we compact clip planes.  For example, if the user
    529     * enables just clip planes 0, 1, and 3, we will enable clip planes
    530     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
    531     * plane 2.  This simplifies the implementation of the Gen6 clip
    532     * thread.
    533     *
    534     * In Gen6 and later, we don't compact clip planes, because this
    535     * simplifies the implementation of gl_ClipDistance.
    536     */
    537    int compacted_clipplane_index = 0;
    538    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
    539       if (intel->gen < 6 &&
    540           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
    541          continue;
    542       }
    543       this->uniform_vector_size[this->uniforms] = 4;
    544       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
    545       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
    546       for (int j = 0; j < 4; ++j) {
    547          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
    548       }
    549       ++compacted_clipplane_index;
    550       ++this->uniforms;
    551    }
    552 }
    553 
    554 /* Our support for builtin uniforms is even scarier than non-builtin.
    555  * It sits on top of the PROG_STATE_VAR parameters that are
    556  * automatically updated from GL context state.
    557  */
    558 void
    559 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
    560 {
    561    const ir_state_slot *const slots = ir->state_slots;
    562    assert(ir->state_slots != NULL);
    563 
    564    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
    565       /* This state reference has already been setup by ir_to_mesa,
    566        * but we'll get the same index back here.  We can reference
    567        * ParameterValues directly, since unlike brw_fs.cpp, we never
    568        * add new state references during compile.
    569        */
    570       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
    571 					    (gl_state_index *)slots[i].tokens);
    572       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
    573 
    574       this->uniform_vector_size[this->uniforms] = 0;
    575       /* Add each of the unique swizzled channels of the element.
    576        * This will end up matching the size of the glsl_type of this field.
    577        */
    578       int last_swiz = -1;
    579       for (unsigned int j = 0; j < 4; j++) {
    580 	 int swiz = GET_SWZ(slots[i].swizzle, j);
    581 	 last_swiz = swiz;
    582 
    583 	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
    584 	 if (swiz <= last_swiz)
    585 	    this->uniform_vector_size[this->uniforms]++;
    586       }
    587       this->uniforms++;
    588    }
    589 }
    590 
    591 dst_reg *
    592 vec4_visitor::variable_storage(ir_variable *var)
    593 {
    594    return (dst_reg *)hash_table_find(this->variable_ht, var);
    595 }
    596 
    597 void
    598 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
    599 {
    600    ir_expression *expr = ir->as_expression();
    601 
    602    *predicate = BRW_PREDICATE_NORMAL;
    603 
    604    if (expr) {
    605       src_reg op[2];
    606       vec4_instruction *inst;
    607 
    608       assert(expr->get_num_operands() <= 2);
    609       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
    610 	 expr->operands[i]->accept(this);
    611 	 op[i] = this->result;
    612 
    613 	 resolve_ud_negate(&op[i]);
    614       }
    615 
    616       switch (expr->operation) {
    617       case ir_unop_logic_not:
    618 	 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
    619 	 inst->conditional_mod = BRW_CONDITIONAL_Z;
    620 	 break;
    621 
    622       case ir_binop_logic_xor:
    623 	 inst = emit(XOR(dst_null_d(), op[0], op[1]));
    624 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
    625 	 break;
    626 
    627       case ir_binop_logic_or:
    628 	 inst = emit(OR(dst_null_d(), op[0], op[1]));
    629 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
    630 	 break;
    631 
    632       case ir_binop_logic_and:
    633 	 inst = emit(AND(dst_null_d(), op[0], op[1]));
    634 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
    635 	 break;
    636 
    637       case ir_unop_f2b:
    638 	 if (intel->gen >= 6) {
    639 	    emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
    640 	 } else {
    641 	    inst = emit(MOV(dst_null_f(), op[0]));
    642 	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
    643 	 }
    644 	 break;
    645 
    646       case ir_unop_i2b:
    647 	 if (intel->gen >= 6) {
    648 	    emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
    649 	 } else {
    650 	    inst = emit(MOV(dst_null_d(), op[0]));
    651 	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
    652 	 }
    653 	 break;
    654 
    655       case ir_binop_all_equal:
    656 	 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
    657 	 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
    658 	 break;
    659 
    660       case ir_binop_any_nequal:
    661 	 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
    662 	 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
    663 	 break;
    664 
    665       case ir_unop_any:
    666 	 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
    667 	 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
    668 	 break;
    669 
    670       case ir_binop_greater:
    671       case ir_binop_gequal:
    672       case ir_binop_less:
    673       case ir_binop_lequal:
    674       case ir_binop_equal:
    675       case ir_binop_nequal:
    676 	 emit(CMP(dst_null_d(), op[0], op[1],
    677 		  brw_conditional_for_comparison(expr->operation)));
    678 	 break;
    679 
    680       default:
    681 	 assert(!"not reached");
    682 	 break;
    683       }
    684       return;
    685    }
    686 
    687    ir->accept(this);
    688 
    689    resolve_ud_negate(&this->result);
    690 
    691    if (intel->gen >= 6) {
    692       vec4_instruction *inst = emit(AND(dst_null_d(),
    693 					this->result, src_reg(1)));
    694       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    695    } else {
    696       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
    697       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    698    }
    699 }
    700 
    701 /**
    702  * Emit a gen6 IF statement with the comparison folded into the IF
    703  * instruction.
    704  */
    705 void
    706 vec4_visitor::emit_if_gen6(ir_if *ir)
    707 {
    708    ir_expression *expr = ir->condition->as_expression();
    709 
    710    if (expr) {
    711       src_reg op[2];
    712       dst_reg temp;
    713 
    714       assert(expr->get_num_operands() <= 2);
    715       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
    716 	 expr->operands[i]->accept(this);
    717 	 op[i] = this->result;
    718       }
    719 
    720       switch (expr->operation) {
    721       case ir_unop_logic_not:
    722 	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
    723 	 return;
    724 
    725       case ir_binop_logic_xor:
    726 	 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
    727 	 return;
    728 
    729       case ir_binop_logic_or:
    730 	 temp = dst_reg(this, glsl_type::bool_type);
    731 	 emit(OR(temp, op[0], op[1]));
    732 	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
    733 	 return;
    734 
    735       case ir_binop_logic_and:
    736 	 temp = dst_reg(this, glsl_type::bool_type);
    737 	 emit(AND(temp, op[0], op[1]));
    738 	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
    739 	 return;
    740 
    741       case ir_unop_f2b:
    742 	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
    743 	 return;
    744 
    745       case ir_unop_i2b:
    746 	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
    747 	 return;
    748 
    749       case ir_binop_greater:
    750       case ir_binop_gequal:
    751       case ir_binop_less:
    752       case ir_binop_lequal:
    753       case ir_binop_equal:
    754       case ir_binop_nequal:
    755 	 emit(IF(op[0], op[1],
    756 		 brw_conditional_for_comparison(expr->operation)));
    757 	 return;
    758 
    759       case ir_binop_all_equal:
    760 	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
    761 	 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
    762 	 return;
    763 
    764       case ir_binop_any_nequal:
    765 	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
    766 	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
    767 	 return;
    768 
    769       case ir_unop_any:
    770 	 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
    771 	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
    772 	 return;
    773 
    774       default:
    775 	 assert(!"not reached");
    776 	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
    777 	 return;
    778       }
    779       return;
    780    }
    781 
    782    ir->condition->accept(this);
    783 
    784    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
    785 }
    786 
    787 void
    788 vec4_visitor::visit(ir_variable *ir)
    789 {
    790    dst_reg *reg = NULL;
    791 
    792    if (variable_storage(ir))
    793       return;
    794 
    795    switch (ir->mode) {
    796    case ir_var_in:
    797       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
    798 
    799       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
    800        * come in as floating point conversions of the integer values.
    801        */
    802       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
    803 	 if (!c->key.gl_fixed_input_size[i])
    804 	    continue;
    805 
    806 	 dst_reg dst = *reg;
    807          dst.type = brw_type_for_base_type(ir->type);
    808 	 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
    809 	 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
    810       }
    811       break;
    812 
    813    case ir_var_out:
    814       reg = new(mem_ctx) dst_reg(this, ir->type);
    815 
    816       for (int i = 0; i < type_size(ir->type); i++) {
    817 	 output_reg[ir->location + i] = *reg;
    818 	 output_reg[ir->location + i].reg_offset = i;
    819 	 output_reg[ir->location + i].type =
    820             brw_type_for_base_type(ir->type->get_scalar_type());
    821 	 output_reg_annotation[ir->location + i] = ir->name;
    822       }
    823       break;
    824 
    825    case ir_var_auto:
    826    case ir_var_temporary:
    827       reg = new(mem_ctx) dst_reg(this, ir->type);
    828       break;
    829 
    830    case ir_var_uniform:
    831       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
    832 
    833       /* Thanks to the lower_ubo_reference pass, we will see only
    834        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
    835        * variables, so no need for them to be in variable_ht.
    836        */
    837       if (ir->uniform_block != -1)
    838          return;
    839 
    840       /* Track how big the whole uniform variable is, in case we need to put a
    841        * copy of its data into pull constants for array access.
    842        */
    843       this->uniform_size[this->uniforms] = type_size(ir->type);
    844 
    845       if (!strncmp(ir->name, "gl_", 3)) {
    846 	 setup_builtin_uniform_values(ir);
    847       } else {
    848 	 setup_uniform_values(ir->location, ir->type);
    849       }
    850       break;
    851 
    852    case ir_var_system_value:
    853       /* VertexID is stored by the VF as the last vertex element, but
    854        * we don't represent it with a flag in inputs_read, so we call
    855        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
    856        */
    857       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
    858       prog_data->uses_vertexid = true;
    859 
    860       switch (ir->location) {
    861       case SYSTEM_VALUE_VERTEX_ID:
    862 	 reg->writemask = WRITEMASK_X;
    863 	 break;
    864       case SYSTEM_VALUE_INSTANCE_ID:
    865 	 reg->writemask = WRITEMASK_Y;
    866 	 break;
    867       default:
    868 	 assert(!"not reached");
    869 	 break;
    870       }
    871       break;
    872 
    873    default:
    874       assert(!"not reached");
    875    }
    876 
    877    reg->type = brw_type_for_base_type(ir->type);
    878    hash_table_insert(this->variable_ht, reg, ir);
    879 }
    880 
    881 void
    882 vec4_visitor::visit(ir_loop *ir)
    883 {
    884    dst_reg counter;
    885 
    886    /* We don't want debugging output to print the whole body of the
    887     * loop as the annotation.
    888     */
    889    this->base_ir = NULL;
    890 
    891    if (ir->counter != NULL) {
    892       this->base_ir = ir->counter;
    893       ir->counter->accept(this);
    894       counter = *(variable_storage(ir->counter));
    895 
    896       if (ir->from != NULL) {
    897 	 this->base_ir = ir->from;
    898 	 ir->from->accept(this);
    899 
    900 	 emit(MOV(counter, this->result));
    901       }
    902    }
    903 
    904    emit(BRW_OPCODE_DO);
    905 
    906    if (ir->to) {
    907       this->base_ir = ir->to;
    908       ir->to->accept(this);
    909 
    910       emit(CMP(dst_null_d(), src_reg(counter), this->result,
    911 	       brw_conditional_for_comparison(ir->cmp)));
    912 
    913       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
    914       inst->predicate = BRW_PREDICATE_NORMAL;
    915    }
    916 
    917    visit_instructions(&ir->body_instructions);
    918 
    919 
    920    if (ir->increment) {
    921       this->base_ir = ir->increment;
    922       ir->increment->accept(this);
    923       emit(ADD(counter, src_reg(counter), this->result));
    924    }
    925 
    926    emit(BRW_OPCODE_WHILE);
    927 }
    928 
    929 void
    930 vec4_visitor::visit(ir_loop_jump *ir)
    931 {
    932    switch (ir->mode) {
    933    case ir_loop_jump::jump_break:
    934       emit(BRW_OPCODE_BREAK);
    935       break;
    936    case ir_loop_jump::jump_continue:
    937       emit(BRW_OPCODE_CONTINUE);
    938       break;
    939    }
    940 }
    941 
    942 
    943 void
    944 vec4_visitor::visit(ir_function_signature *ir)
    945 {
    946    assert(0);
    947    (void)ir;
    948 }
    949 
    950 void
    951 vec4_visitor::visit(ir_function *ir)
    952 {
    953    /* Ignore function bodies other than main() -- we shouldn't see calls to
    954     * them since they should all be inlined.
    955     */
    956    if (strcmp(ir->name, "main") == 0) {
    957       const ir_function_signature *sig;
    958       exec_list empty;
    959 
    960       sig = ir->matching_signature(&empty);
    961 
    962       assert(sig);
    963 
    964       visit_instructions(&sig->body);
    965    }
    966 }
    967 
    968 bool
    969 vec4_visitor::try_emit_sat(ir_expression *ir)
    970 {
    971    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
    972    if (!sat_src)
    973       return false;
    974 
    975    sat_src->accept(this);
    976    src_reg src = this->result;
    977 
    978    this->result = src_reg(this, ir->type);
    979    vec4_instruction *inst;
    980    inst = emit(MOV(dst_reg(this->result), src));
    981    inst->saturate = true;
    982 
    983    return true;
    984 }
    985 
    986 void
    987 vec4_visitor::emit_bool_comparison(unsigned int op,
    988 				 dst_reg dst, src_reg src0, src_reg src1)
    989 {
    990    /* original gen4 does destination conversion before comparison. */
    991    if (intel->gen < 5)
    992       dst.type = src0.type;
    993 
    994    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
    995 
    996    dst.type = BRW_REGISTER_TYPE_D;
    997    emit(AND(dst, src_reg(dst), src_reg(0x1)));
    998 }
    999 
   1000 void
   1001 vec4_visitor::visit(ir_expression *ir)
   1002 {
   1003    unsigned int operand;
   1004    src_reg op[Elements(ir->operands)];
   1005    src_reg result_src;
   1006    dst_reg result_dst;
   1007    vec4_instruction *inst;
   1008 
   1009    if (try_emit_sat(ir))
   1010       return;
   1011 
   1012    for (operand = 0; operand < ir->get_num_operands(); operand++) {
   1013       this->result.file = BAD_FILE;
   1014       ir->operands[operand]->accept(this);
   1015       if (this->result.file == BAD_FILE) {
   1016 	 printf("Failed to get tree for expression operand:\n");
   1017 	 ir->operands[operand]->print();
   1018 	 exit(1);
   1019       }
   1020       op[operand] = this->result;
   1021 
   1022       /* Matrix expression operands should have been broken down to vector
   1023        * operations already.
   1024        */
   1025       assert(!ir->operands[operand]->type->is_matrix());
   1026    }
   1027 
   1028    int vector_elements = ir->operands[0]->type->vector_elements;
   1029    if (ir->operands[1]) {
   1030       vector_elements = MAX2(vector_elements,
   1031 			     ir->operands[1]->type->vector_elements);
   1032    }
   1033 
   1034    this->result.file = BAD_FILE;
   1035 
   1036    /* Storage for our result.  Ideally for an assignment we'd be using
   1037     * the actual storage for the result here, instead.
   1038     */
   1039    result_src = src_reg(this, ir->type);
   1040    /* convenience for the emit functions below. */
   1041    result_dst = dst_reg(result_src);
   1042    /* If nothing special happens, this is the result. */
   1043    this->result = result_src;
   1044    /* Limit writes to the channels that will be used by result_src later.
   1045     * This does limit this temp's use as a temporary for multi-instruction
   1046     * sequences.
   1047     */
   1048    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
   1049 
   1050    switch (ir->operation) {
   1051    case ir_unop_logic_not:
   1052       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
   1053        * ones complement of the whole register, not just bit 0.
   1054        */
   1055       emit(XOR(result_dst, op[0], src_reg(1)));
   1056       break;
   1057    case ir_unop_neg:
   1058       op[0].negate = !op[0].negate;
   1059       this->result = op[0];
   1060       break;
   1061    case ir_unop_abs:
   1062       op[0].abs = true;
   1063       op[0].negate = false;
   1064       this->result = op[0];
   1065       break;
   1066 
   1067    case ir_unop_sign:
   1068       emit(MOV(result_dst, src_reg(0.0f)));
   1069 
   1070       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
   1071       inst = emit(MOV(result_dst, src_reg(1.0f)));
   1072       inst->predicate = BRW_PREDICATE_NORMAL;
   1073 
   1074       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
   1075       inst = emit(MOV(result_dst, src_reg(-1.0f)));
   1076       inst->predicate = BRW_PREDICATE_NORMAL;
   1077 
   1078       break;
   1079 
   1080    case ir_unop_rcp:
   1081       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
   1082       break;
   1083 
   1084    case ir_unop_exp2:
   1085       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
   1086       break;
   1087    case ir_unop_log2:
   1088       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
   1089       break;
   1090    case ir_unop_exp:
   1091    case ir_unop_log:
   1092       assert(!"not reached: should be handled by ir_explog_to_explog2");
   1093       break;
   1094    case ir_unop_sin:
   1095    case ir_unop_sin_reduced:
   1096       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
   1097       break;
   1098    case ir_unop_cos:
   1099    case ir_unop_cos_reduced:
   1100       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
   1101       break;
   1102 
   1103    case ir_unop_dFdx:
   1104    case ir_unop_dFdy:
   1105       assert(!"derivatives not valid in vertex shader");
   1106       break;
   1107 
   1108    case ir_unop_noise:
   1109       assert(!"not reached: should be handled by lower_noise");
   1110       break;
   1111 
   1112    case ir_binop_add:
   1113       emit(ADD(result_dst, op[0], op[1]));
   1114       break;
   1115    case ir_binop_sub:
   1116       assert(!"not reached: should be handled by ir_sub_to_add_neg");
   1117       break;
   1118 
   1119    case ir_binop_mul:
   1120       if (ir->type->is_integer()) {
   1121 	 /* For integer multiplication, the MUL uses the low 16 bits
   1122 	  * of one of the operands (src0 on gen6, src1 on gen7).  The
   1123 	  * MACH accumulates in the contribution of the upper 16 bits
   1124 	  * of that operand.
   1125 	  *
   1126 	  * FINISHME: Emit just the MUL if we know an operand is small
   1127 	  * enough.
   1128 	  */
   1129 	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
   1130 
   1131 	 emit(MUL(acc, op[0], op[1]));
   1132 	 emit(MACH(dst_null_d(), op[0], op[1]));
   1133 	 emit(MOV(result_dst, src_reg(acc)));
   1134       } else {
   1135 	 emit(MUL(result_dst, op[0], op[1]));
   1136       }
   1137       break;
   1138    case ir_binop_div:
   1139       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
   1140       assert(ir->type->is_integer());
   1141       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
   1142       break;
   1143    case ir_binop_mod:
   1144       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
   1145       assert(ir->type->is_integer());
   1146       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
   1147       break;
   1148 
   1149    case ir_binop_less:
   1150    case ir_binop_greater:
   1151    case ir_binop_lequal:
   1152    case ir_binop_gequal:
   1153    case ir_binop_equal:
   1154    case ir_binop_nequal: {
   1155       emit(CMP(result_dst, op[0], op[1],
   1156 	       brw_conditional_for_comparison(ir->operation)));
   1157       emit(AND(result_dst, result_src, src_reg(0x1)));
   1158       break;
   1159    }
   1160 
   1161    case ir_binop_all_equal:
   1162       /* "==" operator producing a scalar boolean. */
   1163       if (ir->operands[0]->type->is_vector() ||
   1164 	  ir->operands[1]->type->is_vector()) {
   1165 	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
   1166 	 emit(MOV(result_dst, src_reg(0)));
   1167 	 inst = emit(MOV(result_dst, src_reg(1)));
   1168 	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
   1169       } else {
   1170 	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
   1171 	 emit(AND(result_dst, result_src, src_reg(0x1)));
   1172       }
   1173       break;
   1174    case ir_binop_any_nequal:
   1175       /* "!=" operator producing a scalar boolean. */
   1176       if (ir->operands[0]->type->is_vector() ||
   1177 	  ir->operands[1]->type->is_vector()) {
   1178 	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
   1179 
   1180 	 emit(MOV(result_dst, src_reg(0)));
   1181 	 inst = emit(MOV(result_dst, src_reg(1)));
   1182 	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
   1183       } else {
   1184 	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
   1185 	 emit(AND(result_dst, result_src, src_reg(0x1)));
   1186       }
   1187       break;
   1188 
   1189    case ir_unop_any:
   1190       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
   1191       emit(MOV(result_dst, src_reg(0)));
   1192 
   1193       inst = emit(MOV(result_dst, src_reg(1)));
   1194       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
   1195       break;
   1196 
   1197    case ir_binop_logic_xor:
   1198       emit(XOR(result_dst, op[0], op[1]));
   1199       break;
   1200 
   1201    case ir_binop_logic_or:
   1202       emit(OR(result_dst, op[0], op[1]));
   1203       break;
   1204 
   1205    case ir_binop_logic_and:
   1206       emit(AND(result_dst, op[0], op[1]));
   1207       break;
   1208 
   1209    case ir_binop_dot:
   1210       assert(ir->operands[0]->type->is_vector());
   1211       assert(ir->operands[0]->type == ir->operands[1]->type);
   1212       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
   1213       break;
   1214 
   1215    case ir_unop_sqrt:
   1216       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
   1217       break;
   1218    case ir_unop_rsq:
   1219       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
   1220       break;
   1221 
   1222    case ir_unop_bitcast_i2f:
   1223    case ir_unop_bitcast_u2f:
   1224       this->result = op[0];
   1225       this->result.type = BRW_REGISTER_TYPE_F;
   1226       break;
   1227 
   1228    case ir_unop_bitcast_f2i:
   1229       this->result = op[0];
   1230       this->result.type = BRW_REGISTER_TYPE_D;
   1231       break;
   1232 
   1233    case ir_unop_bitcast_f2u:
   1234       this->result = op[0];
   1235       this->result.type = BRW_REGISTER_TYPE_UD;
   1236       break;
   1237 
   1238    case ir_unop_i2f:
   1239    case ir_unop_i2u:
   1240    case ir_unop_u2i:
   1241    case ir_unop_u2f:
   1242    case ir_unop_b2f:
   1243    case ir_unop_b2i:
   1244    case ir_unop_f2i:
   1245    case ir_unop_f2u:
   1246       emit(MOV(result_dst, op[0]));
   1247       break;
   1248    case ir_unop_f2b:
   1249    case ir_unop_i2b: {
   1250       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
   1251       emit(AND(result_dst, result_src, src_reg(1)));
   1252       break;
   1253    }
   1254 
   1255    case ir_unop_trunc:
   1256       emit(RNDZ(result_dst, op[0]));
   1257       break;
   1258    case ir_unop_ceil:
   1259       op[0].negate = !op[0].negate;
   1260       inst = emit(RNDD(result_dst, op[0]));
   1261       this->result.negate = true;
   1262       break;
   1263    case ir_unop_floor:
   1264       inst = emit(RNDD(result_dst, op[0]));
   1265       break;
   1266    case ir_unop_fract:
   1267       inst = emit(FRC(result_dst, op[0]));
   1268       break;
   1269    case ir_unop_round_even:
   1270       emit(RNDE(result_dst, op[0]));
   1271       break;
   1272 
   1273    case ir_binop_min:
   1274       if (intel->gen >= 6) {
   1275 	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
   1276 	 inst->conditional_mod = BRW_CONDITIONAL_L;
   1277       } else {
   1278 	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
   1279 
   1280 	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
   1281 	 inst->predicate = BRW_PREDICATE_NORMAL;
   1282       }
   1283       break;
   1284    case ir_binop_max:
   1285       if (intel->gen >= 6) {
   1286 	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
   1287 	 inst->conditional_mod = BRW_CONDITIONAL_G;
   1288       } else {
   1289 	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
   1290 
   1291 	 inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
   1292 	 inst->predicate = BRW_PREDICATE_NORMAL;
   1293       }
   1294       break;
   1295 
   1296    case ir_binop_pow:
   1297       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
   1298       break;
   1299 
   1300    case ir_unop_bit_not:
   1301       inst = emit(NOT(result_dst, op[0]));
   1302       break;
   1303    case ir_binop_bit_and:
   1304       inst = emit(AND(result_dst, op[0], op[1]));
   1305       break;
   1306    case ir_binop_bit_xor:
   1307       inst = emit(XOR(result_dst, op[0], op[1]));
   1308       break;
   1309    case ir_binop_bit_or:
   1310       inst = emit(OR(result_dst, op[0], op[1]));
   1311       break;
   1312 
   1313    case ir_binop_lshift:
   1314       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
   1315       break;
   1316 
   1317    case ir_binop_rshift:
   1318       if (ir->type->base_type == GLSL_TYPE_INT)
   1319 	 inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
   1320       else
   1321 	 inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
   1322       break;
   1323 
   1324    case ir_binop_ubo_load: {
   1325       ir_constant *uniform_block = ir->operands[0]->as_constant();
   1326       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
   1327       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
   1328       src_reg offset = op[1];
   1329 
   1330       /* Now, load the vector from that offset. */
   1331       assert(ir->type->is_vector() || ir->type->is_scalar());
   1332 
   1333       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
   1334       packed_consts.type = result.type;
   1335       src_reg surf_index =
   1336          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
   1337       if (const_offset_ir) {
   1338          offset = src_reg(const_offset / 16);
   1339       } else {
   1340          emit(BRW_OPCODE_SHR, dst_reg(offset), offset, src_reg(4));
   1341       }
   1342 
   1343       vec4_instruction *pull =
   1344          emit(new(mem_ctx) vec4_instruction(this,
   1345                                             VS_OPCODE_PULL_CONSTANT_LOAD,
   1346                                             dst_reg(packed_consts),
   1347                                             surf_index,
   1348                                             offset));
   1349       pull->base_mrf = 14;
   1350       pull->mlen = 1;
   1351 
   1352       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
   1353       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
   1354                                             const_offset % 16 / 4,
   1355                                             const_offset % 16 / 4,
   1356                                             const_offset % 16 / 4);
   1357 
   1358       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
   1359       if (ir->type->base_type == GLSL_TYPE_BOOL) {
   1360          emit(CMP(result_dst, packed_consts, src_reg(0u),
   1361                   BRW_CONDITIONAL_NZ));
   1362          emit(AND(result_dst, result, src_reg(0x1)));
   1363       } else {
   1364          emit(MOV(result_dst, packed_consts));
   1365       }
   1366       break;
   1367    }
   1368 
   1369    case ir_quadop_vector:
   1370       assert(!"not reached: should be handled by lower_quadop_vector");
   1371       break;
   1372    }
   1373 }
   1374 
   1375 
   1376 void
   1377 vec4_visitor::visit(ir_swizzle *ir)
   1378 {
   1379    src_reg src;
   1380    int i = 0;
   1381    int swizzle[4];
   1382 
   1383    /* Note that this is only swizzles in expressions, not those on the left
   1384     * hand side of an assignment, which do write masking.  See ir_assignment
   1385     * for that.
   1386     */
   1387 
   1388    ir->val->accept(this);
   1389    src = this->result;
   1390    assert(src.file != BAD_FILE);
   1391 
   1392    for (i = 0; i < ir->type->vector_elements; i++) {
   1393       switch (i) {
   1394       case 0:
   1395 	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
   1396 	 break;
   1397       case 1:
   1398 	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
   1399 	 break;
   1400       case 2:
   1401 	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
   1402 	 break;
   1403       case 3:
   1404 	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
   1405 	    break;
   1406       }
   1407    }
   1408    for (; i < 4; i++) {
   1409       /* Replicate the last channel out. */
   1410       swizzle[i] = swizzle[ir->type->vector_elements - 1];
   1411    }
   1412 
   1413    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
   1414 
   1415    this->result = src;
   1416 }
   1417 
   1418 void
   1419 vec4_visitor::visit(ir_dereference_variable *ir)
   1420 {
   1421    const struct glsl_type *type = ir->type;
   1422    dst_reg *reg = variable_storage(ir->var);
   1423 
   1424    if (!reg) {
   1425       fail("Failed to find variable storage for %s\n", ir->var->name);
   1426       this->result = src_reg(brw_null_reg());
   1427       return;
   1428    }
   1429 
   1430    this->result = src_reg(*reg);
   1431 
   1432    /* System values get their swizzle from the dst_reg writemask */
   1433    if (ir->var->mode == ir_var_system_value)
   1434       return;
   1435 
   1436    if (type->is_scalar() || type->is_vector() || type->is_matrix())
   1437       this->result.swizzle = swizzle_for_size(type->vector_elements);
   1438 }
   1439 
   1440 void
   1441 vec4_visitor::visit(ir_dereference_array *ir)
   1442 {
   1443    ir_constant *constant_index;
   1444    src_reg src;
   1445    int element_size = type_size(ir->type);
   1446 
   1447    constant_index = ir->array_index->constant_expression_value();
   1448 
   1449    ir->array->accept(this);
   1450    src = this->result;
   1451 
   1452    if (constant_index) {
   1453       src.reg_offset += constant_index->value.i[0] * element_size;
   1454    } else {
   1455       /* Variable index array dereference.  It eats the "vec4" of the
   1456        * base of the array and an index that offsets the Mesa register
   1457        * index.
   1458        */
   1459       ir->array_index->accept(this);
   1460 
   1461       src_reg index_reg;
   1462 
   1463       if (element_size == 1) {
   1464 	 index_reg = this->result;
   1465       } else {
   1466 	 index_reg = src_reg(this, glsl_type::int_type);
   1467 
   1468 	 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
   1469       }
   1470 
   1471       if (src.reladdr) {
   1472 	 src_reg temp = src_reg(this, glsl_type::int_type);
   1473 
   1474 	 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
   1475 
   1476 	 index_reg = temp;
   1477       }
   1478 
   1479       src.reladdr = ralloc(mem_ctx, src_reg);
   1480       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
   1481    }
   1482 
   1483    /* If the type is smaller than a vec4, replicate the last channel out. */
   1484    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
   1485       src.swizzle = swizzle_for_size(ir->type->vector_elements);
   1486    else
   1487       src.swizzle = BRW_SWIZZLE_NOOP;
   1488    src.type = brw_type_for_base_type(ir->type);
   1489 
   1490    this->result = src;
   1491 }
   1492 
   1493 void
   1494 vec4_visitor::visit(ir_dereference_record *ir)
   1495 {
   1496    unsigned int i;
   1497    const glsl_type *struct_type = ir->record->type;
   1498    int offset = 0;
   1499 
   1500    ir->record->accept(this);
   1501 
   1502    for (i = 0; i < struct_type->length; i++) {
   1503       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
   1504 	 break;
   1505       offset += type_size(struct_type->fields.structure[i].type);
   1506    }
   1507 
   1508    /* If the type is smaller than a vec4, replicate the last channel out. */
   1509    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
   1510       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
   1511    else
   1512       this->result.swizzle = BRW_SWIZZLE_NOOP;
   1513    this->result.type = brw_type_for_base_type(ir->type);
   1514 
   1515    this->result.reg_offset += offset;
   1516 }
   1517 
   1518 /**
   1519  * We want to be careful in assignment setup to hit the actual storage
   1520  * instead of potentially using a temporary like we might with the
   1521  * ir_dereference handler.
   1522  */
   1523 static dst_reg
   1524 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
   1525 {
   1526    /* The LHS must be a dereference.  If the LHS is a variable indexed array
   1527     * access of a vector, it must be separated into a series conditional moves
   1528     * before reaching this point (see ir_vec_index_to_cond_assign).
   1529     */
   1530    assert(ir->as_dereference());
   1531    ir_dereference_array *deref_array = ir->as_dereference_array();
   1532    if (deref_array) {
   1533       assert(!deref_array->array->type->is_vector());
   1534    }
   1535 
   1536    /* Use the rvalue deref handler for the most part.  We'll ignore
   1537     * swizzles in it and write swizzles using writemask, though.
   1538     */
   1539    ir->accept(v);
   1540    return dst_reg(v->result);
   1541 }
   1542 
   1543 void
   1544 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
   1545 			      const struct glsl_type *type, uint32_t predicate)
   1546 {
   1547    if (type->base_type == GLSL_TYPE_STRUCT) {
   1548       for (unsigned int i = 0; i < type->length; i++) {
   1549 	 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
   1550       }
   1551       return;
   1552    }
   1553 
   1554    if (type->is_array()) {
   1555       for (unsigned int i = 0; i < type->length; i++) {
   1556 	 emit_block_move(dst, src, type->fields.array, predicate);
   1557       }
   1558       return;
   1559    }
   1560 
   1561    if (type->is_matrix()) {
   1562       const struct glsl_type *vec_type;
   1563 
   1564       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
   1565 					 type->vector_elements, 1);
   1566 
   1567       for (int i = 0; i < type->matrix_columns; i++) {
   1568 	 emit_block_move(dst, src, vec_type, predicate);
   1569       }
   1570       return;
   1571    }
   1572 
   1573    assert(type->is_scalar() || type->is_vector());
   1574 
   1575    dst->type = brw_type_for_base_type(type);
   1576    src->type = dst->type;
   1577 
   1578    dst->writemask = (1 << type->vector_elements) - 1;
   1579 
   1580    src->swizzle = swizzle_for_size(type->vector_elements);
   1581 
   1582    vec4_instruction *inst = emit(MOV(*dst, *src));
   1583    inst->predicate = predicate;
   1584 
   1585    dst->reg_offset++;
   1586    src->reg_offset++;
   1587 }
   1588 
   1589 
   1590 /* If the RHS processing resulted in an instruction generating a
   1591  * temporary value, and it would be easy to rewrite the instruction to
   1592  * generate its result right into the LHS instead, do so.  This ends
   1593  * up reliably removing instructions where it can be tricky to do so
   1594  * later without real UD chain information.
   1595  */
   1596 bool
   1597 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
   1598 				     dst_reg dst,
   1599 				     src_reg src,
   1600 				     vec4_instruction *pre_rhs_inst,
   1601 				     vec4_instruction *last_rhs_inst)
   1602 {
   1603    /* This could be supported, but it would take more smarts. */
   1604    if (ir->condition)
   1605       return false;
   1606 
   1607    if (pre_rhs_inst == last_rhs_inst)
   1608       return false; /* No instructions generated to work with. */
   1609 
   1610    /* Make sure the last instruction generated our source reg. */
   1611    if (src.file != GRF ||
   1612        src.file != last_rhs_inst->dst.file ||
   1613        src.reg != last_rhs_inst->dst.reg ||
   1614        src.reg_offset != last_rhs_inst->dst.reg_offset ||
   1615        src.reladdr ||
   1616        src.abs ||
   1617        src.negate ||
   1618        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
   1619       return false;
   1620 
   1621    /* Check that that last instruction fully initialized the channels
   1622     * we want to use, in the order we want to use them.  We could
   1623     * potentially reswizzle the operands of many instructions so that
   1624     * we could handle out of order channels, but don't yet.
   1625     */
   1626 
   1627    for (unsigned i = 0; i < 4; i++) {
   1628       if (dst.writemask & (1 << i)) {
   1629 	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
   1630 	    return false;
   1631 
   1632 	 if (BRW_GET_SWZ(src.swizzle, i) != i)
   1633 	    return false;
   1634       }
   1635    }
   1636 
   1637    /* Success!  Rewrite the instruction. */
   1638    last_rhs_inst->dst.file = dst.file;
   1639    last_rhs_inst->dst.reg = dst.reg;
   1640    last_rhs_inst->dst.reg_offset = dst.reg_offset;
   1641    last_rhs_inst->dst.reladdr = dst.reladdr;
   1642    last_rhs_inst->dst.writemask &= dst.writemask;
   1643 
   1644    return true;
   1645 }
   1646 
   1647 void
   1648 vec4_visitor::visit(ir_assignment *ir)
   1649 {
   1650    dst_reg dst = get_assignment_lhs(ir->lhs, this);
   1651    uint32_t predicate = BRW_PREDICATE_NONE;
   1652 
   1653    if (!ir->lhs->type->is_scalar() &&
   1654        !ir->lhs->type->is_vector()) {
   1655       ir->rhs->accept(this);
   1656       src_reg src = this->result;
   1657 
   1658       if (ir->condition) {
   1659 	 emit_bool_to_cond_code(ir->condition, &predicate);
   1660       }
   1661 
   1662       /* emit_block_move doesn't account for swizzles in the source register.
   1663        * This should be ok, since the source register is a structure or an
   1664        * array, and those can't be swizzled.  But double-check to be sure.
   1665        */
   1666       assert(src.swizzle ==
   1667              (ir->rhs->type->is_matrix()
   1668               ? swizzle_for_size(ir->rhs->type->vector_elements)
   1669               : BRW_SWIZZLE_NOOP));
   1670 
   1671       emit_block_move(&dst, &src, ir->rhs->type, predicate);
   1672       return;
   1673    }
   1674 
   1675    /* Now we're down to just a scalar/vector with writemasks. */
   1676    int i;
   1677 
   1678    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
   1679    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
   1680 
   1681    ir->rhs->accept(this);
   1682 
   1683    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
   1684 
   1685    src_reg src = this->result;
   1686 
   1687    int swizzles[4];
   1688    int first_enabled_chan = 0;
   1689    int src_chan = 0;
   1690 
   1691    assert(ir->lhs->type->is_vector() ||
   1692 	  ir->lhs->type->is_scalar());
   1693    dst.writemask = ir->write_mask;
   1694 
   1695    for (int i = 0; i < 4; i++) {
   1696       if (dst.writemask & (1 << i)) {
   1697 	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
   1698 	 break;
   1699       }
   1700    }
   1701 
   1702    /* Swizzle a small RHS vector into the channels being written.
   1703     *
   1704     * glsl ir treats write_mask as dictating how many channels are
   1705     * present on the RHS while in our instructions we need to make
   1706     * those channels appear in the slots of the vec4 they're written to.
   1707     */
   1708    for (int i = 0; i < 4; i++) {
   1709       if (dst.writemask & (1 << i))
   1710 	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
   1711       else
   1712 	 swizzles[i] = first_enabled_chan;
   1713    }
   1714    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
   1715 			      swizzles[2], swizzles[3]);
   1716 
   1717    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
   1718       return;
   1719    }
   1720 
   1721    if (ir->condition) {
   1722       emit_bool_to_cond_code(ir->condition, &predicate);
   1723    }
   1724 
   1725    for (i = 0; i < type_size(ir->lhs->type); i++) {
   1726       vec4_instruction *inst = emit(MOV(dst, src));
   1727       inst->predicate = predicate;
   1728 
   1729       dst.reg_offset++;
   1730       src.reg_offset++;
   1731    }
   1732 }
   1733 
   1734 void
   1735 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
   1736 {
   1737    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
   1738       foreach_list(node, &ir->components) {
   1739 	 ir_constant *field_value = (ir_constant *)node;
   1740 
   1741 	 emit_constant_values(dst, field_value);
   1742       }
   1743       return;
   1744    }
   1745 
   1746    if (ir->type->is_array()) {
   1747       for (unsigned int i = 0; i < ir->type->length; i++) {
   1748 	 emit_constant_values(dst, ir->array_elements[i]);
   1749       }
   1750       return;
   1751    }
   1752 
   1753    if (ir->type->is_matrix()) {
   1754       for (int i = 0; i < ir->type->matrix_columns; i++) {
   1755 	 float *vec = &ir->value.f[i * ir->type->vector_elements];
   1756 
   1757 	 for (int j = 0; j < ir->type->vector_elements; j++) {
   1758 	    dst->writemask = 1 << j;
   1759 	    dst->type = BRW_REGISTER_TYPE_F;
   1760 
   1761 	    emit(MOV(*dst, src_reg(vec[j])));
   1762 	 }
   1763 	 dst->reg_offset++;
   1764       }
   1765       return;
   1766    }
   1767 
   1768    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
   1769 
   1770    for (int i = 0; i < ir->type->vector_elements; i++) {
   1771       if (!(remaining_writemask & (1 << i)))
   1772 	 continue;
   1773 
   1774       dst->writemask = 1 << i;
   1775       dst->type = brw_type_for_base_type(ir->type);
   1776 
   1777       /* Find other components that match the one we're about to
   1778        * write.  Emits fewer instructions for things like vec4(0.5,
   1779        * 1.5, 1.5, 1.5).
   1780        */
   1781       for (int j = i + 1; j < ir->type->vector_elements; j++) {
   1782 	 if (ir->type->base_type == GLSL_TYPE_BOOL) {
   1783 	    if (ir->value.b[i] == ir->value.b[j])
   1784 	       dst->writemask |= (1 << j);
   1785 	 } else {
   1786 	    /* u, i, and f storage all line up, so no need for a
   1787 	     * switch case for comparing each type.
   1788 	     */
   1789 	    if (ir->value.u[i] == ir->value.u[j])
   1790 	       dst->writemask |= (1 << j);
   1791 	 }
   1792       }
   1793 
   1794       switch (ir->type->base_type) {
   1795       case GLSL_TYPE_FLOAT:
   1796 	 emit(MOV(*dst, src_reg(ir->value.f[i])));
   1797 	 break;
   1798       case GLSL_TYPE_INT:
   1799 	 emit(MOV(*dst, src_reg(ir->value.i[i])));
   1800 	 break;
   1801       case GLSL_TYPE_UINT:
   1802 	 emit(MOV(*dst, src_reg(ir->value.u[i])));
   1803 	 break;
   1804       case GLSL_TYPE_BOOL:
   1805 	 emit(MOV(*dst, src_reg(ir->value.b[i])));
   1806 	 break;
   1807       default:
   1808 	 assert(!"Non-float/uint/int/bool constant");
   1809 	 break;
   1810       }
   1811 
   1812       remaining_writemask &= ~dst->writemask;
   1813    }
   1814    dst->reg_offset++;
   1815 }
   1816 
   1817 void
   1818 vec4_visitor::visit(ir_constant *ir)
   1819 {
   1820    dst_reg dst = dst_reg(this, ir->type);
   1821    this->result = src_reg(dst);
   1822 
   1823    emit_constant_values(&dst, ir);
   1824 }
   1825 
   1826 void
   1827 vec4_visitor::visit(ir_call *ir)
   1828 {
   1829    assert(!"not reached");
   1830 }
   1831 
   1832 void
   1833 vec4_visitor::visit(ir_texture *ir)
   1834 {
   1835    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
   1836 
   1837    /* Should be lowered by do_lower_texture_projection */
   1838    assert(!ir->projector);
   1839 
   1840    /* Generate code to compute all the subexpression trees.  This has to be
   1841     * done before loading any values into MRFs for the sampler message since
   1842     * generating these values may involve SEND messages that need the MRFs.
   1843     */
   1844    src_reg coordinate;
   1845    if (ir->coordinate) {
   1846       ir->coordinate->accept(this);
   1847       coordinate = this->result;
   1848    }
   1849 
   1850    src_reg shadow_comparitor;
   1851    if (ir->shadow_comparitor) {
   1852       ir->shadow_comparitor->accept(this);
   1853       shadow_comparitor = this->result;
   1854    }
   1855 
   1856    const glsl_type *lod_type;
   1857    src_reg lod, dPdx, dPdy;
   1858    switch (ir->op) {
   1859    case ir_tex:
   1860       lod = src_reg(0.0f);
   1861       lod_type = glsl_type::float_type;
   1862       break;
   1863    case ir_txf:
   1864    case ir_txl:
   1865    case ir_txs:
   1866       ir->lod_info.lod->accept(this);
   1867       lod = this->result;
   1868       lod_type = ir->lod_info.lod->type;
   1869       break;
   1870    case ir_txd:
   1871       ir->lod_info.grad.dPdx->accept(this);
   1872       dPdx = this->result;
   1873 
   1874       ir->lod_info.grad.dPdy->accept(this);
   1875       dPdy = this->result;
   1876 
   1877       lod_type = ir->lod_info.grad.dPdx->type;
   1878       break;
   1879    case ir_txb:
   1880       break;
   1881    }
   1882 
   1883    vec4_instruction *inst = NULL;
   1884    switch (ir->op) {
   1885    case ir_tex:
   1886    case ir_txl:
   1887       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
   1888       break;
   1889    case ir_txd:
   1890       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
   1891       break;
   1892    case ir_txf:
   1893       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
   1894       break;
   1895    case ir_txs:
   1896       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
   1897       break;
   1898    case ir_txb:
   1899       assert(!"TXB is not valid for vertex shaders.");
   1900    }
   1901 
   1902    /* Texel offsets go in the message header; Gen4 also requires headers. */
   1903    inst->header_present = ir->offset || intel->gen < 5;
   1904    inst->base_mrf = 2;
   1905    inst->mlen = inst->header_present + 1; /* always at least one */
   1906    inst->sampler = sampler;
   1907    inst->dst = dst_reg(this, ir->type);
   1908    inst->dst.writemask = WRITEMASK_XYZW;
   1909    inst->shadow_compare = ir->shadow_comparitor != NULL;
   1910 
   1911    if (ir->offset != NULL && ir->op != ir_txf)
   1912       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
   1913 
   1914    /* MRF for the first parameter */
   1915    int param_base = inst->base_mrf + inst->header_present;
   1916 
   1917    if (ir->op == ir_txs) {
   1918       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
   1919       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
   1920    } else {
   1921       int i, coord_mask = 0, zero_mask = 0;
   1922       /* Load the coordinate */
   1923       /* FINISHME: gl_clamp_mask and saturate */
   1924       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
   1925 	 coord_mask |= (1 << i);
   1926       for (; i < 4; i++)
   1927 	 zero_mask |= (1 << i);
   1928 
   1929       if (ir->offset && ir->op == ir_txf) {
   1930 	 /* It appears that the ld instruction used for txf does its
   1931 	  * address bounds check before adding in the offset.  To work
   1932 	  * around this, just add the integer offset to the integer
   1933 	  * texel coordinate, and don't put the offset in the header.
   1934 	  */
   1935 	 ir_constant *offset = ir->offset->as_constant();
   1936 	 assert(offset);
   1937 
   1938 	 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
   1939 	    src_reg src = coordinate;
   1940 	    src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
   1941 				       BRW_GET_SWZ(src.swizzle, j),
   1942 				       BRW_GET_SWZ(src.swizzle, j),
   1943 				       BRW_GET_SWZ(src.swizzle, j));
   1944 	    emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
   1945 		     src, offset->value.i[j]));
   1946 	 }
   1947       } else {
   1948 	 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
   1949 		  coordinate));
   1950       }
   1951       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
   1952 	       src_reg(0)));
   1953       /* Load the shadow comparitor */
   1954       if (ir->shadow_comparitor) {
   1955 	 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
   1956 			  WRITEMASK_X),
   1957 		  shadow_comparitor));
   1958 	 inst->mlen++;
   1959       }
   1960 
   1961       /* Load the LOD info */
   1962       if (ir->op == ir_tex || ir->op == ir_txl) {
   1963 	 int mrf, writemask;
   1964 	 if (intel->gen >= 5) {
   1965 	    mrf = param_base + 1;
   1966 	    if (ir->shadow_comparitor) {
   1967 	       writemask = WRITEMASK_Y;
   1968 	       /* mlen already incremented */
   1969 	    } else {
   1970 	       writemask = WRITEMASK_X;
   1971 	       inst->mlen++;
   1972 	    }
   1973 	 } else /* intel->gen == 4 */ {
   1974 	    mrf = param_base;
   1975 	    writemask = WRITEMASK_Z;
   1976 	 }
   1977 	 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
   1978       } else if (ir->op == ir_txf) {
   1979 	 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
   1980 		  lod));
   1981       } else if (ir->op == ir_txd) {
   1982 	 const glsl_type *type = lod_type;
   1983 
   1984 	 if (intel->gen >= 5) {
   1985 	    dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
   1986 	    dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
   1987 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
   1988 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
   1989 	    inst->mlen++;
   1990 
   1991 	    if (ir->type->vector_elements == 3) {
   1992 	       dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
   1993 	       dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
   1994 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
   1995 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
   1996 	       inst->mlen++;
   1997 	    }
   1998 	 } else /* intel->gen == 4 */ {
   1999 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
   2000 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
   2001 	    inst->mlen += 2;
   2002 	 }
   2003       }
   2004    }
   2005 
   2006    emit(inst);
   2007 
   2008    swizzle_result(ir, src_reg(inst->dst), sampler);
   2009 }
   2010 
   2011 void
   2012 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
   2013 {
   2014    int s = c->key.tex.swizzles[sampler];
   2015 
   2016    this->result = src_reg(this, ir->type);
   2017    dst_reg swizzled_result(this->result);
   2018 
   2019    if (ir->op == ir_txs || ir->type == glsl_type::float_type
   2020 			|| s == SWIZZLE_NOOP) {
   2021       emit(MOV(swizzled_result, orig_val));
   2022       return;
   2023    }
   2024 
   2025    int zero_mask = 0, one_mask = 0, copy_mask = 0;
   2026    int swizzle[4];
   2027 
   2028    for (int i = 0; i < 4; i++) {
   2029       switch (GET_SWZ(s, i)) {
   2030       case SWIZZLE_ZERO:
   2031 	 zero_mask |= (1 << i);
   2032 	 break;
   2033       case SWIZZLE_ONE:
   2034 	 one_mask |= (1 << i);
   2035 	 break;
   2036       default:
   2037 	 copy_mask |= (1 << i);
   2038 	 swizzle[i] = GET_SWZ(s, i);
   2039 	 break;
   2040       }
   2041    }
   2042 
   2043    if (copy_mask) {
   2044       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
   2045       swizzled_result.writemask = copy_mask;
   2046       emit(MOV(swizzled_result, orig_val));
   2047    }
   2048 
   2049    if (zero_mask) {
   2050       swizzled_result.writemask = zero_mask;
   2051       emit(MOV(swizzled_result, src_reg(0.0f)));
   2052    }
   2053 
   2054    if (one_mask) {
   2055       swizzled_result.writemask = one_mask;
   2056       emit(MOV(swizzled_result, src_reg(1.0f)));
   2057    }
   2058 }
   2059 
   2060 void
   2061 vec4_visitor::visit(ir_return *ir)
   2062 {
   2063    assert(!"not reached");
   2064 }
   2065 
   2066 void
   2067 vec4_visitor::visit(ir_discard *ir)
   2068 {
   2069    assert(!"not reached");
   2070 }
   2071 
   2072 void
   2073 vec4_visitor::visit(ir_if *ir)
   2074 {
   2075    /* Don't point the annotation at the if statement, because then it plus
   2076     * the then and else blocks get printed.
   2077     */
   2078    this->base_ir = ir->condition;
   2079 
   2080    if (intel->gen == 6) {
   2081       emit_if_gen6(ir);
   2082    } else {
   2083       uint32_t predicate;
   2084       emit_bool_to_cond_code(ir->condition, &predicate);
   2085       emit(IF(predicate));
   2086    }
   2087 
   2088    visit_instructions(&ir->then_instructions);
   2089 
   2090    if (!ir->else_instructions.is_empty()) {
   2091       this->base_ir = ir->condition;
   2092       emit(BRW_OPCODE_ELSE);
   2093 
   2094       visit_instructions(&ir->else_instructions);
   2095    }
   2096 
   2097    this->base_ir = ir->condition;
   2098    emit(BRW_OPCODE_ENDIF);
   2099 }
   2100 
   2101 void
   2102 vec4_visitor::emit_ndc_computation()
   2103 {
   2104    /* Get the position */
   2105    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
   2106 
   2107    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
   2108    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
   2109    output_reg[BRW_VERT_RESULT_NDC] = ndc;
   2110 
   2111    current_annotation = "NDC";
   2112    dst_reg ndc_w = ndc;
   2113    ndc_w.writemask = WRITEMASK_W;
   2114    src_reg pos_w = pos;
   2115    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
   2116    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
   2117 
   2118    dst_reg ndc_xyz = ndc;
   2119    ndc_xyz.writemask = WRITEMASK_XYZ;
   2120 
   2121    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
   2122 }
   2123 
   2124 void
   2125 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
   2126 {
   2127    if (intel->gen < 6 &&
   2128        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
   2129         c->key.userclip_active || brw->has_negative_rhw_bug)) {
   2130       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
   2131       dst_reg header1_w = header1;
   2132       header1_w.writemask = WRITEMASK_W;
   2133       GLuint i;
   2134 
   2135       emit(MOV(header1, 0u));
   2136 
   2137       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
   2138 	 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
   2139 
   2140 	 current_annotation = "Point size";
   2141 	 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
   2142 	 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
   2143       }
   2144 
   2145       current_annotation = "Clipping flags";
   2146       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
   2147 	 vec4_instruction *inst;
   2148 
   2149 	 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
   2150                          src_reg(this->userplane[i])));
   2151 	 inst->conditional_mod = BRW_CONDITIONAL_L;
   2152 
   2153 	 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
   2154 	 inst->predicate = BRW_PREDICATE_NORMAL;
   2155       }
   2156 
   2157       /* i965 clipping workaround:
   2158        * 1) Test for -ve rhw
   2159        * 2) If set,
   2160        *      set ndc = (0,0,0,0)
   2161        *      set ucp[6] = 1
   2162        *
   2163        * Later, clipping will detect ucp[6] and ensure the primitive is
   2164        * clipped against all fixed planes.
   2165        */
   2166       if (brw->has_negative_rhw_bug) {
   2167 #if 0
   2168 	 /* FINISHME */
   2169 	 brw_CMP(p,
   2170 		 vec8(brw_null_reg()),
   2171 		 BRW_CONDITIONAL_L,
   2172 		 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
   2173 		 brw_imm_f(0));
   2174 
   2175 	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
   2176 	 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
   2177 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
   2178 #endif
   2179       }
   2180 
   2181       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
   2182    } else if (intel->gen < 6) {
   2183       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
   2184    } else {
   2185       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
   2186       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
   2187          emit(MOV(brw_writemask(reg, WRITEMASK_W),
   2188                   src_reg(output_reg[VERT_RESULT_PSIZ])));
   2189       }
   2190    }
   2191 }
   2192 
   2193 void
   2194 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
   2195 {
   2196    if (intel->gen < 6) {
   2197       /* Clip distance slots are set aside in gen5, but they are not used.  It
   2198        * is not clear whether we actually need to set aside space for them,
   2199        * but the performance cost is negligible.
   2200        */
   2201       return;
   2202    }
   2203 
   2204    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
   2205     *
   2206     *     "If a linked set of shaders forming the vertex stage contains no
   2207     *     static write to gl_ClipVertex or gl_ClipDistance, but the
   2208     *     application has requested clipping against user clip planes through
   2209     *     the API, then the coordinate written to gl_Position is used for
   2210     *     comparison against the user clip planes."
   2211     *
   2212     * This function is only called if the shader didn't write to
   2213     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
   2214     * if the user wrote to it; otherwise we use gl_Position.
   2215     */
   2216    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
   2217    if (!(c->prog_data.outputs_written
   2218          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
   2219       clip_vertex = VERT_RESULT_HPOS;
   2220    }
   2221 
   2222    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
   2223         ++i) {
   2224       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
   2225                src_reg(output_reg[clip_vertex]),
   2226                src_reg(this->userplane[i + offset])));
   2227    }
   2228 }
   2229 
   2230 void
   2231 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
   2232 {
   2233    assert (vert_result < VERT_RESULT_MAX);
   2234    reg.type = output_reg[vert_result].type;
   2235    current_annotation = output_reg_annotation[vert_result];
   2236    /* Copy the register, saturating if necessary */
   2237    vec4_instruction *inst = emit(MOV(reg,
   2238                                      src_reg(output_reg[vert_result])));
   2239    if ((vert_result == VERT_RESULT_COL0 ||
   2240         vert_result == VERT_RESULT_COL1 ||
   2241         vert_result == VERT_RESULT_BFC0 ||
   2242         vert_result == VERT_RESULT_BFC1) &&
   2243        c->key.clamp_vertex_color) {
   2244       inst->saturate = true;
   2245    }
   2246 }
   2247 
   2248 void
   2249 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
   2250 {
   2251    struct brw_reg hw_reg = brw_message_reg(mrf);
   2252    dst_reg reg = dst_reg(MRF, mrf);
   2253    reg.type = BRW_REGISTER_TYPE_F;
   2254 
   2255    switch (vert_result) {
   2256    case VERT_RESULT_PSIZ:
   2257       /* PSIZ is always in slot 0, and is coupled with other flags. */
   2258       current_annotation = "indices, point width, clip flags";
   2259       emit_psiz_and_flags(hw_reg);
   2260       break;
   2261    case BRW_VERT_RESULT_NDC:
   2262       current_annotation = "NDC";
   2263       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
   2264       break;
   2265    case BRW_VERT_RESULT_HPOS_DUPLICATE:
   2266    case VERT_RESULT_HPOS:
   2267       current_annotation = "gl_Position";
   2268       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
   2269       break;
   2270    case VERT_RESULT_CLIP_DIST0:
   2271    case VERT_RESULT_CLIP_DIST1:
   2272       if (this->c->key.uses_clip_distance) {
   2273          emit_generic_urb_slot(reg, vert_result);
   2274       } else {
   2275          current_annotation = "user clip distances";
   2276          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
   2277       }
   2278       break;
   2279    case VERT_RESULT_EDGE:
   2280       /* This is present when doing unfilled polygons.  We're supposed to copy
   2281        * the edge flag from the user-provided vertex array
   2282        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
   2283        * of that attribute (starts as 1.0f).  This is then used in clipping to
   2284        * determine which edges should be drawn as wireframe.
   2285        */
   2286       current_annotation = "edge flag";
   2287       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
   2288                                     glsl_type::float_type, WRITEMASK_XYZW))));
   2289       break;
   2290    case BRW_VERT_RESULT_PAD:
   2291       /* No need to write to this slot */
   2292       break;
   2293    default:
   2294       emit_generic_urb_slot(reg, vert_result);
   2295       break;
   2296    }
   2297 }
   2298 
   2299 static int
   2300 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
   2301 {
   2302    struct intel_context *intel = &brw->intel;
   2303 
   2304    if (intel->gen >= 6) {
   2305       /* URB data written (does not include the message header reg) must
   2306        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
   2307        * section 5.4.3.2.2: URB_INTERLEAVED.
   2308        *
   2309        * URB entries are allocated on a multiple of 1024 bits, so an
   2310        * extra 128 bits written here to make the end align to 256 is
   2311        * no problem.
   2312        */
   2313       if ((mlen % 2) != 1)
   2314 	 mlen++;
   2315    }
   2316 
   2317    return mlen;
   2318 }
   2319 
   2320 /**
   2321  * Generates the VUE payload plus the 1 or 2 URB write instructions to
   2322  * complete the VS thread.
   2323  *
   2324  * The VUE layout is documented in Volume 2a.
   2325  */
   2326 void
   2327 vec4_visitor::emit_urb_writes()
   2328 {
   2329    /* MRF 0 is reserved for the debugger, so start with message header
   2330     * in MRF 1.
   2331     */
   2332    int base_mrf = 1;
   2333    int mrf = base_mrf;
   2334    /* In the process of generating our URB write message contents, we
   2335     * may need to unspill a register or load from an array.  Those
   2336     * reads would use MRFs 14-15.
   2337     */
   2338    int max_usable_mrf = 13;
   2339 
   2340    /* The following assertion verifies that max_usable_mrf causes an
   2341     * even-numbered amount of URB write data, which will meet gen6's
   2342     * requirements for length alignment.
   2343     */
   2344    assert ((max_usable_mrf - base_mrf) % 2 == 0);
   2345 
   2346    /* First mrf is the g0-based message header containing URB handles and such,
   2347     * which is implied in VS_OPCODE_URB_WRITE.
   2348     */
   2349    mrf++;
   2350 
   2351    if (intel->gen < 6) {
   2352       emit_ndc_computation();
   2353    }
   2354 
   2355    /* Set up the VUE data for the first URB write */
   2356    int slot;
   2357    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
   2358       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
   2359 
   2360       /* If this was max_usable_mrf, we can't fit anything more into this URB
   2361        * WRITE.
   2362        */
   2363       if (mrf > max_usable_mrf) {
   2364 	 slot++;
   2365 	 break;
   2366       }
   2367    }
   2368 
   2369    current_annotation = "URB write";
   2370    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
   2371    inst->base_mrf = base_mrf;
   2372    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
   2373    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
   2374 
   2375    /* Optional second URB write */
   2376    if (!inst->eot) {
   2377       mrf = base_mrf + 1;
   2378 
   2379       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
   2380 	 assert(mrf < max_usable_mrf);
   2381 
   2382          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
   2383       }
   2384 
   2385       current_annotation = "URB write";
   2386       inst = emit(VS_OPCODE_URB_WRITE);
   2387       inst->base_mrf = base_mrf;
   2388       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
   2389       inst->eot = true;
   2390       /* URB destination offset.  In the previous write, we got MRFs
   2391        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
   2392        * URB row increments, and each of our MRFs is half of one of
   2393        * those, since we're doing interleaved writes.
   2394        */
   2395       inst->offset = (max_usable_mrf - base_mrf) / 2;
   2396    }
   2397 }
   2398 
   2399 src_reg
   2400 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
   2401 				 src_reg *reladdr, int reg_offset)
   2402 {
   2403    /* Because we store the values to scratch interleaved like our
   2404     * vertex data, we need to scale the vec4 index by 2.
   2405     */
   2406    int message_header_scale = 2;
   2407 
   2408    /* Pre-gen6, the message header uses byte offsets instead of vec4
   2409     * (16-byte) offset units.
   2410     */
   2411    if (intel->gen < 6)
   2412       message_header_scale *= 16;
   2413 
   2414    if (reladdr) {
   2415       src_reg index = src_reg(this, glsl_type::int_type);
   2416 
   2417       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
   2418       emit_before(inst, MUL(dst_reg(index),
   2419 			    index, src_reg(message_header_scale)));
   2420 
   2421       return index;
   2422    } else {
   2423       return src_reg(reg_offset * message_header_scale);
   2424    }
   2425 }
   2426 
   2427 src_reg
   2428 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
   2429 				       src_reg *reladdr, int reg_offset)
   2430 {
   2431    if (reladdr) {
   2432       src_reg index = src_reg(this, glsl_type::int_type);
   2433 
   2434       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
   2435 
   2436       /* Pre-gen6, the message header uses byte offsets instead of vec4
   2437        * (16-byte) offset units.
   2438        */
   2439       if (intel->gen < 6) {
   2440 	 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
   2441       }
   2442 
   2443       return index;
   2444    } else {
   2445       int message_header_scale = intel->gen < 6 ? 16 : 1;
   2446       return src_reg(reg_offset * message_header_scale);
   2447    }
   2448 }
   2449 
   2450 /**
   2451  * Emits an instruction before @inst to load the value named by @orig_src
   2452  * from scratch space at @base_offset to @temp.
   2453  *
   2454  * @base_offset is measured in 32-byte units (the size of a register).
   2455  */
   2456 void
   2457 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
   2458 				dst_reg temp, src_reg orig_src,
   2459 				int base_offset)
   2460 {
   2461    int reg_offset = base_offset + orig_src.reg_offset;
   2462    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
   2463 
   2464    emit_before(inst, SCRATCH_READ(temp, index));
   2465 }
   2466 
   2467 /**
   2468  * Emits an instruction after @inst to store the value to be written
   2469  * to @orig_dst to scratch space at @base_offset, from @temp.
   2470  *
   2471  * @base_offset is measured in 32-byte units (the size of a register).
   2472  */
   2473 void
   2474 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
   2475 				 src_reg temp, dst_reg orig_dst,
   2476 				 int base_offset)
   2477 {
   2478    int reg_offset = base_offset + orig_dst.reg_offset;
   2479    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
   2480 
   2481    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
   2482 				       orig_dst.writemask));
   2483    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
   2484    write->predicate = inst->predicate;
   2485    write->ir = inst->ir;
   2486    write->annotation = inst->annotation;
   2487    inst->insert_after(write);
   2488 }
   2489 
   2490 /**
   2491  * We can't generally support array access in GRF space, because a
   2492  * single instruction's destination can only span 2 contiguous
   2493  * registers.  So, we send all GRF arrays that get variable index
   2494  * access to scratch space.
   2495  */
   2496 void
   2497 vec4_visitor::move_grf_array_access_to_scratch()
   2498 {
   2499    int scratch_loc[this->virtual_grf_count];
   2500 
   2501    for (int i = 0; i < this->virtual_grf_count; i++) {
   2502       scratch_loc[i] = -1;
   2503    }
   2504 
   2505    /* First, calculate the set of virtual GRFs that need to be punted
   2506     * to scratch due to having any array access on them, and where in
   2507     * scratch.
   2508     */
   2509    foreach_list(node, &this->instructions) {
   2510       vec4_instruction *inst = (vec4_instruction *)node;
   2511 
   2512       if (inst->dst.file == GRF && inst->dst.reladdr &&
   2513 	  scratch_loc[inst->dst.reg] == -1) {
   2514 	 scratch_loc[inst->dst.reg] = c->last_scratch;
   2515 	 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
   2516       }
   2517 
   2518       for (int i = 0 ; i < 3; i++) {
   2519 	 src_reg *src = &inst->src[i];
   2520 
   2521 	 if (src->file == GRF && src->reladdr &&
   2522 	     scratch_loc[src->reg] == -1) {
   2523 	    scratch_loc[src->reg] = c->last_scratch;
   2524 	    c->last_scratch += this->virtual_grf_sizes[src->reg];
   2525 	 }
   2526       }
   2527    }
   2528 
   2529    /* Now, for anything that will be accessed through scratch, rewrite
   2530     * it to load/store.  Note that this is a _safe list walk, because
   2531     * we may generate a new scratch_write instruction after the one
   2532     * we're processing.
   2533     */
   2534    foreach_list_safe(node, &this->instructions) {
   2535       vec4_instruction *inst = (vec4_instruction *)node;
   2536 
   2537       /* Set up the annotation tracking for new generated instructions. */
   2538       base_ir = inst->ir;
   2539       current_annotation = inst->annotation;
   2540 
   2541       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
   2542 	 src_reg temp = src_reg(this, glsl_type::vec4_type);
   2543 
   2544 	 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
   2545 
   2546 	 inst->dst.file = temp.file;
   2547 	 inst->dst.reg = temp.reg;
   2548 	 inst->dst.reg_offset = temp.reg_offset;
   2549 	 inst->dst.reladdr = NULL;
   2550       }
   2551 
   2552       for (int i = 0 ; i < 3; i++) {
   2553 	 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
   2554 	    continue;
   2555 
   2556 	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
   2557 
   2558 	 emit_scratch_read(inst, temp, inst->src[i],
   2559 			   scratch_loc[inst->src[i].reg]);
   2560 
   2561 	 inst->src[i].file = temp.file;
   2562 	 inst->src[i].reg = temp.reg;
   2563 	 inst->src[i].reg_offset = temp.reg_offset;
   2564 	 inst->src[i].reladdr = NULL;
   2565       }
   2566    }
   2567 }
   2568 
   2569 /**
   2570  * Emits an instruction before @inst to load the value named by @orig_src
   2571  * from the pull constant buffer (surface) at @base_offset to @temp.
   2572  */
   2573 void
   2574 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
   2575 				      dst_reg temp, src_reg orig_src,
   2576 				      int base_offset)
   2577 {
   2578    int reg_offset = base_offset + orig_src.reg_offset;
   2579    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
   2580    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
   2581    vec4_instruction *load;
   2582 
   2583    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
   2584 					temp, index, offset);
   2585    load->base_mrf = 14;
   2586    load->mlen = 1;
   2587    emit_before(inst, load);
   2588 }
   2589 
   2590 /**
   2591  * Implements array access of uniforms by inserting a
   2592  * PULL_CONSTANT_LOAD instruction.
   2593  *
   2594  * Unlike temporary GRF array access (where we don't support it due to
   2595  * the difficulty of doing relative addressing on instruction
   2596  * destinations), we could potentially do array access of uniforms
   2597  * that were loaded in GRF space as push constants.  In real-world
   2598  * usage we've seen, though, the arrays being used are always larger
   2599  * than we could load as push constants, so just always move all
   2600  * uniform array access out to a pull constant buffer.
   2601  */
   2602 void
   2603 vec4_visitor::move_uniform_array_access_to_pull_constants()
   2604 {
   2605    int pull_constant_loc[this->uniforms];
   2606 
   2607    for (int i = 0; i < this->uniforms; i++) {
   2608       pull_constant_loc[i] = -1;
   2609    }
   2610 
   2611    /* Walk through and find array access of uniforms.  Put a copy of that
   2612     * uniform in the pull constant buffer.
   2613     *
   2614     * Note that we don't move constant-indexed accesses to arrays.  No
   2615     * testing has been done of the performance impact of this choice.
   2616     */
   2617    foreach_list_safe(node, &this->instructions) {
   2618       vec4_instruction *inst = (vec4_instruction *)node;
   2619 
   2620       for (int i = 0 ; i < 3; i++) {
   2621 	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
   2622 	    continue;
   2623 
   2624 	 int uniform = inst->src[i].reg;
   2625 
   2626 	 /* If this array isn't already present in the pull constant buffer,
   2627 	  * add it.
   2628 	  */
   2629 	 if (pull_constant_loc[uniform] == -1) {
   2630 	    const float **values = &prog_data->param[uniform * 4];
   2631 
   2632 	    pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
   2633 
   2634 	    for (int j = 0; j < uniform_size[uniform] * 4; j++) {
   2635 	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
   2636 	    }
   2637 	 }
   2638 
   2639 	 /* Set up the annotation tracking for new generated instructions. */
   2640 	 base_ir = inst->ir;
   2641 	 current_annotation = inst->annotation;
   2642 
   2643 	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
   2644 
   2645 	 emit_pull_constant_load(inst, temp, inst->src[i],
   2646 				 pull_constant_loc[uniform]);
   2647 
   2648 	 inst->src[i].file = temp.file;
   2649 	 inst->src[i].reg = temp.reg;
   2650 	 inst->src[i].reg_offset = temp.reg_offset;
   2651 	 inst->src[i].reladdr = NULL;
   2652       }
   2653    }
   2654 
   2655    /* Now there are no accesses of the UNIFORM file with a reladdr, so
   2656     * no need to track them as larger-than-vec4 objects.  This will be
   2657     * relied on in cutting out unused uniform vectors from push
   2658     * constants.
   2659     */
   2660    split_uniform_registers();
   2661 }
   2662 
   2663 void
   2664 vec4_visitor::resolve_ud_negate(src_reg *reg)
   2665 {
   2666    if (reg->type != BRW_REGISTER_TYPE_UD ||
   2667        !reg->negate)
   2668       return;
   2669 
   2670    src_reg temp = src_reg(this, glsl_type::uvec4_type);
   2671    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
   2672    *reg = temp;
   2673 }
   2674 
   2675 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
   2676 			   struct gl_shader_program *prog,
   2677 			   struct brw_shader *shader)
   2678 {
   2679    this->c = c;
   2680    this->p = &c->func;
   2681    this->brw = p->brw;
   2682    this->intel = &brw->intel;
   2683    this->ctx = &intel->ctx;
   2684    this->prog = prog;
   2685    this->shader = shader;
   2686 
   2687    this->mem_ctx = ralloc_context(NULL);
   2688    this->failed = false;
   2689 
   2690    this->base_ir = NULL;
   2691    this->current_annotation = NULL;
   2692 
   2693    this->c = c;
   2694    this->vp = (struct gl_vertex_program *)
   2695      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
   2696    this->prog_data = &c->prog_data;
   2697 
   2698    this->variable_ht = hash_table_ctor(0,
   2699 				       hash_table_pointer_hash,
   2700 				       hash_table_pointer_compare);
   2701 
   2702    this->virtual_grf_def = NULL;
   2703    this->virtual_grf_use = NULL;
   2704    this->virtual_grf_sizes = NULL;
   2705    this->virtual_grf_count = 0;
   2706    this->virtual_grf_reg_map = NULL;
   2707    this->virtual_grf_reg_count = 0;
   2708    this->virtual_grf_array_size = 0;
   2709    this->live_intervals_valid = false;
   2710 
   2711    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
   2712 
   2713    this->uniforms = 0;
   2714 }
   2715 
   2716 vec4_visitor::~vec4_visitor()
   2717 {
   2718    ralloc_free(this->mem_ctx);
   2719    hash_table_dtor(this->variable_ht);
   2720 }
   2721 
   2722 
   2723 void
   2724 vec4_visitor::fail(const char *format, ...)
   2725 {
   2726    va_list va;
   2727    char *msg;
   2728 
   2729    if (failed)
   2730       return;
   2731 
   2732    failed = true;
   2733 
   2734    va_start(va, format);
   2735    msg = ralloc_vasprintf(mem_ctx, format, va);
   2736    va_end(va);
   2737    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
   2738 
   2739    this->fail_msg = msg;
   2740 
   2741    if (INTEL_DEBUG & DEBUG_VS) {
   2742       fprintf(stderr, "%s",  msg);
   2743    }
   2744 }
   2745 
   2746 } /* namespace brw */
   2747