Home | History | Annotate | Download | only in i965
      1 /* Copyright  2011 Intel Corporation
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a
      4  * copy of this software and associated documentation files (the "Software"),
      5  * to deal in the Software without restriction, including without limitation
      6  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      7  * and/or sell copies of the Software, and to permit persons to whom the
      8  * Software is furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice (including the next
     11  * paragraph) shall be included in all copies or substantial portions of the
     12  * Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     20  * IN THE SOFTWARE.
     21  */
     22 
     23 #include "brw_vec4.h"
     24 #include "glsl/ir_print_visitor.h"
     25 
     26 extern "C" {
     27 #include "brw_eu.h"
     28 #include "main/macros.h"
     29 };
     30 
     31 using namespace brw;
     32 
     33 namespace brw {
     34 
     35 int
     36 vec4_visitor::setup_attributes(int payload_reg)
     37 {
     38    int nr_attributes;
     39    int attribute_map[VERT_ATTRIB_MAX + 1];
     40 
     41    nr_attributes = 0;
     42    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
     43       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
     44 	 attribute_map[i] = payload_reg + nr_attributes;
     45 	 nr_attributes++;
     46       }
     47    }
     48 
     49    /* VertexID is stored by the VF as the last vertex element, but we
     50     * don't represent it with a flag in inputs_read, so we call it
     51     * VERT_ATTRIB_MAX.
     52     */
     53    if (prog_data->uses_vertexid) {
     54       attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
     55       nr_attributes++;
     56    }
     57 
     58    foreach_list(node, &this->instructions) {
     59       vec4_instruction *inst = (vec4_instruction *)node;
     60 
     61       /* We have to support ATTR as a destination for GL_FIXED fixup. */
     62       if (inst->dst.file == ATTR) {
     63 	 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
     64 
     65 	 struct brw_reg reg = brw_vec8_grf(grf, 0);
     66 	 reg.dw1.bits.writemask = inst->dst.writemask;
     67 
     68 	 inst->dst.file = HW_REG;
     69 	 inst->dst.fixed_hw_reg = reg;
     70       }
     71 
     72       for (int i = 0; i < 3; i++) {
     73 	 if (inst->src[i].file != ATTR)
     74 	    continue;
     75 
     76 	 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
     77 
     78 	 struct brw_reg reg = brw_vec8_grf(grf, 0);
     79 	 reg.dw1.bits.swizzle = inst->src[i].swizzle;
     80          reg.type = inst->src[i].type;
     81 	 if (inst->src[i].abs)
     82 	    reg = brw_abs(reg);
     83 	 if (inst->src[i].negate)
     84 	    reg = negate(reg);
     85 
     86 	 inst->src[i].file = HW_REG;
     87 	 inst->src[i].fixed_hw_reg = reg;
     88       }
     89    }
     90 
     91    /* The BSpec says we always have to read at least one thing from
     92     * the VF, and it appears that the hardware wedges otherwise.
     93     */
     94    if (nr_attributes == 0)
     95       nr_attributes = 1;
     96 
     97    prog_data->urb_read_length = (nr_attributes + 1) / 2;
     98 
     99    unsigned vue_entries = MAX2(nr_attributes, c->prog_data.vue_map.num_slots);
    100 
    101    if (intel->gen == 6)
    102       c->prog_data.urb_entry_size = ALIGN(vue_entries, 8) / 8;
    103    else
    104       c->prog_data.urb_entry_size = ALIGN(vue_entries, 4) / 4;
    105 
    106    return payload_reg + nr_attributes;
    107 }
    108 
    109 int
    110 vec4_visitor::setup_uniforms(int reg)
    111 {
    112    /* The pre-gen6 VS requires that some push constants get loaded no
    113     * matter what, or the GPU would hang.
    114     */
    115    if (intel->gen < 6 && this->uniforms == 0) {
    116       this->uniform_vector_size[this->uniforms] = 1;
    117 
    118       for (unsigned int i = 0; i < 4; i++) {
    119 	 unsigned int slot = this->uniforms * 4 + i;
    120 	 static float zero = 0.0;
    121 	 c->prog_data.param[slot] = &zero;
    122       }
    123 
    124       this->uniforms++;
    125       reg++;
    126    } else {
    127       reg += ALIGN(uniforms, 2) / 2;
    128    }
    129 
    130    c->prog_data.nr_params = this->uniforms * 4;
    131 
    132    c->prog_data.curb_read_length = reg - 1;
    133    c->prog_data.uses_new_param_layout = true;
    134 
    135    return reg;
    136 }
    137 
    138 void
    139 vec4_visitor::setup_payload(void)
    140 {
    141    int reg = 0;
    142 
    143    /* The payload always contains important data in g0, which contains
    144     * the URB handles that are passed on to the URB write at the end
    145     * of the thread.  So, we always start push constants at g1.
    146     */
    147    reg++;
    148 
    149    reg = setup_uniforms(reg);
    150 
    151    reg = setup_attributes(reg);
    152 
    153    this->first_non_payload_grf = reg;
    154 }
    155 
    156 struct brw_reg
    157 vec4_instruction::get_dst(void)
    158 {
    159    struct brw_reg brw_reg;
    160 
    161    switch (dst.file) {
    162    case GRF:
    163       brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
    164       brw_reg = retype(brw_reg, dst.type);
    165       brw_reg.dw1.bits.writemask = dst.writemask;
    166       break;
    167 
    168    case MRF:
    169       brw_reg = brw_message_reg(dst.reg + dst.reg_offset);
    170       brw_reg = retype(brw_reg, dst.type);
    171       brw_reg.dw1.bits.writemask = dst.writemask;
    172       break;
    173 
    174    case HW_REG:
    175       brw_reg = dst.fixed_hw_reg;
    176       break;
    177 
    178    case BAD_FILE:
    179       brw_reg = brw_null_reg();
    180       break;
    181 
    182    default:
    183       assert(!"not reached");
    184       brw_reg = brw_null_reg();
    185       break;
    186    }
    187    return brw_reg;
    188 }
    189 
    190 struct brw_reg
    191 vec4_instruction::get_src(int i)
    192 {
    193    struct brw_reg brw_reg;
    194 
    195    switch (src[i].file) {
    196    case GRF:
    197       brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0);
    198       brw_reg = retype(brw_reg, src[i].type);
    199       brw_reg.dw1.bits.swizzle = src[i].swizzle;
    200       if (src[i].abs)
    201 	 brw_reg = brw_abs(brw_reg);
    202       if (src[i].negate)
    203 	 brw_reg = negate(brw_reg);
    204       break;
    205 
    206    case IMM:
    207       switch (src[i].type) {
    208       case BRW_REGISTER_TYPE_F:
    209 	 brw_reg = brw_imm_f(src[i].imm.f);
    210 	 break;
    211       case BRW_REGISTER_TYPE_D:
    212 	 brw_reg = brw_imm_d(src[i].imm.i);
    213 	 break;
    214       case BRW_REGISTER_TYPE_UD:
    215 	 brw_reg = brw_imm_ud(src[i].imm.u);
    216 	 break;
    217       default:
    218 	 assert(!"not reached");
    219 	 brw_reg = brw_null_reg();
    220 	 break;
    221       }
    222       break;
    223 
    224    case UNIFORM:
    225       brw_reg = stride(brw_vec4_grf(1 + (src[i].reg + src[i].reg_offset) / 2,
    226 				    ((src[i].reg + src[i].reg_offset) % 2) * 4),
    227 		       0, 4, 1);
    228       brw_reg = retype(brw_reg, src[i].type);
    229       brw_reg.dw1.bits.swizzle = src[i].swizzle;
    230       if (src[i].abs)
    231 	 brw_reg = brw_abs(brw_reg);
    232       if (src[i].negate)
    233 	 brw_reg = negate(brw_reg);
    234 
    235       /* This should have been moved to pull constants. */
    236       assert(!src[i].reladdr);
    237       break;
    238 
    239    case HW_REG:
    240       brw_reg = src[i].fixed_hw_reg;
    241       break;
    242 
    243    case BAD_FILE:
    244       /* Probably unused. */
    245       brw_reg = brw_null_reg();
    246       break;
    247    case ATTR:
    248    default:
    249       assert(!"not reached");
    250       brw_reg = brw_null_reg();
    251       break;
    252    }
    253 
    254    return brw_reg;
    255 }
    256 
    257 void
    258 vec4_visitor::generate_math1_gen4(vec4_instruction *inst,
    259 				  struct brw_reg dst,
    260 				  struct brw_reg src)
    261 {
    262    brw_math(p,
    263 	    dst,
    264 	    brw_math_function(inst->opcode),
    265 	    inst->base_mrf,
    266 	    src,
    267 	    BRW_MATH_DATA_VECTOR,
    268 	    BRW_MATH_PRECISION_FULL);
    269 }
    270 
    271 static void
    272 check_gen6_math_src_arg(struct brw_reg src)
    273 {
    274    /* Source swizzles are ignored. */
    275    assert(!src.abs);
    276    assert(!src.negate);
    277    assert(src.dw1.bits.swizzle == BRW_SWIZZLE_XYZW);
    278 }
    279 
    280 void
    281 vec4_visitor::generate_math1_gen6(vec4_instruction *inst,
    282 				  struct brw_reg dst,
    283 				  struct brw_reg src)
    284 {
    285    /* Can't do writemask because math can't be align16. */
    286    assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
    287    check_gen6_math_src_arg(src);
    288 
    289    brw_set_access_mode(p, BRW_ALIGN_1);
    290    brw_math(p,
    291 	    dst,
    292 	    brw_math_function(inst->opcode),
    293 	    inst->base_mrf,
    294 	    src,
    295 	    BRW_MATH_DATA_SCALAR,
    296 	    BRW_MATH_PRECISION_FULL);
    297    brw_set_access_mode(p, BRW_ALIGN_16);
    298 }
    299 
    300 void
    301 vec4_visitor::generate_math2_gen7(vec4_instruction *inst,
    302 				  struct brw_reg dst,
    303 				  struct brw_reg src0,
    304 				  struct brw_reg src1)
    305 {
    306    brw_math2(p,
    307 	     dst,
    308 	     brw_math_function(inst->opcode),
    309 	     src0, src1);
    310 }
    311 
    312 void
    313 vec4_visitor::generate_math2_gen6(vec4_instruction *inst,
    314 				  struct brw_reg dst,
    315 				  struct brw_reg src0,
    316 				  struct brw_reg src1)
    317 {
    318    /* Can't do writemask because math can't be align16. */
    319    assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
    320    /* Source swizzles are ignored. */
    321    check_gen6_math_src_arg(src0);
    322    check_gen6_math_src_arg(src1);
    323 
    324    brw_set_access_mode(p, BRW_ALIGN_1);
    325    brw_math2(p,
    326 	     dst,
    327 	     brw_math_function(inst->opcode),
    328 	     src0, src1);
    329    brw_set_access_mode(p, BRW_ALIGN_16);
    330 }
    331 
    332 void
    333 vec4_visitor::generate_math2_gen4(vec4_instruction *inst,
    334 				  struct brw_reg dst,
    335 				  struct brw_reg src0,
    336 				  struct brw_reg src1)
    337 {
    338    /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
    339     * "Message Payload":
    340     *
    341     * "Operand0[7].  For the INT DIV functions, this operand is the
    342     *  denominator."
    343     *  ...
    344     * "Operand1[7].  For the INT DIV functions, this operand is the
    345     *  numerator."
    346     */
    347    bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
    348    struct brw_reg &op0 = is_int_div ? src1 : src0;
    349    struct brw_reg &op1 = is_int_div ? src0 : src1;
    350 
    351    brw_push_insn_state(p);
    352    brw_set_saturate(p, false);
    353    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    354    brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
    355    brw_pop_insn_state(p);
    356 
    357    brw_math(p,
    358 	    dst,
    359 	    brw_math_function(inst->opcode),
    360 	    inst->base_mrf,
    361 	    op0,
    362 	    BRW_MATH_DATA_VECTOR,
    363 	    BRW_MATH_PRECISION_FULL);
    364 }
    365 
    366 void
    367 vec4_visitor::generate_tex(vec4_instruction *inst,
    368 			   struct brw_reg dst,
    369 			   struct brw_reg src)
    370 {
    371    int msg_type = -1;
    372 
    373    if (intel->gen >= 5) {
    374       switch (inst->opcode) {
    375       case SHADER_OPCODE_TEX:
    376       case SHADER_OPCODE_TXL:
    377 	 if (inst->shadow_compare) {
    378 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
    379 	 } else {
    380 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
    381 	 }
    382 	 break;
    383       case SHADER_OPCODE_TXD:
    384          if (inst->shadow_compare) {
    385             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
    386             assert(intel->is_haswell);
    387             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
    388          } else {
    389             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
    390          }
    391 	 break;
    392       case SHADER_OPCODE_TXF:
    393 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
    394 	 break;
    395       case SHADER_OPCODE_TXS:
    396 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
    397 	 break;
    398       default:
    399 	 assert(!"should not get here: invalid VS texture opcode");
    400 	 break;
    401       }
    402    } else {
    403       switch (inst->opcode) {
    404       case SHADER_OPCODE_TEX:
    405       case SHADER_OPCODE_TXL:
    406 	 if (inst->shadow_compare) {
    407 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
    408 	    assert(inst->mlen == 3);
    409 	 } else {
    410 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
    411 	    assert(inst->mlen == 2);
    412 	 }
    413 	 break;
    414       case SHADER_OPCODE_TXD:
    415 	 /* There is no sample_d_c message; comparisons are done manually. */
    416 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
    417 	 assert(inst->mlen == 4);
    418 	 break;
    419       case SHADER_OPCODE_TXF:
    420 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
    421 	 assert(inst->mlen == 2);
    422 	 break;
    423       case SHADER_OPCODE_TXS:
    424 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
    425 	 assert(inst->mlen == 2);
    426 	 break;
    427       default:
    428 	 assert(!"should not get here: invalid VS texture opcode");
    429 	 break;
    430       }
    431    }
    432 
    433    assert(msg_type != -1);
    434 
    435    /* Load the message header if present.  If there's a texture offset, we need
    436     * to set it up explicitly and load the offset bitfield.  Otherwise, we can
    437     * use an implied move from g0 to the first message register.
    438     */
    439    if (inst->texture_offset) {
    440       /* Explicitly set up the message header by copying g0 to the MRF. */
    441       brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
    442 	         retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    443 
    444       /* Then set the offset bits in DWord 2. */
    445       brw_set_access_mode(p, BRW_ALIGN_1);
    446       brw_MOV(p,
    447 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, inst->base_mrf, 2),
    448 		     BRW_REGISTER_TYPE_UD),
    449 	      brw_imm_uw(inst->texture_offset));
    450       brw_set_access_mode(p, BRW_ALIGN_16);
    451    } else if (inst->header_present) {
    452       /* Set up an implied move from g0 to the MRF. */
    453       src = brw_vec8_grf(0, 0);
    454    }
    455 
    456    uint32_t return_format;
    457 
    458    switch (dst.type) {
    459    case BRW_REGISTER_TYPE_D:
    460       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
    461       break;
    462    case BRW_REGISTER_TYPE_UD:
    463       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
    464       break;
    465    default:
    466       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
    467       break;
    468    }
    469 
    470    brw_SAMPLE(p,
    471 	      dst,
    472 	      inst->base_mrf,
    473 	      src,
    474 	      SURF_INDEX_VS_TEXTURE(inst->sampler),
    475 	      inst->sampler,
    476 	      WRITEMASK_XYZW,
    477 	      msg_type,
    478 	      1, /* response length */
    479 	      inst->mlen,
    480 	      inst->header_present,
    481 	      BRW_SAMPLER_SIMD_MODE_SIMD4X2,
    482 	      return_format);
    483 }
    484 
    485 void
    486 vec4_visitor::generate_urb_write(vec4_instruction *inst)
    487 {
    488    brw_urb_WRITE(p,
    489 		 brw_null_reg(), /* dest */
    490 		 inst->base_mrf, /* starting mrf reg nr */
    491 		 brw_vec8_grf(0, 0), /* src */
    492 		 false,		/* allocate */
    493 		 true,		/* used */
    494 		 inst->mlen,
    495 		 0,		/* response len */
    496 		 inst->eot,	/* eot */
    497 		 inst->eot,	/* writes complete */
    498 		 inst->offset,	/* urb destination offset */
    499 		 BRW_URB_SWIZZLE_INTERLEAVE);
    500 }
    501 
    502 void
    503 vec4_visitor::generate_oword_dual_block_offsets(struct brw_reg m1,
    504 						struct brw_reg index)
    505 {
    506    int second_vertex_offset;
    507 
    508    if (intel->gen >= 6)
    509       second_vertex_offset = 1;
    510    else
    511       second_vertex_offset = 16;
    512 
    513    m1 = retype(m1, BRW_REGISTER_TYPE_D);
    514 
    515    /* Set up M1 (message payload).  Only the block offsets in M1.0 and
    516     * M1.4 are used, and the rest are ignored.
    517     */
    518    struct brw_reg m1_0 = suboffset(vec1(m1), 0);
    519    struct brw_reg m1_4 = suboffset(vec1(m1), 4);
    520    struct brw_reg index_0 = suboffset(vec1(index), 0);
    521    struct brw_reg index_4 = suboffset(vec1(index), 4);
    522 
    523    brw_push_insn_state(p);
    524    brw_set_mask_control(p, BRW_MASK_DISABLE);
    525    brw_set_access_mode(p, BRW_ALIGN_1);
    526 
    527    brw_MOV(p, m1_0, index_0);
    528 
    529    brw_set_predicate_inverse(p, true);
    530    if (index.file == BRW_IMMEDIATE_VALUE) {
    531       index_4.dw1.ud += second_vertex_offset;
    532       brw_MOV(p, m1_4, index_4);
    533    } else {
    534       brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
    535    }
    536 
    537    brw_pop_insn_state(p);
    538 }
    539 
    540 void
    541 vec4_visitor::generate_scratch_read(vec4_instruction *inst,
    542 				    struct brw_reg dst,
    543 				    struct brw_reg index)
    544 {
    545    struct brw_reg header = brw_vec8_grf(0, 0);
    546 
    547    gen6_resolve_implied_move(p, &header, inst->base_mrf);
    548 
    549    generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
    550 				     index);
    551 
    552    uint32_t msg_type;
    553 
    554    if (intel->gen >= 6)
    555       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    556    else if (intel->gen == 5 || intel->is_g4x)
    557       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    558    else
    559       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    560 
    561    /* Each of the 8 channel enables is considered for whether each
    562     * dword is written.
    563     */
    564    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
    565    brw_set_dest(p, send, dst);
    566    brw_set_src0(p, send, header);
    567    if (intel->gen < 6)
    568       send->header.destreg__conditionalmod = inst->base_mrf;
    569    brw_set_dp_read_message(p, send,
    570 			   255, /* binding table index: stateless access */
    571 			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
    572 			   msg_type,
    573 			   BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
    574 			   2, /* mlen */
    575 			   1 /* rlen */);
    576 }
    577 
    578 void
    579 vec4_visitor::generate_scratch_write(vec4_instruction *inst,
    580 				     struct brw_reg dst,
    581 				     struct brw_reg src,
    582 				     struct brw_reg index)
    583 {
    584    struct brw_reg header = brw_vec8_grf(0, 0);
    585    bool write_commit;
    586 
    587    /* If the instruction is predicated, we'll predicate the send, not
    588     * the header setup.
    589     */
    590    brw_set_predicate_control(p, false);
    591 
    592    gen6_resolve_implied_move(p, &header, inst->base_mrf);
    593 
    594    generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
    595 				     index);
    596 
    597    brw_MOV(p,
    598 	   retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
    599 	   retype(src, BRW_REGISTER_TYPE_D));
    600 
    601    uint32_t msg_type;
    602 
    603    if (intel->gen >= 7)
    604       msg_type = GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
    605    else if (intel->gen == 6)
    606       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
    607    else
    608       msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
    609 
    610    brw_set_predicate_control(p, inst->predicate);
    611 
    612    /* Pre-gen6, we have to specify write commits to ensure ordering
    613     * between reads and writes within a thread.  Afterwards, that's
    614     * guaranteed and write commits only matter for inter-thread
    615     * synchronization.
    616     */
    617    if (intel->gen >= 6) {
    618       write_commit = false;
    619    } else {
    620       /* The visitor set up our destination register to be g0.  This
    621        * means that when the next read comes along, we will end up
    622        * reading from g0 and causing a block on the write commit.  For
    623        * write-after-read, we are relying on the value of the previous
    624        * read being used (and thus blocking on completion) before our
    625        * write is executed.  This means we have to be careful in
    626        * instruction scheduling to not violate this assumption.
    627        */
    628       write_commit = true;
    629    }
    630 
    631    /* Each of the 8 channel enables is considered for whether each
    632     * dword is written.
    633     */
    634    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
    635    brw_set_dest(p, send, dst);
    636    brw_set_src0(p, send, header);
    637    if (intel->gen < 6)
    638       send->header.destreg__conditionalmod = inst->base_mrf;
    639    brw_set_dp_write_message(p, send,
    640 			    255, /* binding table index: stateless access */
    641 			    BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
    642 			    msg_type,
    643 			    3, /* mlen */
    644 			    true, /* header present */
    645 			    false, /* not a render target write */
    646 			    write_commit, /* rlen */
    647 			    false, /* eot */
    648 			    write_commit);
    649 }
    650 
    651 void
    652 vec4_visitor::generate_pull_constant_load(vec4_instruction *inst,
    653 					  struct brw_reg dst,
    654 					  struct brw_reg index,
    655 					  struct brw_reg offset)
    656 {
    657    assert(index.file == BRW_IMMEDIATE_VALUE &&
    658 	  index.type == BRW_REGISTER_TYPE_UD);
    659    uint32_t surf_index = index.dw1.ud;
    660 
    661    if (intel->gen == 7) {
    662       gen6_resolve_implied_move(p, &offset, inst->base_mrf);
    663       brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
    664       brw_set_dest(p, insn, dst);
    665       brw_set_src0(p, insn, offset);
    666       brw_set_sampler_message(p, insn,
    667                               surf_index,
    668                               0, /* LD message ignores sampler unit */
    669                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
    670                               1, /* rlen */
    671                               1, /* mlen */
    672                               false, /* no header */
    673                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
    674                               0);
    675       return;
    676    }
    677 
    678    struct brw_reg header = brw_vec8_grf(0, 0);
    679 
    680    gen6_resolve_implied_move(p, &header, inst->base_mrf);
    681 
    682    brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_D),
    683 	   offset);
    684 
    685    uint32_t msg_type;
    686 
    687    if (intel->gen >= 6)
    688       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    689    else if (intel->gen == 5 || intel->is_g4x)
    690       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    691    else
    692       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
    693 
    694    /* Each of the 8 channel enables is considered for whether each
    695     * dword is written.
    696     */
    697    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
    698    brw_set_dest(p, send, dst);
    699    brw_set_src0(p, send, header);
    700    if (intel->gen < 6)
    701       send->header.destreg__conditionalmod = inst->base_mrf;
    702    brw_set_dp_read_message(p, send,
    703 			   surf_index,
    704 			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
    705 			   msg_type,
    706 			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
    707 			   2, /* mlen */
    708 			   1 /* rlen */);
    709 }
    710 
    711 void
    712 vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
    713 				      struct brw_reg dst,
    714 				      struct brw_reg *src)
    715 {
    716    vec4_instruction *inst = (vec4_instruction *)instruction;
    717 
    718    switch (inst->opcode) {
    719    case SHADER_OPCODE_RCP:
    720    case SHADER_OPCODE_RSQ:
    721    case SHADER_OPCODE_SQRT:
    722    case SHADER_OPCODE_EXP2:
    723    case SHADER_OPCODE_LOG2:
    724    case SHADER_OPCODE_SIN:
    725    case SHADER_OPCODE_COS:
    726       if (intel->gen == 6) {
    727 	 generate_math1_gen6(inst, dst, src[0]);
    728       } else {
    729 	 /* Also works for Gen7. */
    730 	 generate_math1_gen4(inst, dst, src[0]);
    731       }
    732       break;
    733 
    734    case SHADER_OPCODE_POW:
    735    case SHADER_OPCODE_INT_QUOTIENT:
    736    case SHADER_OPCODE_INT_REMAINDER:
    737       if (intel->gen >= 7) {
    738 	 generate_math2_gen7(inst, dst, src[0], src[1]);
    739       } else if (intel->gen == 6) {
    740 	 generate_math2_gen6(inst, dst, src[0], src[1]);
    741       } else {
    742 	 generate_math2_gen4(inst, dst, src[0], src[1]);
    743       }
    744       break;
    745 
    746    case SHADER_OPCODE_TEX:
    747    case SHADER_OPCODE_TXD:
    748    case SHADER_OPCODE_TXF:
    749    case SHADER_OPCODE_TXL:
    750    case SHADER_OPCODE_TXS:
    751       generate_tex(inst, dst, src[0]);
    752       break;
    753 
    754    case VS_OPCODE_URB_WRITE:
    755       generate_urb_write(inst);
    756       break;
    757 
    758    case VS_OPCODE_SCRATCH_READ:
    759       generate_scratch_read(inst, dst, src[0]);
    760       break;
    761 
    762    case VS_OPCODE_SCRATCH_WRITE:
    763       generate_scratch_write(inst, dst, src[0], src[1]);
    764       break;
    765 
    766    case VS_OPCODE_PULL_CONSTANT_LOAD:
    767       generate_pull_constant_load(inst, dst, src[0], src[1]);
    768       break;
    769 
    770    default:
    771       if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
    772 	 fail("unsupported opcode in `%s' in VS\n",
    773 	      brw_opcodes[inst->opcode].name);
    774       } else {
    775 	 fail("Unsupported opcode %d in VS", inst->opcode);
    776       }
    777    }
    778 }
    779 
    780 bool
    781 vec4_visitor::run()
    782 {
    783    if (c->key.userclip_active && !c->key.uses_clip_distance)
    784       setup_uniform_clipplane_values();
    785 
    786    /* Generate VS IR for main().  (the visitor only descends into
    787     * functions called "main").
    788     */
    789    visit_instructions(shader->ir);
    790 
    791    emit_urb_writes();
    792 
    793    /* Before any optimization, push array accesses out to scratch
    794     * space where we need them to be.  This pass may allocate new
    795     * virtual GRFs, so we want to do it early.  It also makes sure
    796     * that we have reladdr computations available for CSE, since we'll
    797     * often do repeated subexpressions for those.
    798     */
    799    move_grf_array_access_to_scratch();
    800    move_uniform_array_access_to_pull_constants();
    801    pack_uniform_registers();
    802    move_push_constants_to_pull_constants();
    803 
    804    bool progress;
    805    do {
    806       progress = false;
    807       progress = dead_code_eliminate() || progress;
    808       progress = opt_copy_propagation() || progress;
    809       progress = opt_algebraic() || progress;
    810       progress = opt_compute_to_mrf() || progress;
    811    } while (progress);
    812 
    813 
    814    if (failed)
    815       return false;
    816 
    817    setup_payload();
    818 
    819    if (false) {
    820       /* Debug of register spilling: Go spill everything. */
    821       const int grf_count = virtual_grf_count;
    822       float spill_costs[virtual_grf_count];
    823       bool no_spill[virtual_grf_count];
    824       evaluate_spill_costs(spill_costs, no_spill);
    825       for (int i = 0; i < grf_count; i++) {
    826          if (no_spill[i])
    827             continue;
    828          spill_reg(i);
    829       }
    830    }
    831 
    832    while (!reg_allocate()) {
    833       if (failed)
    834          break;
    835    }
    836 
    837    if (failed)
    838       return false;
    839 
    840    brw_set_access_mode(p, BRW_ALIGN_16);
    841 
    842    generate_code();
    843 
    844    return !failed;
    845 }
    846 
    847 void
    848 vec4_visitor::generate_code()
    849 {
    850    int last_native_inst = 0;
    851    const char *last_annotation_string = NULL;
    852    ir_instruction *last_annotation_ir = NULL;
    853 
    854    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
    855       printf("Native code for vertex shader %d:\n", prog->Name);
    856    }
    857 
    858    foreach_list(node, &this->instructions) {
    859       vec4_instruction *inst = (vec4_instruction *)node;
    860       struct brw_reg src[3], dst;
    861 
    862       if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
    863 	 if (last_annotation_ir != inst->ir) {
    864 	    last_annotation_ir = inst->ir;
    865 	    if (last_annotation_ir) {
    866 	       printf("   ");
    867 	       last_annotation_ir->print();
    868 	       printf("\n");
    869 	    }
    870 	 }
    871 	 if (last_annotation_string != inst->annotation) {
    872 	    last_annotation_string = inst->annotation;
    873 	    if (last_annotation_string)
    874 	       printf("   %s\n", last_annotation_string);
    875 	 }
    876       }
    877 
    878       for (unsigned int i = 0; i < 3; i++) {
    879 	 src[i] = inst->get_src(i);
    880       }
    881       dst = inst->get_dst();
    882 
    883       brw_set_conditionalmod(p, inst->conditional_mod);
    884       brw_set_predicate_control(p, inst->predicate);
    885       brw_set_predicate_inverse(p, inst->predicate_inverse);
    886       brw_set_saturate(p, inst->saturate);
    887 
    888       switch (inst->opcode) {
    889       case BRW_OPCODE_MOV:
    890 	 brw_MOV(p, dst, src[0]);
    891 	 break;
    892       case BRW_OPCODE_ADD:
    893 	 brw_ADD(p, dst, src[0], src[1]);
    894 	 break;
    895       case BRW_OPCODE_MUL:
    896 	 brw_MUL(p, dst, src[0], src[1]);
    897 	 break;
    898       case BRW_OPCODE_MACH:
    899 	 brw_set_acc_write_control(p, 1);
    900 	 brw_MACH(p, dst, src[0], src[1]);
    901 	 brw_set_acc_write_control(p, 0);
    902 	 break;
    903 
    904       case BRW_OPCODE_FRC:
    905 	 brw_FRC(p, dst, src[0]);
    906 	 break;
    907       case BRW_OPCODE_RNDD:
    908 	 brw_RNDD(p, dst, src[0]);
    909 	 break;
    910       case BRW_OPCODE_RNDE:
    911 	 brw_RNDE(p, dst, src[0]);
    912 	 break;
    913       case BRW_OPCODE_RNDZ:
    914 	 brw_RNDZ(p, dst, src[0]);
    915 	 break;
    916 
    917       case BRW_OPCODE_AND:
    918 	 brw_AND(p, dst, src[0], src[1]);
    919 	 break;
    920       case BRW_OPCODE_OR:
    921 	 brw_OR(p, dst, src[0], src[1]);
    922 	 break;
    923       case BRW_OPCODE_XOR:
    924 	 brw_XOR(p, dst, src[0], src[1]);
    925 	 break;
    926       case BRW_OPCODE_NOT:
    927 	 brw_NOT(p, dst, src[0]);
    928 	 break;
    929       case BRW_OPCODE_ASR:
    930 	 brw_ASR(p, dst, src[0], src[1]);
    931 	 break;
    932       case BRW_OPCODE_SHR:
    933 	 brw_SHR(p, dst, src[0], src[1]);
    934 	 break;
    935       case BRW_OPCODE_SHL:
    936 	 brw_SHL(p, dst, src[0], src[1]);
    937 	 break;
    938 
    939       case BRW_OPCODE_CMP:
    940 	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
    941 	 break;
    942       case BRW_OPCODE_SEL:
    943 	 brw_SEL(p, dst, src[0], src[1]);
    944 	 break;
    945 
    946       case BRW_OPCODE_DP4:
    947 	 brw_DP4(p, dst, src[0], src[1]);
    948 	 break;
    949 
    950       case BRW_OPCODE_DP3:
    951 	 brw_DP3(p, dst, src[0], src[1]);
    952 	 break;
    953 
    954       case BRW_OPCODE_DP2:
    955 	 brw_DP2(p, dst, src[0], src[1]);
    956 	 break;
    957 
    958       case BRW_OPCODE_IF:
    959 	 if (inst->src[0].file != BAD_FILE) {
    960 	    /* The instruction has an embedded compare (only allowed on gen6) */
    961 	    assert(intel->gen == 6);
    962 	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
    963 	 } else {
    964 	    struct brw_instruction *brw_inst = brw_IF(p, BRW_EXECUTE_8);
    965 	    brw_inst->header.predicate_control = inst->predicate;
    966 	 }
    967 	 break;
    968 
    969       case BRW_OPCODE_ELSE:
    970 	 brw_ELSE(p);
    971 	 break;
    972       case BRW_OPCODE_ENDIF:
    973 	 brw_ENDIF(p);
    974 	 break;
    975 
    976       case BRW_OPCODE_DO:
    977 	 brw_DO(p, BRW_EXECUTE_8);
    978 	 break;
    979 
    980       case BRW_OPCODE_BREAK:
    981 	 brw_BREAK(p);
    982 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    983 	 break;
    984       case BRW_OPCODE_CONTINUE:
    985 	 /* FINISHME: We need to write the loop instruction support still. */
    986 	 if (intel->gen >= 6)
    987 	    gen6_CONT(p);
    988 	 else
    989 	    brw_CONT(p);
    990 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    991 	 break;
    992 
    993       case BRW_OPCODE_WHILE:
    994 	 brw_WHILE(p);
    995 	 break;
    996 
    997       default:
    998 	 generate_vs_instruction(inst, dst, src);
    999 	 break;
   1000       }
   1001 
   1002       if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
   1003 	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
   1004 	    if (0) {
   1005 	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
   1006 		      ((uint32_t *)&p->store[i])[3],
   1007 		      ((uint32_t *)&p->store[i])[2],
   1008 		      ((uint32_t *)&p->store[i])[1],
   1009 		      ((uint32_t *)&p->store[i])[0]);
   1010 	    }
   1011 	    brw_disasm(stdout, &p->store[i], intel->gen);
   1012 	 }
   1013       }
   1014 
   1015       last_native_inst = p->nr_insn;
   1016    }
   1017 
   1018    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
   1019       printf("\n");
   1020    }
   1021 
   1022    brw_set_uip_jip(p);
   1023 
   1024    /* OK, while the INTEL_DEBUG=vs above is very nice for debugging VS
   1025     * emit issues, it doesn't get the jump distances into the output,
   1026     * which is often something we want to debug.  So this is here in
   1027     * case you're doing that.
   1028     */
   1029    if (0) {
   1030       if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
   1031 	 for (unsigned int i = 0; i < p->nr_insn; i++) {
   1032 	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
   1033 		   ((uint32_t *)&p->store[i])[3],
   1034 		   ((uint32_t *)&p->store[i])[2],
   1035 		   ((uint32_t *)&p->store[i])[1],
   1036 		   ((uint32_t *)&p->store[i])[0]);
   1037 	    brw_disasm(stdout, &p->store[i], intel->gen);
   1038 	 }
   1039       }
   1040    }
   1041 }
   1042 
   1043 extern "C" {
   1044 
   1045 bool
   1046 brw_vs_emit(struct gl_shader_program *prog, struct brw_vs_compile *c)
   1047 {
   1048    struct brw_context *brw = c->func.brw;
   1049    struct intel_context *intel = &c->func.brw->intel;
   1050    bool start_busy = false;
   1051    float start_time = 0;
   1052 
   1053    if (!prog)
   1054       return false;
   1055 
   1056    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
   1057       start_busy = (intel->batch.last_bo &&
   1058                     drm_intel_bo_busy(intel->batch.last_bo));
   1059       start_time = get_time();
   1060    }
   1061 
   1062    struct brw_shader *shader =
   1063      (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
   1064    if (!shader)
   1065       return false;
   1066 
   1067    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
   1068       printf("GLSL IR for native vertex shader %d:\n", prog->Name);
   1069       _mesa_print_ir(shader->ir, NULL);
   1070       printf("\n\n");
   1071    }
   1072 
   1073    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
   1074       if (shader->compiled_once) {
   1075          brw_vs_debug_recompile(brw, prog, &c->key);
   1076       }
   1077       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
   1078          perf_debug("VS compile took %.03f ms and stalled the GPU\n",
   1079                     (get_time() - start_time) * 1000);
   1080       }
   1081    }
   1082 
   1083    vec4_visitor v(c, prog, shader);
   1084    if (!v.run()) {
   1085       prog->LinkStatus = false;
   1086       ralloc_strcat(&prog->InfoLog, v.fail_msg);
   1087       return false;
   1088    }
   1089 
   1090    shader->compiled_once = true;
   1091 
   1092    return true;
   1093 }
   1094 
   1095 } /* extern "C" */
   1096 
   1097 } /* namespace brw */
   1098