Home | History | Annotate | Download | only in compiler
      1 /* Copyright  2011 Intel Corporation
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a
      4  * copy of this software and associated documentation files (the "Software"),
      5  * to deal in the Software without restriction, including without limitation
      6  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      7  * and/or sell copies of the Software, and to permit persons to whom the
      8  * Software is furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice (including the next
     11  * paragraph) shall be included in all copies or substantial portions of the
     12  * Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     20  * IN THE SOFTWARE.
     21  */
     22 
     23 #include "brw_vec4.h"
     24 #include "brw_cfg.h"
     25 #include "brw_eu.h"
     26 #include "common/gen_debug.h"
     27 
     28 using namespace brw;
     29 
     30 static void
     31 generate_math1_gen4(struct brw_codegen *p,
     32                     vec4_instruction *inst,
     33                     struct brw_reg dst,
     34                     struct brw_reg src)
     35 {
     36    gen4_math(p,
     37 	     dst,
     38 	     brw_math_function(inst->opcode),
     39 	     inst->base_mrf,
     40 	     src,
     41 	     BRW_MATH_PRECISION_FULL);
     42 }
     43 
     44 static void
     45 check_gen6_math_src_arg(struct brw_reg src)
     46 {
     47    /* Source swizzles are ignored. */
     48    assert(!src.abs);
     49    assert(!src.negate);
     50    assert(src.swizzle == BRW_SWIZZLE_XYZW);
     51 }
     52 
     53 static void
     54 generate_math_gen6(struct brw_codegen *p,
     55                    vec4_instruction *inst,
     56                    struct brw_reg dst,
     57                    struct brw_reg src0,
     58                    struct brw_reg src1)
     59 {
     60    /* Can't do writemask because math can't be align16. */
     61    assert(dst.writemask == WRITEMASK_XYZW);
     62    /* Source swizzles are ignored. */
     63    check_gen6_math_src_arg(src0);
     64    if (src1.file == BRW_GENERAL_REGISTER_FILE)
     65       check_gen6_math_src_arg(src1);
     66 
     67    brw_set_default_access_mode(p, BRW_ALIGN_1);
     68    gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
     69    brw_set_default_access_mode(p, BRW_ALIGN_16);
     70 }
     71 
     72 static void
     73 generate_math2_gen4(struct brw_codegen *p,
     74                     vec4_instruction *inst,
     75                     struct brw_reg dst,
     76                     struct brw_reg src0,
     77                     struct brw_reg src1)
     78 {
     79    /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
     80     * "Message Payload":
     81     *
     82     * "Operand0[7].  For the INT DIV functions, this operand is the
     83     *  denominator."
     84     *  ...
     85     * "Operand1[7].  For the INT DIV functions, this operand is the
     86     *  numerator."
     87     */
     88    bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
     89    struct brw_reg &op0 = is_int_div ? src1 : src0;
     90    struct brw_reg &op1 = is_int_div ? src0 : src1;
     91 
     92    brw_push_insn_state(p);
     93    brw_set_default_saturate(p, false);
     94    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
     95    brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
     96    brw_pop_insn_state(p);
     97 
     98    gen4_math(p,
     99 	     dst,
    100 	     brw_math_function(inst->opcode),
    101 	     inst->base_mrf,
    102 	     op0,
    103 	     BRW_MATH_PRECISION_FULL);
    104 }
    105 
    106 static void
    107 generate_tex(struct brw_codegen *p,
    108              struct brw_vue_prog_data *prog_data,
    109              gl_shader_stage stage,
    110              vec4_instruction *inst,
    111              struct brw_reg dst,
    112              struct brw_reg src,
    113              struct brw_reg surface_index,
    114              struct brw_reg sampler_index)
    115 {
    116    const struct gen_device_info *devinfo = p->devinfo;
    117    int msg_type = -1;
    118 
    119    if (devinfo->gen >= 5) {
    120       switch (inst->opcode) {
    121       case SHADER_OPCODE_TEX:
    122       case SHADER_OPCODE_TXL:
    123 	 if (inst->shadow_compare) {
    124 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
    125 	 } else {
    126 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
    127 	 }
    128 	 break;
    129       case SHADER_OPCODE_TXD:
    130          if (inst->shadow_compare) {
    131             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
    132             assert(devinfo->gen >= 8 || devinfo->is_haswell);
    133             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
    134          } else {
    135             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
    136          }
    137 	 break;
    138       case SHADER_OPCODE_TXF:
    139 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
    140 	 break;
    141       case SHADER_OPCODE_TXF_CMS_W:
    142          assert(devinfo->gen >= 9);
    143          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
    144          break;
    145       case SHADER_OPCODE_TXF_CMS:
    146          if (devinfo->gen >= 7)
    147             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
    148          else
    149             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
    150          break;
    151       case SHADER_OPCODE_TXF_MCS:
    152          assert(devinfo->gen >= 7);
    153          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
    154          break;
    155       case SHADER_OPCODE_TXS:
    156 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
    157 	 break;
    158       case SHADER_OPCODE_TG4:
    159          if (inst->shadow_compare) {
    160             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
    161          } else {
    162             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
    163          }
    164          break;
    165       case SHADER_OPCODE_TG4_OFFSET:
    166          if (inst->shadow_compare) {
    167             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
    168          } else {
    169             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
    170          }
    171          break;
    172       case SHADER_OPCODE_SAMPLEINFO:
    173          msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
    174          break;
    175       default:
    176 	 unreachable("should not get here: invalid vec4 texture opcode");
    177       }
    178    } else {
    179       switch (inst->opcode) {
    180       case SHADER_OPCODE_TEX:
    181       case SHADER_OPCODE_TXL:
    182 	 if (inst->shadow_compare) {
    183 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
    184 	    assert(inst->mlen == 3);
    185 	 } else {
    186 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
    187 	    assert(inst->mlen == 2);
    188 	 }
    189 	 break;
    190       case SHADER_OPCODE_TXD:
    191 	 /* There is no sample_d_c message; comparisons are done manually. */
    192 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
    193 	 assert(inst->mlen == 4);
    194 	 break;
    195       case SHADER_OPCODE_TXF:
    196 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
    197 	 assert(inst->mlen == 2);
    198 	 break;
    199       case SHADER_OPCODE_TXS:
    200 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
    201 	 assert(inst->mlen == 2);
    202 	 break;
    203       default:
    204 	 unreachable("should not get here: invalid vec4 texture opcode");
    205       }
    206    }
    207 
    208    assert(msg_type != -1);
    209 
    210    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
    211 
    212    /* Load the message header if present.  If there's a texture offset, we need
    213     * to set it up explicitly and load the offset bitfield.  Otherwise, we can
    214     * use an implied move from g0 to the first message register.
    215     */
    216    if (inst->header_size != 0) {
    217       if (devinfo->gen < 6 && !inst->offset) {
    218          /* Set up an implied move from g0 to the MRF. */
    219          src = brw_vec8_grf(0, 0);
    220       } else {
    221          struct brw_reg header =
    222             retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
    223          uint32_t dw2 = 0;
    224 
    225          /* Explicitly set up the message header by copying g0 to the MRF. */
    226          brw_push_insn_state(p);
    227          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    228          brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    229 
    230          brw_set_default_access_mode(p, BRW_ALIGN_1);
    231 
    232          if (inst->offset)
    233             /* Set the texel offset bits in DWord 2. */
    234             dw2 = inst->offset;
    235 
    236          if (devinfo->gen >= 9)
    237             /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D,
    238              * based on bit 22 in the header.
    239              */
    240             dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2;
    241 
    242          /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
    243           * so header0.2 is 0 when g0 is copied.  The HS and GS stages do
    244           * not, so we must set to to 0 to avoid setting undesirable bits
    245           * in the message header.
    246           */
    247          if (dw2 ||
    248              stage == MESA_SHADER_TESS_CTRL ||
    249              stage == MESA_SHADER_GEOMETRY) {
    250             brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
    251          }
    252 
    253          brw_adjust_sampler_state_pointer(p, header, sampler_index);
    254          brw_pop_insn_state(p);
    255       }
    256    }
    257 
    258    uint32_t return_format;
    259 
    260    switch (dst.type) {
    261    case BRW_REGISTER_TYPE_D:
    262       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
    263       break;
    264    case BRW_REGISTER_TYPE_UD:
    265       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
    266       break;
    267    default:
    268       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
    269       break;
    270    }
    271 
    272    uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
    273          inst->opcode == SHADER_OPCODE_TG4_OFFSET)
    274          ? prog_data->base.binding_table.gather_texture_start
    275          : prog_data->base.binding_table.texture_start;
    276 
    277    if (surface_index.file == BRW_IMMEDIATE_VALUE &&
    278        sampler_index.file == BRW_IMMEDIATE_VALUE) {
    279       uint32_t surface = surface_index.ud;
    280       uint32_t sampler = sampler_index.ud;
    281 
    282       brw_SAMPLE(p,
    283                  dst,
    284                  inst->base_mrf,
    285                  src,
    286                  surface + base_binding_table_index,
    287                  sampler % 16,
    288                  msg_type,
    289                  1, /* response length */
    290                  inst->mlen,
    291                  inst->header_size != 0,
    292                  BRW_SAMPLER_SIMD_MODE_SIMD4X2,
    293                  return_format);
    294 
    295       brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
    296    } else {
    297       /* Non-constant sampler index. */
    298 
    299       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
    300       struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
    301       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
    302 
    303       brw_push_insn_state(p);
    304       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    305       brw_set_default_access_mode(p, BRW_ALIGN_1);
    306 
    307       if (brw_regs_equal(&surface_reg, &sampler_reg)) {
    308          brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
    309       } else {
    310          if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
    311             brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
    312          } else {
    313             brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
    314             brw_OR(p, addr, addr, surface_reg);
    315          }
    316       }
    317       if (base_binding_table_index)
    318          brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
    319       brw_AND(p, addr, addr, brw_imm_ud(0xfff));
    320 
    321       brw_pop_insn_state(p);
    322 
    323       if (inst->base_mrf != -1)
    324          gen6_resolve_implied_move(p, &src, inst->base_mrf);
    325 
    326       /* dst = send(offset, a0.0 | <descriptor>) */
    327       brw_inst *insn = brw_send_indirect_message(
    328          p, BRW_SFID_SAMPLER, dst, src, addr);
    329       brw_set_sampler_message(p, insn,
    330                               0 /* surface */,
    331                               0 /* sampler */,
    332                               msg_type,
    333                               1 /* rlen */,
    334                               inst->mlen /* mlen */,
    335                               inst->header_size != 0 /* header */,
    336                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
    337                               return_format);
    338 
    339       /* visitor knows more than we do about the surface limit required,
    340        * so has already done marking.
    341        */
    342    }
    343 }
    344 
    345 static void
    346 generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
    347 {
    348    brw_urb_WRITE(p,
    349 		 brw_null_reg(), /* dest */
    350 		 inst->base_mrf, /* starting mrf reg nr */
    351 		 brw_vec8_grf(0, 0), /* src */
    352                  inst->urb_write_flags,
    353 		 inst->mlen,
    354 		 0,		/* response len */
    355 		 inst->offset,	/* urb destination offset */
    356 		 BRW_URB_SWIZZLE_INTERLEAVE);
    357 }
    358 
    359 static void
    360 generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
    361 {
    362    struct brw_reg src = brw_message_reg(inst->base_mrf);
    363    brw_urb_WRITE(p,
    364                  brw_null_reg(), /* dest */
    365                  inst->base_mrf, /* starting mrf reg nr */
    366                  src,
    367                  inst->urb_write_flags,
    368                  inst->mlen,
    369                  0,             /* response len */
    370                  inst->offset,  /* urb destination offset */
    371                  BRW_URB_SWIZZLE_INTERLEAVE);
    372 }
    373 
    374 static void
    375 generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
    376 {
    377    struct brw_reg src = brw_message_reg(inst->base_mrf);
    378 
    379    /* We pass the temporary passed in src0 as the writeback register */
    380    brw_urb_WRITE(p,
    381                  inst->src[0].as_brw_reg(), /* dest */
    382                  inst->base_mrf, /* starting mrf reg nr */
    383                  src,
    384                  BRW_URB_WRITE_ALLOCATE_COMPLETE,
    385                  inst->mlen,
    386                  1, /* response len */
    387                  inst->offset,  /* urb destination offset */
    388                  BRW_URB_SWIZZLE_INTERLEAVE);
    389 
    390    /* Now put allocated urb handle in dst.0 */
    391    brw_push_insn_state(p);
    392    brw_set_default_access_mode(p, BRW_ALIGN_1);
    393    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    394    brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0),
    395            get_element_ud(inst->src[0].as_brw_reg(), 0));
    396    brw_pop_insn_state(p);
    397 }
    398 
    399 static void
    400 generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
    401 {
    402    struct brw_reg src = brw_message_reg(inst->base_mrf);
    403    brw_urb_WRITE(p,
    404                  brw_null_reg(), /* dest */
    405                  inst->base_mrf, /* starting mrf reg nr */
    406                  src,
    407                  BRW_URB_WRITE_EOT | inst->urb_write_flags,
    408                  inst->mlen,
    409                  0,              /* response len */
    410                  0,              /* urb destination offset */
    411                  BRW_URB_SWIZZLE_INTERLEAVE);
    412 }
    413 
    414 static void
    415 generate_gs_set_write_offset(struct brw_codegen *p,
    416                              struct brw_reg dst,
    417                              struct brw_reg src0,
    418                              struct brw_reg src1)
    419 {
    420    /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
    421     * Header: M0.3):
    422     *
    423     *     Slot 0 Offset. This field, after adding to the Global Offset field
    424     *     in the message descriptor, specifies the offset (in 256-bit units)
    425     *     from the start of the URB entry, as referenced by URB Handle 0, at
    426     *     which the data will be accessed.
    427     *
    428     * Similar text describes DWORD M0.4, which is slot 1 offset.
    429     *
    430     * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
    431     * of the register for geometry shader invocations 0 and 1) by the
    432     * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
    433     *
    434     * We can do this with the following EU instruction:
    435     *
    436     *     mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW   { Align1 WE_all }
    437     */
    438    brw_push_insn_state(p);
    439    brw_set_default_access_mode(p, BRW_ALIGN_1);
    440    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    441    assert(p->devinfo->gen >= 7 &&
    442           src1.file == BRW_IMMEDIATE_VALUE &&
    443           src1.type == BRW_REGISTER_TYPE_UD &&
    444           src1.ud <= USHRT_MAX);
    445    if (src0.file == BRW_IMMEDIATE_VALUE) {
    446       brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
    447               brw_imm_ud(src0.ud * src1.ud));
    448    } else {
    449       brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
    450               retype(src1, BRW_REGISTER_TYPE_UW));
    451    }
    452    brw_pop_insn_state(p);
    453 }
    454 
    455 static void
    456 generate_gs_set_vertex_count(struct brw_codegen *p,
    457                              struct brw_reg dst,
    458                              struct brw_reg src)
    459 {
    460    brw_push_insn_state(p);
    461    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    462 
    463    if (p->devinfo->gen >= 8) {
    464       /* Move the vertex count into the second MRF for the EOT write. */
    465       brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
    466               src);
    467    } else {
    468       /* If we think of the src and dst registers as composed of 8 DWORDs each,
    469        * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
    470        * them to WORDs, and then pack them into DWORD 2 of dst.
    471        *
    472        * It's easier to get the EU to do this if we think of the src and dst
    473        * registers as composed of 16 WORDS each; then, we want to pick up the
    474        * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
    475        * of dst.
    476        *
    477        * We can do that by the following EU instruction:
    478        *
    479        *     mov (2) dst.4<1>:uw src<8;1,0>:uw   { Align1, Q1, NoMask }
    480        */
    481       brw_set_default_access_mode(p, BRW_ALIGN_1);
    482       brw_MOV(p,
    483               suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
    484               stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
    485    }
    486    brw_pop_insn_state(p);
    487 }
    488 
    489 static void
    490 generate_gs_svb_write(struct brw_codegen *p,
    491                       struct brw_vue_prog_data *prog_data,
    492                       vec4_instruction *inst,
    493                       struct brw_reg dst,
    494                       struct brw_reg src0,
    495                       struct brw_reg src1)
    496 {
    497    int binding = inst->sol_binding;
    498    bool final_write = inst->sol_final_write;
    499 
    500    brw_push_insn_state(p);
    501    brw_set_default_exec_size(p, BRW_EXECUTE_4);
    502    /* Copy Vertex data into M0.x */
    503    brw_MOV(p, stride(dst, 4, 4, 1),
    504            stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
    505    brw_pop_insn_state(p);
    506 
    507    brw_push_insn_state(p);
    508    /* Send SVB Write */
    509    brw_svb_write(p,
    510                  final_write ? src1 : brw_null_reg(), /* dest == src1 */
    511                  1, /* msg_reg_nr */
    512                  dst, /* src0 == previous dst */
    513                  BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */
    514                  final_write); /* send_commit_msg */
    515 
    516    /* Finally, wait for the write commit to occur so that we can proceed to
    517     * other things safely.
    518     *
    519     * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
    520     *
    521     *   The write commit does not modify the destination register, but
    522     *   merely clears the dependency associated with the destination
    523     *   register. Thus, a simple mov instruction using the register as a
    524     *   source is sufficient to wait for the write commit to occur.
    525     */
    526    if (final_write) {
    527       brw_MOV(p, src1, src1);
    528    }
    529    brw_pop_insn_state(p);
    530 }
    531 
    532 static void
    533 generate_gs_svb_set_destination_index(struct brw_codegen *p,
    534                                       vec4_instruction *inst,
    535                                       struct brw_reg dst,
    536                                       struct brw_reg src)
    537 {
    538    int vertex = inst->sol_vertex;
    539    brw_push_insn_state(p);
    540    brw_set_default_access_mode(p, BRW_ALIGN_1);
    541    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    542    brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
    543    brw_pop_insn_state(p);
    544 }
    545 
    546 static void
    547 generate_gs_set_dword_2(struct brw_codegen *p,
    548                         struct brw_reg dst,
    549                         struct brw_reg src)
    550 {
    551    brw_push_insn_state(p);
    552    brw_set_default_access_mode(p, BRW_ALIGN_1);
    553    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    554    brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
    555    brw_pop_insn_state(p);
    556 }
    557 
    558 static void
    559 generate_gs_prepare_channel_masks(struct brw_codegen *p,
    560                                   struct brw_reg dst)
    561 {
    562    /* We want to left shift just DWORD 4 (the x component belonging to the
    563     * second geometry shader invocation) by 4 bits.  So generate the
    564     * instruction:
    565     *
    566     *     shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
    567     */
    568    dst = suboffset(vec1(dst), 4);
    569    brw_push_insn_state(p);
    570    brw_set_default_access_mode(p, BRW_ALIGN_1);
    571    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    572    brw_SHL(p, dst, dst, brw_imm_ud(4));
    573    brw_pop_insn_state(p);
    574 }
    575 
    576 static void
    577 generate_gs_set_channel_masks(struct brw_codegen *p,
    578                               struct brw_reg dst,
    579                               struct brw_reg src)
    580 {
    581    /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
    582     * Header: M0.5):
    583     *
    584     *     15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
    585     *
    586     *        When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
    587     *        DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
    588     *        Vertex 0 DATA[7].  This bit is ANDed with the corresponding
    589     *        channel enable to determine the final channel enable.  For the
    590     *        URB_READ_OWORD & URB_READ_HWORD messages, when final channel
    591     *        enable is 1 it indicates that Vertex 1 DATA [3] will be included
    592     *        in the writeback message.  For the URB_WRITE_OWORD &
    593     *        URB_WRITE_HWORD messages, when final channel enable is 1 it
    594     *        indicates that Vertex 1 DATA [3] will be written to the surface.
    595     *
    596     *        0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
    597     *        1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
    598     *
    599     *     14 Vertex 1 DATA [2] Channel Mask
    600     *     13 Vertex 1 DATA [1] Channel Mask
    601     *     12 Vertex 1 DATA [0] Channel Mask
    602     *     11 Vertex 0 DATA [3] Channel Mask
    603     *     10 Vertex 0 DATA [2] Channel Mask
    604     *      9 Vertex 0 DATA [1] Channel Mask
    605     *      8 Vertex 0 DATA [0] Channel Mask
    606     *
    607     * (This is from a section of the PRM that is agnostic to the particular
    608     * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
    609     * geometry shader invocations 0 and 1, respectively).  Since we have the
    610     * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
    611     * and the enable flags for geometry shader invocation 1 in bits 7:0 of
    612     * DWORD 4, we just need to OR them together and store the result in bits
    613     * 15:8 of DWORD 5.
    614     *
    615     * It's easier to get the EU to do this if we think of the src and dst
    616     * registers as composed of 32 bytes each; then, we want to pick up the
    617     * contents of bytes 0 and 16 from src, OR them together, and store them in
    618     * byte 21.
    619     *
    620     * We can do that by the following EU instruction:
    621     *
    622     *     or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
    623     *
    624     * Note: this relies on the source register having zeros in (a) bits 7:4 of
    625     * DWORD 0 and (b) bits 3:0 of DWORD 4.  We can rely on (b) because the
    626     * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
    627     * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
    628     * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
    629     * contain valid channel mask values (which are in the range 0x0-0xf).
    630     */
    631    dst = retype(dst, BRW_REGISTER_TYPE_UB);
    632    src = retype(src, BRW_REGISTER_TYPE_UB);
    633    brw_push_insn_state(p);
    634    brw_set_default_access_mode(p, BRW_ALIGN_1);
    635    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    636    brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
    637    brw_pop_insn_state(p);
    638 }
    639 
    640 static void
    641 generate_gs_get_instance_id(struct brw_codegen *p,
    642                             struct brw_reg dst)
    643 {
    644    /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
    645     * and store into dst.0 & dst.4. So generate the instruction:
    646     *
    647     *     shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
    648     */
    649    brw_push_insn_state(p);
    650    brw_set_default_access_mode(p, BRW_ALIGN_1);
    651    dst = retype(dst, BRW_REGISTER_TYPE_UD);
    652    struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    653    brw_SHR(p, dst, stride(r0, 1, 4, 0),
    654            brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
    655    brw_pop_insn_state(p);
    656 }
    657 
    658 static void
    659 generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
    660                                    struct brw_reg dst,
    661                                    struct brw_reg src0,
    662                                    struct brw_reg src1,
    663                                    struct brw_reg src2)
    664 {
    665    brw_push_insn_state(p);
    666    brw_set_default_access_mode(p, BRW_ALIGN_1);
    667    /* Save src0 data in 16:31 bits of dst.0 */
    668    brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
    669            brw_imm_ud(0xffffu));
    670    brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
    671    /* Save src1 data in 0:15 bits of dst.0 */
    672    brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
    673            brw_imm_ud(0xffffu));
    674    brw_OR(p, suboffset(vec1(dst), 0),
    675           suboffset(vec1(dst), 0),
    676           suboffset(vec1(src2), 0));
    677    brw_pop_insn_state(p);
    678 }
    679 
    680 static void
    681 generate_gs_ff_sync(struct brw_codegen *p,
    682                     vec4_instruction *inst,
    683                     struct brw_reg dst,
    684                     struct brw_reg src0,
    685                     struct brw_reg src1)
    686 {
    687    /* This opcode uses an implied MRF register for:
    688     *  - the header of the ff_sync message. And as such it is expected to be
    689     *    initialized to r0 before calling here.
    690     *  - the destination where we will write the allocated URB handle.
    691     */
    692    struct brw_reg header =
    693       retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
    694 
    695    /* Overwrite dword 0 of the header (SO vertices to write) and
    696     * dword 1 (number of primitives written).
    697     */
    698    brw_push_insn_state(p);
    699    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    700    brw_set_default_access_mode(p, BRW_ALIGN_1);
    701    brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
    702    brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
    703    brw_pop_insn_state(p);
    704 
    705    /* Allocate URB handle in dst */
    706    brw_ff_sync(p,
    707                dst,
    708                0,
    709                header,
    710                1, /* allocate */
    711                1, /* response length */
    712                0 /* eot */);
    713 
    714    /* Now put allocated urb handle in header.0 */
    715    brw_push_insn_state(p);
    716    brw_set_default_access_mode(p, BRW_ALIGN_1);
    717    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    718    brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
    719 
    720    /* src1 is not an immediate when we use transform feedback */
    721    if (src1.file != BRW_IMMEDIATE_VALUE) {
    722       brw_set_default_exec_size(p, BRW_EXECUTE_4);
    723       brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
    724    }
    725 
    726    brw_pop_insn_state(p);
    727 }
    728 
    729 static void
    730 generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
    731 {
    732    /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
    733    struct brw_reg src = brw_vec8_grf(0, 0);
    734    brw_push_insn_state(p);
    735    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    736    brw_set_default_access_mode(p, BRW_ALIGN_1);
    737    brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
    738    brw_pop_insn_state(p);
    739 }
    740 
    741 static void
    742 generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
    743 {
    744    const struct gen_device_info *devinfo = p->devinfo;
    745    const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
    746 
    747    /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
    748     *
    749     * Since we operate in SIMD4x2 mode, we need run half as many threads
    750     * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
    751     * shift right by one less to accomplish the multiplication by two.
    752     */
    753    dst = retype(dst, BRW_REGISTER_TYPE_UD);
    754    struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    755 
    756    brw_push_insn_state(p);
    757    brw_set_default_access_mode(p, BRW_ALIGN_1);
    758 
    759    const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
    760    const int shift = ivb ? 16 : 17;
    761 
    762    brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
    763    brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
    764            brw_imm_ud(shift - 1));
    765    brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
    766 
    767    brw_pop_insn_state(p);
    768 }
    769 
    770 static void
    771 generate_tcs_urb_write(struct brw_codegen *p,
    772                        vec4_instruction *inst,
    773                        struct brw_reg urb_header)
    774 {
    775    const struct gen_device_info *devinfo = p->devinfo;
    776 
    777    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    778    brw_set_dest(p, send, brw_null_reg());
    779    brw_set_src0(p, send, urb_header);
    780 
    781    brw_set_message_descriptor(p, send, BRW_SFID_URB,
    782                               inst->mlen /* mlen */, 0 /* rlen */,
    783                               true /* header */, false /* eot */);
    784    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
    785    brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
    786    if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
    787       brw_inst_set_eot(devinfo, send, 1);
    788    } else {
    789       brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
    790       brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
    791    }
    792 
    793    /* what happens to swizzles? */
    794 }
    795 
    796 
    797 static void
    798 generate_tcs_input_urb_offsets(struct brw_codegen *p,
    799                                struct brw_reg dst,
    800                                struct brw_reg vertex,
    801                                struct brw_reg offset)
    802 {
    803    /* Generates an URB read/write message header for HS/DS operation.
    804     * Inputs are a vertex index, and a byte offset from the beginning of
    805     * the vertex. */
    806 
    807    /* If `vertex` is not an immediate, we clobber a0.0 */
    808 
    809    assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
    810    assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
    811 
    812    assert(dst.file == BRW_GENERAL_REGISTER_FILE);
    813 
    814    brw_push_insn_state(p);
    815    brw_set_default_access_mode(p, BRW_ALIGN_1);
    816    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    817    brw_MOV(p, dst, brw_imm_ud(0));
    818 
    819    /* m0.5 bits 8-15 are channel enables */
    820    brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
    821 
    822    /* m0.0-0.1: URB handles */
    823    if (vertex.file == BRW_IMMEDIATE_VALUE) {
    824       uint32_t vertex_index = vertex.ud;
    825       struct brw_reg index_reg = brw_vec1_grf(
    826             1 + (vertex_index >> 3), vertex_index & 7);
    827 
    828       brw_MOV(p, vec2(get_element_ud(dst, 0)),
    829               retype(index_reg, BRW_REGISTER_TYPE_UD));
    830    } else {
    831       /* Use indirect addressing.  ICP Handles are DWords (single channels
    832        * of a register) and start at g1.0.
    833        *
    834        * In order to start our region at g1.0, we add 8 to the vertex index,
    835        * effectively skipping over the 8 channels in g0.0.  This gives us a
    836        * DWord offset to the ICP Handle.
    837        *
    838        * Indirect addressing works in terms of bytes, so we then multiply
    839        * the DWord offset by 4 (by shifting left by 2).
    840        */
    841       struct brw_reg addr = brw_address_reg(0);
    842 
    843       /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
    844       brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW),
    845               brw_imm_uw(0x8));
    846       brw_SHL(p, addr, addr, brw_imm_uw(2));
    847       brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
    848 
    849       /* top half: m0.1 = g[1.0 + vertex.4]UD */
    850       brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW),
    851               brw_imm_uw(0x8));
    852       brw_SHL(p, addr, addr, brw_imm_uw(2));
    853       brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
    854    }
    855 
    856    /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
    857    if (offset.file != ARF)
    858       brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
    859 
    860    brw_pop_insn_state(p);
    861 }
    862 
    863 
    864 static void
    865 generate_tcs_output_urb_offsets(struct brw_codegen *p,
    866                                 struct brw_reg dst,
    867                                 struct brw_reg write_mask,
    868                                 struct brw_reg offset)
    869 {
    870    /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
    871    assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
    872 
    873    assert(write_mask.file == BRW_IMMEDIATE_VALUE);
    874    assert(write_mask.type == BRW_REGISTER_TYPE_UD);
    875 
    876    brw_push_insn_state(p);
    877 
    878    brw_set_default_access_mode(p, BRW_ALIGN_1);
    879    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    880    brw_MOV(p, dst, brw_imm_ud(0));
    881 
    882    unsigned mask = write_mask.ud;
    883 
    884    /* m0.5 bits 15:12 and 11:8 are channel enables */
    885    brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
    886 
    887    /* HS patch URB handle is delivered in r0.0 */
    888    struct brw_reg urb_handle = brw_vec1_grf(0, 0);
    889 
    890    /* m0.0-0.1: URB handles */
    891    brw_MOV(p, vec2(get_element_ud(dst, 0)),
    892            retype(urb_handle, BRW_REGISTER_TYPE_UD));
    893 
    894    /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
    895    if (offset.file != ARF)
    896       brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
    897 
    898    brw_pop_insn_state(p);
    899 }
    900 
    901 static void
    902 generate_tes_create_input_read_header(struct brw_codegen *p,
    903                                       struct brw_reg dst)
    904 {
    905    brw_push_insn_state(p);
    906    brw_set_default_access_mode(p, BRW_ALIGN_1);
    907    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    908 
    909    /* Initialize the register to 0 */
    910    brw_MOV(p, dst, brw_imm_ud(0));
    911 
    912    /* Enable all the channels in m0.5 bits 15:8 */
    913    brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
    914 
    915    /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1.  For safety,
    916     * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
    917     */
    918    brw_AND(p, vec2(get_element_ud(dst, 0)),
    919            retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
    920            brw_imm_ud(0x1fff));
    921    brw_pop_insn_state(p);
    922 }
    923 
    924 static void
    925 generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
    926                                      struct brw_reg dst,
    927                                      struct brw_reg header,
    928                                      struct brw_reg offset)
    929 {
    930    brw_push_insn_state(p);
    931    brw_set_default_access_mode(p, BRW_ALIGN_1);
    932    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    933 
    934    brw_MOV(p, dst, header);
    935    /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
    936    brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
    937 
    938    brw_pop_insn_state(p);
    939 }
    940 
    941 static void
    942 generate_vec4_urb_read(struct brw_codegen *p,
    943                        vec4_instruction *inst,
    944                        struct brw_reg dst,
    945                        struct brw_reg header)
    946 {
    947    const struct gen_device_info *devinfo = p->devinfo;
    948 
    949    assert(header.file == BRW_GENERAL_REGISTER_FILE);
    950    assert(header.type == BRW_REGISTER_TYPE_UD);
    951 
    952    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    953    brw_set_dest(p, send, dst);
    954    brw_set_src0(p, send, header);
    955 
    956    brw_set_message_descriptor(p, send, BRW_SFID_URB,
    957                               1 /* mlen */, 1 /* rlen */,
    958                               true /* header */, false /* eot */);
    959    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
    960    brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
    961    brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
    962 
    963    brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
    964 }
    965 
    966 static void
    967 generate_tcs_release_input(struct brw_codegen *p,
    968                            struct brw_reg header,
    969                            struct brw_reg vertex,
    970                            struct brw_reg is_unpaired)
    971 {
    972    const struct gen_device_info *devinfo = p->devinfo;
    973 
    974    assert(vertex.file == BRW_IMMEDIATE_VALUE);
    975    assert(vertex.type == BRW_REGISTER_TYPE_UD);
    976 
    977    /* m0.0-0.1: URB handles */
    978    struct brw_reg urb_handles =
    979       retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
    980              BRW_REGISTER_TYPE_UD);
    981 
    982    brw_push_insn_state(p);
    983    brw_set_default_access_mode(p, BRW_ALIGN_1);
    984    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    985    brw_MOV(p, header, brw_imm_ud(0));
    986    brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
    987    brw_pop_insn_state(p);
    988 
    989    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    990    brw_set_dest(p, send, brw_null_reg());
    991    brw_set_src0(p, send, header);
    992    brw_set_message_descriptor(p, send, BRW_SFID_URB,
    993                               1 /* mlen */, 0 /* rlen */,
    994                               true /* header */, false /* eot */);
    995    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
    996    brw_inst_set_urb_complete(devinfo, send, 1);
    997    brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
    998                                     BRW_URB_SWIZZLE_NONE :
    999                                     BRW_URB_SWIZZLE_INTERLEAVE);
   1000 }
   1001 
   1002 static void
   1003 generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
   1004 {
   1005    struct brw_reg header = brw_message_reg(inst->base_mrf);
   1006 
   1007    brw_push_insn_state(p);
   1008    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1009    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1010    brw_MOV(p, header, brw_imm_ud(0));
   1011    brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8));
   1012    brw_MOV(p, get_element_ud(header, 0),
   1013            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
   1014    brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u));
   1015    brw_pop_insn_state(p);
   1016 
   1017    brw_urb_WRITE(p,
   1018                  brw_null_reg(), /* dest */
   1019                  inst->base_mrf, /* starting mrf reg nr */
   1020                  header,
   1021                  BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD |
   1022                  BRW_URB_WRITE_USE_CHANNEL_MASKS,
   1023                  inst->mlen,
   1024                  0,              /* response len */
   1025                  0,              /* urb destination offset */
   1026                  0);
   1027 }
   1028 
   1029 static void
   1030 generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
   1031 {
   1032    brw_push_insn_state(p);
   1033    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1034    brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
   1035    brw_pop_insn_state(p);
   1036 }
   1037 
   1038 static void
   1039 generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
   1040 {
   1041    brw_push_insn_state(p);
   1042    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1043    brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
   1044    brw_pop_insn_state(p);
   1045 }
   1046 
   1047 static void
   1048 generate_tcs_create_barrier_header(struct brw_codegen *p,
   1049                                    struct brw_vue_prog_data *prog_data,
   1050                                    struct brw_reg dst)
   1051 {
   1052    const struct gen_device_info *devinfo = p->devinfo;
   1053    const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
   1054    struct brw_reg m0_2 = get_element_ud(dst, 2);
   1055    unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
   1056 
   1057    brw_push_insn_state(p);
   1058    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1059    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1060 
   1061    /* Zero the message header */
   1062    brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
   1063 
   1064    /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */
   1065    brw_AND(p, m0_2,
   1066            retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
   1067            brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
   1068 
   1069    /* Shift it up to bits 27:24. */
   1070    brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
   1071 
   1072    /* Set the Barrier Count and the enable bit */
   1073    brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
   1074 
   1075    brw_pop_insn_state(p);
   1076 }
   1077 
   1078 static void
   1079 generate_oword_dual_block_offsets(struct brw_codegen *p,
   1080                                   struct brw_reg m1,
   1081                                   struct brw_reg index)
   1082 {
   1083    int second_vertex_offset;
   1084 
   1085    if (p->devinfo->gen >= 6)
   1086       second_vertex_offset = 1;
   1087    else
   1088       second_vertex_offset = 16;
   1089 
   1090    m1 = retype(m1, BRW_REGISTER_TYPE_D);
   1091 
   1092    /* Set up M1 (message payload).  Only the block offsets in M1.0 and
   1093     * M1.4 are used, and the rest are ignored.
   1094     */
   1095    struct brw_reg m1_0 = suboffset(vec1(m1), 0);
   1096    struct brw_reg m1_4 = suboffset(vec1(m1), 4);
   1097    struct brw_reg index_0 = suboffset(vec1(index), 0);
   1098    struct brw_reg index_4 = suboffset(vec1(index), 4);
   1099 
   1100    brw_push_insn_state(p);
   1101    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1102    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1103 
   1104    brw_MOV(p, m1_0, index_0);
   1105 
   1106    if (index.file == BRW_IMMEDIATE_VALUE) {
   1107       index_4.ud += second_vertex_offset;
   1108       brw_MOV(p, m1_4, index_4);
   1109    } else {
   1110       brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
   1111    }
   1112 
   1113    brw_pop_insn_state(p);
   1114 }
   1115 
   1116 static void
   1117 generate_unpack_flags(struct brw_codegen *p,
   1118                       struct brw_reg dst)
   1119 {
   1120    brw_push_insn_state(p);
   1121    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1122    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1123 
   1124    struct brw_reg flags = brw_flag_reg(0, 0);
   1125    struct brw_reg dst_0 = suboffset(vec1(dst), 0);
   1126    struct brw_reg dst_4 = suboffset(vec1(dst), 4);
   1127 
   1128    brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
   1129    brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
   1130    brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
   1131 
   1132    brw_pop_insn_state(p);
   1133 }
   1134 
   1135 static void
   1136 generate_scratch_read(struct brw_codegen *p,
   1137                       vec4_instruction *inst,
   1138                       struct brw_reg dst,
   1139                       struct brw_reg index)
   1140 {
   1141    const struct gen_device_info *devinfo = p->devinfo;
   1142    struct brw_reg header = brw_vec8_grf(0, 0);
   1143 
   1144    gen6_resolve_implied_move(p, &header, inst->base_mrf);
   1145 
   1146    generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
   1147 				     index);
   1148 
   1149    uint32_t msg_type;
   1150 
   1151    if (devinfo->gen >= 6)
   1152       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1153    else if (devinfo->gen == 5 || devinfo->is_g4x)
   1154       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1155    else
   1156       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1157 
   1158    const unsigned target_cache =
   1159       devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
   1160       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
   1161       BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
   1162 
   1163    /* Each of the 8 channel enables is considered for whether each
   1164     * dword is written.
   1165     */
   1166    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1167    brw_set_dest(p, send, dst);
   1168    brw_set_src0(p, send, header);
   1169    if (devinfo->gen < 6)
   1170       brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
   1171    brw_set_dp_read_message(p, send,
   1172                            brw_scratch_surface_idx(p),
   1173 			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
   1174 			   msg_type, target_cache,
   1175 			   2, /* mlen */
   1176                            true, /* header_present */
   1177 			   1 /* rlen */);
   1178 }
   1179 
   1180 static void
   1181 generate_scratch_write(struct brw_codegen *p,
   1182                        vec4_instruction *inst,
   1183                        struct brw_reg dst,
   1184                        struct brw_reg src,
   1185                        struct brw_reg index)
   1186 {
   1187    const struct gen_device_info *devinfo = p->devinfo;
   1188    const unsigned target_cache =
   1189       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
   1190        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
   1191        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
   1192    struct brw_reg header = brw_vec8_grf(0, 0);
   1193    bool write_commit;
   1194 
   1195    /* If the instruction is predicated, we'll predicate the send, not
   1196     * the header setup.
   1197     */
   1198    brw_set_default_predicate_control(p, false);
   1199 
   1200    gen6_resolve_implied_move(p, &header, inst->base_mrf);
   1201 
   1202    generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
   1203 				     index);
   1204 
   1205    brw_MOV(p,
   1206 	   retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
   1207 	   retype(src, BRW_REGISTER_TYPE_D));
   1208 
   1209    uint32_t msg_type;
   1210 
   1211    if (devinfo->gen >= 7)
   1212       msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
   1213    else if (devinfo->gen == 6)
   1214       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
   1215    else
   1216       msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
   1217 
   1218    brw_set_default_predicate_control(p, inst->predicate);
   1219 
   1220    /* Pre-gen6, we have to specify write commits to ensure ordering
   1221     * between reads and writes within a thread.  Afterwards, that's
   1222     * guaranteed and write commits only matter for inter-thread
   1223     * synchronization.
   1224     */
   1225    if (devinfo->gen >= 6) {
   1226       write_commit = false;
   1227    } else {
   1228       /* The visitor set up our destination register to be g0.  This
   1229        * means that when the next read comes along, we will end up
   1230        * reading from g0 and causing a block on the write commit.  For
   1231        * write-after-read, we are relying on the value of the previous
   1232        * read being used (and thus blocking on completion) before our
   1233        * write is executed.  This means we have to be careful in
   1234        * instruction scheduling to not violate this assumption.
   1235        */
   1236       write_commit = true;
   1237    }
   1238 
   1239    /* Each of the 8 channel enables is considered for whether each
   1240     * dword is written.
   1241     */
   1242    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1243    brw_set_dest(p, send, dst);
   1244    brw_set_src0(p, send, header);
   1245    if (devinfo->gen < 6)
   1246       brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
   1247    brw_set_dp_write_message(p, send,
   1248                             brw_scratch_surface_idx(p),
   1249 			    BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
   1250 			    msg_type,
   1251                             target_cache,
   1252 			    3, /* mlen */
   1253 			    true, /* header present */
   1254 			    false, /* not a render target write */
   1255 			    write_commit, /* rlen */
   1256 			    false, /* eot */
   1257 			    write_commit);
   1258 }
   1259 
   1260 static void
   1261 generate_pull_constant_load(struct brw_codegen *p,
   1262                             struct brw_vue_prog_data *prog_data,
   1263                             vec4_instruction *inst,
   1264                             struct brw_reg dst,
   1265                             struct brw_reg index,
   1266                             struct brw_reg offset)
   1267 {
   1268    const struct gen_device_info *devinfo = p->devinfo;
   1269    const unsigned target_cache =
   1270       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
   1271        BRW_DATAPORT_READ_TARGET_DATA_CACHE);
   1272    assert(index.file == BRW_IMMEDIATE_VALUE &&
   1273 	  index.type == BRW_REGISTER_TYPE_UD);
   1274    uint32_t surf_index = index.ud;
   1275 
   1276    struct brw_reg header = brw_vec8_grf(0, 0);
   1277 
   1278    gen6_resolve_implied_move(p, &header, inst->base_mrf);
   1279 
   1280    if (devinfo->gen >= 6) {
   1281       if (offset.file == BRW_IMMEDIATE_VALUE) {
   1282          brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
   1283                            BRW_REGISTER_TYPE_D),
   1284                  brw_imm_d(offset.ud >> 4));
   1285       } else {
   1286          brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1),
   1287                            BRW_REGISTER_TYPE_D),
   1288                  offset, brw_imm_d(4));
   1289       }
   1290    } else {
   1291       brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
   1292                         BRW_REGISTER_TYPE_D),
   1293               offset);
   1294    }
   1295 
   1296    uint32_t msg_type;
   1297 
   1298    if (devinfo->gen >= 6)
   1299       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1300    else if (devinfo->gen == 5 || devinfo->is_g4x)
   1301       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1302    else
   1303       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1304 
   1305    /* Each of the 8 channel enables is considered for whether each
   1306     * dword is written.
   1307     */
   1308    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1309    brw_set_dest(p, send, dst);
   1310    brw_set_src0(p, send, header);
   1311    if (devinfo->gen < 6)
   1312       brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
   1313    brw_set_dp_read_message(p, send,
   1314 			   surf_index,
   1315 			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
   1316 			   msg_type,
   1317                            target_cache,
   1318 			   2, /* mlen */
   1319                            true, /* header_present */
   1320 			   1 /* rlen */);
   1321 }
   1322 
   1323 static void
   1324 generate_get_buffer_size(struct brw_codegen *p,
   1325                          struct brw_vue_prog_data *prog_data,
   1326                          vec4_instruction *inst,
   1327                          struct brw_reg dst,
   1328                          struct brw_reg src,
   1329                          struct brw_reg surf_index)
   1330 {
   1331    assert(p->devinfo->gen >= 7);
   1332    assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
   1333           surf_index.file == BRW_IMMEDIATE_VALUE);
   1334 
   1335    brw_SAMPLE(p,
   1336               dst,
   1337               inst->base_mrf,
   1338               src,
   1339               surf_index.ud,
   1340               0,
   1341               GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
   1342               1, /* response length */
   1343               inst->mlen,
   1344               inst->header_size > 0,
   1345               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
   1346               BRW_SAMPLER_RETURN_FORMAT_SINT32);
   1347 
   1348    brw_mark_surface_used(&prog_data->base, surf_index.ud);
   1349 }
   1350 
   1351 static void
   1352 generate_pull_constant_load_gen7(struct brw_codegen *p,
   1353                                  struct brw_vue_prog_data *prog_data,
   1354                                  vec4_instruction *inst,
   1355                                  struct brw_reg dst,
   1356                                  struct brw_reg surf_index,
   1357                                  struct brw_reg offset)
   1358 {
   1359    assert(surf_index.type == BRW_REGISTER_TYPE_UD);
   1360 
   1361    if (surf_index.file == BRW_IMMEDIATE_VALUE) {
   1362 
   1363       brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
   1364       brw_set_dest(p, insn, dst);
   1365       brw_set_src0(p, insn, offset);
   1366       brw_set_sampler_message(p, insn,
   1367                               surf_index.ud,
   1368                               0, /* LD message ignores sampler unit */
   1369                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
   1370                               1, /* rlen */
   1371                               inst->mlen,
   1372                               inst->header_size != 0,
   1373                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
   1374                               0);
   1375 
   1376       brw_mark_surface_used(&prog_data->base, surf_index.ud);
   1377 
   1378    } else {
   1379 
   1380       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
   1381 
   1382       brw_push_insn_state(p);
   1383       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1384       brw_set_default_access_mode(p, BRW_ALIGN_1);
   1385 
   1386       /* a0.0 = surf_index & 0xff */
   1387       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
   1388       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
   1389       brw_set_dest(p, insn_and, addr);
   1390       brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
   1391       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
   1392 
   1393       brw_pop_insn_state(p);
   1394 
   1395       /* dst = send(offset, a0.0 | <descriptor>) */
   1396       brw_inst *insn = brw_send_indirect_message(
   1397          p, BRW_SFID_SAMPLER, dst, offset, addr);
   1398       brw_set_sampler_message(p, insn,
   1399                               0 /* surface */,
   1400                               0 /* sampler */,
   1401                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
   1402                               1 /* rlen */,
   1403                               inst->mlen,
   1404                               inst->header_size != 0,
   1405                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
   1406                               0);
   1407    }
   1408 }
   1409 
   1410 static void
   1411 generate_set_simd4x2_header_gen9(struct brw_codegen *p,
   1412                                  vec4_instruction *inst,
   1413                                  struct brw_reg dst)
   1414 {
   1415    brw_push_insn_state(p);
   1416    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1417 
   1418    brw_set_default_exec_size(p, BRW_EXECUTE_8);
   1419    brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   1420 
   1421    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1422    brw_MOV(p, get_element_ud(dst, 2),
   1423            brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
   1424 
   1425    brw_pop_insn_state(p);
   1426 }
   1427 
   1428 static void
   1429 generate_mov_indirect(struct brw_codegen *p,
   1430                       vec4_instruction *inst,
   1431                       struct brw_reg dst, struct brw_reg reg,
   1432                       struct brw_reg indirect, struct brw_reg length)
   1433 {
   1434    assert(indirect.type == BRW_REGISTER_TYPE_UD);
   1435    assert(p->devinfo->gen >= 6);
   1436 
   1437    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
   1438 
   1439    /* This instruction acts in align1 mode */
   1440    assert(dst.writemask == WRITEMASK_XYZW);
   1441 
   1442    if (indirect.file == BRW_IMMEDIATE_VALUE) {
   1443       imm_byte_offset += indirect.ud;
   1444 
   1445       reg.nr = imm_byte_offset / REG_SIZE;
   1446       reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
   1447       unsigned shift = (imm_byte_offset / 4) % 4;
   1448       reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
   1449 
   1450       brw_MOV(p, dst, reg);
   1451    } else {
   1452       brw_push_insn_state(p);
   1453       brw_set_default_access_mode(p, BRW_ALIGN_1);
   1454       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1455 
   1456       struct brw_reg addr = vec8(brw_address_reg(0));
   1457 
   1458       /* We need to move the indirect value into the address register.  In
   1459        * order to make things make some sense, we want to respect at least the
   1460        * X component of the swizzle.  In order to do that, we need to convert
   1461        * the subnr (probably 0) to an align1 subnr and add in the swizzle.
   1462        */
   1463       assert(brw_is_single_value_swizzle(indirect.swizzle));
   1464       indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0));
   1465 
   1466       /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
   1467        * the indirect and splat it out to all four channels of the given half
   1468        * of a0.
   1469        */
   1470       indirect.subnr *= 2;
   1471       indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
   1472       brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
   1473 
   1474       /* Now we need to incorporate the swizzle from the source register */
   1475       if (reg.swizzle != BRW_SWIZZLE_XXXX) {
   1476          uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 |
   1477                             BRW_GET_SWZ(reg.swizzle, 1) << 6 |
   1478                             BRW_GET_SWZ(reg.swizzle, 2) << 10 |
   1479                             BRW_GET_SWZ(reg.swizzle, 3) << 14;
   1480          uv_swiz |= uv_swiz << 16;
   1481 
   1482          brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz));
   1483       }
   1484 
   1485       brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type));
   1486 
   1487       brw_pop_insn_state(p);
   1488    }
   1489 }
   1490 
   1491 static void
   1492 generate_code(struct brw_codegen *p,
   1493               const struct brw_compiler *compiler,
   1494               void *log_data,
   1495               const nir_shader *nir,
   1496               struct brw_vue_prog_data *prog_data,
   1497               const struct cfg_t *cfg)
   1498 {
   1499    const struct gen_device_info *devinfo = p->devinfo;
   1500    const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage);
   1501    bool debug_flag = INTEL_DEBUG &
   1502       intel_debug_flag_for_shader_stage(nir->info.stage);
   1503    struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
   1504    int spill_count = 0, fill_count = 0;
   1505    int loop_count = 0;
   1506 
   1507    foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
   1508       struct brw_reg src[3], dst;
   1509 
   1510       if (unlikely(debug_flag))
   1511          disasm_annotate(disasm_info, inst, p->next_insn_offset);
   1512 
   1513       for (unsigned int i = 0; i < 3; i++) {
   1514          src[i] = inst->src[i].as_brw_reg();
   1515       }
   1516       dst = inst->dst.as_brw_reg();
   1517 
   1518       brw_set_default_predicate_control(p, inst->predicate);
   1519       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
   1520       brw_set_default_flag_reg(p, 0, inst->flag_subreg);
   1521       brw_set_default_saturate(p, inst->saturate);
   1522       brw_set_default_mask_control(p, inst->force_writemask_all);
   1523       brw_set_default_acc_write_control(p, inst->writes_accumulator);
   1524 
   1525       assert(inst->group % inst->exec_size == 0);
   1526       assert(inst->group % 4 == 0);
   1527 
   1528       /* There are some instructions where the destination is 64-bit
   1529        * but we retype it to a smaller type. In that case, we cannot
   1530        * double the exec_size.
   1531        */
   1532       const bool is_df = (get_exec_type_size(inst) == 8 ||
   1533                           inst->dst.type == BRW_REGISTER_TYPE_DF) &&
   1534                          inst->opcode != VEC4_OPCODE_PICK_LOW_32BIT &&
   1535                          inst->opcode != VEC4_OPCODE_PICK_HIGH_32BIT &&
   1536                          inst->opcode != VEC4_OPCODE_SET_LOW_32BIT &&
   1537                          inst->opcode != VEC4_OPCODE_SET_HIGH_32BIT;
   1538 
   1539       unsigned exec_size = inst->exec_size;
   1540       if (devinfo->gen == 7 && !devinfo->is_haswell && is_df)
   1541          exec_size *= 2;
   1542 
   1543       brw_set_default_exec_size(p, cvt(exec_size) - 1);
   1544 
   1545       if (!inst->force_writemask_all)
   1546          brw_set_default_group(p, inst->group);
   1547 
   1548       assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
   1549       assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
   1550 
   1551       unsigned pre_emit_nr_insn = p->nr_insn;
   1552 
   1553       switch (inst->opcode) {
   1554       case VEC4_OPCODE_UNPACK_UNIFORM:
   1555       case BRW_OPCODE_MOV:
   1556          brw_MOV(p, dst, src[0]);
   1557          break;
   1558       case BRW_OPCODE_ADD:
   1559          brw_ADD(p, dst, src[0], src[1]);
   1560          break;
   1561       case BRW_OPCODE_MUL:
   1562          brw_MUL(p, dst, src[0], src[1]);
   1563          break;
   1564       case BRW_OPCODE_MACH:
   1565          brw_MACH(p, dst, src[0], src[1]);
   1566          break;
   1567 
   1568       case BRW_OPCODE_MAD:
   1569          assert(devinfo->gen >= 6);
   1570          brw_MAD(p, dst, src[0], src[1], src[2]);
   1571          break;
   1572 
   1573       case BRW_OPCODE_FRC:
   1574          brw_FRC(p, dst, src[0]);
   1575          break;
   1576       case BRW_OPCODE_RNDD:
   1577          brw_RNDD(p, dst, src[0]);
   1578          break;
   1579       case BRW_OPCODE_RNDE:
   1580          brw_RNDE(p, dst, src[0]);
   1581          break;
   1582       case BRW_OPCODE_RNDZ:
   1583          brw_RNDZ(p, dst, src[0]);
   1584          break;
   1585 
   1586       case BRW_OPCODE_AND:
   1587          brw_AND(p, dst, src[0], src[1]);
   1588          break;
   1589       case BRW_OPCODE_OR:
   1590          brw_OR(p, dst, src[0], src[1]);
   1591          break;
   1592       case BRW_OPCODE_XOR:
   1593          brw_XOR(p, dst, src[0], src[1]);
   1594          break;
   1595       case BRW_OPCODE_NOT:
   1596          brw_NOT(p, dst, src[0]);
   1597          break;
   1598       case BRW_OPCODE_ASR:
   1599          brw_ASR(p, dst, src[0], src[1]);
   1600          break;
   1601       case BRW_OPCODE_SHR:
   1602          brw_SHR(p, dst, src[0], src[1]);
   1603          break;
   1604       case BRW_OPCODE_SHL:
   1605          brw_SHL(p, dst, src[0], src[1]);
   1606          break;
   1607 
   1608       case BRW_OPCODE_CMP:
   1609          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
   1610          break;
   1611       case BRW_OPCODE_SEL:
   1612          brw_SEL(p, dst, src[0], src[1]);
   1613          break;
   1614 
   1615       case BRW_OPCODE_DPH:
   1616          brw_DPH(p, dst, src[0], src[1]);
   1617          break;
   1618 
   1619       case BRW_OPCODE_DP4:
   1620          brw_DP4(p, dst, src[0], src[1]);
   1621          break;
   1622 
   1623       case BRW_OPCODE_DP3:
   1624          brw_DP3(p, dst, src[0], src[1]);
   1625          break;
   1626 
   1627       case BRW_OPCODE_DP2:
   1628          brw_DP2(p, dst, src[0], src[1]);
   1629          break;
   1630 
   1631       case BRW_OPCODE_F32TO16:
   1632          assert(devinfo->gen >= 7);
   1633          brw_F32TO16(p, dst, src[0]);
   1634          break;
   1635 
   1636       case BRW_OPCODE_F16TO32:
   1637          assert(devinfo->gen >= 7);
   1638          brw_F16TO32(p, dst, src[0]);
   1639          break;
   1640 
   1641       case BRW_OPCODE_LRP:
   1642          assert(devinfo->gen >= 6);
   1643          brw_LRP(p, dst, src[0], src[1], src[2]);
   1644          break;
   1645 
   1646       case BRW_OPCODE_BFREV:
   1647          assert(devinfo->gen >= 7);
   1648          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
   1649                    retype(src[0], BRW_REGISTER_TYPE_UD));
   1650          break;
   1651       case BRW_OPCODE_FBH:
   1652          assert(devinfo->gen >= 7);
   1653          brw_FBH(p, retype(dst, src[0].type), src[0]);
   1654          break;
   1655       case BRW_OPCODE_FBL:
   1656          assert(devinfo->gen >= 7);
   1657          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
   1658                  retype(src[0], BRW_REGISTER_TYPE_UD));
   1659          break;
   1660       case BRW_OPCODE_LZD:
   1661          brw_LZD(p, dst, src[0]);
   1662          break;
   1663       case BRW_OPCODE_CBIT:
   1664          assert(devinfo->gen >= 7);
   1665          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
   1666                   retype(src[0], BRW_REGISTER_TYPE_UD));
   1667          break;
   1668       case BRW_OPCODE_ADDC:
   1669          assert(devinfo->gen >= 7);
   1670          brw_ADDC(p, dst, src[0], src[1]);
   1671          break;
   1672       case BRW_OPCODE_SUBB:
   1673          assert(devinfo->gen >= 7);
   1674          brw_SUBB(p, dst, src[0], src[1]);
   1675          break;
   1676       case BRW_OPCODE_MAC:
   1677          brw_MAC(p, dst, src[0], src[1]);
   1678          break;
   1679 
   1680       case BRW_OPCODE_BFE:
   1681          assert(devinfo->gen >= 7);
   1682          brw_BFE(p, dst, src[0], src[1], src[2]);
   1683          break;
   1684 
   1685       case BRW_OPCODE_BFI1:
   1686          assert(devinfo->gen >= 7);
   1687          brw_BFI1(p, dst, src[0], src[1]);
   1688          break;
   1689       case BRW_OPCODE_BFI2:
   1690          assert(devinfo->gen >= 7);
   1691          brw_BFI2(p, dst, src[0], src[1], src[2]);
   1692          break;
   1693 
   1694       case BRW_OPCODE_IF:
   1695          if (!inst->src[0].is_null()) {
   1696             /* The instruction has an embedded compare (only allowed on gen6) */
   1697             assert(devinfo->gen == 6);
   1698             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
   1699          } else {
   1700             brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
   1701             brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
   1702          }
   1703          break;
   1704 
   1705       case BRW_OPCODE_ELSE:
   1706          brw_ELSE(p);
   1707          break;
   1708       case BRW_OPCODE_ENDIF:
   1709          brw_ENDIF(p);
   1710          break;
   1711 
   1712       case BRW_OPCODE_DO:
   1713          brw_DO(p, BRW_EXECUTE_8);
   1714          break;
   1715 
   1716       case BRW_OPCODE_BREAK:
   1717          brw_BREAK(p);
   1718          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   1719          break;
   1720       case BRW_OPCODE_CONTINUE:
   1721          brw_CONT(p);
   1722          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   1723          break;
   1724 
   1725       case BRW_OPCODE_WHILE:
   1726          brw_WHILE(p);
   1727          loop_count++;
   1728          break;
   1729 
   1730       case SHADER_OPCODE_RCP:
   1731       case SHADER_OPCODE_RSQ:
   1732       case SHADER_OPCODE_SQRT:
   1733       case SHADER_OPCODE_EXP2:
   1734       case SHADER_OPCODE_LOG2:
   1735       case SHADER_OPCODE_SIN:
   1736       case SHADER_OPCODE_COS:
   1737          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
   1738          if (devinfo->gen >= 7) {
   1739             gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
   1740                       brw_null_reg());
   1741          } else if (devinfo->gen == 6) {
   1742             generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
   1743          } else {
   1744             generate_math1_gen4(p, inst, dst, src[0]);
   1745          }
   1746          break;
   1747 
   1748       case SHADER_OPCODE_POW:
   1749       case SHADER_OPCODE_INT_QUOTIENT:
   1750       case SHADER_OPCODE_INT_REMAINDER:
   1751          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
   1752          if (devinfo->gen >= 7) {
   1753             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
   1754          } else if (devinfo->gen == 6) {
   1755             generate_math_gen6(p, inst, dst, src[0], src[1]);
   1756          } else {
   1757             generate_math2_gen4(p, inst, dst, src[0], src[1]);
   1758          }
   1759          break;
   1760 
   1761       case SHADER_OPCODE_TEX:
   1762       case SHADER_OPCODE_TXD:
   1763       case SHADER_OPCODE_TXF:
   1764       case SHADER_OPCODE_TXF_CMS:
   1765       case SHADER_OPCODE_TXF_CMS_W:
   1766       case SHADER_OPCODE_TXF_MCS:
   1767       case SHADER_OPCODE_TXL:
   1768       case SHADER_OPCODE_TXS:
   1769       case SHADER_OPCODE_TG4:
   1770       case SHADER_OPCODE_TG4_OFFSET:
   1771       case SHADER_OPCODE_SAMPLEINFO:
   1772          generate_tex(p, prog_data, nir->info.stage,
   1773                       inst, dst, src[0], src[1], src[2]);
   1774          break;
   1775 
   1776       case SHADER_OPCODE_GET_BUFFER_SIZE:
   1777          generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
   1778          break;
   1779 
   1780       case VS_OPCODE_URB_WRITE:
   1781          generate_vs_urb_write(p, inst);
   1782          break;
   1783 
   1784       case SHADER_OPCODE_GEN4_SCRATCH_READ:
   1785          generate_scratch_read(p, inst, dst, src[0]);
   1786          fill_count++;
   1787          break;
   1788 
   1789       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
   1790          generate_scratch_write(p, inst, dst, src[0], src[1]);
   1791          spill_count++;
   1792          break;
   1793 
   1794       case VS_OPCODE_PULL_CONSTANT_LOAD:
   1795          generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
   1796          break;
   1797 
   1798       case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
   1799          generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
   1800          break;
   1801 
   1802       case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
   1803          generate_set_simd4x2_header_gen9(p, inst, dst);
   1804          break;
   1805 
   1806       case GS_OPCODE_URB_WRITE:
   1807          generate_gs_urb_write(p, inst);
   1808          break;
   1809 
   1810       case GS_OPCODE_URB_WRITE_ALLOCATE:
   1811          generate_gs_urb_write_allocate(p, inst);
   1812          break;
   1813 
   1814       case GS_OPCODE_SVB_WRITE:
   1815          generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
   1816          break;
   1817 
   1818       case GS_OPCODE_SVB_SET_DST_INDEX:
   1819          generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
   1820          break;
   1821 
   1822       case GS_OPCODE_THREAD_END:
   1823          generate_gs_thread_end(p, inst);
   1824          break;
   1825 
   1826       case GS_OPCODE_SET_WRITE_OFFSET:
   1827          generate_gs_set_write_offset(p, dst, src[0], src[1]);
   1828          break;
   1829 
   1830       case GS_OPCODE_SET_VERTEX_COUNT:
   1831          generate_gs_set_vertex_count(p, dst, src[0]);
   1832          break;
   1833 
   1834       case GS_OPCODE_FF_SYNC:
   1835          generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
   1836          break;
   1837 
   1838       case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
   1839          generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
   1840          break;
   1841 
   1842       case GS_OPCODE_SET_PRIMITIVE_ID:
   1843          generate_gs_set_primitive_id(p, dst);
   1844          break;
   1845 
   1846       case GS_OPCODE_SET_DWORD_2:
   1847          generate_gs_set_dword_2(p, dst, src[0]);
   1848          break;
   1849 
   1850       case GS_OPCODE_PREPARE_CHANNEL_MASKS:
   1851          generate_gs_prepare_channel_masks(p, dst);
   1852          break;
   1853 
   1854       case GS_OPCODE_SET_CHANNEL_MASKS:
   1855          generate_gs_set_channel_masks(p, dst, src[0]);
   1856          break;
   1857 
   1858       case GS_OPCODE_GET_INSTANCE_ID:
   1859          generate_gs_get_instance_id(p, dst);
   1860          break;
   1861 
   1862       case SHADER_OPCODE_SHADER_TIME_ADD:
   1863          brw_shader_time_add(p, src[0],
   1864                              prog_data->base.binding_table.shader_time_start);
   1865          brw_mark_surface_used(&prog_data->base,
   1866                                prog_data->base.binding_table.shader_time_start);
   1867          break;
   1868 
   1869       case SHADER_OPCODE_UNTYPED_ATOMIC:
   1870          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1871          brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
   1872                             !inst->dst.is_null());
   1873          break;
   1874 
   1875       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
   1876          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1877          brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
   1878                                   src[2].ud);
   1879          break;
   1880 
   1881       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
   1882          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1883          brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
   1884                                    src[2].ud);
   1885          break;
   1886 
   1887       case SHADER_OPCODE_TYPED_ATOMIC:
   1888          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1889          brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
   1890                           !inst->dst.is_null());
   1891          break;
   1892 
   1893       case SHADER_OPCODE_TYPED_SURFACE_READ:
   1894          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1895          brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
   1896                                 src[2].ud);
   1897          break;
   1898 
   1899       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
   1900          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1901          brw_typed_surface_write(p, src[0], src[1], inst->mlen,
   1902                                  src[2].ud);
   1903          break;
   1904 
   1905       case SHADER_OPCODE_MEMORY_FENCE:
   1906          brw_memory_fence(p, dst);
   1907          break;
   1908 
   1909       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
   1910          const struct brw_reg mask =
   1911             brw_stage_has_packed_dispatch(devinfo, nir->info.stage,
   1912                                           &prog_data->base) ? brw_imm_ud(~0u) :
   1913             brw_dmask_reg();
   1914          brw_find_live_channel(p, dst, mask);
   1915          break;
   1916       }
   1917 
   1918       case SHADER_OPCODE_BROADCAST:
   1919          assert(inst->force_writemask_all);
   1920          brw_broadcast(p, dst, src[0], src[1]);
   1921          break;
   1922 
   1923       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
   1924          generate_unpack_flags(p, dst);
   1925          break;
   1926 
   1927       case VEC4_OPCODE_MOV_BYTES: {
   1928          /* Moves the low byte from each channel, using an Align1 access mode
   1929           * and a <4,1,0> source region.
   1930           */
   1931          assert(src[0].type == BRW_REGISTER_TYPE_UB ||
   1932                 src[0].type == BRW_REGISTER_TYPE_B);
   1933 
   1934          brw_set_default_access_mode(p, BRW_ALIGN_1);
   1935          src[0].vstride = BRW_VERTICAL_STRIDE_4;
   1936          src[0].width = BRW_WIDTH_1;
   1937          src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
   1938          brw_MOV(p, dst, src[0]);
   1939          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1940          break;
   1941       }
   1942 
   1943       case VEC4_OPCODE_DOUBLE_TO_F32:
   1944       case VEC4_OPCODE_DOUBLE_TO_D32:
   1945       case VEC4_OPCODE_DOUBLE_TO_U32: {
   1946          assert(type_sz(src[0].type) == 8);
   1947          assert(type_sz(dst.type) == 8);
   1948 
   1949          brw_reg_type dst_type;
   1950 
   1951          switch (inst->opcode) {
   1952          case VEC4_OPCODE_DOUBLE_TO_F32:
   1953             dst_type = BRW_REGISTER_TYPE_F;
   1954             break;
   1955          case VEC4_OPCODE_DOUBLE_TO_D32:
   1956             dst_type = BRW_REGISTER_TYPE_D;
   1957             break;
   1958          case VEC4_OPCODE_DOUBLE_TO_U32:
   1959             dst_type = BRW_REGISTER_TYPE_UD;
   1960             break;
   1961          default:
   1962             unreachable("Not supported conversion");
   1963          }
   1964          dst = retype(dst, dst_type);
   1965 
   1966          brw_set_default_access_mode(p, BRW_ALIGN_1);
   1967 
   1968          /* When converting from DF->F, we set destination's stride as 2 as an
   1969           * aligment requirement. But in IVB/BYT, each DF implicitly writes
   1970           * two floats, being the first one the converted value. So we don't
   1971           * need to explicitly set stride 2, but 1.
   1972           */
   1973          struct brw_reg spread_dst;
   1974          if (devinfo->gen == 7 && !devinfo->is_haswell)
   1975             spread_dst = stride(dst, 8, 4, 1);
   1976          else
   1977             spread_dst = stride(dst, 8, 4, 2);
   1978 
   1979          brw_MOV(p, spread_dst, src[0]);
   1980 
   1981          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1982          break;
   1983       }
   1984 
   1985       case VEC4_OPCODE_TO_DOUBLE: {
   1986          assert(type_sz(src[0].type) == 4);
   1987          assert(type_sz(dst.type) == 8);
   1988 
   1989          brw_set_default_access_mode(p, BRW_ALIGN_1);
   1990 
   1991          brw_MOV(p, dst, src[0]);
   1992 
   1993          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1994          break;
   1995       }
   1996 
   1997       case VEC4_OPCODE_PICK_LOW_32BIT:
   1998       case VEC4_OPCODE_PICK_HIGH_32BIT: {
   1999          /* Stores the low/high 32-bit of each 64-bit element in src[0] into
   2000           * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
   2001           */
   2002          assert(type_sz(src[0].type) == 8);
   2003          assert(type_sz(dst.type) == 4);
   2004 
   2005          brw_set_default_access_mode(p, BRW_ALIGN_1);
   2006 
   2007          dst = retype(dst, BRW_REGISTER_TYPE_UD);
   2008          dst.hstride = BRW_HORIZONTAL_STRIDE_1;
   2009 
   2010          src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
   2011          if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
   2012             src[0] = suboffset(src[0], 1);
   2013          src[0] = spread(src[0], 2);
   2014          brw_MOV(p, dst, src[0]);
   2015 
   2016          brw_set_default_access_mode(p, BRW_ALIGN_16);
   2017          break;
   2018       }
   2019 
   2020       case VEC4_OPCODE_SET_LOW_32BIT:
   2021       case VEC4_OPCODE_SET_HIGH_32BIT: {
   2022          /* Reads consecutive 32-bit elements from src[0] and writes
   2023           * them to the low/high 32-bit of each 64-bit element in dst.
   2024           */
   2025          assert(type_sz(src[0].type) == 4);
   2026          assert(type_sz(dst.type) == 8);
   2027 
   2028          brw_set_default_access_mode(p, BRW_ALIGN_1);
   2029 
   2030          dst = retype(dst, BRW_REGISTER_TYPE_UD);
   2031          if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
   2032             dst = suboffset(dst, 1);
   2033          dst.hstride = BRW_HORIZONTAL_STRIDE_2;
   2034 
   2035          src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
   2036          brw_MOV(p, dst, src[0]);
   2037 
   2038          brw_set_default_access_mode(p, BRW_ALIGN_16);
   2039          break;
   2040       }
   2041 
   2042       case VEC4_OPCODE_PACK_BYTES: {
   2043          /* Is effectively:
   2044           *
   2045           *   mov(8) dst<16,4,1>:UB src<4,1,0>:UB
   2046           *
   2047           * but destinations' only regioning is horizontal stride, so instead we
   2048           * have to use two instructions:
   2049           *
   2050           *   mov(4) dst<1>:UB     src<4,1,0>:UB
   2051           *   mov(4) dst.16<1>:UB  src.16<4,1,0>:UB
   2052           *
   2053           * where they pack the four bytes from the low and high four DW.
   2054           */
   2055          assert(_mesa_is_pow_two(dst.writemask) &&
   2056                 dst.writemask != 0);
   2057          unsigned offset = __builtin_ctz(dst.writemask);
   2058 
   2059          dst.type = BRW_REGISTER_TYPE_UB;
   2060 
   2061          brw_set_default_access_mode(p, BRW_ALIGN_1);
   2062 
   2063          src[0].type = BRW_REGISTER_TYPE_UB;
   2064          src[0].vstride = BRW_VERTICAL_STRIDE_4;
   2065          src[0].width = BRW_WIDTH_1;
   2066          src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
   2067          dst.subnr = offset * 4;
   2068          struct brw_inst *insn = brw_MOV(p, dst, src[0]);
   2069          brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
   2070          brw_inst_set_no_dd_clear(p->devinfo, insn, true);
   2071          brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
   2072 
   2073          src[0].subnr = 16;
   2074          dst.subnr = 16 + offset * 4;
   2075          insn = brw_MOV(p, dst, src[0]);
   2076          brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
   2077          brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
   2078          brw_inst_set_no_dd_check(p->devinfo, insn, true);
   2079 
   2080          brw_set_default_access_mode(p, BRW_ALIGN_16);
   2081          break;
   2082       }
   2083 
   2084       case TCS_OPCODE_URB_WRITE:
   2085          generate_tcs_urb_write(p, inst, src[0]);
   2086          break;
   2087 
   2088       case VEC4_OPCODE_URB_READ:
   2089          generate_vec4_urb_read(p, inst, dst, src[0]);
   2090          break;
   2091 
   2092       case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
   2093          generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
   2094          break;
   2095 
   2096       case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
   2097          generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
   2098          break;
   2099 
   2100       case TCS_OPCODE_GET_INSTANCE_ID:
   2101          generate_tcs_get_instance_id(p, dst);
   2102          break;
   2103 
   2104       case TCS_OPCODE_GET_PRIMITIVE_ID:
   2105          generate_tcs_get_primitive_id(p, dst);
   2106          break;
   2107 
   2108       case TCS_OPCODE_CREATE_BARRIER_HEADER:
   2109          generate_tcs_create_barrier_header(p, prog_data, dst);
   2110          break;
   2111 
   2112       case TES_OPCODE_CREATE_INPUT_READ_HEADER:
   2113          generate_tes_create_input_read_header(p, dst);
   2114          break;
   2115 
   2116       case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
   2117          generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
   2118          break;
   2119 
   2120       case TES_OPCODE_GET_PRIMITIVE_ID:
   2121          generate_tes_get_primitive_id(p, dst);
   2122          break;
   2123 
   2124       case TCS_OPCODE_SRC0_010_IS_ZERO:
   2125          /* If src_reg had stride like fs_reg, we wouldn't need this. */
   2126          brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
   2127          break;
   2128 
   2129       case TCS_OPCODE_RELEASE_INPUT:
   2130          generate_tcs_release_input(p, dst, src[0], src[1]);
   2131          break;
   2132 
   2133       case TCS_OPCODE_THREAD_END:
   2134          generate_tcs_thread_end(p, inst);
   2135          break;
   2136 
   2137       case SHADER_OPCODE_BARRIER:
   2138          brw_barrier(p, src[0]);
   2139          brw_WAIT(p);
   2140          break;
   2141 
   2142       case SHADER_OPCODE_MOV_INDIRECT:
   2143          generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
   2144          break;
   2145 
   2146       case BRW_OPCODE_DIM:
   2147          assert(devinfo->is_haswell);
   2148          assert(src[0].type == BRW_REGISTER_TYPE_DF);
   2149          assert(dst.type == BRW_REGISTER_TYPE_DF);
   2150          brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
   2151          break;
   2152 
   2153       default:
   2154          unreachable("Unsupported opcode");
   2155       }
   2156 
   2157       if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
   2158          /* Handled dependency hints in the generator. */
   2159 
   2160          assert(!inst->conditional_mod);
   2161       } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
   2162          assert(p->nr_insn == pre_emit_nr_insn + 1 ||
   2163                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
   2164                  "emitting more than 1 instruction");
   2165 
   2166          brw_inst *last = &p->store[pre_emit_nr_insn];
   2167 
   2168          if (inst->conditional_mod)
   2169             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
   2170          brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
   2171          brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
   2172       }
   2173    }
   2174 
   2175    brw_set_uip_jip(p, 0);
   2176 
   2177    /* end of program sentinel */
   2178    disasm_new_inst_group(disasm_info, p->next_insn_offset);
   2179 
   2180 #ifndef NDEBUG
   2181    bool validated =
   2182 #else
   2183    if (unlikely(debug_flag))
   2184 #endif
   2185       brw_validate_instructions(devinfo, p->store,
   2186                                 0, p->next_insn_offset,
   2187                                 disasm_info);
   2188 
   2189    int before_size = p->next_insn_offset;
   2190    brw_compact_instructions(p, 0, disasm_info);
   2191    int after_size = p->next_insn_offset;
   2192 
   2193    if (unlikely(debug_flag)) {
   2194       fprintf(stderr, "Native code for %s %s shader %s:\n",
   2195               nir->info.label ? nir->info.label : "unnamed",
   2196               _mesa_shader_stage_to_string(nir->info.stage), nir->info.name);
   2197 
   2198       fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
   2199                       "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
   2200               stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
   2201               spill_count, fill_count, before_size, after_size,
   2202               100.0f * (before_size - after_size) / before_size);
   2203 
   2204       dump_assembly(p->store, disasm_info);
   2205    }
   2206    ralloc_free(disasm_info);
   2207    assert(validated);
   2208 
   2209    compiler->shader_debug_log(log_data,
   2210                               "%s vec4 shader: %d inst, %d loops, %u cycles, "
   2211                               "%d:%d spills:fills, compacted %d to %d bytes.",
   2212                               stage_abbrev, before_size / 16,
   2213                               loop_count, cfg->cycle_count, spill_count,
   2214                               fill_count, before_size, after_size);
   2215 
   2216 }
   2217 
   2218 extern "C" const unsigned *
   2219 brw_vec4_generate_assembly(const struct brw_compiler *compiler,
   2220                            void *log_data,
   2221                            void *mem_ctx,
   2222                            const nir_shader *nir,
   2223                            struct brw_vue_prog_data *prog_data,
   2224                            const struct cfg_t *cfg,
   2225                            unsigned *out_assembly_size)
   2226 {
   2227    struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
   2228    brw_init_codegen(compiler->devinfo, p, mem_ctx);
   2229    brw_set_default_access_mode(p, BRW_ALIGN_16);
   2230 
   2231    generate_code(p, compiler, log_data, nir, prog_data, cfg);
   2232 
   2233    return brw_get_program(p, out_assembly_size);
   2234 }
   2235