Home | History | Annotate | Download | only in i965
      1 /* Copyright  2011 Intel Corporation
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a
      4  * copy of this software and associated documentation files (the "Software"),
      5  * to deal in the Software without restriction, including without limitation
      6  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      7  * and/or sell copies of the Software, and to permit persons to whom the
      8  * Software is furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice (including the next
     11  * paragraph) shall be included in all copies or substantial portions of the
     12  * Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     20  * IN THE SOFTWARE.
     21  */
     22 
     23 #include "brw_vec4.h"
     24 #include "brw_cfg.h"
     25 #include "brw_eu.h"
     26 #include "brw_program.h"
     27 
     28 using namespace brw;
     29 
     30 static void
     31 generate_math1_gen4(struct brw_codegen *p,
     32                     vec4_instruction *inst,
     33                     struct brw_reg dst,
     34                     struct brw_reg src)
     35 {
     36    gen4_math(p,
     37 	     dst,
     38 	     brw_math_function(inst->opcode),
     39 	     inst->base_mrf,
     40 	     src,
     41 	     BRW_MATH_PRECISION_FULL);
     42 }
     43 
     44 static void
     45 check_gen6_math_src_arg(struct brw_reg src)
     46 {
     47    /* Source swizzles are ignored. */
     48    assert(!src.abs);
     49    assert(!src.negate);
     50    assert(src.swizzle == BRW_SWIZZLE_XYZW);
     51 }
     52 
     53 static void
     54 generate_math_gen6(struct brw_codegen *p,
     55                    vec4_instruction *inst,
     56                    struct brw_reg dst,
     57                    struct brw_reg src0,
     58                    struct brw_reg src1)
     59 {
     60    /* Can't do writemask because math can't be align16. */
     61    assert(dst.writemask == WRITEMASK_XYZW);
     62    /* Source swizzles are ignored. */
     63    check_gen6_math_src_arg(src0);
     64    if (src1.file == BRW_GENERAL_REGISTER_FILE)
     65       check_gen6_math_src_arg(src1);
     66 
     67    brw_set_default_access_mode(p, BRW_ALIGN_1);
     68    gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
     69    brw_set_default_access_mode(p, BRW_ALIGN_16);
     70 }
     71 
     72 static void
     73 generate_math2_gen4(struct brw_codegen *p,
     74                     vec4_instruction *inst,
     75                     struct brw_reg dst,
     76                     struct brw_reg src0,
     77                     struct brw_reg src1)
     78 {
     79    /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
     80     * "Message Payload":
     81     *
     82     * "Operand0[7].  For the INT DIV functions, this operand is the
     83     *  denominator."
     84     *  ...
     85     * "Operand1[7].  For the INT DIV functions, this operand is the
     86     *  numerator."
     87     */
     88    bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
     89    struct brw_reg &op0 = is_int_div ? src1 : src0;
     90    struct brw_reg &op1 = is_int_div ? src0 : src1;
     91 
     92    brw_push_insn_state(p);
     93    brw_set_default_saturate(p, false);
     94    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
     95    brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
     96    brw_pop_insn_state(p);
     97 
     98    gen4_math(p,
     99 	     dst,
    100 	     brw_math_function(inst->opcode),
    101 	     inst->base_mrf,
    102 	     op0,
    103 	     BRW_MATH_PRECISION_FULL);
    104 }
    105 
    106 static void
    107 generate_tex(struct brw_codegen *p,
    108              struct brw_vue_prog_data *prog_data,
    109              gl_shader_stage stage,
    110              vec4_instruction *inst,
    111              struct brw_reg dst,
    112              struct brw_reg src,
    113              struct brw_reg surface_index,
    114              struct brw_reg sampler_index)
    115 {
    116    const struct gen_device_info *devinfo = p->devinfo;
    117    int msg_type = -1;
    118 
    119    if (devinfo->gen >= 5) {
    120       switch (inst->opcode) {
    121       case SHADER_OPCODE_TEX:
    122       case SHADER_OPCODE_TXL:
    123 	 if (inst->shadow_compare) {
    124 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
    125 	 } else {
    126 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
    127 	 }
    128 	 break;
    129       case SHADER_OPCODE_TXD:
    130          if (inst->shadow_compare) {
    131             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
    132             assert(devinfo->gen >= 8 || devinfo->is_haswell);
    133             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
    134          } else {
    135             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
    136          }
    137 	 break;
    138       case SHADER_OPCODE_TXF:
    139 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
    140 	 break;
    141       case SHADER_OPCODE_TXF_CMS_W:
    142          assert(devinfo->gen >= 9);
    143          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
    144          break;
    145       case SHADER_OPCODE_TXF_CMS:
    146          if (devinfo->gen >= 7)
    147             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
    148          else
    149             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
    150          break;
    151       case SHADER_OPCODE_TXF_MCS:
    152          assert(devinfo->gen >= 7);
    153          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
    154          break;
    155       case SHADER_OPCODE_TXS:
    156 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
    157 	 break;
    158       case SHADER_OPCODE_TG4:
    159          if (inst->shadow_compare) {
    160             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
    161          } else {
    162             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
    163          }
    164          break;
    165       case SHADER_OPCODE_TG4_OFFSET:
    166          if (inst->shadow_compare) {
    167             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
    168          } else {
    169             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
    170          }
    171          break;
    172       case SHADER_OPCODE_SAMPLEINFO:
    173          msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
    174          break;
    175       default:
    176 	 unreachable("should not get here: invalid vec4 texture opcode");
    177       }
    178    } else {
    179       switch (inst->opcode) {
    180       case SHADER_OPCODE_TEX:
    181       case SHADER_OPCODE_TXL:
    182 	 if (inst->shadow_compare) {
    183 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
    184 	    assert(inst->mlen == 3);
    185 	 } else {
    186 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
    187 	    assert(inst->mlen == 2);
    188 	 }
    189 	 break;
    190       case SHADER_OPCODE_TXD:
    191 	 /* There is no sample_d_c message; comparisons are done manually. */
    192 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
    193 	 assert(inst->mlen == 4);
    194 	 break;
    195       case SHADER_OPCODE_TXF:
    196 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
    197 	 assert(inst->mlen == 2);
    198 	 break;
    199       case SHADER_OPCODE_TXS:
    200 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
    201 	 assert(inst->mlen == 2);
    202 	 break;
    203       default:
    204 	 unreachable("should not get here: invalid vec4 texture opcode");
    205       }
    206    }
    207 
    208    assert(msg_type != -1);
    209 
    210    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
    211 
    212    /* Load the message header if present.  If there's a texture offset, we need
    213     * to set it up explicitly and load the offset bitfield.  Otherwise, we can
    214     * use an implied move from g0 to the first message register.
    215     */
    216    if (inst->header_size != 0) {
    217       if (devinfo->gen < 6 && !inst->offset) {
    218          /* Set up an implied move from g0 to the MRF. */
    219          src = brw_vec8_grf(0, 0);
    220       } else {
    221          struct brw_reg header =
    222             retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
    223          uint32_t dw2 = 0;
    224 
    225          /* Explicitly set up the message header by copying g0 to the MRF. */
    226          brw_push_insn_state(p);
    227          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    228          brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    229 
    230          brw_set_default_access_mode(p, BRW_ALIGN_1);
    231 
    232          if (inst->offset)
    233             /* Set the texel offset bits in DWord 2. */
    234             dw2 = inst->offset;
    235 
    236          if (devinfo->gen >= 9)
    237             /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D,
    238              * based on bit 22 in the header.
    239              */
    240             dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2;
    241 
    242          /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
    243           * so header0.2 is 0 when g0 is copied.  The HS and GS stages do
    244           * not, so we must set to to 0 to avoid setting undesirable bits
    245           * in the message header.
    246           */
    247          if (dw2 ||
    248              stage == MESA_SHADER_TESS_CTRL ||
    249              stage == MESA_SHADER_GEOMETRY) {
    250             brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
    251          }
    252 
    253          brw_adjust_sampler_state_pointer(p, header, sampler_index);
    254          brw_pop_insn_state(p);
    255       }
    256    }
    257 
    258    uint32_t return_format;
    259 
    260    switch (dst.type) {
    261    case BRW_REGISTER_TYPE_D:
    262       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
    263       break;
    264    case BRW_REGISTER_TYPE_UD:
    265       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
    266       break;
    267    default:
    268       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
    269       break;
    270    }
    271 
    272    uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
    273          inst->opcode == SHADER_OPCODE_TG4_OFFSET)
    274          ? prog_data->base.binding_table.gather_texture_start
    275          : prog_data->base.binding_table.texture_start;
    276 
    277    if (surface_index.file == BRW_IMMEDIATE_VALUE &&
    278        sampler_index.file == BRW_IMMEDIATE_VALUE) {
    279       uint32_t surface = surface_index.ud;
    280       uint32_t sampler = sampler_index.ud;
    281 
    282       brw_SAMPLE(p,
    283                  dst,
    284                  inst->base_mrf,
    285                  src,
    286                  surface + base_binding_table_index,
    287                  sampler % 16,
    288                  msg_type,
    289                  1, /* response length */
    290                  inst->mlen,
    291                  inst->header_size != 0,
    292                  BRW_SAMPLER_SIMD_MODE_SIMD4X2,
    293                  return_format);
    294 
    295       brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
    296    } else {
    297       /* Non-constant sampler index. */
    298 
    299       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
    300       struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
    301       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
    302 
    303       brw_push_insn_state(p);
    304       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    305       brw_set_default_access_mode(p, BRW_ALIGN_1);
    306 
    307       if (brw_regs_equal(&surface_reg, &sampler_reg)) {
    308          brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
    309       } else {
    310          if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
    311             brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
    312          } else {
    313             brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
    314             brw_OR(p, addr, addr, surface_reg);
    315          }
    316       }
    317       if (base_binding_table_index)
    318          brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
    319       brw_AND(p, addr, addr, brw_imm_ud(0xfff));
    320 
    321       brw_pop_insn_state(p);
    322 
    323       if (inst->base_mrf != -1)
    324          gen6_resolve_implied_move(p, &src, inst->base_mrf);
    325 
    326       /* dst = send(offset, a0.0 | <descriptor>) */
    327       brw_inst *insn = brw_send_indirect_message(
    328          p, BRW_SFID_SAMPLER, dst, src, addr);
    329       brw_set_sampler_message(p, insn,
    330                               0 /* surface */,
    331                               0 /* sampler */,
    332                               msg_type,
    333                               1 /* rlen */,
    334                               inst->mlen /* mlen */,
    335                               inst->header_size != 0 /* header */,
    336                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
    337                               return_format);
    338 
    339       /* visitor knows more than we do about the surface limit required,
    340        * so has already done marking.
    341        */
    342    }
    343 }
    344 
    345 static void
    346 generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
    347 {
    348    brw_urb_WRITE(p,
    349 		 brw_null_reg(), /* dest */
    350 		 inst->base_mrf, /* starting mrf reg nr */
    351 		 brw_vec8_grf(0, 0), /* src */
    352                  inst->urb_write_flags,
    353 		 inst->mlen,
    354 		 0,		/* response len */
    355 		 inst->offset,	/* urb destination offset */
    356 		 BRW_URB_SWIZZLE_INTERLEAVE);
    357 }
    358 
    359 static void
    360 generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
    361 {
    362    struct brw_reg src = brw_message_reg(inst->base_mrf);
    363    brw_urb_WRITE(p,
    364                  brw_null_reg(), /* dest */
    365                  inst->base_mrf, /* starting mrf reg nr */
    366                  src,
    367                  inst->urb_write_flags,
    368                  inst->mlen,
    369                  0,             /* response len */
    370                  inst->offset,  /* urb destination offset */
    371                  BRW_URB_SWIZZLE_INTERLEAVE);
    372 }
    373 
    374 static void
    375 generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
    376 {
    377    struct brw_reg src = brw_message_reg(inst->base_mrf);
    378 
    379    /* We pass the temporary passed in src0 as the writeback register */
    380    brw_urb_WRITE(p,
    381                  inst->src[0].as_brw_reg(), /* dest */
    382                  inst->base_mrf, /* starting mrf reg nr */
    383                  src,
    384                  BRW_URB_WRITE_ALLOCATE_COMPLETE,
    385                  inst->mlen,
    386                  1, /* response len */
    387                  inst->offset,  /* urb destination offset */
    388                  BRW_URB_SWIZZLE_INTERLEAVE);
    389 
    390    /* Now put allocated urb handle in dst.0 */
    391    brw_push_insn_state(p);
    392    brw_set_default_access_mode(p, BRW_ALIGN_1);
    393    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    394    brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0),
    395            get_element_ud(inst->src[0].as_brw_reg(), 0));
    396    brw_pop_insn_state(p);
    397 }
    398 
    399 static void
    400 generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
    401 {
    402    struct brw_reg src = brw_message_reg(inst->base_mrf);
    403    brw_urb_WRITE(p,
    404                  brw_null_reg(), /* dest */
    405                  inst->base_mrf, /* starting mrf reg nr */
    406                  src,
    407                  BRW_URB_WRITE_EOT | inst->urb_write_flags,
    408                  inst->mlen,
    409                  0,              /* response len */
    410                  0,              /* urb destination offset */
    411                  BRW_URB_SWIZZLE_INTERLEAVE);
    412 }
    413 
    414 static void
    415 generate_gs_set_write_offset(struct brw_codegen *p,
    416                              struct brw_reg dst,
    417                              struct brw_reg src0,
    418                              struct brw_reg src1)
    419 {
    420    /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
    421     * Header: M0.3):
    422     *
    423     *     Slot 0 Offset. This field, after adding to the Global Offset field
    424     *     in the message descriptor, specifies the offset (in 256-bit units)
    425     *     from the start of the URB entry, as referenced by URB Handle 0, at
    426     *     which the data will be accessed.
    427     *
    428     * Similar text describes DWORD M0.4, which is slot 1 offset.
    429     *
    430     * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
    431     * of the register for geometry shader invocations 0 and 1) by the
    432     * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
    433     *
    434     * We can do this with the following EU instruction:
    435     *
    436     *     mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW   { Align1 WE_all }
    437     */
    438    brw_push_insn_state(p);
    439    brw_set_default_access_mode(p, BRW_ALIGN_1);
    440    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    441    assert(p->devinfo->gen >= 7 &&
    442           src1.file == BRW_IMMEDIATE_VALUE &&
    443           src1.type == BRW_REGISTER_TYPE_UD &&
    444           src1.ud <= USHRT_MAX);
    445    if (src0.file == BRW_IMMEDIATE_VALUE) {
    446       brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
    447               brw_imm_ud(src0.ud * src1.ud));
    448    } else {
    449       brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
    450               retype(src1, BRW_REGISTER_TYPE_UW));
    451    }
    452    brw_pop_insn_state(p);
    453 }
    454 
    455 static void
    456 generate_gs_set_vertex_count(struct brw_codegen *p,
    457                              struct brw_reg dst,
    458                              struct brw_reg src)
    459 {
    460    brw_push_insn_state(p);
    461    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    462 
    463    if (p->devinfo->gen >= 8) {
    464       /* Move the vertex count into the second MRF for the EOT write. */
    465       brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
    466               src);
    467    } else {
    468       /* If we think of the src and dst registers as composed of 8 DWORDs each,
    469        * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
    470        * them to WORDs, and then pack them into DWORD 2 of dst.
    471        *
    472        * It's easier to get the EU to do this if we think of the src and dst
    473        * registers as composed of 16 WORDS each; then, we want to pick up the
    474        * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
    475        * of dst.
    476        *
    477        * We can do that by the following EU instruction:
    478        *
    479        *     mov (2) dst.4<1>:uw src<8;1,0>:uw   { Align1, Q1, NoMask }
    480        */
    481       brw_set_default_access_mode(p, BRW_ALIGN_1);
    482       brw_MOV(p,
    483               suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
    484               stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
    485    }
    486    brw_pop_insn_state(p);
    487 }
    488 
    489 static void
    490 generate_gs_svb_write(struct brw_codegen *p,
    491                       struct brw_vue_prog_data *prog_data,
    492                       vec4_instruction *inst,
    493                       struct brw_reg dst,
    494                       struct brw_reg src0,
    495                       struct brw_reg src1)
    496 {
    497    int binding = inst->sol_binding;
    498    bool final_write = inst->sol_final_write;
    499 
    500    brw_push_insn_state(p);
    501    brw_set_default_exec_size(p, BRW_EXECUTE_4);
    502    /* Copy Vertex data into M0.x */
    503    brw_MOV(p, stride(dst, 4, 4, 1),
    504            stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
    505    brw_pop_insn_state(p);
    506 
    507    brw_push_insn_state(p);
    508    /* Send SVB Write */
    509    brw_svb_write(p,
    510                  final_write ? src1 : brw_null_reg(), /* dest == src1 */
    511                  1, /* msg_reg_nr */
    512                  dst, /* src0 == previous dst */
    513                  SURF_INDEX_GEN6_SOL_BINDING(binding), /* binding_table_index */
    514                  final_write); /* send_commit_msg */
    515 
    516    /* Finally, wait for the write commit to occur so that we can proceed to
    517     * other things safely.
    518     *
    519     * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
    520     *
    521     *   The write commit does not modify the destination register, but
    522     *   merely clears the dependency associated with the destination
    523     *   register. Thus, a simple mov instruction using the register as a
    524     *   source is sufficient to wait for the write commit to occur.
    525     */
    526    if (final_write) {
    527       brw_MOV(p, src1, src1);
    528    }
    529    brw_pop_insn_state(p);
    530 }
    531 
    532 static void
    533 generate_gs_svb_set_destination_index(struct brw_codegen *p,
    534                                       vec4_instruction *inst,
    535                                       struct brw_reg dst,
    536                                       struct brw_reg src)
    537 {
    538    int vertex = inst->sol_vertex;
    539    brw_push_insn_state(p);
    540    brw_set_default_access_mode(p, BRW_ALIGN_1);
    541    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    542    brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
    543    brw_pop_insn_state(p);
    544 }
    545 
    546 static void
    547 generate_gs_set_dword_2(struct brw_codegen *p,
    548                         struct brw_reg dst,
    549                         struct brw_reg src)
    550 {
    551    brw_push_insn_state(p);
    552    brw_set_default_access_mode(p, BRW_ALIGN_1);
    553    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    554    brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
    555    brw_pop_insn_state(p);
    556 }
    557 
    558 static void
    559 generate_gs_prepare_channel_masks(struct brw_codegen *p,
    560                                   struct brw_reg dst)
    561 {
    562    /* We want to left shift just DWORD 4 (the x component belonging to the
    563     * second geometry shader invocation) by 4 bits.  So generate the
    564     * instruction:
    565     *
    566     *     shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
    567     */
    568    dst = suboffset(vec1(dst), 4);
    569    brw_push_insn_state(p);
    570    brw_set_default_access_mode(p, BRW_ALIGN_1);
    571    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    572    brw_SHL(p, dst, dst, brw_imm_ud(4));
    573    brw_pop_insn_state(p);
    574 }
    575 
    576 static void
    577 generate_gs_set_channel_masks(struct brw_codegen *p,
    578                               struct brw_reg dst,
    579                               struct brw_reg src)
    580 {
    581    /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
    582     * Header: M0.5):
    583     *
    584     *     15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
    585     *
    586     *        When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
    587     *        DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
    588     *        Vertex 0 DATA[7].  This bit is ANDed with the corresponding
    589     *        channel enable to determine the final channel enable.  For the
    590     *        URB_READ_OWORD & URB_READ_HWORD messages, when final channel
    591     *        enable is 1 it indicates that Vertex 1 DATA [3] will be included
    592     *        in the writeback message.  For the URB_WRITE_OWORD &
    593     *        URB_WRITE_HWORD messages, when final channel enable is 1 it
    594     *        indicates that Vertex 1 DATA [3] will be written to the surface.
    595     *
    596     *        0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
    597     *        1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
    598     *
    599     *     14 Vertex 1 DATA [2] Channel Mask
    600     *     13 Vertex 1 DATA [1] Channel Mask
    601     *     12 Vertex 1 DATA [0] Channel Mask
    602     *     11 Vertex 0 DATA [3] Channel Mask
    603     *     10 Vertex 0 DATA [2] Channel Mask
    604     *      9 Vertex 0 DATA [1] Channel Mask
    605     *      8 Vertex 0 DATA [0] Channel Mask
    606     *
    607     * (This is from a section of the PRM that is agnostic to the particular
    608     * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
    609     * geometry shader invocations 0 and 1, respectively).  Since we have the
    610     * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
    611     * and the enable flags for geometry shader invocation 1 in bits 7:0 of
    612     * DWORD 4, we just need to OR them together and store the result in bits
    613     * 15:8 of DWORD 5.
    614     *
    615     * It's easier to get the EU to do this if we think of the src and dst
    616     * registers as composed of 32 bytes each; then, we want to pick up the
    617     * contents of bytes 0 and 16 from src, OR them together, and store them in
    618     * byte 21.
    619     *
    620     * We can do that by the following EU instruction:
    621     *
    622     *     or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
    623     *
    624     * Note: this relies on the source register having zeros in (a) bits 7:4 of
    625     * DWORD 0 and (b) bits 3:0 of DWORD 4.  We can rely on (b) because the
    626     * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
    627     * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
    628     * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
    629     * contain valid channel mask values (which are in the range 0x0-0xf).
    630     */
    631    dst = retype(dst, BRW_REGISTER_TYPE_UB);
    632    src = retype(src, BRW_REGISTER_TYPE_UB);
    633    brw_push_insn_state(p);
    634    brw_set_default_access_mode(p, BRW_ALIGN_1);
    635    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    636    brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
    637    brw_pop_insn_state(p);
    638 }
    639 
    640 static void
    641 generate_gs_get_instance_id(struct brw_codegen *p,
    642                             struct brw_reg dst)
    643 {
    644    /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
    645     * and store into dst.0 & dst.4. So generate the instruction:
    646     *
    647     *     shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
    648     */
    649    brw_push_insn_state(p);
    650    brw_set_default_access_mode(p, BRW_ALIGN_1);
    651    dst = retype(dst, BRW_REGISTER_TYPE_UD);
    652    struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    653    brw_SHR(p, dst, stride(r0, 1, 4, 0),
    654            brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
    655    brw_pop_insn_state(p);
    656 }
    657 
    658 static void
    659 generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
    660                                    struct brw_reg dst,
    661                                    struct brw_reg src0,
    662                                    struct brw_reg src1,
    663                                    struct brw_reg src2)
    664 {
    665    brw_push_insn_state(p);
    666    brw_set_default_access_mode(p, BRW_ALIGN_1);
    667    /* Save src0 data in 16:31 bits of dst.0 */
    668    brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
    669            brw_imm_ud(0xffffu));
    670    brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
    671    /* Save src1 data in 0:15 bits of dst.0 */
    672    brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
    673            brw_imm_ud(0xffffu));
    674    brw_OR(p, suboffset(vec1(dst), 0),
    675           suboffset(vec1(dst), 0),
    676           suboffset(vec1(src2), 0));
    677    brw_pop_insn_state(p);
    678 }
    679 
    680 static void
    681 generate_gs_ff_sync(struct brw_codegen *p,
    682                     vec4_instruction *inst,
    683                     struct brw_reg dst,
    684                     struct brw_reg src0,
    685                     struct brw_reg src1)
    686 {
    687    /* This opcode uses an implied MRF register for:
    688     *  - the header of the ff_sync message. And as such it is expected to be
    689     *    initialized to r0 before calling here.
    690     *  - the destination where we will write the allocated URB handle.
    691     */
    692    struct brw_reg header =
    693       retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
    694 
    695    /* Overwrite dword 0 of the header (SO vertices to write) and
    696     * dword 1 (number of primitives written).
    697     */
    698    brw_push_insn_state(p);
    699    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    700    brw_set_default_access_mode(p, BRW_ALIGN_1);
    701    brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
    702    brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
    703    brw_pop_insn_state(p);
    704 
    705    /* Allocate URB handle in dst */
    706    brw_ff_sync(p,
    707                dst,
    708                0,
    709                header,
    710                1, /* allocate */
    711                1, /* response length */
    712                0 /* eot */);
    713 
    714    /* Now put allocated urb handle in header.0 */
    715    brw_push_insn_state(p);
    716    brw_set_default_access_mode(p, BRW_ALIGN_1);
    717    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    718    brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
    719 
    720    /* src1 is not an immediate when we use transform feedback */
    721    if (src1.file != BRW_IMMEDIATE_VALUE) {
    722       brw_set_default_exec_size(p, BRW_EXECUTE_4);
    723       brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
    724    }
    725 
    726    brw_pop_insn_state(p);
    727 }
    728 
    729 static void
    730 generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
    731 {
    732    /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
    733    struct brw_reg src = brw_vec8_grf(0, 0);
    734    brw_push_insn_state(p);
    735    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    736    brw_set_default_access_mode(p, BRW_ALIGN_1);
    737    brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
    738    brw_pop_insn_state(p);
    739 }
    740 
    741 static void
    742 generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
    743 {
    744    const struct gen_device_info *devinfo = p->devinfo;
    745    const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
    746 
    747    /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
    748     *
    749     * Since we operate in SIMD4x2 mode, we need run half as many threads
    750     * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
    751     * shift right by one less to accomplish the multiplication by two.
    752     */
    753    dst = retype(dst, BRW_REGISTER_TYPE_UD);
    754    struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    755 
    756    brw_push_insn_state(p);
    757    brw_set_default_access_mode(p, BRW_ALIGN_1);
    758 
    759    const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
    760    const int shift = ivb ? 16 : 17;
    761 
    762    brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
    763    brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
    764            brw_imm_ud(shift - 1));
    765    brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
    766 
    767    brw_pop_insn_state(p);
    768 }
    769 
    770 static void
    771 generate_tcs_urb_write(struct brw_codegen *p,
    772                        vec4_instruction *inst,
    773                        struct brw_reg urb_header)
    774 {
    775    const struct gen_device_info *devinfo = p->devinfo;
    776 
    777    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    778    brw_set_dest(p, send, brw_null_reg());
    779    brw_set_src0(p, send, urb_header);
    780 
    781    brw_set_message_descriptor(p, send, BRW_SFID_URB,
    782                               inst->mlen /* mlen */, 0 /* rlen */,
    783                               true /* header */, false /* eot */);
    784    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
    785    brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
    786    if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
    787       brw_inst_set_eot(devinfo, send, 1);
    788    } else {
    789       brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
    790       brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
    791    }
    792 
    793    /* what happens to swizzles? */
    794 }
    795 
    796 
    797 static void
    798 generate_tcs_input_urb_offsets(struct brw_codegen *p,
    799                                struct brw_reg dst,
    800                                struct brw_reg vertex,
    801                                struct brw_reg offset)
    802 {
    803    /* Generates an URB read/write message header for HS/DS operation.
    804     * Inputs are a vertex index, and a byte offset from the beginning of
    805     * the vertex. */
    806 
    807    /* If `vertex` is not an immediate, we clobber a0.0 */
    808 
    809    assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
    810    assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
    811 
    812    assert(dst.file == BRW_GENERAL_REGISTER_FILE);
    813 
    814    brw_push_insn_state(p);
    815    brw_set_default_access_mode(p, BRW_ALIGN_1);
    816    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    817    brw_MOV(p, dst, brw_imm_ud(0));
    818 
    819    /* m0.5 bits 8-15 are channel enables */
    820    brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
    821 
    822    /* m0.0-0.1: URB handles */
    823    if (vertex.file == BRW_IMMEDIATE_VALUE) {
    824       uint32_t vertex_index = vertex.ud;
    825       struct brw_reg index_reg = brw_vec1_grf(
    826             1 + (vertex_index >> 3), vertex_index & 7);
    827 
    828       brw_MOV(p, vec2(get_element_ud(dst, 0)),
    829               retype(index_reg, BRW_REGISTER_TYPE_UD));
    830    } else {
    831       /* Use indirect addressing.  ICP Handles are DWords (single channels
    832        * of a register) and start at g1.0.
    833        *
    834        * In order to start our region at g1.0, we add 8 to the vertex index,
    835        * effectively skipping over the 8 channels in g0.0.  This gives us a
    836        * DWord offset to the ICP Handle.
    837        *
    838        * Indirect addressing works in terms of bytes, so we then multiply
    839        * the DWord offset by 4 (by shifting left by 2).
    840        */
    841       struct brw_reg addr = brw_address_reg(0);
    842 
    843       /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
    844       brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8));
    845       brw_SHL(p, addr, addr, brw_imm_ud(2));
    846       brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
    847 
    848       /* top half: m0.1 = g[1.0 + vertex.4]UD */
    849       brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8));
    850       brw_SHL(p, addr, addr, brw_imm_ud(2));
    851       brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
    852    }
    853 
    854    /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
    855    if (offset.file != ARF)
    856       brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
    857 
    858    brw_pop_insn_state(p);
    859 }
    860 
    861 
    862 static void
    863 generate_tcs_output_urb_offsets(struct brw_codegen *p,
    864                                 struct brw_reg dst,
    865                                 struct brw_reg write_mask,
    866                                 struct brw_reg offset)
    867 {
    868    /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
    869    assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
    870 
    871    assert(write_mask.file == BRW_IMMEDIATE_VALUE);
    872    assert(write_mask.type == BRW_REGISTER_TYPE_UD);
    873 
    874    brw_push_insn_state(p);
    875 
    876    brw_set_default_access_mode(p, BRW_ALIGN_1);
    877    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    878    brw_MOV(p, dst, brw_imm_ud(0));
    879 
    880    unsigned mask = write_mask.ud;
    881 
    882    /* m0.5 bits 15:12 and 11:8 are channel enables */
    883    brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
    884 
    885    /* HS patch URB handle is delivered in r0.0 */
    886    struct brw_reg urb_handle = brw_vec1_grf(0, 0);
    887 
    888    /* m0.0-0.1: URB handles */
    889    brw_MOV(p, vec2(get_element_ud(dst, 0)),
    890            retype(urb_handle, BRW_REGISTER_TYPE_UD));
    891 
    892    /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
    893    if (offset.file != ARF)
    894       brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
    895 
    896    brw_pop_insn_state(p);
    897 }
    898 
    899 static void
    900 generate_tes_create_input_read_header(struct brw_codegen *p,
    901                                       struct brw_reg dst)
    902 {
    903    brw_push_insn_state(p);
    904    brw_set_default_access_mode(p, BRW_ALIGN_1);
    905    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    906 
    907    /* Initialize the register to 0 */
    908    brw_MOV(p, dst, brw_imm_ud(0));
    909 
    910    /* Enable all the channels in m0.5 bits 15:8 */
    911    brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
    912 
    913    /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1.  For safety,
    914     * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
    915     */
    916    brw_AND(p, vec2(get_element_ud(dst, 0)),
    917            retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
    918            brw_imm_ud(0x1fff));
    919    brw_pop_insn_state(p);
    920 }
    921 
    922 static void
    923 generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
    924                                      struct brw_reg dst,
    925                                      struct brw_reg header,
    926                                      struct brw_reg offset)
    927 {
    928    brw_push_insn_state(p);
    929    brw_set_default_access_mode(p, BRW_ALIGN_1);
    930    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    931 
    932    brw_MOV(p, dst, header);
    933    /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
    934    brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
    935 
    936    brw_pop_insn_state(p);
    937 }
    938 
    939 static void
    940 generate_vec4_urb_read(struct brw_codegen *p,
    941                        vec4_instruction *inst,
    942                        struct brw_reg dst,
    943                        struct brw_reg header)
    944 {
    945    const struct gen_device_info *devinfo = p->devinfo;
    946 
    947    assert(header.file == BRW_GENERAL_REGISTER_FILE);
    948    assert(header.type == BRW_REGISTER_TYPE_UD);
    949 
    950    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    951    brw_set_dest(p, send, dst);
    952    brw_set_src0(p, send, header);
    953 
    954    brw_set_message_descriptor(p, send, BRW_SFID_URB,
    955                               1 /* mlen */, 1 /* rlen */,
    956                               true /* header */, false /* eot */);
    957    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
    958    brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
    959    brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
    960 
    961    brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
    962 }
    963 
    964 static void
    965 generate_tcs_release_input(struct brw_codegen *p,
    966                            struct brw_reg header,
    967                            struct brw_reg vertex,
    968                            struct brw_reg is_unpaired)
    969 {
    970    const struct gen_device_info *devinfo = p->devinfo;
    971 
    972    assert(vertex.file == BRW_IMMEDIATE_VALUE);
    973    assert(vertex.type == BRW_REGISTER_TYPE_UD);
    974 
    975    /* m0.0-0.1: URB handles */
    976    struct brw_reg urb_handles =
    977       retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
    978              BRW_REGISTER_TYPE_UD);
    979 
    980    brw_push_insn_state(p);
    981    brw_set_default_access_mode(p, BRW_ALIGN_1);
    982    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    983    brw_MOV(p, header, brw_imm_ud(0));
    984    brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
    985    brw_pop_insn_state(p);
    986 
    987    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    988    brw_set_dest(p, send, brw_null_reg());
    989    brw_set_src0(p, send, header);
    990    brw_set_message_descriptor(p, send, BRW_SFID_URB,
    991                               1 /* mlen */, 0 /* rlen */,
    992                               true /* header */, false /* eot */);
    993    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
    994    brw_inst_set_urb_complete(devinfo, send, 1);
    995    brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
    996                                     BRW_URB_SWIZZLE_NONE :
    997                                     BRW_URB_SWIZZLE_INTERLEAVE);
    998 }
    999 
   1000 static void
   1001 generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
   1002 {
   1003    struct brw_reg header = brw_message_reg(inst->base_mrf);
   1004 
   1005    brw_push_insn_state(p);
   1006    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1007    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1008    brw_MOV(p, header, brw_imm_ud(0));
   1009    brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8));
   1010    brw_MOV(p, get_element_ud(header, 0),
   1011            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
   1012    brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u));
   1013    brw_pop_insn_state(p);
   1014 
   1015    brw_urb_WRITE(p,
   1016                  brw_null_reg(), /* dest */
   1017                  inst->base_mrf, /* starting mrf reg nr */
   1018                  header,
   1019                  BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD |
   1020                  BRW_URB_WRITE_USE_CHANNEL_MASKS,
   1021                  inst->mlen,
   1022                  0,              /* response len */
   1023                  0,              /* urb destination offset */
   1024                  0);
   1025 }
   1026 
   1027 static void
   1028 generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
   1029 {
   1030    brw_push_insn_state(p);
   1031    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1032    brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
   1033    brw_pop_insn_state(p);
   1034 }
   1035 
   1036 static void
   1037 generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
   1038 {
   1039    brw_push_insn_state(p);
   1040    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1041    brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
   1042    brw_pop_insn_state(p);
   1043 }
   1044 
   1045 static void
   1046 generate_tcs_create_barrier_header(struct brw_codegen *p,
   1047                                    struct brw_vue_prog_data *prog_data,
   1048                                    struct brw_reg dst)
   1049 {
   1050    const struct gen_device_info *devinfo = p->devinfo;
   1051    const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
   1052    struct brw_reg m0_2 = get_element_ud(dst, 2);
   1053    unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
   1054 
   1055    brw_push_insn_state(p);
   1056    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1057    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1058 
   1059    /* Zero the message header */
   1060    brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
   1061 
   1062    /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */
   1063    brw_AND(p, m0_2,
   1064            retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
   1065            brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
   1066 
   1067    /* Shift it up to bits 27:24. */
   1068    brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
   1069 
   1070    /* Set the Barrier Count and the enable bit */
   1071    brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
   1072 
   1073    brw_pop_insn_state(p);
   1074 }
   1075 
   1076 static void
   1077 generate_oword_dual_block_offsets(struct brw_codegen *p,
   1078                                   struct brw_reg m1,
   1079                                   struct brw_reg index)
   1080 {
   1081    int second_vertex_offset;
   1082 
   1083    if (p->devinfo->gen >= 6)
   1084       second_vertex_offset = 1;
   1085    else
   1086       second_vertex_offset = 16;
   1087 
   1088    m1 = retype(m1, BRW_REGISTER_TYPE_D);
   1089 
   1090    /* Set up M1 (message payload).  Only the block offsets in M1.0 and
   1091     * M1.4 are used, and the rest are ignored.
   1092     */
   1093    struct brw_reg m1_0 = suboffset(vec1(m1), 0);
   1094    struct brw_reg m1_4 = suboffset(vec1(m1), 4);
   1095    struct brw_reg index_0 = suboffset(vec1(index), 0);
   1096    struct brw_reg index_4 = suboffset(vec1(index), 4);
   1097 
   1098    brw_push_insn_state(p);
   1099    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1100    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1101 
   1102    brw_MOV(p, m1_0, index_0);
   1103 
   1104    if (index.file == BRW_IMMEDIATE_VALUE) {
   1105       index_4.ud += second_vertex_offset;
   1106       brw_MOV(p, m1_4, index_4);
   1107    } else {
   1108       brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
   1109    }
   1110 
   1111    brw_pop_insn_state(p);
   1112 }
   1113 
   1114 static void
   1115 generate_unpack_flags(struct brw_codegen *p,
   1116                       struct brw_reg dst)
   1117 {
   1118    brw_push_insn_state(p);
   1119    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1120    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1121 
   1122    struct brw_reg flags = brw_flag_reg(0, 0);
   1123    struct brw_reg dst_0 = suboffset(vec1(dst), 0);
   1124    struct brw_reg dst_4 = suboffset(vec1(dst), 4);
   1125 
   1126    brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
   1127    brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
   1128    brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
   1129 
   1130    brw_pop_insn_state(p);
   1131 }
   1132 
   1133 static void
   1134 generate_scratch_read(struct brw_codegen *p,
   1135                       vec4_instruction *inst,
   1136                       struct brw_reg dst,
   1137                       struct brw_reg index)
   1138 {
   1139    const struct gen_device_info *devinfo = p->devinfo;
   1140    struct brw_reg header = brw_vec8_grf(0, 0);
   1141 
   1142    gen6_resolve_implied_move(p, &header, inst->base_mrf);
   1143 
   1144    generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
   1145 				     index);
   1146 
   1147    uint32_t msg_type;
   1148 
   1149    if (devinfo->gen >= 6)
   1150       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1151    else if (devinfo->gen == 5 || devinfo->is_g4x)
   1152       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1153    else
   1154       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1155 
   1156    const unsigned target_cache =
   1157       devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
   1158       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
   1159       BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
   1160 
   1161    /* Each of the 8 channel enables is considered for whether each
   1162     * dword is written.
   1163     */
   1164    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1165    brw_set_dest(p, send, dst);
   1166    brw_set_src0(p, send, header);
   1167    if (devinfo->gen < 6)
   1168       brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
   1169    brw_set_dp_read_message(p, send,
   1170                            brw_scratch_surface_idx(p),
   1171 			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
   1172 			   msg_type, target_cache,
   1173 			   2, /* mlen */
   1174                            true, /* header_present */
   1175 			   1 /* rlen */);
   1176 }
   1177 
   1178 static void
   1179 generate_scratch_write(struct brw_codegen *p,
   1180                        vec4_instruction *inst,
   1181                        struct brw_reg dst,
   1182                        struct brw_reg src,
   1183                        struct brw_reg index)
   1184 {
   1185    const struct gen_device_info *devinfo = p->devinfo;
   1186    const unsigned target_cache =
   1187       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
   1188        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
   1189        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
   1190    struct brw_reg header = brw_vec8_grf(0, 0);
   1191    bool write_commit;
   1192 
   1193    /* If the instruction is predicated, we'll predicate the send, not
   1194     * the header setup.
   1195     */
   1196    brw_set_default_predicate_control(p, false);
   1197 
   1198    gen6_resolve_implied_move(p, &header, inst->base_mrf);
   1199 
   1200    generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
   1201 				     index);
   1202 
   1203    brw_MOV(p,
   1204 	   retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
   1205 	   retype(src, BRW_REGISTER_TYPE_D));
   1206 
   1207    uint32_t msg_type;
   1208 
   1209    if (devinfo->gen >= 7)
   1210       msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
   1211    else if (devinfo->gen == 6)
   1212       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
   1213    else
   1214       msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
   1215 
   1216    brw_set_default_predicate_control(p, inst->predicate);
   1217 
   1218    /* Pre-gen6, we have to specify write commits to ensure ordering
   1219     * between reads and writes within a thread.  Afterwards, that's
   1220     * guaranteed and write commits only matter for inter-thread
   1221     * synchronization.
   1222     */
   1223    if (devinfo->gen >= 6) {
   1224       write_commit = false;
   1225    } else {
   1226       /* The visitor set up our destination register to be g0.  This
   1227        * means that when the next read comes along, we will end up
   1228        * reading from g0 and causing a block on the write commit.  For
   1229        * write-after-read, we are relying on the value of the previous
   1230        * read being used (and thus blocking on completion) before our
   1231        * write is executed.  This means we have to be careful in
   1232        * instruction scheduling to not violate this assumption.
   1233        */
   1234       write_commit = true;
   1235    }
   1236 
   1237    /* Each of the 8 channel enables is considered for whether each
   1238     * dword is written.
   1239     */
   1240    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1241    brw_set_dest(p, send, dst);
   1242    brw_set_src0(p, send, header);
   1243    if (devinfo->gen < 6)
   1244       brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
   1245    brw_set_dp_write_message(p, send,
   1246                             brw_scratch_surface_idx(p),
   1247 			    BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
   1248 			    msg_type,
   1249                             target_cache,
   1250 			    3, /* mlen */
   1251 			    true, /* header present */
   1252 			    false, /* not a render target write */
   1253 			    write_commit, /* rlen */
   1254 			    false, /* eot */
   1255 			    write_commit);
   1256 }
   1257 
   1258 static void
   1259 generate_pull_constant_load(struct brw_codegen *p,
   1260                             struct brw_vue_prog_data *prog_data,
   1261                             vec4_instruction *inst,
   1262                             struct brw_reg dst,
   1263                             struct brw_reg index,
   1264                             struct brw_reg offset)
   1265 {
   1266    const struct gen_device_info *devinfo = p->devinfo;
   1267    const unsigned target_cache =
   1268       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
   1269        BRW_DATAPORT_READ_TARGET_DATA_CACHE);
   1270    assert(index.file == BRW_IMMEDIATE_VALUE &&
   1271 	  index.type == BRW_REGISTER_TYPE_UD);
   1272    uint32_t surf_index = index.ud;
   1273 
   1274    struct brw_reg header = brw_vec8_grf(0, 0);
   1275 
   1276    gen6_resolve_implied_move(p, &header, inst->base_mrf);
   1277 
   1278    if (devinfo->gen >= 6) {
   1279       if (offset.file == BRW_IMMEDIATE_VALUE) {
   1280          brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
   1281                            BRW_REGISTER_TYPE_D),
   1282                  brw_imm_d(offset.ud >> 4));
   1283       } else {
   1284          brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1),
   1285                            BRW_REGISTER_TYPE_D),
   1286                  offset, brw_imm_d(4));
   1287       }
   1288    } else {
   1289       brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
   1290                         BRW_REGISTER_TYPE_D),
   1291               offset);
   1292    }
   1293 
   1294    uint32_t msg_type;
   1295 
   1296    if (devinfo->gen >= 6)
   1297       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1298    else if (devinfo->gen == 5 || devinfo->is_g4x)
   1299       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1300    else
   1301       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   1302 
   1303    /* Each of the 8 channel enables is considered for whether each
   1304     * dword is written.
   1305     */
   1306    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1307    brw_set_dest(p, send, dst);
   1308    brw_set_src0(p, send, header);
   1309    if (devinfo->gen < 6)
   1310       brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
   1311    brw_set_dp_read_message(p, send,
   1312 			   surf_index,
   1313 			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
   1314 			   msg_type,
   1315                            target_cache,
   1316 			   2, /* mlen */
   1317                            true, /* header_present */
   1318 			   1 /* rlen */);
   1319 }
   1320 
   1321 static void
   1322 generate_get_buffer_size(struct brw_codegen *p,
   1323                          struct brw_vue_prog_data *prog_data,
   1324                          vec4_instruction *inst,
   1325                          struct brw_reg dst,
   1326                          struct brw_reg src,
   1327                          struct brw_reg surf_index)
   1328 {
   1329    assert(p->devinfo->gen >= 7);
   1330    assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
   1331           surf_index.file == BRW_IMMEDIATE_VALUE);
   1332 
   1333    brw_SAMPLE(p,
   1334               dst,
   1335               inst->base_mrf,
   1336               src,
   1337               surf_index.ud,
   1338               0,
   1339               GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
   1340               1, /* response length */
   1341               inst->mlen,
   1342               inst->header_size > 0,
   1343               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
   1344               BRW_SAMPLER_RETURN_FORMAT_SINT32);
   1345 
   1346    brw_mark_surface_used(&prog_data->base, surf_index.ud);
   1347 }
   1348 
   1349 static void
   1350 generate_pull_constant_load_gen7(struct brw_codegen *p,
   1351                                  struct brw_vue_prog_data *prog_data,
   1352                                  vec4_instruction *inst,
   1353                                  struct brw_reg dst,
   1354                                  struct brw_reg surf_index,
   1355                                  struct brw_reg offset)
   1356 {
   1357    assert(surf_index.type == BRW_REGISTER_TYPE_UD);
   1358 
   1359    if (surf_index.file == BRW_IMMEDIATE_VALUE) {
   1360 
   1361       brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
   1362       brw_set_dest(p, insn, dst);
   1363       brw_set_src0(p, insn, offset);
   1364       brw_set_sampler_message(p, insn,
   1365                               surf_index.ud,
   1366                               0, /* LD message ignores sampler unit */
   1367                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
   1368                               1, /* rlen */
   1369                               inst->mlen,
   1370                               inst->header_size != 0,
   1371                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
   1372                               0);
   1373 
   1374       brw_mark_surface_used(&prog_data->base, surf_index.ud);
   1375 
   1376    } else {
   1377 
   1378       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
   1379 
   1380       brw_push_insn_state(p);
   1381       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1382       brw_set_default_access_mode(p, BRW_ALIGN_1);
   1383 
   1384       /* a0.0 = surf_index & 0xff */
   1385       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
   1386       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
   1387       brw_set_dest(p, insn_and, addr);
   1388       brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
   1389       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
   1390 
   1391       brw_pop_insn_state(p);
   1392 
   1393       /* dst = send(offset, a0.0 | <descriptor>) */
   1394       brw_inst *insn = brw_send_indirect_message(
   1395          p, BRW_SFID_SAMPLER, dst, offset, addr);
   1396       brw_set_sampler_message(p, insn,
   1397                               0 /* surface */,
   1398                               0 /* sampler */,
   1399                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
   1400                               1 /* rlen */,
   1401                               inst->mlen,
   1402                               inst->header_size != 0,
   1403                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
   1404                               0);
   1405    }
   1406 }
   1407 
   1408 static void
   1409 generate_set_simd4x2_header_gen9(struct brw_codegen *p,
   1410                                  vec4_instruction *inst,
   1411                                  struct brw_reg dst)
   1412 {
   1413    brw_push_insn_state(p);
   1414    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1415 
   1416    brw_set_default_exec_size(p, BRW_EXECUTE_8);
   1417    brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   1418 
   1419    brw_set_default_access_mode(p, BRW_ALIGN_1);
   1420    brw_MOV(p, get_element_ud(dst, 2),
   1421            brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
   1422 
   1423    brw_pop_insn_state(p);
   1424 }
   1425 
   1426 static void
   1427 generate_mov_indirect(struct brw_codegen *p,
   1428                       vec4_instruction *inst,
   1429                       struct brw_reg dst, struct brw_reg reg,
   1430                       struct brw_reg indirect, struct brw_reg length)
   1431 {
   1432    assert(indirect.type == BRW_REGISTER_TYPE_UD);
   1433    assert(p->devinfo->gen >= 6);
   1434 
   1435    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
   1436 
   1437    /* This instruction acts in align1 mode */
   1438    assert(dst.writemask == WRITEMASK_XYZW);
   1439 
   1440    if (indirect.file == BRW_IMMEDIATE_VALUE) {
   1441       imm_byte_offset += indirect.ud;
   1442 
   1443       reg.nr = imm_byte_offset / REG_SIZE;
   1444       reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
   1445       unsigned shift = (imm_byte_offset / 4) % 4;
   1446       reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
   1447 
   1448       brw_MOV(p, dst, reg);
   1449    } else {
   1450       brw_push_insn_state(p);
   1451       brw_set_default_access_mode(p, BRW_ALIGN_1);
   1452       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1453 
   1454       struct brw_reg addr = vec8(brw_address_reg(0));
   1455 
   1456       /* We need to move the indirect value into the address register.  In
   1457        * order to make things make some sense, we want to respect at least the
   1458        * X component of the swizzle.  In order to do that, we need to convert
   1459        * the subnr (probably 0) to an align1 subnr and add in the swizzle.
   1460        */
   1461       assert(brw_is_single_value_swizzle(indirect.swizzle));
   1462       indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0));
   1463 
   1464       /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
   1465        * the indirect and splat it out to all four channels of the given half
   1466        * of a0.
   1467        */
   1468       indirect.subnr *= 2;
   1469       indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
   1470       brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
   1471 
   1472       /* Now we need to incorporate the swizzle from the source register */
   1473       if (reg.swizzle != BRW_SWIZZLE_XXXX) {
   1474          uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 |
   1475                             BRW_GET_SWZ(reg.swizzle, 1) << 6 |
   1476                             BRW_GET_SWZ(reg.swizzle, 2) << 10 |
   1477                             BRW_GET_SWZ(reg.swizzle, 3) << 14;
   1478          uv_swiz |= uv_swiz << 16;
   1479 
   1480          brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz));
   1481       }
   1482 
   1483       brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type));
   1484 
   1485       brw_pop_insn_state(p);
   1486    }
   1487 }
   1488 
   1489 static void
   1490 generate_code(struct brw_codegen *p,
   1491               const struct brw_compiler *compiler,
   1492               void *log_data,
   1493               const nir_shader *nir,
   1494               struct brw_vue_prog_data *prog_data,
   1495               const struct cfg_t *cfg)
   1496 {
   1497    const struct gen_device_info *devinfo = p->devinfo;
   1498    const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage);
   1499    bool debug_flag = INTEL_DEBUG &
   1500       intel_debug_flag_for_shader_stage(nir->stage);
   1501    struct annotation_info annotation;
   1502    memset(&annotation, 0, sizeof(annotation));
   1503    int spill_count = 0, fill_count = 0;
   1504    int loop_count = 0;
   1505 
   1506    foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
   1507       struct brw_reg src[3], dst;
   1508 
   1509       if (unlikely(debug_flag))
   1510          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
   1511 
   1512       for (unsigned int i = 0; i < 3; i++) {
   1513          src[i] = inst->src[i].as_brw_reg();
   1514       }
   1515       dst = inst->dst.as_brw_reg();
   1516 
   1517       brw_set_default_predicate_control(p, inst->predicate);
   1518       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
   1519       brw_set_default_flag_reg(p, 0, inst->flag_subreg);
   1520       brw_set_default_saturate(p, inst->saturate);
   1521       brw_set_default_mask_control(p, inst->force_writemask_all);
   1522       brw_set_default_acc_write_control(p, inst->writes_accumulator);
   1523       brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
   1524 
   1525       assert(inst->group % inst->exec_size == 0);
   1526       assert(inst->group % 8 == 0 ||
   1527              inst->dst.type == BRW_REGISTER_TYPE_DF ||
   1528              inst->src[0].type == BRW_REGISTER_TYPE_DF ||
   1529              inst->src[1].type == BRW_REGISTER_TYPE_DF ||
   1530              inst->src[2].type == BRW_REGISTER_TYPE_DF);
   1531       if (!inst->force_writemask_all)
   1532          brw_set_default_group(p, inst->group);
   1533 
   1534       assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
   1535       assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
   1536 
   1537       unsigned pre_emit_nr_insn = p->nr_insn;
   1538 
   1539       switch (inst->opcode) {
   1540       case VEC4_OPCODE_UNPACK_UNIFORM:
   1541       case BRW_OPCODE_MOV:
   1542          brw_MOV(p, dst, src[0]);
   1543          break;
   1544       case BRW_OPCODE_ADD:
   1545          brw_ADD(p, dst, src[0], src[1]);
   1546          break;
   1547       case BRW_OPCODE_MUL:
   1548          brw_MUL(p, dst, src[0], src[1]);
   1549          break;
   1550       case BRW_OPCODE_MACH:
   1551          brw_MACH(p, dst, src[0], src[1]);
   1552          break;
   1553 
   1554       case BRW_OPCODE_MAD:
   1555          assert(devinfo->gen >= 6);
   1556          brw_MAD(p, dst, src[0], src[1], src[2]);
   1557          break;
   1558 
   1559       case BRW_OPCODE_FRC:
   1560          brw_FRC(p, dst, src[0]);
   1561          break;
   1562       case BRW_OPCODE_RNDD:
   1563          brw_RNDD(p, dst, src[0]);
   1564          break;
   1565       case BRW_OPCODE_RNDE:
   1566          brw_RNDE(p, dst, src[0]);
   1567          break;
   1568       case BRW_OPCODE_RNDZ:
   1569          brw_RNDZ(p, dst, src[0]);
   1570          break;
   1571 
   1572       case BRW_OPCODE_AND:
   1573          brw_AND(p, dst, src[0], src[1]);
   1574          break;
   1575       case BRW_OPCODE_OR:
   1576          brw_OR(p, dst, src[0], src[1]);
   1577          break;
   1578       case BRW_OPCODE_XOR:
   1579          brw_XOR(p, dst, src[0], src[1]);
   1580          break;
   1581       case BRW_OPCODE_NOT:
   1582          brw_NOT(p, dst, src[0]);
   1583          break;
   1584       case BRW_OPCODE_ASR:
   1585          brw_ASR(p, dst, src[0], src[1]);
   1586          break;
   1587       case BRW_OPCODE_SHR:
   1588          brw_SHR(p, dst, src[0], src[1]);
   1589          break;
   1590       case BRW_OPCODE_SHL:
   1591          brw_SHL(p, dst, src[0], src[1]);
   1592          break;
   1593 
   1594       case BRW_OPCODE_CMP:
   1595          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
   1596          break;
   1597       case BRW_OPCODE_SEL:
   1598          brw_SEL(p, dst, src[0], src[1]);
   1599          break;
   1600 
   1601       case BRW_OPCODE_DPH:
   1602          brw_DPH(p, dst, src[0], src[1]);
   1603          break;
   1604 
   1605       case BRW_OPCODE_DP4:
   1606          brw_DP4(p, dst, src[0], src[1]);
   1607          break;
   1608 
   1609       case BRW_OPCODE_DP3:
   1610          brw_DP3(p, dst, src[0], src[1]);
   1611          break;
   1612 
   1613       case BRW_OPCODE_DP2:
   1614          brw_DP2(p, dst, src[0], src[1]);
   1615          break;
   1616 
   1617       case BRW_OPCODE_F32TO16:
   1618          assert(devinfo->gen >= 7);
   1619          brw_F32TO16(p, dst, src[0]);
   1620          break;
   1621 
   1622       case BRW_OPCODE_F16TO32:
   1623          assert(devinfo->gen >= 7);
   1624          brw_F16TO32(p, dst, src[0]);
   1625          break;
   1626 
   1627       case BRW_OPCODE_LRP:
   1628          assert(devinfo->gen >= 6);
   1629          brw_LRP(p, dst, src[0], src[1], src[2]);
   1630          break;
   1631 
   1632       case BRW_OPCODE_BFREV:
   1633          assert(devinfo->gen >= 7);
   1634          /* BFREV only supports UD type for src and dst. */
   1635          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
   1636                    retype(src[0], BRW_REGISTER_TYPE_UD));
   1637          break;
   1638       case BRW_OPCODE_FBH:
   1639          assert(devinfo->gen >= 7);
   1640          /* FBH only supports UD type for dst. */
   1641          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
   1642          break;
   1643       case BRW_OPCODE_FBL:
   1644          assert(devinfo->gen >= 7);
   1645          /* FBL only supports UD type for dst. */
   1646          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
   1647          break;
   1648       case BRW_OPCODE_LZD:
   1649          brw_LZD(p, dst, src[0]);
   1650          break;
   1651       case BRW_OPCODE_CBIT:
   1652          assert(devinfo->gen >= 7);
   1653          /* CBIT only supports UD type for dst. */
   1654          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
   1655          break;
   1656       case BRW_OPCODE_ADDC:
   1657          assert(devinfo->gen >= 7);
   1658          brw_ADDC(p, dst, src[0], src[1]);
   1659          break;
   1660       case BRW_OPCODE_SUBB:
   1661          assert(devinfo->gen >= 7);
   1662          brw_SUBB(p, dst, src[0], src[1]);
   1663          break;
   1664       case BRW_OPCODE_MAC:
   1665          brw_MAC(p, dst, src[0], src[1]);
   1666          break;
   1667 
   1668       case BRW_OPCODE_BFE:
   1669          assert(devinfo->gen >= 7);
   1670          brw_BFE(p, dst, src[0], src[1], src[2]);
   1671          break;
   1672 
   1673       case BRW_OPCODE_BFI1:
   1674          assert(devinfo->gen >= 7);
   1675          brw_BFI1(p, dst, src[0], src[1]);
   1676          break;
   1677       case BRW_OPCODE_BFI2:
   1678          assert(devinfo->gen >= 7);
   1679          brw_BFI2(p, dst, src[0], src[1], src[2]);
   1680          break;
   1681 
   1682       case BRW_OPCODE_IF:
   1683          if (!inst->src[0].is_null()) {
   1684             /* The instruction has an embedded compare (only allowed on gen6) */
   1685             assert(devinfo->gen == 6);
   1686             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
   1687          } else {
   1688             brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
   1689             brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
   1690          }
   1691          break;
   1692 
   1693       case BRW_OPCODE_ELSE:
   1694          brw_ELSE(p);
   1695          break;
   1696       case BRW_OPCODE_ENDIF:
   1697          brw_ENDIF(p);
   1698          break;
   1699 
   1700       case BRW_OPCODE_DO:
   1701          brw_DO(p, BRW_EXECUTE_8);
   1702          break;
   1703 
   1704       case BRW_OPCODE_BREAK:
   1705          brw_BREAK(p);
   1706          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   1707          break;
   1708       case BRW_OPCODE_CONTINUE:
   1709          brw_CONT(p);
   1710          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   1711          break;
   1712 
   1713       case BRW_OPCODE_WHILE:
   1714          brw_WHILE(p);
   1715          loop_count++;
   1716          break;
   1717 
   1718       case SHADER_OPCODE_RCP:
   1719       case SHADER_OPCODE_RSQ:
   1720       case SHADER_OPCODE_SQRT:
   1721       case SHADER_OPCODE_EXP2:
   1722       case SHADER_OPCODE_LOG2:
   1723       case SHADER_OPCODE_SIN:
   1724       case SHADER_OPCODE_COS:
   1725          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
   1726          if (devinfo->gen >= 7) {
   1727             gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
   1728                       brw_null_reg());
   1729          } else if (devinfo->gen == 6) {
   1730             generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
   1731          } else {
   1732             generate_math1_gen4(p, inst, dst, src[0]);
   1733          }
   1734          break;
   1735 
   1736       case SHADER_OPCODE_POW:
   1737       case SHADER_OPCODE_INT_QUOTIENT:
   1738       case SHADER_OPCODE_INT_REMAINDER:
   1739          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
   1740          if (devinfo->gen >= 7) {
   1741             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
   1742          } else if (devinfo->gen == 6) {
   1743             generate_math_gen6(p, inst, dst, src[0], src[1]);
   1744          } else {
   1745             generate_math2_gen4(p, inst, dst, src[0], src[1]);
   1746          }
   1747          break;
   1748 
   1749       case SHADER_OPCODE_TEX:
   1750       case SHADER_OPCODE_TXD:
   1751       case SHADER_OPCODE_TXF:
   1752       case SHADER_OPCODE_TXF_CMS:
   1753       case SHADER_OPCODE_TXF_CMS_W:
   1754       case SHADER_OPCODE_TXF_MCS:
   1755       case SHADER_OPCODE_TXL:
   1756       case SHADER_OPCODE_TXS:
   1757       case SHADER_OPCODE_TG4:
   1758       case SHADER_OPCODE_TG4_OFFSET:
   1759       case SHADER_OPCODE_SAMPLEINFO:
   1760          generate_tex(p, prog_data, nir->stage,
   1761                       inst, dst, src[0], src[1], src[2]);
   1762          break;
   1763 
   1764       case VS_OPCODE_URB_WRITE:
   1765          generate_vs_urb_write(p, inst);
   1766          break;
   1767 
   1768       case SHADER_OPCODE_GEN4_SCRATCH_READ:
   1769          generate_scratch_read(p, inst, dst, src[0]);
   1770          fill_count++;
   1771          break;
   1772 
   1773       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
   1774          generate_scratch_write(p, inst, dst, src[0], src[1]);
   1775          spill_count++;
   1776          break;
   1777 
   1778       case VS_OPCODE_PULL_CONSTANT_LOAD:
   1779          generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
   1780          break;
   1781 
   1782       case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
   1783          generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
   1784          break;
   1785 
   1786       case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
   1787          generate_set_simd4x2_header_gen9(p, inst, dst);
   1788          break;
   1789 
   1790 
   1791       case VS_OPCODE_GET_BUFFER_SIZE:
   1792          generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
   1793          break;
   1794 
   1795       case GS_OPCODE_URB_WRITE:
   1796          generate_gs_urb_write(p, inst);
   1797          break;
   1798 
   1799       case GS_OPCODE_URB_WRITE_ALLOCATE:
   1800          generate_gs_urb_write_allocate(p, inst);
   1801          break;
   1802 
   1803       case GS_OPCODE_SVB_WRITE:
   1804          generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
   1805          break;
   1806 
   1807       case GS_OPCODE_SVB_SET_DST_INDEX:
   1808          generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
   1809          break;
   1810 
   1811       case GS_OPCODE_THREAD_END:
   1812          generate_gs_thread_end(p, inst);
   1813          break;
   1814 
   1815       case GS_OPCODE_SET_WRITE_OFFSET:
   1816          generate_gs_set_write_offset(p, dst, src[0], src[1]);
   1817          break;
   1818 
   1819       case GS_OPCODE_SET_VERTEX_COUNT:
   1820          generate_gs_set_vertex_count(p, dst, src[0]);
   1821          break;
   1822 
   1823       case GS_OPCODE_FF_SYNC:
   1824          generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
   1825          break;
   1826 
   1827       case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
   1828          generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
   1829          break;
   1830 
   1831       case GS_OPCODE_SET_PRIMITIVE_ID:
   1832          generate_gs_set_primitive_id(p, dst);
   1833          break;
   1834 
   1835       case GS_OPCODE_SET_DWORD_2:
   1836          generate_gs_set_dword_2(p, dst, src[0]);
   1837          break;
   1838 
   1839       case GS_OPCODE_PREPARE_CHANNEL_MASKS:
   1840          generate_gs_prepare_channel_masks(p, dst);
   1841          break;
   1842 
   1843       case GS_OPCODE_SET_CHANNEL_MASKS:
   1844          generate_gs_set_channel_masks(p, dst, src[0]);
   1845          break;
   1846 
   1847       case GS_OPCODE_GET_INSTANCE_ID:
   1848          generate_gs_get_instance_id(p, dst);
   1849          break;
   1850 
   1851       case SHADER_OPCODE_SHADER_TIME_ADD:
   1852          brw_shader_time_add(p, src[0],
   1853                              prog_data->base.binding_table.shader_time_start);
   1854          brw_mark_surface_used(&prog_data->base,
   1855                                prog_data->base.binding_table.shader_time_start);
   1856          break;
   1857 
   1858       case SHADER_OPCODE_UNTYPED_ATOMIC:
   1859          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1860          brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
   1861                             !inst->dst.is_null());
   1862          break;
   1863 
   1864       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
   1865          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1866          brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
   1867                                   src[2].ud);
   1868          break;
   1869 
   1870       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
   1871          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1872          brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
   1873                                    src[2].ud);
   1874          break;
   1875 
   1876       case SHADER_OPCODE_TYPED_ATOMIC:
   1877          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1878          brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
   1879                           !inst->dst.is_null());
   1880          break;
   1881 
   1882       case SHADER_OPCODE_TYPED_SURFACE_READ:
   1883          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1884          brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
   1885                                 src[2].ud);
   1886          break;
   1887 
   1888       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
   1889          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1890          brw_typed_surface_write(p, src[0], src[1], inst->mlen,
   1891                                  src[2].ud);
   1892          break;
   1893 
   1894       case SHADER_OPCODE_MEMORY_FENCE:
   1895          brw_memory_fence(p, dst);
   1896          break;
   1897 
   1898       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
   1899          const struct brw_reg mask =
   1900             brw_stage_has_packed_dispatch(devinfo, nir->stage,
   1901                                           &prog_data->base) ? brw_imm_ud(~0u) :
   1902             brw_dmask_reg();
   1903          brw_find_live_channel(p, dst, mask);
   1904          break;
   1905       }
   1906 
   1907       case SHADER_OPCODE_BROADCAST:
   1908          assert(inst->force_writemask_all);
   1909          brw_broadcast(p, dst, src[0], src[1]);
   1910          break;
   1911 
   1912       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
   1913          generate_unpack_flags(p, dst);
   1914          break;
   1915 
   1916       case VEC4_OPCODE_MOV_BYTES: {
   1917          /* Moves the low byte from each channel, using an Align1 access mode
   1918           * and a <4,1,0> source region.
   1919           */
   1920          assert(src[0].type == BRW_REGISTER_TYPE_UB ||
   1921                 src[0].type == BRW_REGISTER_TYPE_B);
   1922 
   1923          brw_set_default_access_mode(p, BRW_ALIGN_1);
   1924          src[0].vstride = BRW_VERTICAL_STRIDE_4;
   1925          src[0].width = BRW_WIDTH_1;
   1926          src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
   1927          brw_MOV(p, dst, src[0]);
   1928          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1929          break;
   1930       }
   1931 
   1932       case VEC4_OPCODE_FROM_DOUBLE: {
   1933          assert(type_sz(src[0].type) == 8);
   1934          assert(type_sz(dst.type) == 4);
   1935 
   1936          brw_set_default_access_mode(p, BRW_ALIGN_1);
   1937 
   1938          dst.hstride = BRW_HORIZONTAL_STRIDE_2;
   1939          dst.width = BRW_WIDTH_4;
   1940          src[0].vstride = BRW_VERTICAL_STRIDE_4;
   1941          src[0].width = BRW_WIDTH_4;
   1942          brw_MOV(p, dst, src[0]);
   1943 
   1944          struct brw_reg dst_as_src = dst;
   1945          dst.hstride = BRW_HORIZONTAL_STRIDE_1;
   1946          dst.width = BRW_WIDTH_8;
   1947          brw_MOV(p, dst, dst_as_src);
   1948 
   1949          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1950          break;
   1951       }
   1952 
   1953       case VEC4_OPCODE_TO_DOUBLE: {
   1954          assert(type_sz(src[0].type) == 4);
   1955          assert(type_sz(dst.type) == 8);
   1956 
   1957          brw_set_default_access_mode(p, BRW_ALIGN_1);
   1958 
   1959          struct brw_reg tmp = retype(dst, src[0].type);
   1960          tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
   1961          tmp.width = BRW_WIDTH_4;
   1962          src[0].vstride = BRW_VERTICAL_STRIDE_4;
   1963          src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
   1964          src[0].width = BRW_WIDTH_4;
   1965          brw_MOV(p, tmp, src[0]);
   1966 
   1967          tmp.vstride = BRW_VERTICAL_STRIDE_8;
   1968          tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
   1969          tmp.width = BRW_WIDTH_4;
   1970          brw_MOV(p, dst, tmp);
   1971 
   1972          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1973          break;
   1974       }
   1975 
   1976       case VEC4_OPCODE_PICK_LOW_32BIT:
   1977       case VEC4_OPCODE_PICK_HIGH_32BIT: {
   1978          /* Stores the low/high 32-bit of each 64-bit element in src[0] into
   1979           * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
   1980           */
   1981          assert(type_sz(src[0].type) == 8);
   1982          assert(type_sz(dst.type) == 4);
   1983 
   1984          brw_set_default_access_mode(p, BRW_ALIGN_1);
   1985 
   1986          dst = retype(dst, BRW_REGISTER_TYPE_UD);
   1987          dst.hstride = BRW_HORIZONTAL_STRIDE_1;
   1988 
   1989          src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
   1990          if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
   1991             src[0] = suboffset(src[0], 1);
   1992          src[0].vstride = BRW_VERTICAL_STRIDE_8;
   1993          src[0].width = BRW_WIDTH_4;
   1994          src[0].hstride = BRW_HORIZONTAL_STRIDE_2;
   1995          brw_MOV(p, dst, src[0]);
   1996 
   1997          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1998          break;
   1999       }
   2000 
   2001       case VEC4_OPCODE_SET_LOW_32BIT:
   2002       case VEC4_OPCODE_SET_HIGH_32BIT: {
   2003          /* Reads consecutive 32-bit elements from src[0] and writes
   2004           * them to the low/high 32-bit of each 64-bit element in dst.
   2005           */
   2006          assert(type_sz(src[0].type) == 4);
   2007          assert(type_sz(dst.type) == 8);
   2008 
   2009          brw_set_default_access_mode(p, BRW_ALIGN_1);
   2010 
   2011          dst = retype(dst, BRW_REGISTER_TYPE_UD);
   2012          if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
   2013             dst = suboffset(dst, 1);
   2014          dst.hstride = BRW_HORIZONTAL_STRIDE_2;
   2015 
   2016          src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
   2017          src[0].vstride = BRW_VERTICAL_STRIDE_4;
   2018          src[0].width = BRW_WIDTH_4;
   2019          src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
   2020          brw_MOV(p, dst, src[0]);
   2021 
   2022          brw_set_default_access_mode(p, BRW_ALIGN_16);
   2023          break;
   2024       }
   2025 
   2026       case VEC4_OPCODE_PACK_BYTES: {
   2027          /* Is effectively:
   2028           *
   2029           *   mov(8) dst<16,4,1>:UB src<4,1,0>:UB
   2030           *
   2031           * but destinations' only regioning is horizontal stride, so instead we
   2032           * have to use two instructions:
   2033           *
   2034           *   mov(4) dst<1>:UB     src<4,1,0>:UB
   2035           *   mov(4) dst.16<1>:UB  src.16<4,1,0>:UB
   2036           *
   2037           * where they pack the four bytes from the low and high four DW.
   2038           */
   2039          assert(_mesa_is_pow_two(dst.writemask) &&
   2040                 dst.writemask != 0);
   2041          unsigned offset = __builtin_ctz(dst.writemask);
   2042 
   2043          dst.type = BRW_REGISTER_TYPE_UB;
   2044 
   2045          brw_set_default_access_mode(p, BRW_ALIGN_1);
   2046 
   2047          src[0].type = BRW_REGISTER_TYPE_UB;
   2048          src[0].vstride = BRW_VERTICAL_STRIDE_4;
   2049          src[0].width = BRW_WIDTH_1;
   2050          src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
   2051          dst.subnr = offset * 4;
   2052          struct brw_inst *insn = brw_MOV(p, dst, src[0]);
   2053          brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
   2054          brw_inst_set_no_dd_clear(p->devinfo, insn, true);
   2055          brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
   2056 
   2057          src[0].subnr = 16;
   2058          dst.subnr = 16 + offset * 4;
   2059          insn = brw_MOV(p, dst, src[0]);
   2060          brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
   2061          brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
   2062          brw_inst_set_no_dd_check(p->devinfo, insn, true);
   2063 
   2064          brw_set_default_access_mode(p, BRW_ALIGN_16);
   2065          break;
   2066       }
   2067 
   2068       case TCS_OPCODE_URB_WRITE:
   2069          generate_tcs_urb_write(p, inst, src[0]);
   2070          break;
   2071 
   2072       case VEC4_OPCODE_URB_READ:
   2073          generate_vec4_urb_read(p, inst, dst, src[0]);
   2074          break;
   2075 
   2076       case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
   2077          generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
   2078          break;
   2079 
   2080       case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
   2081          generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
   2082          break;
   2083 
   2084       case TCS_OPCODE_GET_INSTANCE_ID:
   2085          generate_tcs_get_instance_id(p, dst);
   2086          break;
   2087 
   2088       case TCS_OPCODE_GET_PRIMITIVE_ID:
   2089          generate_tcs_get_primitive_id(p, dst);
   2090          break;
   2091 
   2092       case TCS_OPCODE_CREATE_BARRIER_HEADER:
   2093          generate_tcs_create_barrier_header(p, prog_data, dst);
   2094          break;
   2095 
   2096       case TES_OPCODE_CREATE_INPUT_READ_HEADER:
   2097          generate_tes_create_input_read_header(p, dst);
   2098          break;
   2099 
   2100       case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
   2101          generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
   2102          break;
   2103 
   2104       case TES_OPCODE_GET_PRIMITIVE_ID:
   2105          generate_tes_get_primitive_id(p, dst);
   2106          break;
   2107 
   2108       case TCS_OPCODE_SRC0_010_IS_ZERO:
   2109          /* If src_reg had stride like fs_reg, we wouldn't need this. */
   2110          brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
   2111          break;
   2112 
   2113       case TCS_OPCODE_RELEASE_INPUT:
   2114          generate_tcs_release_input(p, dst, src[0], src[1]);
   2115          break;
   2116 
   2117       case TCS_OPCODE_THREAD_END:
   2118          generate_tcs_thread_end(p, inst);
   2119          break;
   2120 
   2121       case SHADER_OPCODE_BARRIER:
   2122          brw_barrier(p, src[0]);
   2123          brw_WAIT(p);
   2124          break;
   2125 
   2126       case SHADER_OPCODE_MOV_INDIRECT:
   2127          generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
   2128          break;
   2129 
   2130       case BRW_OPCODE_DIM:
   2131          assert(devinfo->is_haswell);
   2132          assert(src[0].type == BRW_REGISTER_TYPE_DF);
   2133          assert(dst.type == BRW_REGISTER_TYPE_DF);
   2134          brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
   2135          break;
   2136 
   2137       default:
   2138          unreachable("Unsupported opcode");
   2139       }
   2140 
   2141       if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
   2142          /* Handled dependency hints in the generator. */
   2143 
   2144          assert(!inst->conditional_mod);
   2145       } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
   2146          assert(p->nr_insn == pre_emit_nr_insn + 1 ||
   2147                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
   2148                  "emitting more than 1 instruction");
   2149 
   2150          brw_inst *last = &p->store[pre_emit_nr_insn];
   2151 
   2152          if (inst->conditional_mod)
   2153             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
   2154          brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
   2155          brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
   2156       }
   2157    }
   2158 
   2159    brw_set_uip_jip(p, 0);
   2160    annotation_finalize(&annotation, p->next_insn_offset);
   2161 
   2162 #ifndef NDEBUG
   2163    bool validated = brw_validate_instructions(p, 0, &annotation);
   2164 #else
   2165    if (unlikely(debug_flag))
   2166       brw_validate_instructions(p, 0, &annotation);
   2167 #endif
   2168 
   2169    int before_size = p->next_insn_offset;
   2170    brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann);
   2171    int after_size = p->next_insn_offset;
   2172 
   2173    if (unlikely(debug_flag)) {
   2174       fprintf(stderr, "Native code for %s %s shader %s:\n",
   2175               nir->info->label ? nir->info->label : "unnamed",
   2176               _mesa_shader_stage_to_string(nir->stage), nir->info->name);
   2177 
   2178       fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
   2179                       "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
   2180               stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
   2181               spill_count, fill_count, before_size, after_size,
   2182               100.0f * (before_size - after_size) / before_size);
   2183 
   2184       dump_assembly(p->store, annotation.ann_count, annotation.ann,
   2185                     p->devinfo);
   2186       ralloc_free(annotation.mem_ctx);
   2187    }
   2188    assert(validated);
   2189 
   2190    compiler->shader_debug_log(log_data,
   2191                               "%s vec4 shader: %d inst, %d loops, %u cycles, "
   2192                               "%d:%d spills:fills, compacted %d to %d bytes.",
   2193                               stage_abbrev, before_size / 16,
   2194                               loop_count, cfg->cycle_count, spill_count,
   2195                               fill_count, before_size, after_size);
   2196 
   2197 }
   2198 
   2199 extern "C" const unsigned *
   2200 brw_vec4_generate_assembly(const struct brw_compiler *compiler,
   2201                            void *log_data,
   2202                            void *mem_ctx,
   2203                            const nir_shader *nir,
   2204                            struct brw_vue_prog_data *prog_data,
   2205                            const struct cfg_t *cfg,
   2206                            unsigned *out_assembly_size)
   2207 {
   2208    struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
   2209    brw_init_codegen(compiler->devinfo, p, mem_ctx);
   2210    brw_set_default_access_mode(p, BRW_ALIGN_16);
   2211 
   2212    generate_code(p, compiler, log_data, nir, prog_data, cfg);
   2213 
   2214    return brw_get_program(p, out_assembly_size);
   2215 }
   2216