Home | History | Annotate | Download | only in compiler
      1 /*
      2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
      3  Intel funded Tungsten Graphics to
      4  develop this 3D driver.
      5 
      6  Permission is hereby granted, free of charge, to any person obtaining
      7  a copy of this software and associated documentation files (the
      8  "Software"), to deal in the Software without restriction, including
      9  without limitation the rights to use, copy, modify, merge, publish,
     10  distribute, sublicense, and/or sell copies of the Software, and to
     11  permit persons to whom the Software is furnished to do so, subject to
     12  the following conditions:
     13 
     14  The above copyright notice and this permission notice (including the
     15  next paragraph) shall be included in all copies or substantial
     16  portions of the Software.
     17 
     18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
     22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25 
     26  **********************************************************************/
     27  /*
     28   * Authors:
     29   *   Keith Whitwell <keithw (at) vmware.com>
     30   */
     31 
     32 
     33 #include "brw_eu_defines.h"
     34 #include "brw_eu.h"
     35 
     36 #include "util/ralloc.h"
     37 
     38 /**
     39  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
     40  * registers, implicitly moving the operand to a message register.
     41  *
     42  * On Sandybridge, this is no longer the case.  This function performs the
     43  * explicit move; it should be called before emitting a SEND instruction.
     44  */
     45 void
     46 gen6_resolve_implied_move(struct brw_codegen *p,
     47 			  struct brw_reg *src,
     48 			  unsigned msg_reg_nr)
     49 {
     50    const struct gen_device_info *devinfo = p->devinfo;
     51    if (devinfo->gen < 6)
     52       return;
     53 
     54    if (src->file == BRW_MESSAGE_REGISTER_FILE)
     55       return;
     56 
     57    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
     58       brw_push_insn_state(p);
     59       brw_set_default_exec_size(p, BRW_EXECUTE_8);
     60       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
     61       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
     62       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
     63 	      retype(*src, BRW_REGISTER_TYPE_UD));
     64       brw_pop_insn_state(p);
     65    }
     66    *src = brw_message_reg(msg_reg_nr);
     67 }
     68 
     69 static void
     70 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
     71 {
     72    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
     73     * "The send with EOT should use register space R112-R127 for <src>. This is
     74     *  to enable loading of a new thread into the same slot while the message
     75     *  with EOT for current thread is pending dispatch."
     76     *
     77     * Since we're pretending to have 16 MRFs anyway, we may as well use the
     78     * registers required for messages with EOT.
     79     */
     80    const struct gen_device_info *devinfo = p->devinfo;
     81    if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
     82       reg->file = BRW_GENERAL_REGISTER_FILE;
     83       reg->nr += GEN7_MRF_HACK_START;
     84    }
     85 }
     86 
     87 void
     88 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
     89 {
     90    const struct gen_device_info *devinfo = p->devinfo;
     91 
     92    if (dest.file == BRW_MESSAGE_REGISTER_FILE)
     93       assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
     94    else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
     95       assert(dest.nr < 128);
     96 
     97    gen7_convert_mrf_to_grf(p, &dest);
     98 
     99    brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
    100    brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
    101 
    102    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
    103       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
    104 
    105       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
    106          brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
    107 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
    108 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
    109          brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
    110       } else {
    111          brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
    112          brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
    113          if (dest.file == BRW_GENERAL_REGISTER_FILE ||
    114              dest.file == BRW_MESSAGE_REGISTER_FILE) {
    115             assert(dest.writemask != 0);
    116          }
    117 	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
    118 	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
    119 	  *    this to be programmed as "01".
    120 	  */
    121          brw_inst_set_dst_hstride(devinfo, inst, 1);
    122       }
    123    } else {
    124       brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
    125 
    126       /* These are different sizes in align1 vs align16:
    127        */
    128       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
    129          brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
    130                                        dest.indirect_offset);
    131 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
    132 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
    133          brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
    134       } else {
    135          brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
    136                                         dest.indirect_offset);
    137 	 /* even ignored in da16, still need to set as '01' */
    138          brw_inst_set_dst_hstride(devinfo, inst, 1);
    139       }
    140    }
    141 
    142    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
    143     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
    144     * small registers, it can be useful for us to automatically reduce it to
    145     * match the register size.
    146     */
    147    if (p->automatic_exec_sizes) {
    148       /*
    149        * In platforms that support fp64 we can emit instructions with a width
    150        * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
    151        * these cases we need to make sure that these instructions have their
    152        * exec sizes set properly when they are emitted and we can't rely on
    153        * this code to fix it.
    154        */
    155       bool fix_exec_size;
    156       if (devinfo->gen >= 6)
    157          fix_exec_size = dest.width < BRW_EXECUTE_4;
    158       else
    159          fix_exec_size = dest.width < BRW_EXECUTE_8;
    160 
    161       if (fix_exec_size)
    162          brw_inst_set_exec_size(devinfo, inst, dest.width);
    163    }
    164 }
    165 
    166 void
    167 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
    168 {
    169    const struct gen_device_info *devinfo = p->devinfo;
    170 
    171    if (reg.file == BRW_MESSAGE_REGISTER_FILE)
    172       assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
    173    else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
    174       assert(reg.nr < 128);
    175 
    176    gen7_convert_mrf_to_grf(p, &reg);
    177 
    178    if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
    179                              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
    180       /* Any source modifiers or regions will be ignored, since this just
    181        * identifies the MRF/GRF to start reading the message contents from.
    182        * Check for some likely failures.
    183        */
    184       assert(!reg.negate);
    185       assert(!reg.abs);
    186       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
    187    }
    188 
    189    brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
    190    brw_inst_set_src0_abs(devinfo, inst, reg.abs);
    191    brw_inst_set_src0_negate(devinfo, inst, reg.negate);
    192    brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
    193 
    194    if (reg.file == BRW_IMMEDIATE_VALUE) {
    195       if (reg.type == BRW_REGISTER_TYPE_DF ||
    196           brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
    197          brw_inst_set_imm_df(devinfo, inst, reg.df);
    198       else if (reg.type == BRW_REGISTER_TYPE_UQ ||
    199                reg.type == BRW_REGISTER_TYPE_Q)
    200          brw_inst_set_imm_uq(devinfo, inst, reg.u64);
    201       else
    202          brw_inst_set_imm_ud(devinfo, inst, reg.ud);
    203 
    204       if (type_sz(reg.type) < 8) {
    205          brw_inst_set_src1_reg_file(devinfo, inst,
    206                                     BRW_ARCHITECTURE_REGISTER_FILE);
    207          brw_inst_set_src1_reg_hw_type(devinfo, inst,
    208                                        brw_inst_src0_reg_hw_type(devinfo, inst));
    209       }
    210    } else {
    211       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
    212          brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
    213          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
    214              brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
    215 	 } else {
    216             brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
    217 	 }
    218       } else {
    219          brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
    220 
    221          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
    222             brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
    223 	 } else {
    224             brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
    225 	 }
    226       }
    227 
    228       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
    229 	 if (reg.width == BRW_WIDTH_1 &&
    230              brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
    231             brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
    232             brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
    233             brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
    234 	 } else {
    235             brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
    236             brw_inst_set_src0_width(devinfo, inst, reg.width);
    237             brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
    238 	 }
    239       } else {
    240          brw_inst_set_src0_da16_swiz_x(devinfo, inst,
    241             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
    242          brw_inst_set_src0_da16_swiz_y(devinfo, inst,
    243             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
    244          brw_inst_set_src0_da16_swiz_z(devinfo, inst,
    245             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
    246          brw_inst_set_src0_da16_swiz_w(devinfo, inst,
    247             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
    248 
    249          if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
    250             /* This is an oddity of the fact we're using the same
    251              * descriptions for registers in align_16 as align_1:
    252              */
    253             brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
    254          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
    255                     reg.type == BRW_REGISTER_TYPE_DF &&
    256                     reg.vstride == BRW_VERTICAL_STRIDE_2) {
    257             /* From SNB PRM:
    258              *
    259              * "For Align16 access mode, only encodings of 0000 and 0011
    260              *  are allowed. Other codes are reserved."
    261              *
    262              * Presumably the DevSNB behavior applies to IVB as well.
    263              */
    264             brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
    265          } else {
    266             brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
    267          }
    268       }
    269    }
    270 }
    271 
    272 
    273 void
    274 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
    275 {
    276    const struct gen_device_info *devinfo = p->devinfo;
    277 
    278    if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
    279       assert(reg.nr < 128);
    280 
    281    /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
    282     *
    283     *    "Accumulator registers may be accessed explicitly as src0
    284     *    operands only."
    285     */
    286    assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
    287           reg.nr != BRW_ARF_ACCUMULATOR);
    288 
    289    gen7_convert_mrf_to_grf(p, &reg);
    290    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
    291 
    292    brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
    293    brw_inst_set_src1_abs(devinfo, inst, reg.abs);
    294    brw_inst_set_src1_negate(devinfo, inst, reg.negate);
    295 
    296    /* Only src1 can be immediate in two-argument instructions.
    297     */
    298    assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
    299 
    300    if (reg.file == BRW_IMMEDIATE_VALUE) {
    301       /* two-argument instructions can only use 32-bit immediates */
    302       assert(type_sz(reg.type) < 8);
    303       brw_inst_set_imm_ud(devinfo, inst, reg.ud);
    304    } else {
    305       /* This is a hardware restriction, which may or may not be lifted
    306        * in the future:
    307        */
    308       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
    309       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
    310 
    311       brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
    312       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
    313          brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
    314       } else {
    315          brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
    316       }
    317 
    318       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
    319 	 if (reg.width == BRW_WIDTH_1 &&
    320              brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
    321             brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
    322             brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
    323             brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
    324 	 } else {
    325             brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
    326             brw_inst_set_src1_width(devinfo, inst, reg.width);
    327             brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
    328 	 }
    329       } else {
    330          brw_inst_set_src1_da16_swiz_x(devinfo, inst,
    331             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
    332          brw_inst_set_src1_da16_swiz_y(devinfo, inst,
    333             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
    334          brw_inst_set_src1_da16_swiz_z(devinfo, inst,
    335             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
    336          brw_inst_set_src1_da16_swiz_w(devinfo, inst,
    337             BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
    338 
    339          if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
    340             /* This is an oddity of the fact we're using the same
    341              * descriptions for registers in align_16 as align_1:
    342              */
    343             brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
    344          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
    345                     reg.type == BRW_REGISTER_TYPE_DF &&
    346                     reg.vstride == BRW_VERTICAL_STRIDE_2) {
    347             /* From SNB PRM:
    348              *
    349              * "For Align16 access mode, only encodings of 0000 and 0011
    350              *  are allowed. Other codes are reserved."
    351              *
    352              * Presumably the DevSNB behavior applies to IVB as well.
    353              */
    354             brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
    355          } else {
    356             brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
    357          }
    358       }
    359    }
    360 }
    361 
    362 /**
    363  * Set the Message Descriptor and Extended Message Descriptor fields
    364  * for SEND messages.
    365  *
    366  * \note This zeroes out the Function Control bits, so it must be called
    367  *       \b before filling out any message-specific data.  Callers can
    368  *       choose not to fill in irrelevant bits; they will be zero.
    369  */
    370 void
    371 brw_set_message_descriptor(struct brw_codegen *p,
    372 			   brw_inst *inst,
    373 			   enum brw_message_target sfid,
    374 			   unsigned msg_length,
    375 			   unsigned response_length,
    376 			   bool header_present,
    377 			   bool end_of_thread)
    378 {
    379    const struct gen_device_info *devinfo = p->devinfo;
    380 
    381    brw_set_src1(p, inst, brw_imm_d(0));
    382 
    383    /* For indirect sends, `inst` will not be the SEND/SENDC instruction
    384     * itself; instead, it will be a MOV/OR into the address register.
    385     *
    386     * In this case, we avoid setting the extended message descriptor bits,
    387     * since they go on the later SEND/SENDC instead and if set here would
    388     * instead clobber the conditionalmod bits.
    389     */
    390    unsigned opcode = brw_inst_opcode(devinfo, inst);
    391    if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
    392       brw_inst_set_sfid(devinfo, inst, sfid);
    393    }
    394 
    395    brw_inst_set_mlen(devinfo, inst, msg_length);
    396    brw_inst_set_rlen(devinfo, inst, response_length);
    397    brw_inst_set_eot(devinfo, inst, end_of_thread);
    398 
    399    if (devinfo->gen >= 5) {
    400       brw_inst_set_header_present(devinfo, inst, header_present);
    401    }
    402 }
    403 
    404 static void brw_set_math_message( struct brw_codegen *p,
    405 				  brw_inst *inst,
    406 				  unsigned function,
    407 				  unsigned integer_type,
    408 				  bool low_precision,
    409 				  unsigned dataType )
    410 {
    411    const struct gen_device_info *devinfo = p->devinfo;
    412    unsigned msg_length;
    413    unsigned response_length;
    414 
    415    /* Infer message length from the function */
    416    switch (function) {
    417    case BRW_MATH_FUNCTION_POW:
    418    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
    419    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
    420    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
    421       msg_length = 2;
    422       break;
    423    default:
    424       msg_length = 1;
    425       break;
    426    }
    427 
    428    /* Infer response length from the function */
    429    switch (function) {
    430    case BRW_MATH_FUNCTION_SINCOS:
    431    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
    432       response_length = 2;
    433       break;
    434    default:
    435       response_length = 1;
    436       break;
    437    }
    438 
    439 
    440    brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
    441 			      msg_length, response_length, false, false);
    442    brw_inst_set_math_msg_function(devinfo, inst, function);
    443    brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
    444    brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
    445    brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
    446    brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
    447    brw_inst_set_saturate(devinfo, inst, 0);
    448 }
    449 
    450 
    451 static void brw_set_ff_sync_message(struct brw_codegen *p,
    452 				    brw_inst *insn,
    453 				    bool allocate,
    454 				    unsigned response_length,
    455 				    bool end_of_thread)
    456 {
    457    const struct gen_device_info *devinfo = p->devinfo;
    458 
    459    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
    460 			      1, response_length, true, end_of_thread);
    461    brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
    462    brw_inst_set_urb_allocate(devinfo, insn, allocate);
    463    /* The following fields are not used by FF_SYNC: */
    464    brw_inst_set_urb_global_offset(devinfo, insn, 0);
    465    brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
    466    brw_inst_set_urb_used(devinfo, insn, 0);
    467    brw_inst_set_urb_complete(devinfo, insn, 0);
    468 }
    469 
    470 static void brw_set_urb_message( struct brw_codegen *p,
    471 				 brw_inst *insn,
    472                                  enum brw_urb_write_flags flags,
    473 				 unsigned msg_length,
    474 				 unsigned response_length,
    475 				 unsigned offset,
    476 				 unsigned swizzle_control )
    477 {
    478    const struct gen_device_info *devinfo = p->devinfo;
    479 
    480    assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
    481    assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
    482    assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
    483 
    484    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
    485 			      msg_length, response_length, true,
    486                               flags & BRW_URB_WRITE_EOT);
    487 
    488    if (flags & BRW_URB_WRITE_OWORD) {
    489       assert(msg_length == 2); /* header + one OWORD of data */
    490       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
    491    } else {
    492       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
    493    }
    494 
    495    brw_inst_set_urb_global_offset(devinfo, insn, offset);
    496    brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
    497 
    498    if (devinfo->gen < 8) {
    499       brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
    500    }
    501 
    502    if (devinfo->gen < 7) {
    503       brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
    504       brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
    505    } else {
    506       brw_inst_set_urb_per_slot_offset(devinfo, insn,
    507          !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
    508    }
    509 }
    510 
    511 void
    512 brw_set_dp_write_message(struct brw_codegen *p,
    513 			 brw_inst *insn,
    514 			 unsigned binding_table_index,
    515 			 unsigned msg_control,
    516 			 unsigned msg_type,
    517                          unsigned target_cache,
    518 			 unsigned msg_length,
    519 			 bool header_present,
    520 			 unsigned last_render_target,
    521 			 unsigned response_length,
    522 			 unsigned end_of_thread,
    523 			 unsigned send_commit_msg)
    524 {
    525    const struct gen_device_info *devinfo = p->devinfo;
    526    const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
    527                           BRW_SFID_DATAPORT_WRITE);
    528 
    529    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
    530 			      header_present, end_of_thread);
    531 
    532    brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
    533    brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
    534    brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
    535    brw_inst_set_rt_last(devinfo, insn, last_render_target);
    536    if (devinfo->gen < 7) {
    537       brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
    538    }
    539 }
    540 
    541 void
    542 brw_set_dp_read_message(struct brw_codegen *p,
    543 			brw_inst *insn,
    544 			unsigned binding_table_index,
    545 			unsigned msg_control,
    546 			unsigned msg_type,
    547 			unsigned target_cache,
    548 			unsigned msg_length,
    549                         bool header_present,
    550 			unsigned response_length)
    551 {
    552    const struct gen_device_info *devinfo = p->devinfo;
    553    const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
    554                           BRW_SFID_DATAPORT_READ);
    555 
    556    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
    557 			      header_present, false);
    558 
    559    brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
    560    brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
    561    brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
    562    if (devinfo->gen < 6)
    563       brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
    564 }
    565 
    566 void
    567 brw_set_sampler_message(struct brw_codegen *p,
    568                         brw_inst *inst,
    569                         unsigned binding_table_index,
    570                         unsigned sampler,
    571                         unsigned msg_type,
    572                         unsigned response_length,
    573                         unsigned msg_length,
    574                         unsigned header_present,
    575                         unsigned simd_mode,
    576                         unsigned return_format)
    577 {
    578    const struct gen_device_info *devinfo = p->devinfo;
    579 
    580    brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
    581 			      response_length, header_present, false);
    582 
    583    brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
    584    brw_inst_set_sampler(devinfo, inst, sampler);
    585    brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
    586    if (devinfo->gen >= 5) {
    587       brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
    588    } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
    589       brw_inst_set_sampler_return_format(devinfo, inst, return_format);
    590    }
    591 }
    592 
    593 static void
    594 gen7_set_dp_scratch_message(struct brw_codegen *p,
    595                             brw_inst *inst,
    596                             bool write,
    597                             bool dword,
    598                             bool invalidate_after_read,
    599                             unsigned num_regs,
    600                             unsigned addr_offset,
    601                             unsigned mlen,
    602                             unsigned rlen,
    603                             bool header_present)
    604 {
    605    const struct gen_device_info *devinfo = p->devinfo;
    606    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
    607           (devinfo->gen >= 8 && num_regs == 8));
    608    const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
    609                                 num_regs - 1);
    610 
    611    brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
    612                               mlen, rlen, header_present, false);
    613    brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
    614    brw_inst_set_scratch_read_write(devinfo, inst, write);
    615    brw_inst_set_scratch_type(devinfo, inst, dword);
    616    brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
    617    brw_inst_set_scratch_block_size(devinfo, inst, block_size);
    618    brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
    619 }
    620 
    621 #define next_insn brw_next_insn
    622 brw_inst *
    623 brw_next_insn(struct brw_codegen *p, unsigned opcode)
    624 {
    625    const struct gen_device_info *devinfo = p->devinfo;
    626    brw_inst *insn;
    627 
    628    if (p->nr_insn + 1 > p->store_size) {
    629       p->store_size <<= 1;
    630       p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
    631    }
    632 
    633    p->next_insn_offset += 16;
    634    insn = &p->store[p->nr_insn++];
    635    memcpy(insn, p->current, sizeof(*insn));
    636 
    637    brw_inst_set_opcode(devinfo, insn, opcode);
    638    return insn;
    639 }
    640 
    641 static brw_inst *
    642 brw_alu1(struct brw_codegen *p, unsigned opcode,
    643          struct brw_reg dest, struct brw_reg src)
    644 {
    645    brw_inst *insn = next_insn(p, opcode);
    646    brw_set_dest(p, insn, dest);
    647    brw_set_src0(p, insn, src);
    648    return insn;
    649 }
    650 
    651 static brw_inst *
    652 brw_alu2(struct brw_codegen *p, unsigned opcode,
    653          struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
    654 {
    655    /* 64-bit immediates are only supported on 1-src instructions */
    656    assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
    657    assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
    658 
    659    brw_inst *insn = next_insn(p, opcode);
    660    brw_set_dest(p, insn, dest);
    661    brw_set_src0(p, insn, src0);
    662    brw_set_src1(p, insn, src1);
    663    return insn;
    664 }
    665 
    666 static int
    667 get_3src_subreg_nr(struct brw_reg reg)
    668 {
    669    /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
    670     * use 32-bit units (components 0..7).  Since they only support F/D/UD
    671     * types, this doesn't lose any flexibility, but uses fewer bits.
    672     */
    673    return reg.subnr / 4;
    674 }
    675 
    676 static enum gen10_align1_3src_vertical_stride
    677 to_3src_align1_vstride(enum brw_vertical_stride vstride)
    678 {
    679    switch (vstride) {
    680    case BRW_VERTICAL_STRIDE_0:
    681       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
    682    case BRW_VERTICAL_STRIDE_2:
    683       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
    684    case BRW_VERTICAL_STRIDE_4:
    685       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
    686    case BRW_VERTICAL_STRIDE_8:
    687    case BRW_VERTICAL_STRIDE_16:
    688       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
    689    default:
    690       unreachable("invalid vstride");
    691    }
    692 }
    693 
    694 
    695 static enum gen10_align1_3src_src_horizontal_stride
    696 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
    697 {
    698    switch (hstride) {
    699    case BRW_HORIZONTAL_STRIDE_0:
    700       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
    701    case BRW_HORIZONTAL_STRIDE_1:
    702       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
    703    case BRW_HORIZONTAL_STRIDE_2:
    704       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
    705    case BRW_HORIZONTAL_STRIDE_4:
    706       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
    707    default:
    708       unreachable("invalid hstride");
    709    }
    710 }
    711 
    712 static brw_inst *
    713 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
    714          struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
    715 {
    716    const struct gen_device_info *devinfo = p->devinfo;
    717    brw_inst *inst = next_insn(p, opcode);
    718 
    719    gen7_convert_mrf_to_grf(p, &dest);
    720 
    721    assert(dest.nr < 128);
    722    assert(src0.nr < 128);
    723    assert(src1.nr < 128);
    724    assert(src2.nr < 128);
    725    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
    726    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
    727    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
    728    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
    729 
    730    if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
    731       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
    732              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
    733 
    734       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
    735          brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
    736                                            BRW_ALIGN1_3SRC_ACCUMULATOR);
    737          brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
    738       } else {
    739          brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
    740                                            BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
    741          brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
    742       }
    743       brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
    744 
    745       brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
    746 
    747       if (brw_reg_type_is_floating_point(dest.type)) {
    748          brw_inst_set_3src_a1_exec_type(devinfo, inst,
    749                                         BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
    750       } else {
    751          brw_inst_set_3src_a1_exec_type(devinfo, inst,
    752                                         BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
    753       }
    754 
    755       brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
    756       brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
    757       brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
    758       brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
    759 
    760       brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
    761                                         to_3src_align1_vstride(src0.vstride));
    762       brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
    763                                         to_3src_align1_vstride(src1.vstride));
    764       /* no vstride on src2 */
    765 
    766       brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
    767                                         to_3src_align1_hstride(src0.hstride));
    768       brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
    769                                         to_3src_align1_hstride(src1.hstride));
    770       brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
    771                                         to_3src_align1_hstride(src2.hstride));
    772 
    773       brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
    774       brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
    775       brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
    776       brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
    777 
    778       brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
    779       if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
    780          brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
    781       } else {
    782          brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
    783       }
    784       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
    785       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
    786 
    787       brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
    788       brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
    789       brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
    790       brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
    791 
    792       assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
    793              src0.file == BRW_IMMEDIATE_VALUE);
    794       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
    795              src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
    796       assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
    797              src2.file == BRW_IMMEDIATE_VALUE);
    798 
    799       brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
    800                                          src0.file == BRW_GENERAL_REGISTER_FILE ?
    801                                          BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
    802                                          BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
    803       brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
    804                                          src1.file == BRW_GENERAL_REGISTER_FILE ?
    805                                          BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
    806                                          BRW_ALIGN1_3SRC_ACCUMULATOR);
    807       brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
    808                                          src2.file == BRW_GENERAL_REGISTER_FILE ?
    809                                          BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
    810                                          BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
    811    } else {
    812       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
    813              dest.file == BRW_MESSAGE_REGISTER_FILE);
    814       assert(dest.type == BRW_REGISTER_TYPE_F  ||
    815              dest.type == BRW_REGISTER_TYPE_DF ||
    816              dest.type == BRW_REGISTER_TYPE_D  ||
    817              dest.type == BRW_REGISTER_TYPE_UD);
    818       if (devinfo->gen == 6) {
    819          brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
    820                                             dest.file == BRW_MESSAGE_REGISTER_FILE);
    821       }
    822       brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
    823       brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
    824       brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
    825 
    826       assert(src0.file == BRW_GENERAL_REGISTER_FILE);
    827       brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
    828       brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
    829       brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
    830       brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
    831       brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
    832       brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
    833                                           src0.vstride == BRW_VERTICAL_STRIDE_0);
    834 
    835       assert(src1.file == BRW_GENERAL_REGISTER_FILE);
    836       brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
    837       brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
    838       brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
    839       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
    840       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
    841       brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
    842                                           src1.vstride == BRW_VERTICAL_STRIDE_0);
    843 
    844       assert(src2.file == BRW_GENERAL_REGISTER_FILE);
    845       brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
    846       brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
    847       brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
    848       brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
    849       brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
    850       brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
    851                                           src2.vstride == BRW_VERTICAL_STRIDE_0);
    852 
    853       if (devinfo->gen >= 7) {
    854          /* Set both the source and destination types based on dest.type,
    855           * ignoring the source register types.  The MAD and LRP emitters ensure
    856           * that all four types are float.  The BFE and BFI2 emitters, however,
    857           * may send us mixed D and UD types and want us to ignore that and use
    858           * the destination type.
    859           */
    860          brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
    861          brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
    862       }
    863    }
    864 
    865    return inst;
    866 }
    867 
    868 
    869 /***********************************************************************
    870  * Convenience routines.
    871  */
    872 #define ALU1(OP)					\
    873 brw_inst *brw_##OP(struct brw_codegen *p,		\
    874 	      struct brw_reg dest,			\
    875 	      struct brw_reg src0)   			\
    876 {							\
    877    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
    878 }
    879 
    880 #define ALU2(OP)					\
    881 brw_inst *brw_##OP(struct brw_codegen *p,		\
    882 	      struct brw_reg dest,			\
    883 	      struct brw_reg src0,			\
    884 	      struct brw_reg src1)   			\
    885 {							\
    886    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
    887 }
    888 
    889 #define ALU3(OP)					\
    890 brw_inst *brw_##OP(struct brw_codegen *p,		\
    891 	      struct brw_reg dest,			\
    892 	      struct brw_reg src0,			\
    893 	      struct brw_reg src1,			\
    894 	      struct brw_reg src2)   			\
    895 {							\
    896    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
    897 }
    898 
    899 #define ALU3F(OP)                                               \
    900 brw_inst *brw_##OP(struct brw_codegen *p,         \
    901                                  struct brw_reg dest,           \
    902                                  struct brw_reg src0,           \
    903                                  struct brw_reg src1,           \
    904                                  struct brw_reg src2)           \
    905 {                                                               \
    906    assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
    907           dest.type == BRW_REGISTER_TYPE_DF);                   \
    908    if (dest.type == BRW_REGISTER_TYPE_F) {                      \
    909       assert(src0.type == BRW_REGISTER_TYPE_F);                 \
    910       assert(src1.type == BRW_REGISTER_TYPE_F);                 \
    911       assert(src2.type == BRW_REGISTER_TYPE_F);                 \
    912    } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
    913       assert(src0.type == BRW_REGISTER_TYPE_DF);                \
    914       assert(src1.type == BRW_REGISTER_TYPE_DF);                \
    915       assert(src2.type == BRW_REGISTER_TYPE_DF);                \
    916    }                                                            \
    917    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
    918 }
    919 
    920 /* Rounding operations (other than RNDD) require two instructions - the first
    921  * stores a rounded value (possibly the wrong way) in the dest register, but
    922  * also sets a per-channel "increment bit" in the flag register.  A predicated
    923  * add of 1.0 fixes dest to contain the desired result.
    924  *
    925  * Sandybridge and later appear to round correctly without an ADD.
    926  */
    927 #define ROUND(OP)							      \
    928 void brw_##OP(struct brw_codegen *p,					      \
    929 	      struct brw_reg dest,					      \
    930 	      struct brw_reg src)					      \
    931 {									      \
    932    const struct gen_device_info *devinfo = p->devinfo;					      \
    933    brw_inst *rnd, *add;							      \
    934    rnd = next_insn(p, BRW_OPCODE_##OP);					      \
    935    brw_set_dest(p, rnd, dest);						      \
    936    brw_set_src0(p, rnd, src);						      \
    937 									      \
    938    if (devinfo->gen < 6) {							      \
    939       /* turn on round-increments */					      \
    940       brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
    941       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
    942       brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
    943    }									      \
    944 }
    945 
    946 
    947 ALU2(SEL)
    948 ALU1(NOT)
    949 ALU2(AND)
    950 ALU2(OR)
    951 ALU2(XOR)
    952 ALU2(SHR)
    953 ALU2(SHL)
    954 ALU1(DIM)
    955 ALU2(ASR)
    956 ALU1(FRC)
    957 ALU1(RNDD)
    958 ALU2(MAC)
    959 ALU2(MACH)
    960 ALU1(LZD)
    961 ALU2(DP4)
    962 ALU2(DPH)
    963 ALU2(DP3)
    964 ALU2(DP2)
    965 ALU3F(MAD)
    966 ALU3F(LRP)
    967 ALU1(BFREV)
    968 ALU3(BFE)
    969 ALU2(BFI1)
    970 ALU3(BFI2)
    971 ALU1(FBH)
    972 ALU1(FBL)
    973 ALU1(CBIT)
    974 ALU2(ADDC)
    975 ALU2(SUBB)
    976 
    977 ROUND(RNDZ)
    978 ROUND(RNDE)
    979 
    980 brw_inst *
    981 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
    982 {
    983    const struct gen_device_info *devinfo = p->devinfo;
    984 
    985    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
    986     * To avoid the problems that causes, we use a <1,2,0> source region to read
    987     * each element twice.
    988     */
    989    if (devinfo->gen == 7 && !devinfo->is_haswell &&
    990        brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1 &&
    991        dest.type == BRW_REGISTER_TYPE_DF &&
    992        (src0.type == BRW_REGISTER_TYPE_F ||
    993         src0.type == BRW_REGISTER_TYPE_D ||
    994         src0.type == BRW_REGISTER_TYPE_UD) &&
    995        !has_scalar_region(src0)) {
    996       assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
    997              src0.width == BRW_WIDTH_4 &&
    998              src0.hstride == BRW_HORIZONTAL_STRIDE_1);
    999 
   1000       src0.vstride = BRW_VERTICAL_STRIDE_1;
   1001       src0.width = BRW_WIDTH_2;
   1002       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
   1003    }
   1004 
   1005    return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
   1006 }
   1007 
   1008 brw_inst *
   1009 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
   1010         struct brw_reg src0, struct brw_reg src1)
   1011 {
   1012    /* 6.2.2: add */
   1013    if (src0.type == BRW_REGISTER_TYPE_F ||
   1014        (src0.file == BRW_IMMEDIATE_VALUE &&
   1015 	src0.type == BRW_REGISTER_TYPE_VF)) {
   1016       assert(src1.type != BRW_REGISTER_TYPE_UD);
   1017       assert(src1.type != BRW_REGISTER_TYPE_D);
   1018    }
   1019 
   1020    if (src1.type == BRW_REGISTER_TYPE_F ||
   1021        (src1.file == BRW_IMMEDIATE_VALUE &&
   1022 	src1.type == BRW_REGISTER_TYPE_VF)) {
   1023       assert(src0.type != BRW_REGISTER_TYPE_UD);
   1024       assert(src0.type != BRW_REGISTER_TYPE_D);
   1025    }
   1026 
   1027    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
   1028 }
   1029 
   1030 brw_inst *
   1031 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
   1032         struct brw_reg src0, struct brw_reg src1)
   1033 {
   1034    assert(dest.type == src0.type);
   1035    assert(src0.type == src1.type);
   1036    switch (src0.type) {
   1037    case BRW_REGISTER_TYPE_B:
   1038    case BRW_REGISTER_TYPE_UB:
   1039    case BRW_REGISTER_TYPE_W:
   1040    case BRW_REGISTER_TYPE_UW:
   1041    case BRW_REGISTER_TYPE_D:
   1042    case BRW_REGISTER_TYPE_UD:
   1043       break;
   1044    default:
   1045       unreachable("Bad type for brw_AVG");
   1046    }
   1047 
   1048    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
   1049 }
   1050 
   1051 brw_inst *
   1052 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
   1053         struct brw_reg src0, struct brw_reg src1)
   1054 {
   1055    /* 6.32.38: mul */
   1056    if (src0.type == BRW_REGISTER_TYPE_D ||
   1057        src0.type == BRW_REGISTER_TYPE_UD ||
   1058        src1.type == BRW_REGISTER_TYPE_D ||
   1059        src1.type == BRW_REGISTER_TYPE_UD) {
   1060       assert(dest.type != BRW_REGISTER_TYPE_F);
   1061    }
   1062 
   1063    if (src0.type == BRW_REGISTER_TYPE_F ||
   1064        (src0.file == BRW_IMMEDIATE_VALUE &&
   1065 	src0.type == BRW_REGISTER_TYPE_VF)) {
   1066       assert(src1.type != BRW_REGISTER_TYPE_UD);
   1067       assert(src1.type != BRW_REGISTER_TYPE_D);
   1068    }
   1069 
   1070    if (src1.type == BRW_REGISTER_TYPE_F ||
   1071        (src1.file == BRW_IMMEDIATE_VALUE &&
   1072 	src1.type == BRW_REGISTER_TYPE_VF)) {
   1073       assert(src0.type != BRW_REGISTER_TYPE_UD);
   1074       assert(src0.type != BRW_REGISTER_TYPE_D);
   1075    }
   1076 
   1077    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
   1078 	  src0.nr != BRW_ARF_ACCUMULATOR);
   1079    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
   1080 	  src1.nr != BRW_ARF_ACCUMULATOR);
   1081 
   1082    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
   1083 }
   1084 
   1085 brw_inst *
   1086 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
   1087          struct brw_reg src0, struct brw_reg src1)
   1088 {
   1089    src0.vstride = BRW_VERTICAL_STRIDE_0;
   1090    src0.width = BRW_WIDTH_1;
   1091    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
   1092    return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
   1093 }
   1094 
   1095 brw_inst *
   1096 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
   1097         struct brw_reg src0, struct brw_reg src1)
   1098 {
   1099    src0.vstride = BRW_VERTICAL_STRIDE_0;
   1100    src0.width = BRW_WIDTH_1;
   1101    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
   1102    src1.vstride = BRW_VERTICAL_STRIDE_8;
   1103    src1.width = BRW_WIDTH_8;
   1104    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
   1105    return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
   1106 }
   1107 
   1108 brw_inst *
   1109 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
   1110 {
   1111    const struct gen_device_info *devinfo = p->devinfo;
   1112    const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
   1113    /* The F32TO16 instruction doesn't support 32-bit destination types in
   1114     * Align1 mode, and neither does the Gen8 implementation in terms of a
   1115     * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
   1116     * an undocumented feature.
   1117     */
   1118    const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
   1119                                  (!align16 || devinfo->gen >= 8));
   1120    brw_inst *inst;
   1121 
   1122    if (align16) {
   1123       assert(dst.type == BRW_REGISTER_TYPE_UD);
   1124    } else {
   1125       assert(dst.type == BRW_REGISTER_TYPE_UD ||
   1126              dst.type == BRW_REGISTER_TYPE_W ||
   1127              dst.type == BRW_REGISTER_TYPE_UW ||
   1128              dst.type == BRW_REGISTER_TYPE_HF);
   1129    }
   1130 
   1131    brw_push_insn_state(p);
   1132 
   1133    if (needs_zero_fill) {
   1134       brw_set_default_access_mode(p, BRW_ALIGN_1);
   1135       dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
   1136    }
   1137 
   1138    if (devinfo->gen >= 8) {
   1139       inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
   1140    } else {
   1141       assert(devinfo->gen == 7);
   1142       inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
   1143    }
   1144 
   1145    if (needs_zero_fill) {
   1146       brw_inst_set_no_dd_clear(devinfo, inst, true);
   1147       inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
   1148       brw_inst_set_no_dd_check(devinfo, inst, true);
   1149    }
   1150 
   1151    brw_pop_insn_state(p);
   1152    return inst;
   1153 }
   1154 
   1155 brw_inst *
   1156 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
   1157 {
   1158    const struct gen_device_info *devinfo = p->devinfo;
   1159    bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
   1160 
   1161    if (align16) {
   1162       assert(src.type == BRW_REGISTER_TYPE_UD);
   1163    } else {
   1164       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
   1165        *
   1166        *   Because this instruction does not have a 16-bit floating-point
   1167        *   type, the source data type must be Word (W). The destination type
   1168        *   must be F (Float).
   1169        */
   1170       if (src.type == BRW_REGISTER_TYPE_UD)
   1171          src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
   1172 
   1173       assert(src.type == BRW_REGISTER_TYPE_W ||
   1174              src.type == BRW_REGISTER_TYPE_UW ||
   1175              src.type == BRW_REGISTER_TYPE_HF);
   1176    }
   1177 
   1178    if (devinfo->gen >= 8) {
   1179       return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
   1180    } else {
   1181       assert(devinfo->gen == 7);
   1182       return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
   1183    }
   1184 }
   1185 
   1186 
   1187 void brw_NOP(struct brw_codegen *p)
   1188 {
   1189    brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
   1190    memset(insn, 0, sizeof(*insn));
   1191    brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
   1192 }
   1193 
   1194 
   1195 
   1196 
   1197 
   1198 /***********************************************************************
   1199  * Comparisons, if/else/endif
   1200  */
   1201 
   1202 brw_inst *
   1203 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
   1204          unsigned predicate_control)
   1205 {
   1206    const struct gen_device_info *devinfo = p->devinfo;
   1207    struct brw_reg ip = brw_ip_reg();
   1208    brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
   1209 
   1210    brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
   1211    brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
   1212    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
   1213    brw_inst_set_pred_control(devinfo, inst, predicate_control);
   1214 
   1215    return inst;
   1216 }
   1217 
   1218 static void
   1219 push_if_stack(struct brw_codegen *p, brw_inst *inst)
   1220 {
   1221    p->if_stack[p->if_stack_depth] = inst - p->store;
   1222 
   1223    p->if_stack_depth++;
   1224    if (p->if_stack_array_size <= p->if_stack_depth) {
   1225       p->if_stack_array_size *= 2;
   1226       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
   1227 			     p->if_stack_array_size);
   1228    }
   1229 }
   1230 
   1231 static brw_inst *
   1232 pop_if_stack(struct brw_codegen *p)
   1233 {
   1234    p->if_stack_depth--;
   1235    return &p->store[p->if_stack[p->if_stack_depth]];
   1236 }
   1237 
   1238 static void
   1239 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
   1240 {
   1241    if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
   1242       p->loop_stack_array_size *= 2;
   1243       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
   1244 			       p->loop_stack_array_size);
   1245       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
   1246 				     p->loop_stack_array_size);
   1247    }
   1248 
   1249    p->loop_stack[p->loop_stack_depth] = inst - p->store;
   1250    p->loop_stack_depth++;
   1251    p->if_depth_in_loop[p->loop_stack_depth] = 0;
   1252 }
   1253 
   1254 static brw_inst *
   1255 get_inner_do_insn(struct brw_codegen *p)
   1256 {
   1257    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
   1258 }
   1259 
   1260 /* EU takes the value from the flag register and pushes it onto some
   1261  * sort of a stack (presumably merging with any flag value already on
   1262  * the stack).  Within an if block, the flags at the top of the stack
   1263  * control execution on each channel of the unit, eg. on each of the
   1264  * 16 pixel values in our wm programs.
   1265  *
   1266  * When the matching 'else' instruction is reached (presumably by
   1267  * countdown of the instruction count patched in by our ELSE/ENDIF
   1268  * functions), the relevant flags are inverted.
   1269  *
   1270  * When the matching 'endif' instruction is reached, the flags are
   1271  * popped off.  If the stack is now empty, normal execution resumes.
   1272  */
   1273 brw_inst *
   1274 brw_IF(struct brw_codegen *p, unsigned execute_size)
   1275 {
   1276    const struct gen_device_info *devinfo = p->devinfo;
   1277    brw_inst *insn;
   1278 
   1279    insn = next_insn(p, BRW_OPCODE_IF);
   1280 
   1281    /* Override the defaults for this instruction:
   1282     */
   1283    if (devinfo->gen < 6) {
   1284       brw_set_dest(p, insn, brw_ip_reg());
   1285       brw_set_src0(p, insn, brw_ip_reg());
   1286       brw_set_src1(p, insn, brw_imm_d(0x0));
   1287    } else if (devinfo->gen == 6) {
   1288       brw_set_dest(p, insn, brw_imm_w(0));
   1289       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
   1290       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1291       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1292    } else if (devinfo->gen == 7) {
   1293       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1294       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1295       brw_set_src1(p, insn, brw_imm_w(0));
   1296       brw_inst_set_jip(devinfo, insn, 0);
   1297       brw_inst_set_uip(devinfo, insn, 0);
   1298    } else {
   1299       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1300       brw_set_src0(p, insn, brw_imm_d(0));
   1301       brw_inst_set_jip(devinfo, insn, 0);
   1302       brw_inst_set_uip(devinfo, insn, 0);
   1303    }
   1304 
   1305    brw_inst_set_exec_size(devinfo, insn, execute_size);
   1306    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
   1307    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
   1308    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
   1309    if (!p->single_program_flow && devinfo->gen < 6)
   1310       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
   1311 
   1312    push_if_stack(p, insn);
   1313    p->if_depth_in_loop[p->loop_stack_depth]++;
   1314    return insn;
   1315 }
   1316 
   1317 /* This function is only used for gen6-style IF instructions with an
   1318  * embedded comparison (conditional modifier).  It is not used on gen7.
   1319  */
   1320 brw_inst *
   1321 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
   1322 	struct brw_reg src0, struct brw_reg src1)
   1323 {
   1324    const struct gen_device_info *devinfo = p->devinfo;
   1325    brw_inst *insn;
   1326 
   1327    insn = next_insn(p, BRW_OPCODE_IF);
   1328 
   1329    brw_set_dest(p, insn, brw_imm_w(0));
   1330    brw_inst_set_exec_size(devinfo, insn,
   1331                           brw_inst_exec_size(devinfo, p->current));
   1332    brw_inst_set_gen6_jump_count(devinfo, insn, 0);
   1333    brw_set_src0(p, insn, src0);
   1334    brw_set_src1(p, insn, src1);
   1335 
   1336    assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
   1337    assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
   1338    brw_inst_set_cond_modifier(devinfo, insn, conditional);
   1339 
   1340    push_if_stack(p, insn);
   1341    return insn;
   1342 }
   1343 
   1344 /**
   1345  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
   1346  */
   1347 static void
   1348 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
   1349                        brw_inst *if_inst, brw_inst *else_inst)
   1350 {
   1351    const struct gen_device_info *devinfo = p->devinfo;
   1352 
   1353    /* The next instruction (where the ENDIF would be, if it existed) */
   1354    brw_inst *next_inst = &p->store[p->nr_insn];
   1355 
   1356    assert(p->single_program_flow);
   1357    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
   1358    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
   1359    assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
   1360 
   1361    /* Convert IF to an ADD instruction that moves the instruction pointer
   1362     * to the first instruction of the ELSE block.  If there is no ELSE
   1363     * block, point to where ENDIF would be.  Reverse the predicate.
   1364     *
   1365     * There's no need to execute an ENDIF since we don't need to do any
   1366     * stack operations, and if we're currently executing, we just want to
   1367     * continue normally.
   1368     */
   1369    brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
   1370    brw_inst_set_pred_inv(devinfo, if_inst, true);
   1371 
   1372    if (else_inst != NULL) {
   1373       /* Convert ELSE to an ADD instruction that points where the ENDIF
   1374        * would be.
   1375        */
   1376       brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
   1377 
   1378       brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
   1379       brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
   1380    } else {
   1381       brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
   1382    }
   1383 }
   1384 
   1385 /**
   1386  * Patch IF and ELSE instructions with appropriate jump targets.
   1387  */
   1388 static void
   1389 patch_IF_ELSE(struct brw_codegen *p,
   1390               brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
   1391 {
   1392    const struct gen_device_info *devinfo = p->devinfo;
   1393 
   1394    /* We shouldn't be patching IF and ELSE instructions in single program flow
   1395     * mode when gen < 6, because in single program flow mode on those
   1396     * platforms, we convert flow control instructions to conditional ADDs that
   1397     * operate on IP (see brw_ENDIF).
   1398     *
   1399     * However, on Gen6, writing to IP doesn't work in single program flow mode
   1400     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
   1401     * not be updated by non-flow control instructions.").  And on later
   1402     * platforms, there is no significant benefit to converting control flow
   1403     * instructions to conditional ADDs.  So we do patch IF and ELSE
   1404     * instructions in single program flow mode on those platforms.
   1405     */
   1406    if (devinfo->gen < 6)
   1407       assert(!p->single_program_flow);
   1408 
   1409    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
   1410    assert(endif_inst != NULL);
   1411    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
   1412 
   1413    unsigned br = brw_jump_scale(devinfo);
   1414 
   1415    assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
   1416    brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
   1417 
   1418    if (else_inst == NULL) {
   1419       /* Patch IF -> ENDIF */
   1420       if (devinfo->gen < 6) {
   1421 	 /* Turn it into an IFF, which means no mask stack operations for
   1422 	  * all-false and jumping past the ENDIF.
   1423 	  */
   1424          brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
   1425          brw_inst_set_gen4_jump_count(devinfo, if_inst,
   1426                                       br * (endif_inst - if_inst + 1));
   1427          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
   1428       } else if (devinfo->gen == 6) {
   1429 	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
   1430          brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
   1431       } else {
   1432          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
   1433          brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
   1434       }
   1435    } else {
   1436       brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
   1437 
   1438       /* Patch IF -> ELSE */
   1439       if (devinfo->gen < 6) {
   1440          brw_inst_set_gen4_jump_count(devinfo, if_inst,
   1441                                       br * (else_inst - if_inst));
   1442          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
   1443       } else if (devinfo->gen == 6) {
   1444          brw_inst_set_gen6_jump_count(devinfo, if_inst,
   1445                                       br * (else_inst - if_inst + 1));
   1446       }
   1447 
   1448       /* Patch ELSE -> ENDIF */
   1449       if (devinfo->gen < 6) {
   1450 	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
   1451 	  * matching ENDIF.
   1452 	  */
   1453          brw_inst_set_gen4_jump_count(devinfo, else_inst,
   1454                                       br * (endif_inst - else_inst + 1));
   1455          brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
   1456       } else if (devinfo->gen == 6) {
   1457 	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
   1458          brw_inst_set_gen6_jump_count(devinfo, else_inst,
   1459                                       br * (endif_inst - else_inst));
   1460       } else {
   1461 	 /* The IF instruction's JIP should point just past the ELSE */
   1462          brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
   1463 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
   1464          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
   1465          brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
   1466          if (devinfo->gen >= 8) {
   1467             /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
   1468              * should point to ENDIF.
   1469              */
   1470             brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
   1471          }
   1472       }
   1473    }
   1474 }
   1475 
   1476 void
   1477 brw_ELSE(struct brw_codegen *p)
   1478 {
   1479    const struct gen_device_info *devinfo = p->devinfo;
   1480    brw_inst *insn;
   1481 
   1482    insn = next_insn(p, BRW_OPCODE_ELSE);
   1483 
   1484    if (devinfo->gen < 6) {
   1485       brw_set_dest(p, insn, brw_ip_reg());
   1486       brw_set_src0(p, insn, brw_ip_reg());
   1487       brw_set_src1(p, insn, brw_imm_d(0x0));
   1488    } else if (devinfo->gen == 6) {
   1489       brw_set_dest(p, insn, brw_imm_w(0));
   1490       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
   1491       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1492       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1493    } else if (devinfo->gen == 7) {
   1494       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1495       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1496       brw_set_src1(p, insn, brw_imm_w(0));
   1497       brw_inst_set_jip(devinfo, insn, 0);
   1498       brw_inst_set_uip(devinfo, insn, 0);
   1499    } else {
   1500       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1501       brw_set_src0(p, insn, brw_imm_d(0));
   1502       brw_inst_set_jip(devinfo, insn, 0);
   1503       brw_inst_set_uip(devinfo, insn, 0);
   1504    }
   1505 
   1506    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
   1507    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
   1508    if (!p->single_program_flow && devinfo->gen < 6)
   1509       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
   1510 
   1511    push_if_stack(p, insn);
   1512 }
   1513 
   1514 void
   1515 brw_ENDIF(struct brw_codegen *p)
   1516 {
   1517    const struct gen_device_info *devinfo = p->devinfo;
   1518    brw_inst *insn = NULL;
   1519    brw_inst *else_inst = NULL;
   1520    brw_inst *if_inst = NULL;
   1521    brw_inst *tmp;
   1522    bool emit_endif = true;
   1523 
   1524    /* In single program flow mode, we can express IF and ELSE instructions
   1525     * equivalently as ADD instructions that operate on IP.  On platforms prior
   1526     * to Gen6, flow control instructions cause an implied thread switch, so
   1527     * this is a significant savings.
   1528     *
   1529     * However, on Gen6, writing to IP doesn't work in single program flow mode
   1530     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
   1531     * not be updated by non-flow control instructions.").  And on later
   1532     * platforms, there is no significant benefit to converting control flow
   1533     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
   1534     * Gen5.
   1535     */
   1536    if (devinfo->gen < 6 && p->single_program_flow)
   1537       emit_endif = false;
   1538 
   1539    /*
   1540     * A single next_insn() may change the base address of instruction store
   1541     * memory(p->store), so call it first before referencing the instruction
   1542     * store pointer from an index
   1543     */
   1544    if (emit_endif)
   1545       insn = next_insn(p, BRW_OPCODE_ENDIF);
   1546 
   1547    /* Pop the IF and (optional) ELSE instructions from the stack */
   1548    p->if_depth_in_loop[p->loop_stack_depth]--;
   1549    tmp = pop_if_stack(p);
   1550    if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
   1551       else_inst = tmp;
   1552       tmp = pop_if_stack(p);
   1553    }
   1554    if_inst = tmp;
   1555 
   1556    if (!emit_endif) {
   1557       /* ENDIF is useless; don't bother emitting it. */
   1558       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
   1559       return;
   1560    }
   1561 
   1562    if (devinfo->gen < 6) {
   1563       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1564       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1565       brw_set_src1(p, insn, brw_imm_d(0x0));
   1566    } else if (devinfo->gen == 6) {
   1567       brw_set_dest(p, insn, brw_imm_w(0));
   1568       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1569       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1570    } else if (devinfo->gen == 7) {
   1571       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1572       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1573       brw_set_src1(p, insn, brw_imm_w(0));
   1574    } else {
   1575       brw_set_src0(p, insn, brw_imm_d(0));
   1576    }
   1577 
   1578    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
   1579    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
   1580    if (devinfo->gen < 6)
   1581       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
   1582 
   1583    /* Also pop item off the stack in the endif instruction: */
   1584    if (devinfo->gen < 6) {
   1585       brw_inst_set_gen4_jump_count(devinfo, insn, 0);
   1586       brw_inst_set_gen4_pop_count(devinfo, insn, 1);
   1587    } else if (devinfo->gen == 6) {
   1588       brw_inst_set_gen6_jump_count(devinfo, insn, 2);
   1589    } else {
   1590       brw_inst_set_jip(devinfo, insn, 2);
   1591    }
   1592    patch_IF_ELSE(p, if_inst, else_inst, insn);
   1593 }
   1594 
   1595 brw_inst *
   1596 brw_BREAK(struct brw_codegen *p)
   1597 {
   1598    const struct gen_device_info *devinfo = p->devinfo;
   1599    brw_inst *insn;
   1600 
   1601    insn = next_insn(p, BRW_OPCODE_BREAK);
   1602    if (devinfo->gen >= 8) {
   1603       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1604       brw_set_src0(p, insn, brw_imm_d(0x0));
   1605    } else if (devinfo->gen >= 6) {
   1606       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1607       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1608       brw_set_src1(p, insn, brw_imm_d(0x0));
   1609    } else {
   1610       brw_set_dest(p, insn, brw_ip_reg());
   1611       brw_set_src0(p, insn, brw_ip_reg());
   1612       brw_set_src1(p, insn, brw_imm_d(0x0));
   1613       brw_inst_set_gen4_pop_count(devinfo, insn,
   1614                                   p->if_depth_in_loop[p->loop_stack_depth]);
   1615    }
   1616    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
   1617    brw_inst_set_exec_size(devinfo, insn,
   1618                           brw_inst_exec_size(devinfo, p->current));
   1619 
   1620    return insn;
   1621 }
   1622 
   1623 brw_inst *
   1624 brw_CONT(struct brw_codegen *p)
   1625 {
   1626    const struct gen_device_info *devinfo = p->devinfo;
   1627    brw_inst *insn;
   1628 
   1629    insn = next_insn(p, BRW_OPCODE_CONTINUE);
   1630    brw_set_dest(p, insn, brw_ip_reg());
   1631    if (devinfo->gen >= 8) {
   1632       brw_set_src0(p, insn, brw_imm_d(0x0));
   1633    } else {
   1634       brw_set_src0(p, insn, brw_ip_reg());
   1635       brw_set_src1(p, insn, brw_imm_d(0x0));
   1636    }
   1637 
   1638    if (devinfo->gen < 6) {
   1639       brw_inst_set_gen4_pop_count(devinfo, insn,
   1640                                   p->if_depth_in_loop[p->loop_stack_depth]);
   1641    }
   1642    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
   1643    brw_inst_set_exec_size(devinfo, insn,
   1644                           brw_inst_exec_size(devinfo, p->current));
   1645    return insn;
   1646 }
   1647 
   1648 brw_inst *
   1649 gen6_HALT(struct brw_codegen *p)
   1650 {
   1651    const struct gen_device_info *devinfo = p->devinfo;
   1652    brw_inst *insn;
   1653 
   1654    insn = next_insn(p, BRW_OPCODE_HALT);
   1655    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1656    if (devinfo->gen >= 8) {
   1657       brw_set_src0(p, insn, brw_imm_d(0x0));
   1658    } else {
   1659       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1660       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
   1661    }
   1662 
   1663    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
   1664    brw_inst_set_exec_size(devinfo, insn,
   1665                           brw_inst_exec_size(devinfo, p->current));
   1666    return insn;
   1667 }
   1668 
   1669 /* DO/WHILE loop:
   1670  *
   1671  * The DO/WHILE is just an unterminated loop -- break or continue are
   1672  * used for control within the loop.  We have a few ways they can be
   1673  * done.
   1674  *
   1675  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
   1676  * jip and no DO instruction.
   1677  *
   1678  * For non-uniform control flow pre-gen6, there's a DO instruction to
   1679  * push the mask, and a WHILE to jump back, and BREAK to get out and
   1680  * pop the mask.
   1681  *
   1682  * For gen6, there's no more mask stack, so no need for DO.  WHILE
   1683  * just points back to the first instruction of the loop.
   1684  */
   1685 brw_inst *
   1686 brw_DO(struct brw_codegen *p, unsigned execute_size)
   1687 {
   1688    const struct gen_device_info *devinfo = p->devinfo;
   1689 
   1690    if (devinfo->gen >= 6 || p->single_program_flow) {
   1691       push_loop_stack(p, &p->store[p->nr_insn]);
   1692       return &p->store[p->nr_insn];
   1693    } else {
   1694       brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
   1695 
   1696       push_loop_stack(p, insn);
   1697 
   1698       /* Override the defaults for this instruction:
   1699        */
   1700       brw_set_dest(p, insn, brw_null_reg());
   1701       brw_set_src0(p, insn, brw_null_reg());
   1702       brw_set_src1(p, insn, brw_null_reg());
   1703 
   1704       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
   1705       brw_inst_set_exec_size(devinfo, insn, execute_size);
   1706       brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
   1707 
   1708       return insn;
   1709    }
   1710 }
   1711 
   1712 /**
   1713  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
   1714  * instruction here.
   1715  *
   1716  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
   1717  * nesting, since it can always just point to the end of the block/current loop.
   1718  */
   1719 static void
   1720 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
   1721 {
   1722    const struct gen_device_info *devinfo = p->devinfo;
   1723    brw_inst *do_inst = get_inner_do_insn(p);
   1724    brw_inst *inst;
   1725    unsigned br = brw_jump_scale(devinfo);
   1726 
   1727    assert(devinfo->gen < 6);
   1728 
   1729    for (inst = while_inst - 1; inst != do_inst; inst--) {
   1730       /* If the jump count is != 0, that means that this instruction has already
   1731        * been patched because it's part of a loop inside of the one we're
   1732        * patching.
   1733        */
   1734       if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
   1735           brw_inst_gen4_jump_count(devinfo, inst) == 0) {
   1736          brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
   1737       } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
   1738                  brw_inst_gen4_jump_count(devinfo, inst) == 0) {
   1739          brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
   1740       }
   1741    }
   1742 }
   1743 
   1744 brw_inst *
   1745 brw_WHILE(struct brw_codegen *p)
   1746 {
   1747    const struct gen_device_info *devinfo = p->devinfo;
   1748    brw_inst *insn, *do_insn;
   1749    unsigned br = brw_jump_scale(devinfo);
   1750 
   1751    if (devinfo->gen >= 6) {
   1752       insn = next_insn(p, BRW_OPCODE_WHILE);
   1753       do_insn = get_inner_do_insn(p);
   1754 
   1755       if (devinfo->gen >= 8) {
   1756          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1757          brw_set_src0(p, insn, brw_imm_d(0));
   1758          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
   1759       } else if (devinfo->gen == 7) {
   1760          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1761          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1762          brw_set_src1(p, insn, brw_imm_w(0));
   1763          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
   1764       } else {
   1765          brw_set_dest(p, insn, brw_imm_w(0));
   1766          brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
   1767          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1768          brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1769       }
   1770 
   1771       brw_inst_set_exec_size(devinfo, insn,
   1772                              brw_inst_exec_size(devinfo, p->current));
   1773 
   1774    } else {
   1775       if (p->single_program_flow) {
   1776 	 insn = next_insn(p, BRW_OPCODE_ADD);
   1777          do_insn = get_inner_do_insn(p);
   1778 
   1779 	 brw_set_dest(p, insn, brw_ip_reg());
   1780 	 brw_set_src0(p, insn, brw_ip_reg());
   1781 	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
   1782          brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
   1783       } else {
   1784 	 insn = next_insn(p, BRW_OPCODE_WHILE);
   1785          do_insn = get_inner_do_insn(p);
   1786 
   1787          assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
   1788 
   1789 	 brw_set_dest(p, insn, brw_ip_reg());
   1790 	 brw_set_src0(p, insn, brw_ip_reg());
   1791 	 brw_set_src1(p, insn, brw_imm_d(0));
   1792 
   1793          brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
   1794          brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
   1795          brw_inst_set_gen4_pop_count(devinfo, insn, 0);
   1796 
   1797 	 brw_patch_break_cont(p, insn);
   1798       }
   1799    }
   1800    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
   1801 
   1802    p->loop_stack_depth--;
   1803 
   1804    return insn;
   1805 }
   1806 
   1807 /* FORWARD JUMPS:
   1808  */
   1809 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
   1810 {
   1811    const struct gen_device_info *devinfo = p->devinfo;
   1812    brw_inst *jmp_insn = &p->store[jmp_insn_idx];
   1813    unsigned jmpi = 1;
   1814 
   1815    if (devinfo->gen >= 5)
   1816       jmpi = 2;
   1817 
   1818    assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
   1819    assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
   1820 
   1821    brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
   1822                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
   1823 }
   1824 
   1825 /* To integrate with the above, it makes sense that the comparison
   1826  * instruction should populate the flag register.  It might be simpler
   1827  * just to use the flag reg for most WM tasks?
   1828  */
   1829 void brw_CMP(struct brw_codegen *p,
   1830 	     struct brw_reg dest,
   1831 	     unsigned conditional,
   1832 	     struct brw_reg src0,
   1833 	     struct brw_reg src1)
   1834 {
   1835    const struct gen_device_info *devinfo = p->devinfo;
   1836    brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
   1837 
   1838    brw_inst_set_cond_modifier(devinfo, insn, conditional);
   1839    brw_set_dest(p, insn, dest);
   1840    brw_set_src0(p, insn, src0);
   1841    brw_set_src1(p, insn, src1);
   1842 
   1843    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
   1844     * page says:
   1845     *    "Any CMP instruction with a null destination must use a {switch}."
   1846     *
   1847     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
   1848     * mentioned on their work-arounds pages.
   1849     */
   1850    if (devinfo->gen == 7) {
   1851       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
   1852           dest.nr == BRW_ARF_NULL) {
   1853          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
   1854       }
   1855    }
   1856 }
   1857 
   1858 /***********************************************************************
   1859  * Helpers for the various SEND message types:
   1860  */
   1861 
   1862 /** Extended math function, float[8].
   1863  */
   1864 void gen4_math(struct brw_codegen *p,
   1865 	       struct brw_reg dest,
   1866 	       unsigned function,
   1867 	       unsigned msg_reg_nr,
   1868 	       struct brw_reg src,
   1869 	       unsigned precision )
   1870 {
   1871    const struct gen_device_info *devinfo = p->devinfo;
   1872    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
   1873    unsigned data_type;
   1874    if (has_scalar_region(src)) {
   1875       data_type = BRW_MATH_DATA_SCALAR;
   1876    } else {
   1877       data_type = BRW_MATH_DATA_VECTOR;
   1878    }
   1879 
   1880    assert(devinfo->gen < 6);
   1881 
   1882    /* Example code doesn't set predicate_control for send
   1883     * instructions.
   1884     */
   1885    brw_inst_set_pred_control(devinfo, insn, 0);
   1886    brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
   1887 
   1888    brw_set_dest(p, insn, dest);
   1889    brw_set_src0(p, insn, src);
   1890    brw_set_math_message(p,
   1891                         insn,
   1892                         function,
   1893                         src.type == BRW_REGISTER_TYPE_D,
   1894                         precision,
   1895                         data_type);
   1896 }
   1897 
   1898 void gen6_math(struct brw_codegen *p,
   1899 	       struct brw_reg dest,
   1900 	       unsigned function,
   1901 	       struct brw_reg src0,
   1902 	       struct brw_reg src1)
   1903 {
   1904    const struct gen_device_info *devinfo = p->devinfo;
   1905    brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
   1906 
   1907    assert(devinfo->gen >= 6);
   1908 
   1909    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
   1910           (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
   1911 
   1912    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
   1913    if (devinfo->gen == 6) {
   1914       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
   1915       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
   1916    }
   1917 
   1918    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
   1919        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
   1920        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
   1921       assert(src0.type != BRW_REGISTER_TYPE_F);
   1922       assert(src1.type != BRW_REGISTER_TYPE_F);
   1923       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
   1924              (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
   1925    } else {
   1926       assert(src0.type == BRW_REGISTER_TYPE_F);
   1927       assert(src1.type == BRW_REGISTER_TYPE_F);
   1928    }
   1929 
   1930    /* Source modifiers are ignored for extended math instructions on Gen6. */
   1931    if (devinfo->gen == 6) {
   1932       assert(!src0.negate);
   1933       assert(!src0.abs);
   1934       assert(!src1.negate);
   1935       assert(!src1.abs);
   1936    }
   1937 
   1938    brw_inst_set_math_function(devinfo, insn, function);
   1939 
   1940    brw_set_dest(p, insn, dest);
   1941    brw_set_src0(p, insn, src0);
   1942    brw_set_src1(p, insn, src1);
   1943 }
   1944 
   1945 /**
   1946  * Return the right surface index to access the thread scratch space using
   1947  * stateless dataport messages.
   1948  */
   1949 unsigned
   1950 brw_scratch_surface_idx(const struct brw_codegen *p)
   1951 {
   1952    /* The scratch space is thread-local so IA coherency is unnecessary. */
   1953    if (p->devinfo->gen >= 8)
   1954       return GEN8_BTI_STATELESS_NON_COHERENT;
   1955    else
   1956       return BRW_BTI_STATELESS;
   1957 }
   1958 
   1959 /**
   1960  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
   1961  * using a constant offset per channel.
   1962  *
   1963  * The offset must be aligned to oword size (16 bytes).  Used for
   1964  * register spilling.
   1965  */
   1966 void brw_oword_block_write_scratch(struct brw_codegen *p,
   1967 				   struct brw_reg mrf,
   1968 				   int num_regs,
   1969 				   unsigned offset)
   1970 {
   1971    const struct gen_device_info *devinfo = p->devinfo;
   1972    const unsigned target_cache =
   1973       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
   1974        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
   1975        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
   1976    uint32_t msg_type;
   1977 
   1978    if (devinfo->gen >= 6)
   1979       offset /= 16;
   1980 
   1981    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
   1982 
   1983    const unsigned mlen = 1 + num_regs;
   1984 
   1985    /* Set up the message header.  This is g0, with g0.2 filled with
   1986     * the offset.  We don't want to leave our offset around in g0 or
   1987     * it'll screw up texture samples, so set it up inside the message
   1988     * reg.
   1989     */
   1990    {
   1991       brw_push_insn_state(p);
   1992       brw_set_default_exec_size(p, BRW_EXECUTE_8);
   1993       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1994       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   1995 
   1996       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   1997 
   1998       /* set message header global offset field (reg 0, element 2) */
   1999       brw_set_default_exec_size(p, BRW_EXECUTE_1);
   2000       brw_MOV(p,
   2001 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
   2002 				  mrf.nr,
   2003 				  2), BRW_REGISTER_TYPE_UD),
   2004 	      brw_imm_ud(offset));
   2005 
   2006       brw_pop_insn_state(p);
   2007    }
   2008 
   2009    {
   2010       struct brw_reg dest;
   2011       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
   2012       int send_commit_msg;
   2013       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
   2014 					 BRW_REGISTER_TYPE_UW);
   2015 
   2016       brw_inst_set_compression(devinfo, insn, false);
   2017 
   2018       if (brw_inst_exec_size(devinfo, insn) >= 16)
   2019 	 src_header = vec16(src_header);
   2020 
   2021       assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
   2022       if (devinfo->gen < 6)
   2023          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
   2024 
   2025       /* Until gen6, writes followed by reads from the same location
   2026        * are not guaranteed to be ordered unless write_commit is set.
   2027        * If set, then a no-op write is issued to the destination
   2028        * register to set a dependency, and a read from the destination
   2029        * can be used to ensure the ordering.
   2030        *
   2031        * For gen6, only writes between different threads need ordering
   2032        * protection.  Our use of DP writes is all about register
   2033        * spilling within a thread.
   2034        */
   2035       if (devinfo->gen >= 6) {
   2036 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
   2037 	 send_commit_msg = 0;
   2038       } else {
   2039 	 dest = src_header;
   2040 	 send_commit_msg = 1;
   2041       }
   2042 
   2043       brw_set_dest(p, insn, dest);
   2044       if (devinfo->gen >= 6) {
   2045 	 brw_set_src0(p, insn, mrf);
   2046       } else {
   2047 	 brw_set_src0(p, insn, brw_null_reg());
   2048       }
   2049 
   2050       if (devinfo->gen >= 6)
   2051 	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
   2052       else
   2053 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
   2054 
   2055       brw_set_dp_write_message(p,
   2056 			       insn,
   2057                                brw_scratch_surface_idx(p),
   2058 			       BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
   2059 			       msg_type,
   2060                                target_cache,
   2061 			       mlen,
   2062 			       true, /* header_present */
   2063 			       0, /* not a render target */
   2064 			       send_commit_msg, /* response_length */
   2065 			       0, /* eot */
   2066 			       send_commit_msg);
   2067    }
   2068 }
   2069 
   2070 
   2071 /**
   2072  * Read a block of owords (half a GRF each) from the scratch buffer
   2073  * using a constant index per channel.
   2074  *
   2075  * Offset must be aligned to oword size (16 bytes).  Used for register
   2076  * spilling.
   2077  */
   2078 void
   2079 brw_oword_block_read_scratch(struct brw_codegen *p,
   2080 			     struct brw_reg dest,
   2081 			     struct brw_reg mrf,
   2082 			     int num_regs,
   2083 			     unsigned offset)
   2084 {
   2085    const struct gen_device_info *devinfo = p->devinfo;
   2086 
   2087    if (devinfo->gen >= 6)
   2088       offset /= 16;
   2089 
   2090    if (p->devinfo->gen >= 7) {
   2091       /* On gen 7 and above, we no longer have message registers and we can
   2092        * send from any register we want.  By using the destination register
   2093        * for the message, we guarantee that the implied message write won't
   2094        * accidentally overwrite anything.  This has been a problem because
   2095        * the MRF registers and source for the final FB write are both fixed
   2096        * and may overlap.
   2097        */
   2098       mrf = retype(dest, BRW_REGISTER_TYPE_UD);
   2099    } else {
   2100       mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
   2101    }
   2102    dest = retype(dest, BRW_REGISTER_TYPE_UW);
   2103 
   2104    const unsigned rlen = num_regs;
   2105    const unsigned target_cache =
   2106       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
   2107        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
   2108        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
   2109 
   2110    {
   2111       brw_push_insn_state(p);
   2112       brw_set_default_exec_size(p, BRW_EXECUTE_8);
   2113       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   2114       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   2115 
   2116       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   2117 
   2118       /* set message header global offset field (reg 0, element 2) */
   2119       brw_set_default_exec_size(p, BRW_EXECUTE_1);
   2120       brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
   2121 
   2122       brw_pop_insn_state(p);
   2123    }
   2124 
   2125    {
   2126       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
   2127 
   2128       assert(brw_inst_pred_control(devinfo, insn) == 0);
   2129       brw_inst_set_compression(devinfo, insn, false);
   2130 
   2131       brw_set_dest(p, insn, dest);	/* UW? */
   2132       if (devinfo->gen >= 6) {
   2133 	 brw_set_src0(p, insn, mrf);
   2134       } else {
   2135 	 brw_set_src0(p, insn, brw_null_reg());
   2136          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
   2137       }
   2138 
   2139       brw_set_dp_read_message(p,
   2140 			      insn,
   2141                               brw_scratch_surface_idx(p),
   2142 			      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
   2143 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
   2144 			      target_cache,
   2145 			      1, /* msg_length */
   2146                               true, /* header_present */
   2147 			      rlen);
   2148    }
   2149 }
   2150 
   2151 void
   2152 gen7_block_read_scratch(struct brw_codegen *p,
   2153                         struct brw_reg dest,
   2154                         int num_regs,
   2155                         unsigned offset)
   2156 {
   2157    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
   2158    assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
   2159 
   2160    brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
   2161 
   2162    /* The HW requires that the header is present; this is to get the g0.5
   2163     * scratch offset.
   2164     */
   2165    brw_set_src0(p, insn, brw_vec8_grf(0, 0));
   2166 
   2167    /* According to the docs, offset is "A 12-bit HWord offset into the memory
   2168     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
   2169     * is 32 bytes, which happens to be the size of a register.
   2170     */
   2171    offset /= REG_SIZE;
   2172    assert(offset < (1 << 12));
   2173 
   2174    gen7_set_dp_scratch_message(p, insn,
   2175                                false, /* scratch read */
   2176                                false, /* OWords */
   2177                                false, /* invalidate after read */
   2178                                num_regs,
   2179                                offset,
   2180                                1,        /* mlen: just g0 */
   2181                                num_regs, /* rlen */
   2182                                true);    /* header present */
   2183 }
   2184 
   2185 /**
   2186  * Read float[4] vectors from the data port constant cache.
   2187  * Location (in buffer) should be a multiple of 16.
   2188  * Used for fetching shader constants.
   2189  */
   2190 void brw_oword_block_read(struct brw_codegen *p,
   2191 			  struct brw_reg dest,
   2192 			  struct brw_reg mrf,
   2193 			  uint32_t offset,
   2194 			  uint32_t bind_table_index)
   2195 {
   2196    const struct gen_device_info *devinfo = p->devinfo;
   2197    const unsigned target_cache =
   2198       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
   2199        BRW_DATAPORT_READ_TARGET_DATA_CACHE);
   2200    const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
   2201 
   2202    /* On newer hardware, offset is in units of owords. */
   2203    if (devinfo->gen >= 6)
   2204       offset /= 16;
   2205 
   2206    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
   2207 
   2208    brw_push_insn_state(p);
   2209    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   2210    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   2211    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   2212 
   2213    brw_push_insn_state(p);
   2214    brw_set_default_exec_size(p, BRW_EXECUTE_8);
   2215    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   2216 
   2217    /* set message header global offset field (reg 0, element 2) */
   2218    brw_set_default_exec_size(p, BRW_EXECUTE_1);
   2219    brw_MOV(p,
   2220 	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
   2221 			       mrf.nr,
   2222 			       2), BRW_REGISTER_TYPE_UD),
   2223 	   brw_imm_ud(offset));
   2224    brw_pop_insn_state(p);
   2225 
   2226    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
   2227 
   2228    /* cast dest to a uword[8] vector */
   2229    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
   2230 
   2231    brw_set_dest(p, insn, dest);
   2232    if (devinfo->gen >= 6) {
   2233       brw_set_src0(p, insn, mrf);
   2234    } else {
   2235       brw_set_src0(p, insn, brw_null_reg());
   2236       brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
   2237    }
   2238 
   2239    brw_set_dp_read_message(p, insn, bind_table_index,
   2240                            BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
   2241 			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
   2242 			   target_cache,
   2243 			   1, /* msg_length */
   2244                            true, /* header_present */
   2245 			   DIV_ROUND_UP(exec_size, 8)); /* response_length */
   2246 
   2247    brw_pop_insn_state(p);
   2248 }
   2249 
   2250 
   2251 void brw_fb_WRITE(struct brw_codegen *p,
   2252                   struct brw_reg payload,
   2253                   struct brw_reg implied_header,
   2254                   unsigned msg_control,
   2255                   unsigned binding_table_index,
   2256                   unsigned msg_length,
   2257                   unsigned response_length,
   2258                   bool eot,
   2259                   bool last_render_target,
   2260                   bool header_present)
   2261 {
   2262    const struct gen_device_info *devinfo = p->devinfo;
   2263    const unsigned target_cache =
   2264       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
   2265        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
   2266    brw_inst *insn;
   2267    unsigned msg_type;
   2268    struct brw_reg dest, src0;
   2269 
   2270    if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
   2271       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
   2272    else
   2273       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
   2274 
   2275    if (devinfo->gen >= 6) {
   2276       insn = next_insn(p, BRW_OPCODE_SENDC);
   2277    } else {
   2278       insn = next_insn(p, BRW_OPCODE_SEND);
   2279    }
   2280    brw_inst_set_compression(devinfo, insn, false);
   2281 
   2282    if (devinfo->gen >= 6) {
   2283       /* headerless version, just submit color payload */
   2284       src0 = payload;
   2285 
   2286       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
   2287    } else {
   2288       assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
   2289       brw_inst_set_base_mrf(devinfo, insn, payload.nr);
   2290       src0 = implied_header;
   2291 
   2292       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
   2293    }
   2294 
   2295    brw_set_dest(p, insn, dest);
   2296    brw_set_src0(p, insn, src0);
   2297    brw_set_dp_write_message(p,
   2298 			    insn,
   2299 			    binding_table_index,
   2300 			    msg_control,
   2301 			    msg_type,
   2302                             target_cache,
   2303 			    msg_length,
   2304 			    header_present,
   2305 			    last_render_target,
   2306 			    response_length,
   2307 			    eot,
   2308 			    0 /* send_commit_msg */);
   2309 }
   2310 
   2311 brw_inst *
   2312 gen9_fb_READ(struct brw_codegen *p,
   2313              struct brw_reg dst,
   2314              struct brw_reg payload,
   2315              unsigned binding_table_index,
   2316              unsigned msg_length,
   2317              unsigned response_length,
   2318              bool per_sample)
   2319 {
   2320    const struct gen_device_info *devinfo = p->devinfo;
   2321    assert(devinfo->gen >= 9);
   2322    const unsigned msg_subtype =
   2323       brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
   2324    brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
   2325 
   2326    brw_set_dest(p, insn, dst);
   2327    brw_set_src0(p, insn, payload);
   2328    brw_set_dp_read_message(p, insn, binding_table_index,
   2329                            per_sample << 5 | msg_subtype,
   2330                            GEN9_DATAPORT_RC_RENDER_TARGET_READ,
   2331                            GEN6_SFID_DATAPORT_RENDER_CACHE,
   2332                            msg_length, true /* header_present */,
   2333                            response_length);
   2334    brw_inst_set_rt_slot_group(devinfo, insn,
   2335                               brw_inst_qtr_control(devinfo, p->current) / 2);
   2336 
   2337    return insn;
   2338 }
   2339 
   2340 /**
   2341  * Texture sample instruction.
   2342  * Note: the msg_type plus msg_length values determine exactly what kind
   2343  * of sampling operation is performed.  See volume 4, page 161 of docs.
   2344  */
   2345 void brw_SAMPLE(struct brw_codegen *p,
   2346 		struct brw_reg dest,
   2347 		unsigned msg_reg_nr,
   2348 		struct brw_reg src0,
   2349 		unsigned binding_table_index,
   2350 		unsigned sampler,
   2351 		unsigned msg_type,
   2352 		unsigned response_length,
   2353 		unsigned msg_length,
   2354 		unsigned header_present,
   2355 		unsigned simd_mode,
   2356 		unsigned return_format)
   2357 {
   2358    const struct gen_device_info *devinfo = p->devinfo;
   2359    brw_inst *insn;
   2360 
   2361    if (msg_reg_nr != -1)
   2362       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
   2363 
   2364    insn = next_insn(p, BRW_OPCODE_SEND);
   2365    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
   2366 
   2367    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
   2368     *
   2369     *    "Instruction compression is not allowed for this instruction (that
   2370     *     is, send). The hardware behavior is undefined if this instruction is
   2371     *     set as compressed. However, compress control can be set to "SecHalf"
   2372     *     to affect the EMask generation."
   2373     *
   2374     * No similar wording is found in later PRMs, but there are examples
   2375     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
   2376     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
   2377     * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
   2378     */
   2379    brw_inst_set_compression(devinfo, insn, false);
   2380 
   2381    if (devinfo->gen < 6)
   2382       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
   2383 
   2384    brw_set_dest(p, insn, dest);
   2385    brw_set_src0(p, insn, src0);
   2386    brw_set_sampler_message(p, insn,
   2387                            binding_table_index,
   2388                            sampler,
   2389                            msg_type,
   2390                            response_length,
   2391                            msg_length,
   2392                            header_present,
   2393                            simd_mode,
   2394                            return_format);
   2395 }
   2396 
   2397 /* Adjust the message header's sampler state pointer to
   2398  * select the correct group of 16 samplers.
   2399  */
   2400 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
   2401                                       struct brw_reg header,
   2402                                       struct brw_reg sampler_index)
   2403 {
   2404    /* The "Sampler Index" field can only store values between 0 and 15.
   2405     * However, we can add an offset to the "Sampler State Pointer"
   2406     * field, effectively selecting a different set of 16 samplers.
   2407     *
   2408     * The "Sampler State Pointer" needs to be aligned to a 32-byte
   2409     * offset, and each sampler state is only 16-bytes, so we can't
   2410     * exclusively use the offset - we have to use both.
   2411     */
   2412 
   2413    const struct gen_device_info *devinfo = p->devinfo;
   2414 
   2415    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
   2416       const int sampler_state_size = 16; /* 16 bytes */
   2417       uint32_t sampler = sampler_index.ud;
   2418 
   2419       if (sampler >= 16) {
   2420          assert(devinfo->is_haswell || devinfo->gen >= 8);
   2421          brw_ADD(p,
   2422                  get_element_ud(header, 3),
   2423                  get_element_ud(brw_vec8_grf(0, 0), 3),
   2424                  brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
   2425       }
   2426    } else {
   2427       /* Non-const sampler array indexing case */
   2428       if (devinfo->gen < 8 && !devinfo->is_haswell) {
   2429          return;
   2430       }
   2431 
   2432       struct brw_reg temp = get_element_ud(header, 3);
   2433 
   2434       brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
   2435       brw_SHL(p, temp, temp, brw_imm_ud(4));
   2436       brw_ADD(p,
   2437               get_element_ud(header, 3),
   2438               get_element_ud(brw_vec8_grf(0, 0), 3),
   2439               temp);
   2440    }
   2441 }
   2442 
   2443 /* All these variables are pretty confusing - we might be better off
   2444  * using bitmasks and macros for this, in the old style.  Or perhaps
   2445  * just having the caller instantiate the fields in dword3 itself.
   2446  */
   2447 void brw_urb_WRITE(struct brw_codegen *p,
   2448 		   struct brw_reg dest,
   2449 		   unsigned msg_reg_nr,
   2450 		   struct brw_reg src0,
   2451                    enum brw_urb_write_flags flags,
   2452 		   unsigned msg_length,
   2453 		   unsigned response_length,
   2454 		   unsigned offset,
   2455 		   unsigned swizzle)
   2456 {
   2457    const struct gen_device_info *devinfo = p->devinfo;
   2458    brw_inst *insn;
   2459 
   2460    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
   2461 
   2462    if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
   2463       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
   2464       brw_push_insn_state(p);
   2465       brw_set_default_access_mode(p, BRW_ALIGN_1);
   2466       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   2467       brw_set_default_exec_size(p, BRW_EXECUTE_1);
   2468       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
   2469 		       BRW_REGISTER_TYPE_UD),
   2470 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
   2471 		brw_imm_ud(0xff00));
   2472       brw_pop_insn_state(p);
   2473    }
   2474 
   2475    insn = next_insn(p, BRW_OPCODE_SEND);
   2476 
   2477    assert(msg_length < BRW_MAX_MRF(devinfo->gen));
   2478 
   2479    brw_set_dest(p, insn, dest);
   2480    brw_set_src0(p, insn, src0);
   2481    brw_set_src1(p, insn, brw_imm_d(0));
   2482 
   2483    if (devinfo->gen < 6)
   2484       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
   2485 
   2486    brw_set_urb_message(p,
   2487 		       insn,
   2488 		       flags,
   2489 		       msg_length,
   2490 		       response_length,
   2491 		       offset,
   2492 		       swizzle);
   2493 }
   2494 
   2495 struct brw_inst *
   2496 brw_send_indirect_message(struct brw_codegen *p,
   2497                           unsigned sfid,
   2498                           struct brw_reg dst,
   2499                           struct brw_reg payload,
   2500                           struct brw_reg desc)
   2501 {
   2502    const struct gen_device_info *devinfo = p->devinfo;
   2503    struct brw_inst *send;
   2504    int setup;
   2505 
   2506    dst = retype(dst, BRW_REGISTER_TYPE_UW);
   2507 
   2508    assert(desc.type == BRW_REGISTER_TYPE_UD);
   2509 
   2510    /* We hold on to the setup instruction (the SEND in the direct case, the OR
   2511     * in the indirect case) by its index in the instruction store.  The
   2512     * pointer returned by next_insn() may become invalid if emitting the SEND
   2513     * in the indirect case reallocs the store.
   2514     */
   2515 
   2516    if (desc.file == BRW_IMMEDIATE_VALUE) {
   2517       setup = p->nr_insn;
   2518       send = next_insn(p, BRW_OPCODE_SEND);
   2519       brw_set_src1(p, send, desc);
   2520 
   2521    } else {
   2522       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
   2523 
   2524       brw_push_insn_state(p);
   2525       brw_set_default_access_mode(p, BRW_ALIGN_1);
   2526       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   2527       brw_set_default_exec_size(p, BRW_EXECUTE_1);
   2528       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   2529 
   2530       /* Load the indirect descriptor to an address register using OR so the
   2531        * caller can specify additional descriptor bits with the usual
   2532        * brw_set_*_message() helper functions.
   2533        */
   2534       setup = p->nr_insn;
   2535       brw_OR(p, addr, desc, brw_imm_ud(0));
   2536 
   2537       brw_pop_insn_state(p);
   2538 
   2539       send = next_insn(p, BRW_OPCODE_SEND);
   2540       brw_set_src1(p, send, addr);
   2541    }
   2542 
   2543    if (dst.width < BRW_EXECUTE_8)
   2544       brw_inst_set_exec_size(devinfo, send, dst.width);
   2545 
   2546    brw_set_dest(p, send, dst);
   2547    brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
   2548    brw_inst_set_sfid(devinfo, send, sfid);
   2549 
   2550    return &p->store[setup];
   2551 }
   2552 
   2553 static struct brw_inst *
   2554 brw_send_indirect_surface_message(struct brw_codegen *p,
   2555                                   unsigned sfid,
   2556                                   struct brw_reg dst,
   2557                                   struct brw_reg payload,
   2558                                   struct brw_reg surface,
   2559                                   unsigned message_len,
   2560                                   unsigned response_len,
   2561                                   bool header_present)
   2562 {
   2563    const struct gen_device_info *devinfo = p->devinfo;
   2564    struct brw_inst *insn;
   2565 
   2566    if (surface.file != BRW_IMMEDIATE_VALUE) {
   2567       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
   2568 
   2569       brw_push_insn_state(p);
   2570       brw_set_default_access_mode(p, BRW_ALIGN_1);
   2571       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   2572       brw_set_default_exec_size(p, BRW_EXECUTE_1);
   2573       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   2574 
   2575       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
   2576        * some surface array is accessed out of bounds.
   2577        */
   2578       insn = brw_AND(p, addr,
   2579                      suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
   2580                                BRW_GET_SWZ(surface.swizzle, 0)),
   2581                      brw_imm_ud(0xff));
   2582 
   2583       brw_pop_insn_state(p);
   2584 
   2585       surface = addr;
   2586    }
   2587 
   2588    insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
   2589    brw_inst_set_mlen(devinfo, insn, message_len);
   2590    brw_inst_set_rlen(devinfo, insn, response_len);
   2591    brw_inst_set_header_present(devinfo, insn, header_present);
   2592 
   2593    return insn;
   2594 }
   2595 
   2596 static bool
   2597 while_jumps_before_offset(const struct gen_device_info *devinfo,
   2598                           brw_inst *insn, int while_offset, int start_offset)
   2599 {
   2600    int scale = 16 / brw_jump_scale(devinfo);
   2601    int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
   2602                                : brw_inst_jip(devinfo, insn);
   2603    assert(jip < 0);
   2604    return while_offset + jip * scale <= start_offset;
   2605 }
   2606 
   2607 
   2608 static int
   2609 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
   2610 {
   2611    int offset;
   2612    void *store = p->store;
   2613    const struct gen_device_info *devinfo = p->devinfo;
   2614 
   2615    int depth = 0;
   2616 
   2617    for (offset = next_offset(devinfo, store, start_offset);
   2618         offset < p->next_insn_offset;
   2619         offset = next_offset(devinfo, store, offset)) {
   2620       brw_inst *insn = store + offset;
   2621 
   2622       switch (brw_inst_opcode(devinfo, insn)) {
   2623       case BRW_OPCODE_IF:
   2624          depth++;
   2625          break;
   2626       case BRW_OPCODE_ENDIF:
   2627          if (depth == 0)
   2628             return offset;
   2629          depth--;
   2630          break;
   2631       case BRW_OPCODE_WHILE:
   2632          /* If the while doesn't jump before our instruction, it's the end
   2633           * of a sibling do...while loop.  Ignore it.
   2634           */
   2635          if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
   2636             continue;
   2637          /* fallthrough */
   2638       case BRW_OPCODE_ELSE:
   2639       case BRW_OPCODE_HALT:
   2640          if (depth == 0)
   2641             return offset;
   2642       }
   2643    }
   2644 
   2645    return 0;
   2646 }
   2647 
   2648 /* There is no DO instruction on gen6, so to find the end of the loop
   2649  * we have to see if the loop is jumping back before our start
   2650  * instruction.
   2651  */
   2652 static int
   2653 brw_find_loop_end(struct brw_codegen *p, int start_offset)
   2654 {
   2655    const struct gen_device_info *devinfo = p->devinfo;
   2656    int offset;
   2657    void *store = p->store;
   2658 
   2659    assert(devinfo->gen >= 6);
   2660 
   2661    /* Always start after the instruction (such as a WHILE) we're trying to fix
   2662     * up.
   2663     */
   2664    for (offset = next_offset(devinfo, store, start_offset);
   2665         offset < p->next_insn_offset;
   2666         offset = next_offset(devinfo, store, offset)) {
   2667       brw_inst *insn = store + offset;
   2668 
   2669       if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
   2670 	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
   2671 	    return offset;
   2672       }
   2673    }
   2674    assert(!"not reached");
   2675    return start_offset;
   2676 }
   2677 
   2678 /* After program generation, go back and update the UIP and JIP of
   2679  * BREAK, CONT, and HALT instructions to their correct locations.
   2680  */
   2681 void
   2682 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
   2683 {
   2684    const struct gen_device_info *devinfo = p->devinfo;
   2685    int offset;
   2686    int br = brw_jump_scale(devinfo);
   2687    int scale = 16 / br;
   2688    void *store = p->store;
   2689 
   2690    if (devinfo->gen < 6)
   2691       return;
   2692 
   2693    for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
   2694       brw_inst *insn = store + offset;
   2695       assert(brw_inst_cmpt_control(devinfo, insn) == 0);
   2696 
   2697       int block_end_offset = brw_find_next_block_end(p, offset);
   2698       switch (brw_inst_opcode(devinfo, insn)) {
   2699       case BRW_OPCODE_BREAK:
   2700          assert(block_end_offset != 0);
   2701          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
   2702 	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
   2703          brw_inst_set_uip(devinfo, insn,
   2704 	    (brw_find_loop_end(p, offset) - offset +
   2705              (devinfo->gen == 6 ? 16 : 0)) / scale);
   2706 	 break;
   2707       case BRW_OPCODE_CONTINUE:
   2708          assert(block_end_offset != 0);
   2709          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
   2710          brw_inst_set_uip(devinfo, insn,
   2711             (brw_find_loop_end(p, offset) - offset) / scale);
   2712 
   2713          assert(brw_inst_uip(devinfo, insn) != 0);
   2714          assert(brw_inst_jip(devinfo, insn) != 0);
   2715 	 break;
   2716 
   2717       case BRW_OPCODE_ENDIF: {
   2718          int32_t jump = (block_end_offset == 0) ?
   2719                         1 * br : (block_end_offset - offset) / scale;
   2720          if (devinfo->gen >= 7)
   2721             brw_inst_set_jip(devinfo, insn, jump);
   2722          else
   2723             brw_inst_set_gen6_jump_count(devinfo, insn, jump);
   2724 	 break;
   2725       }
   2726 
   2727       case BRW_OPCODE_HALT:
   2728 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
   2729 	  *
   2730 	  *    "In case of the halt instruction not inside any conditional
   2731 	  *     code block, the value of <JIP> and <UIP> should be the
   2732 	  *     same. In case of the halt instruction inside conditional code
   2733 	  *     block, the <UIP> should be the end of the program, and the
   2734 	  *     <JIP> should be end of the most inner conditional code block."
   2735 	  *
   2736 	  * The uip will have already been set by whoever set up the
   2737 	  * instruction.
   2738 	  */
   2739 	 if (block_end_offset == 0) {
   2740             brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
   2741 	 } else {
   2742             brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
   2743 	 }
   2744          assert(brw_inst_uip(devinfo, insn) != 0);
   2745          assert(brw_inst_jip(devinfo, insn) != 0);
   2746 	 break;
   2747       }
   2748    }
   2749 }
   2750 
   2751 void brw_ff_sync(struct brw_codegen *p,
   2752 		   struct brw_reg dest,
   2753 		   unsigned msg_reg_nr,
   2754 		   struct brw_reg src0,
   2755 		   bool allocate,
   2756 		   unsigned response_length,
   2757 		   bool eot)
   2758 {
   2759    const struct gen_device_info *devinfo = p->devinfo;
   2760    brw_inst *insn;
   2761 
   2762    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
   2763 
   2764    insn = next_insn(p, BRW_OPCODE_SEND);
   2765    brw_set_dest(p, insn, dest);
   2766    brw_set_src0(p, insn, src0);
   2767    brw_set_src1(p, insn, brw_imm_d(0));
   2768 
   2769    if (devinfo->gen < 6)
   2770       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
   2771 
   2772    brw_set_ff_sync_message(p,
   2773 			   insn,
   2774 			   allocate,
   2775 			   response_length,
   2776 			   eot);
   2777 }
   2778 
   2779 /**
   2780  * Emit the SEND instruction necessary to generate stream output data on Gen6
   2781  * (for transform feedback).
   2782  *
   2783  * If send_commit_msg is true, this is the last piece of stream output data
   2784  * from this thread, so send the data as a committed write.  According to the
   2785  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
   2786  *
   2787  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
   2788  *   writes are complete by sending the final write as a committed write."
   2789  */
   2790 void
   2791 brw_svb_write(struct brw_codegen *p,
   2792               struct brw_reg dest,
   2793               unsigned msg_reg_nr,
   2794               struct brw_reg src0,
   2795               unsigned binding_table_index,
   2796               bool   send_commit_msg)
   2797 {
   2798    const struct gen_device_info *devinfo = p->devinfo;
   2799    const unsigned target_cache =
   2800       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
   2801        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
   2802        BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
   2803    brw_inst *insn;
   2804 
   2805    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
   2806 
   2807    insn = next_insn(p, BRW_OPCODE_SEND);
   2808    brw_set_dest(p, insn, dest);
   2809    brw_set_src0(p, insn, src0);
   2810    brw_set_src1(p, insn, brw_imm_d(0));
   2811    brw_set_dp_write_message(p, insn,
   2812                             binding_table_index,
   2813                             0, /* msg_control: ignored */
   2814                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
   2815                             target_cache,
   2816                             1, /* msg_length */
   2817                             true, /* header_present */
   2818                             0, /* last_render_target: ignored */
   2819                             send_commit_msg, /* response_length */
   2820                             0, /* end_of_thread */
   2821                             send_commit_msg); /* send_commit_msg */
   2822 }
   2823 
   2824 static unsigned
   2825 brw_surface_payload_size(struct brw_codegen *p,
   2826                          unsigned num_channels,
   2827                          bool has_simd4x2,
   2828                          bool has_simd16)
   2829 {
   2830    if (has_simd4x2 &&
   2831        brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
   2832       return 1;
   2833    else if (has_simd16 &&
   2834             brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
   2835       return 2 * num_channels;
   2836    else
   2837       return num_channels;
   2838 }
   2839 
   2840 static void
   2841 brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
   2842                                   brw_inst *insn,
   2843                                   unsigned atomic_op,
   2844                                   bool response_expected)
   2845 {
   2846    const struct gen_device_info *devinfo = p->devinfo;
   2847    unsigned msg_control =
   2848       atomic_op | /* Atomic Operation Type: BRW_AOP_* */
   2849       (response_expected ? 1 << 5 : 0); /* Return data expected */
   2850 
   2851    if (devinfo->gen >= 8 || devinfo->is_haswell) {
   2852       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   2853          if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
   2854             msg_control |= 1 << 4; /* SIMD8 mode */
   2855 
   2856          brw_inst_set_dp_msg_type(devinfo, insn,
   2857                                   HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
   2858       } else {
   2859          brw_inst_set_dp_msg_type(devinfo, insn,
   2860             HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
   2861       }
   2862    } else {
   2863       brw_inst_set_dp_msg_type(devinfo, insn,
   2864                                GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
   2865 
   2866       if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
   2867          msg_control |= 1 << 4; /* SIMD8 mode */
   2868    }
   2869 
   2870    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
   2871 }
   2872 
   2873 void
   2874 brw_untyped_atomic(struct brw_codegen *p,
   2875                    struct brw_reg dst,
   2876                    struct brw_reg payload,
   2877                    struct brw_reg surface,
   2878                    unsigned atomic_op,
   2879                    unsigned msg_length,
   2880                    bool response_expected)
   2881 {
   2882    const struct gen_device_info *devinfo = p->devinfo;
   2883    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
   2884                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
   2885                           GEN7_SFID_DATAPORT_DATA_CACHE);
   2886    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
   2887    /* Mask out unused components -- This is especially important in Align16
   2888     * mode on generations that don't have native support for SIMD4x2 atomics,
   2889     * because unused but enabled components will cause the dataport to perform
   2890     * additional atomic operations on the addresses that happen to be in the
   2891     * uninitialized Y, Z and W coordinates of the payload.
   2892     */
   2893    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
   2894    struct brw_inst *insn = brw_send_indirect_surface_message(
   2895       p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
   2896       brw_surface_payload_size(p, response_expected,
   2897                                devinfo->gen >= 8 || devinfo->is_haswell, true),
   2898       align1);
   2899 
   2900    brw_set_dp_untyped_atomic_message(
   2901       p, insn, atomic_op, response_expected);
   2902 }
   2903 
   2904 static void
   2905 brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
   2906                                         struct brw_inst *insn,
   2907                                         unsigned num_channels)
   2908 {
   2909    const struct gen_device_info *devinfo = p->devinfo;
   2910    /* Set mask of 32-bit channels to drop. */
   2911    unsigned msg_control = 0xf & (0xf << num_channels);
   2912 
   2913    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   2914       if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
   2915          msg_control |= 1 << 4; /* SIMD16 mode */
   2916       else
   2917          msg_control |= 2 << 4; /* SIMD8 mode */
   2918    }
   2919 
   2920    brw_inst_set_dp_msg_type(devinfo, insn,
   2921                             (devinfo->gen >= 8 || devinfo->is_haswell ?
   2922                              HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
   2923                              GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
   2924    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
   2925 }
   2926 
   2927 void
   2928 brw_untyped_surface_read(struct brw_codegen *p,
   2929                          struct brw_reg dst,
   2930                          struct brw_reg payload,
   2931                          struct brw_reg surface,
   2932                          unsigned msg_length,
   2933                          unsigned num_channels)
   2934 {
   2935    const struct gen_device_info *devinfo = p->devinfo;
   2936    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
   2937                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
   2938                           GEN7_SFID_DATAPORT_DATA_CACHE);
   2939    struct brw_inst *insn = brw_send_indirect_surface_message(
   2940       p, sfid, dst, payload, surface, msg_length,
   2941       brw_surface_payload_size(p, num_channels, true, true),
   2942       false);
   2943 
   2944    brw_set_dp_untyped_surface_read_message(
   2945       p, insn, num_channels);
   2946 }
   2947 
   2948 static void
   2949 brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
   2950                                          struct brw_inst *insn,
   2951                                          unsigned num_channels)
   2952 {
   2953    const struct gen_device_info *devinfo = p->devinfo;
   2954    /* Set mask of 32-bit channels to drop. */
   2955    unsigned msg_control = 0xf & (0xf << num_channels);
   2956 
   2957    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   2958       if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
   2959          msg_control |= 1 << 4; /* SIMD16 mode */
   2960       else
   2961          msg_control |= 2 << 4; /* SIMD8 mode */
   2962    } else {
   2963       if (devinfo->gen >= 8 || devinfo->is_haswell)
   2964          msg_control |= 0 << 4; /* SIMD4x2 mode */
   2965       else
   2966          msg_control |= 2 << 4; /* SIMD8 mode */
   2967    }
   2968 
   2969    brw_inst_set_dp_msg_type(devinfo, insn,
   2970                             devinfo->gen >= 8 || devinfo->is_haswell ?
   2971                              HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
   2972                              GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
   2973    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
   2974 }
   2975 
   2976 void
   2977 brw_untyped_surface_write(struct brw_codegen *p,
   2978                           struct brw_reg payload,
   2979                           struct brw_reg surface,
   2980                           unsigned msg_length,
   2981                           unsigned num_channels)
   2982 {
   2983    const struct gen_device_info *devinfo = p->devinfo;
   2984    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
   2985                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
   2986                           GEN7_SFID_DATAPORT_DATA_CACHE);
   2987    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
   2988    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
   2989    const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
   2990                           WRITEMASK_X : WRITEMASK_XYZW;
   2991    struct brw_inst *insn = brw_send_indirect_surface_message(
   2992       p, sfid, brw_writemask(brw_null_reg(), mask),
   2993       payload, surface, msg_length, 0, align1);
   2994 
   2995    brw_set_dp_untyped_surface_write_message(
   2996       p, insn, num_channels);
   2997 }
   2998 
   2999 static unsigned
   3000 brw_byte_scattered_data_element_from_bit_size(unsigned bit_size)
   3001 {
   3002    switch (bit_size) {
   3003    case 8:
   3004       return GEN7_BYTE_SCATTERED_DATA_ELEMENT_BYTE;
   3005    case 16:
   3006       return GEN7_BYTE_SCATTERED_DATA_ELEMENT_WORD;
   3007    case 32:
   3008       return GEN7_BYTE_SCATTERED_DATA_ELEMENT_DWORD;
   3009    default:
   3010       unreachable("Unsupported bit_size for byte scattered messages");
   3011    }
   3012 }
   3013 
   3014 
   3015 void
   3016 brw_byte_scattered_read(struct brw_codegen *p,
   3017                         struct brw_reg dst,
   3018                         struct brw_reg payload,
   3019                         struct brw_reg surface,
   3020                         unsigned msg_length,
   3021                         unsigned bit_size)
   3022 {
   3023    const struct gen_device_info *devinfo = p->devinfo;
   3024    assert(devinfo->gen > 7 || devinfo->is_haswell);
   3025    assert(brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
   3026    const unsigned sfid =  GEN7_SFID_DATAPORT_DATA_CACHE;
   3027 
   3028    struct brw_inst *insn = brw_send_indirect_surface_message(
   3029       p, sfid, dst, payload, surface, msg_length,
   3030       brw_surface_payload_size(p, 1, true, true),
   3031       false);
   3032 
   3033    unsigned msg_control =
   3034       brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
   3035 
   3036    if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
   3037       msg_control |= 1; /* SIMD16 mode */
   3038    else
   3039       msg_control |= 0; /* SIMD8 mode */
   3040 
   3041    brw_inst_set_dp_msg_type(devinfo, insn,
   3042                             HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
   3043    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
   3044 }
   3045 
   3046 void
   3047 brw_byte_scattered_write(struct brw_codegen *p,
   3048                          struct brw_reg payload,
   3049                          struct brw_reg surface,
   3050                          unsigned msg_length,
   3051                          unsigned bit_size)
   3052 {
   3053    const struct gen_device_info *devinfo = p->devinfo;
   3054    assert(devinfo->gen > 7 || devinfo->is_haswell);
   3055    assert(brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
   3056    const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
   3057 
   3058    struct brw_inst *insn = brw_send_indirect_surface_message(
   3059       p, sfid, brw_writemask(brw_null_reg(), WRITEMASK_XYZW),
   3060       payload, surface, msg_length, 0, true);
   3061 
   3062    unsigned msg_control =
   3063       brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
   3064 
   3065    if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
   3066       msg_control |= 1;
   3067    else
   3068       msg_control |= 0;
   3069 
   3070    brw_inst_set_dp_msg_type(devinfo, insn,
   3071                             HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
   3072    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
   3073 }
   3074 
   3075 static void
   3076 brw_set_dp_typed_atomic_message(struct brw_codegen *p,
   3077                                 struct brw_inst *insn,
   3078                                 unsigned atomic_op,
   3079                                 bool response_expected)
   3080 {
   3081    const struct gen_device_info *devinfo = p->devinfo;
   3082    unsigned msg_control =
   3083       atomic_op | /* Atomic Operation Type: BRW_AOP_* */
   3084       (response_expected ? 1 << 5 : 0); /* Return data expected */
   3085 
   3086    if (devinfo->gen >= 8 || devinfo->is_haswell) {
   3087       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   3088          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
   3089             msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
   3090 
   3091          brw_inst_set_dp_msg_type(devinfo, insn,
   3092                                   HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
   3093       } else {
   3094          brw_inst_set_dp_msg_type(devinfo, insn,
   3095                                   HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
   3096       }
   3097 
   3098    } else {
   3099       brw_inst_set_dp_msg_type(devinfo, insn,
   3100                                GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
   3101 
   3102       if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
   3103          msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
   3104    }
   3105 
   3106    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
   3107 }
   3108 
   3109 void
   3110 brw_typed_atomic(struct brw_codegen *p,
   3111                  struct brw_reg dst,
   3112                  struct brw_reg payload,
   3113                  struct brw_reg surface,
   3114                  unsigned atomic_op,
   3115                  unsigned msg_length,
   3116                  bool response_expected) {
   3117    const struct gen_device_info *devinfo = p->devinfo;
   3118    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
   3119                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
   3120                           GEN6_SFID_DATAPORT_RENDER_CACHE);
   3121    const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
   3122    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
   3123    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
   3124    struct brw_inst *insn = brw_send_indirect_surface_message(
   3125       p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
   3126       brw_surface_payload_size(p, response_expected,
   3127                                devinfo->gen >= 8 || devinfo->is_haswell, false),
   3128       true);
   3129 
   3130    brw_set_dp_typed_atomic_message(
   3131       p, insn, atomic_op, response_expected);
   3132 }
   3133 
   3134 static void
   3135 brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
   3136                                       struct brw_inst *insn,
   3137                                       unsigned num_channels)
   3138 {
   3139    const struct gen_device_info *devinfo = p->devinfo;
   3140    /* Set mask of unused channels. */
   3141    unsigned msg_control = 0xf & (0xf << num_channels);
   3142 
   3143    if (devinfo->gen >= 8 || devinfo->is_haswell) {
   3144       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   3145          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
   3146             msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
   3147          else
   3148             msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
   3149       }
   3150 
   3151       brw_inst_set_dp_msg_type(devinfo, insn,
   3152                                HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
   3153    } else {
   3154       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   3155          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
   3156             msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
   3157       }
   3158 
   3159       brw_inst_set_dp_msg_type(devinfo, insn,
   3160                                GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
   3161    }
   3162 
   3163    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
   3164 }
   3165 
   3166 void
   3167 brw_typed_surface_read(struct brw_codegen *p,
   3168                        struct brw_reg dst,
   3169                        struct brw_reg payload,
   3170                        struct brw_reg surface,
   3171                        unsigned msg_length,
   3172                        unsigned num_channels)
   3173 {
   3174    const struct gen_device_info *devinfo = p->devinfo;
   3175    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
   3176                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
   3177                           GEN6_SFID_DATAPORT_RENDER_CACHE);
   3178    struct brw_inst *insn = brw_send_indirect_surface_message(
   3179       p, sfid, dst, payload, surface, msg_length,
   3180       brw_surface_payload_size(p, num_channels,
   3181                                devinfo->gen >= 8 || devinfo->is_haswell, false),
   3182       true);
   3183 
   3184    brw_set_dp_typed_surface_read_message(
   3185       p, insn, num_channels);
   3186 }
   3187 
   3188 static void
   3189 brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
   3190                                        struct brw_inst *insn,
   3191                                        unsigned num_channels)
   3192 {
   3193    const struct gen_device_info *devinfo = p->devinfo;
   3194    /* Set mask of unused channels. */
   3195    unsigned msg_control = 0xf & (0xf << num_channels);
   3196 
   3197    if (devinfo->gen >= 8 || devinfo->is_haswell) {
   3198       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   3199          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
   3200             msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
   3201          else
   3202             msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
   3203       }
   3204 
   3205       brw_inst_set_dp_msg_type(devinfo, insn,
   3206                                HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
   3207 
   3208    } else {
   3209       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   3210          if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
   3211             msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
   3212       }
   3213 
   3214       brw_inst_set_dp_msg_type(devinfo, insn,
   3215                                GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
   3216    }
   3217 
   3218    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
   3219 }
   3220 
   3221 void
   3222 brw_typed_surface_write(struct brw_codegen *p,
   3223                         struct brw_reg payload,
   3224                         struct brw_reg surface,
   3225                         unsigned msg_length,
   3226                         unsigned num_channels)
   3227 {
   3228    const struct gen_device_info *devinfo = p->devinfo;
   3229    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
   3230                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
   3231                           GEN6_SFID_DATAPORT_RENDER_CACHE);
   3232    const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
   3233    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
   3234    const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
   3235                           WRITEMASK_X : WRITEMASK_XYZW);
   3236    struct brw_inst *insn = brw_send_indirect_surface_message(
   3237       p, sfid, brw_writemask(brw_null_reg(), mask),
   3238       payload, surface, msg_length, 0, true);
   3239 
   3240    brw_set_dp_typed_surface_write_message(
   3241       p, insn, num_channels);
   3242 }
   3243 
   3244 static void
   3245 brw_set_memory_fence_message(struct brw_codegen *p,
   3246                              struct brw_inst *insn,
   3247                              enum brw_message_target sfid,
   3248                              bool commit_enable)
   3249 {
   3250    const struct gen_device_info *devinfo = p->devinfo;
   3251 
   3252    brw_set_message_descriptor(p, insn, sfid,
   3253                               1 /* message length */,
   3254                               (commit_enable ? 1 : 0) /* response length */,
   3255                               true /* header present */,
   3256                               false);
   3257 
   3258    switch (sfid) {
   3259    case GEN6_SFID_DATAPORT_RENDER_CACHE:
   3260       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
   3261       break;
   3262    case GEN7_SFID_DATAPORT_DATA_CACHE:
   3263       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
   3264       break;
   3265    default:
   3266       unreachable("Not reached");
   3267    }
   3268 
   3269    if (commit_enable)
   3270       brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
   3271 }
   3272 
   3273 void
   3274 brw_memory_fence(struct brw_codegen *p,
   3275                  struct brw_reg dst)
   3276 {
   3277    const struct gen_device_info *devinfo = p->devinfo;
   3278    const bool commit_enable =
   3279       devinfo->gen >= 10 || /* HSD ES # 1404612949 */
   3280       (devinfo->gen == 7 && !devinfo->is_haswell);
   3281    struct brw_inst *insn;
   3282 
   3283    brw_push_insn_state(p);
   3284    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   3285    brw_set_default_exec_size(p, BRW_EXECUTE_1);
   3286    dst = vec1(dst);
   3287 
   3288    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
   3289     * message doesn't write anything back.
   3290     */
   3291    insn = next_insn(p, BRW_OPCODE_SEND);
   3292    dst = retype(dst, BRW_REGISTER_TYPE_UW);
   3293    brw_set_dest(p, insn, dst);
   3294    brw_set_src0(p, insn, dst);
   3295    brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
   3296                                 commit_enable);
   3297 
   3298    if (devinfo->gen == 7 && !devinfo->is_haswell) {
   3299       /* IVB does typed surface access through the render cache, so we need to
   3300        * flush it too.  Use a different register so both flushes can be
   3301        * pipelined by the hardware.
   3302        */
   3303       insn = next_insn(p, BRW_OPCODE_SEND);
   3304       brw_set_dest(p, insn, offset(dst, 1));
   3305       brw_set_src0(p, insn, offset(dst, 1));
   3306       brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
   3307                                    commit_enable);
   3308 
   3309       /* Now write the response of the second message into the response of the
   3310        * first to trigger a pipeline stall -- This way future render and data
   3311        * cache messages will be properly ordered with respect to past data and
   3312        * render cache messages.
   3313        */
   3314       brw_MOV(p, dst, offset(dst, 1));
   3315    }
   3316 
   3317    brw_pop_insn_state(p);
   3318 }
   3319 
   3320 void
   3321 brw_pixel_interpolator_query(struct brw_codegen *p,
   3322                              struct brw_reg dest,
   3323                              struct brw_reg mrf,
   3324                              bool noperspective,
   3325                              unsigned mode,
   3326                              struct brw_reg data,
   3327                              unsigned msg_length,
   3328                              unsigned response_length)
   3329 {
   3330    const struct gen_device_info *devinfo = p->devinfo;
   3331    struct brw_inst *insn;
   3332    const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
   3333 
   3334    /* brw_send_indirect_message will automatically use a direct send message
   3335     * if data is actually immediate.
   3336     */
   3337    insn = brw_send_indirect_message(p,
   3338                                     GEN7_SFID_PIXEL_INTERPOLATOR,
   3339                                     dest,
   3340                                     mrf,
   3341                                     vec1(data));
   3342    brw_inst_set_mlen(devinfo, insn, msg_length);
   3343    brw_inst_set_rlen(devinfo, insn, response_length);
   3344 
   3345    brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
   3346    brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
   3347    brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
   3348    brw_inst_set_pi_message_type(devinfo, insn, mode);
   3349 }
   3350 
   3351 void
   3352 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
   3353                       struct brw_reg mask)
   3354 {
   3355    const struct gen_device_info *devinfo = p->devinfo;
   3356    const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
   3357    const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
   3358    brw_inst *inst;
   3359 
   3360    assert(devinfo->gen >= 7);
   3361    assert(mask.type == BRW_REGISTER_TYPE_UD);
   3362 
   3363    brw_push_insn_state(p);
   3364 
   3365    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
   3366       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   3367 
   3368       if (devinfo->gen >= 8) {
   3369          /* Getting the first active channel index is easy on Gen8: Just find
   3370           * the first bit set in the execution mask.  The register exists on
   3371           * HSW already but it reads back as all ones when the current
   3372           * instruction has execution masking disabled, so it's kind of
   3373           * useless.
   3374           */
   3375          struct brw_reg exec_mask =
   3376             retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
   3377 
   3378          brw_set_default_exec_size(p, BRW_EXECUTE_1);
   3379          if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
   3380             /* Unfortunately, ce0 does not take into account the thread
   3381              * dispatch mask, which may be a problem in cases where it's not
   3382              * tightly packed (i.e. it doesn't have the form '2^n - 1' for
   3383              * some n).  Combine ce0 with the given dispatch (or vector) mask
   3384              * to mask off those channels which were never dispatched by the
   3385              * hardware.
   3386              */
   3387             brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
   3388             brw_AND(p, vec1(dst), exec_mask, vec1(dst));
   3389             exec_mask = vec1(dst);
   3390          }
   3391 
   3392          /* Quarter control has the effect of magically shifting the value of
   3393           * ce0 so you'll get the first active channel relative to the
   3394           * specified quarter control as result.
   3395           */
   3396          inst = brw_FBL(p, vec1(dst), exec_mask);
   3397       } else {
   3398          const struct brw_reg flag = brw_flag_reg(1, 0);
   3399 
   3400          brw_set_default_exec_size(p, BRW_EXECUTE_1);
   3401          brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
   3402 
   3403          /* Run enough instructions returning zero with execution masking and
   3404           * a conditional modifier enabled in order to get the full execution
   3405           * mask in f1.0.  We could use a single 32-wide move here if it
   3406           * weren't because of the hardware bug that causes channel enables to
   3407           * be applied incorrectly to the second half of 32-wide instructions
   3408           * on Gen7.
   3409           */
   3410          const unsigned lower_size = MIN2(16, exec_size);
   3411          for (unsigned i = 0; i < exec_size / lower_size; i++) {
   3412             inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
   3413                            brw_imm_uw(0));
   3414             brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
   3415             brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
   3416             brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
   3417             brw_inst_set_flag_reg_nr(devinfo, inst, 1);
   3418             brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
   3419          }
   3420 
   3421          /* Find the first bit set in the exec_size-wide portion of the flag
   3422           * register that was updated by the last sequence of MOV
   3423           * instructions.
   3424           */
   3425          const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
   3426          brw_set_default_exec_size(p, BRW_EXECUTE_1);
   3427          brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
   3428       }
   3429    } else {
   3430       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   3431 
   3432       if (devinfo->gen >= 8 &&
   3433           mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
   3434          /* In SIMD4x2 mode the first active channel index is just the
   3435           * negation of the first bit of the mask register.  Note that ce0
   3436           * doesn't take into account the dispatch mask, so the Gen7 path
   3437           * should be used instead unless you have the guarantee that the
   3438           * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
   3439           * for some n).
   3440           */
   3441          inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
   3442                         negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
   3443                         brw_imm_ud(1));
   3444 
   3445       } else {
   3446          /* Overwrite the destination without and with execution masking to
   3447           * find out which of the channels is active.
   3448           */
   3449          brw_push_insn_state(p);
   3450          brw_set_default_exec_size(p, BRW_EXECUTE_4);
   3451          brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
   3452                  brw_imm_ud(1));
   3453 
   3454          inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
   3455                         brw_imm_ud(0));
   3456          brw_pop_insn_state(p);
   3457          brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
   3458       }
   3459    }
   3460 
   3461    brw_pop_insn_state(p);
   3462 }
   3463 
   3464 void
   3465 brw_broadcast(struct brw_codegen *p,
   3466               struct brw_reg dst,
   3467               struct brw_reg src,
   3468               struct brw_reg idx)
   3469 {
   3470    const struct gen_device_info *devinfo = p->devinfo;
   3471    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
   3472    brw_inst *inst;
   3473 
   3474    brw_push_insn_state(p);
   3475    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   3476    brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
   3477 
   3478    assert(src.file == BRW_GENERAL_REGISTER_FILE &&
   3479           src.address_mode == BRW_ADDRESS_DIRECT);
   3480    assert(!src.abs && !src.negate);
   3481    assert(src.type == dst.type);
   3482 
   3483    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
   3484        idx.file == BRW_IMMEDIATE_VALUE) {
   3485       /* Trivial, the source is already uniform or the index is a constant.
   3486        * We will typically not get here if the optimizer is doing its job, but
   3487        * asserting would be mean.
   3488        */
   3489       const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
   3490       brw_MOV(p, dst,
   3491               (align1 ? stride(suboffset(src, i), 0, 1, 0) :
   3492                stride(suboffset(src, 4 * i), 0, 4, 1)));
   3493    } else {
   3494       /* From the Haswell PRM section "Register Region Restrictions":
   3495        *
   3496        *    "The lower bits of the AddressImmediate must not overflow to
   3497        *    change the register address.  The lower 5 bits of Address
   3498        *    Immediate when added to lower 5 bits of address register gives
   3499        *    the sub-register offset. The upper bits of Address Immediate
   3500        *    when added to upper bits of address register gives the register
   3501        *    address. Any overflow from sub-register offset is dropped."
   3502        *
   3503        * Fortunately, for broadcast, we never have a sub-register offset so
   3504        * this isn't an issue.
   3505        */
   3506       assert(src.subnr == 0);
   3507 
   3508       if (align1) {
   3509          const struct brw_reg addr =
   3510             retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
   3511          unsigned offset = src.nr * REG_SIZE + src.subnr;
   3512          /* Limit in bytes of the signed indirect addressing immediate. */
   3513          const unsigned limit = 512;
   3514 
   3515          brw_push_insn_state(p);
   3516          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   3517          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   3518 
   3519          /* Take into account the component size and horizontal stride. */
   3520          assert(src.vstride == src.hstride + src.width);
   3521          brw_SHL(p, addr, vec1(idx),
   3522                  brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
   3523                             src.hstride - 1));
   3524 
   3525          /* We can only address up to limit bytes using the indirect
   3526           * addressing immediate, account for the difference if the source
   3527           * register is above this limit.
   3528           */
   3529          if (offset >= limit) {
   3530             brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
   3531             offset = offset % limit;
   3532          }
   3533 
   3534          brw_pop_insn_state(p);
   3535 
   3536          /* Use indirect addressing to fetch the specified component. */
   3537          if (type_sz(src.type) > 4 &&
   3538              (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
   3539             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
   3540              *
   3541              *    "When source or destination datatype is 64b or operation is
   3542              *    integer DWord multiply, indirect addressing must not be
   3543              *    used."
   3544              *
   3545              * To work around both of this issue, we do two integer MOVs
   3546              * insead of one 64-bit MOV.  Because no double value should ever
   3547              * cross a register boundary, it's safe to use the immediate
   3548              * offset in the indirect here to handle adding 4 bytes to the
   3549              * offset and avoid the extra ADD to the register file.
   3550              */
   3551             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
   3552                        retype(brw_vec1_indirect(addr.subnr, offset),
   3553                               BRW_REGISTER_TYPE_D));
   3554             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
   3555                        retype(brw_vec1_indirect(addr.subnr, offset + 4),
   3556                               BRW_REGISTER_TYPE_D));
   3557          } else {
   3558             brw_MOV(p, dst,
   3559                     retype(brw_vec1_indirect(addr.subnr, offset), src.type));
   3560          }
   3561       } else {
   3562          /* In SIMD4x2 mode the index can be either zero or one, replicate it
   3563           * to all bits of a flag register,
   3564           */
   3565          inst = brw_MOV(p,
   3566                         brw_null_reg(),
   3567                         stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
   3568          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
   3569          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
   3570          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
   3571 
   3572          /* and use predicated SEL to pick the right channel. */
   3573          inst = brw_SEL(p, dst,
   3574                         stride(suboffset(src, 4), 4, 4, 1),
   3575                         stride(src, 4, 4, 1));
   3576          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
   3577          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
   3578       }
   3579    }
   3580 
   3581    brw_pop_insn_state(p);
   3582 }
   3583 
   3584 /**
   3585  * This instruction is generated as a single-channel align1 instruction by
   3586  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
   3587  *
   3588  * We can't use the typed atomic op in the FS because that has the execution
   3589  * mask ANDed with the pixel mask, but we just want to write the one dword for
   3590  * all the pixels.
   3591  *
   3592  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
   3593  * one u32.  So we use the same untyped atomic write message as the pixel
   3594  * shader.
   3595  *
   3596  * The untyped atomic operation requires a BUFFER surface type with RAW
   3597  * format, and is only accessible through the legacy DATA_CACHE dataport
   3598  * messages.
   3599  */
   3600 void brw_shader_time_add(struct brw_codegen *p,
   3601                          struct brw_reg payload,
   3602                          uint32_t surf_index)
   3603 {
   3604    const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
   3605                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
   3606                           GEN7_SFID_DATAPORT_DATA_CACHE);
   3607    assert(p->devinfo->gen >= 7);
   3608 
   3609    brw_push_insn_state(p);
   3610    brw_set_default_access_mode(p, BRW_ALIGN_1);
   3611    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   3612    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   3613    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   3614 
   3615    /* We use brw_vec1_reg and unmasked because we want to increment the given
   3616     * offset only once.
   3617     */
   3618    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
   3619                                       BRW_ARF_NULL, 0));
   3620    brw_set_src0(p, send, brw_vec1_reg(payload.file,
   3621                                       payload.nr, 0));
   3622    brw_set_src1(p, send, brw_imm_ud(0));
   3623    brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
   3624    brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
   3625    brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
   3626 
   3627    brw_pop_insn_state(p);
   3628 }
   3629 
   3630 
   3631 /**
   3632  * Emit the SEND message for a barrier
   3633  */
   3634 void
   3635 brw_barrier(struct brw_codegen *p, struct brw_reg src)
   3636 {
   3637    const struct gen_device_info *devinfo = p->devinfo;
   3638    struct brw_inst *inst;
   3639 
   3640    assert(devinfo->gen >= 7);
   3641 
   3642    brw_push_insn_state(p);
   3643    brw_set_default_access_mode(p, BRW_ALIGN_1);
   3644    inst = next_insn(p, BRW_OPCODE_SEND);
   3645    brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
   3646    brw_set_src0(p, inst, src);
   3647    brw_set_src1(p, inst, brw_null_reg());
   3648 
   3649    brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
   3650                               1 /* msg_length */,
   3651                               0 /* response_length */,
   3652                               false /* header_present */,
   3653                               false /* end_of_thread */);
   3654 
   3655    brw_inst_set_gateway_notify(devinfo, inst, 1);
   3656    brw_inst_set_gateway_subfuncid(devinfo, inst,
   3657                                   BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
   3658 
   3659    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
   3660    brw_pop_insn_state(p);
   3661 }
   3662 
   3663 
   3664 /**
   3665  * Emit the wait instruction for a barrier
   3666  */
   3667 void
   3668 brw_WAIT(struct brw_codegen *p)
   3669 {
   3670    const struct gen_device_info *devinfo = p->devinfo;
   3671    struct brw_inst *insn;
   3672 
   3673    struct brw_reg src = brw_notification_reg();
   3674 
   3675    insn = next_insn(p, BRW_OPCODE_WAIT);
   3676    brw_set_dest(p, insn, src);
   3677    brw_set_src0(p, insn, src);
   3678    brw_set_src1(p, insn, brw_null_reg());
   3679 
   3680    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
   3681    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
   3682 }
   3683 
   3684 /**
   3685  * Changes the floating point rounding mode updating the control register
   3686  * field defined at cr0.0[5-6] bits. This function supports the changes to
   3687  * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
   3688  * Only RTNE and RTZ rounding are enabled at nir.
   3689  */
   3690 void
   3691 brw_rounding_mode(struct brw_codegen *p,
   3692                   enum brw_rnd_mode mode)
   3693 {
   3694    const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
   3695 
   3696    if (bits != BRW_CR0_RND_MODE_MASK) {
   3697       brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
   3698                                brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
   3699 
   3700       /* From the Skylake PRM, Volume 7, page 760:
   3701        *  "Implementation Restriction on Register Access: When the control
   3702        *   register is used as an explicit source and/or destination, hardware
   3703        *   does not ensure execution pipeline coherency. Software must set the
   3704        *   thread control field to switch for an instruction that uses
   3705        *   control register as an explicit operand."
   3706        */
   3707       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
   3708     }
   3709 
   3710    if (bits) {
   3711       brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
   3712                               brw_imm_ud(bits));
   3713       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
   3714    }
   3715 }
   3716