Home | History | Annotate | Download | only in i965
      1 /*
      2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
      3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
      4  develop this 3D driver.
      5 
      6  Permission is hereby granted, free of charge, to any person obtaining
      7  a copy of this software and associated documentation files (the
      8  "Software"), to deal in the Software without restriction, including
      9  without limitation the rights to use, copy, modify, merge, publish,
     10  distribute, sublicense, and/or sell copies of the Software, and to
     11  permit persons to whom the Software is furnished to do so, subject to
     12  the following conditions:
     13 
     14  The above copyright notice and this permission notice (including the
     15  next paragraph) shall be included in all copies or substantial
     16  portions of the Software.
     17 
     18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
     22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25 
     26  **********************************************************************/
     27  /*
     28   * Authors:
     29   *   Keith Whitwell <keith (at) tungstengraphics.com>
     30   */
     31 
     32 
     33 #include "brw_context.h"
     34 #include "brw_defines.h"
     35 #include "brw_eu.h"
     36 
     37 #include "glsl/ralloc.h"
     38 
     39 /***********************************************************************
     40  * Internal helper for constructing instructions
     41  */
     42 
     43 static void guess_execution_size(struct brw_compile *p,
     44 				 struct brw_instruction *insn,
     45 				 struct brw_reg reg)
     46 {
     47    if (reg.width == BRW_WIDTH_8 && p->compressed)
     48       insn->header.execution_size = BRW_EXECUTE_16;
     49    else
     50       insn->header.execution_size = reg.width;	/* note - definitions are compatible */
     51 }
     52 
     53 
     54 /**
     55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
     56  * registers, implicitly moving the operand to a message register.
     57  *
     58  * On Sandybridge, this is no longer the case.  This function performs the
     59  * explicit move; it should be called before emitting a SEND instruction.
     60  */
     61 void
     62 gen6_resolve_implied_move(struct brw_compile *p,
     63 			  struct brw_reg *src,
     64 			  GLuint msg_reg_nr)
     65 {
     66    struct intel_context *intel = &p->brw->intel;
     67    if (intel->gen < 6)
     68       return;
     69 
     70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
     71       return;
     72 
     73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
     74       brw_push_insn_state(p);
     75       brw_set_mask_control(p, BRW_MASK_DISABLE);
     76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
     78 	      retype(*src, BRW_REGISTER_TYPE_UD));
     79       brw_pop_insn_state(p);
     80    }
     81    *src = brw_message_reg(msg_reg_nr);
     82 }
     83 
     84 static void
     85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
     86 {
     87    /* From the BSpec / ISA Reference / send - [DevIVB+]:
     88     * "The send with EOT should use register space R112-R127 for <src>. This is
     89     *  to enable loading of a new thread into the same slot while the message
     90     *  with EOT for current thread is pending dispatch."
     91     *
     92     * Since we're pretending to have 16 MRFs anyway, we may as well use the
     93     * registers required for messages with EOT.
     94     */
     95    struct intel_context *intel = &p->brw->intel;
     96    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
     97       reg->file = BRW_GENERAL_REGISTER_FILE;
     98       reg->nr += GEN7_MRF_HACK_START;
     99    }
    100 }
    101 
    102 
    103 void
    104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
    105 	     struct brw_reg dest)
    106 {
    107    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
    108        dest.file != BRW_MESSAGE_REGISTER_FILE)
    109       assert(dest.nr < 128);
    110 
    111    gen7_convert_mrf_to_grf(p, &dest);
    112 
    113    insn->bits1.da1.dest_reg_file = dest.file;
    114    insn->bits1.da1.dest_reg_type = dest.type;
    115    insn->bits1.da1.dest_address_mode = dest.address_mode;
    116 
    117    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
    118       insn->bits1.da1.dest_reg_nr = dest.nr;
    119 
    120       if (insn->header.access_mode == BRW_ALIGN_1) {
    121 	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
    122 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
    123 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
    124 	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
    125       }
    126       else {
    127 	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
    128 	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
    129 	 /* even ignored in da16, still need to set as '01' */
    130 	 insn->bits1.da16.dest_horiz_stride = 1;
    131       }
    132    }
    133    else {
    134       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
    135 
    136       /* These are different sizes in align1 vs align16:
    137        */
    138       if (insn->header.access_mode == BRW_ALIGN_1) {
    139 	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
    140 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
    141 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
    142 	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
    143       }
    144       else {
    145 	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
    146 	 /* even ignored in da16, still need to set as '01' */
    147 	 insn->bits1.ia16.dest_horiz_stride = 1;
    148       }
    149    }
    150 
    151    /* NEW: Set the execution size based on dest.width and
    152     * insn->compression_control:
    153     */
    154    guess_execution_size(p, insn, dest);
    155 }
    156 
    157 extern int reg_type_size[];
    158 
    159 static void
    160 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
    161 {
    162    int hstride_for_reg[] = {0, 1, 2, 4};
    163    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
    164    int width_for_reg[] = {1, 2, 4, 8, 16};
    165    int execsize_for_reg[] = {1, 2, 4, 8, 16};
    166    int width, hstride, vstride, execsize;
    167 
    168    if (reg.file == BRW_IMMEDIATE_VALUE) {
    169       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
    170        * mean the destination has to be 128-bit aligned and the
    171        * destination horiz stride has to be a word.
    172        */
    173       if (reg.type == BRW_REGISTER_TYPE_V) {
    174 	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
    175 		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
    176       }
    177 
    178       return;
    179    }
    180 
    181    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
    182        reg.file == BRW_ARF_NULL)
    183       return;
    184 
    185    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
    186    hstride = hstride_for_reg[reg.hstride];
    187 
    188    if (reg.vstride == 0xf) {
    189       vstride = -1;
    190    } else {
    191       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
    192       vstride = vstride_for_reg[reg.vstride];
    193    }
    194 
    195    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
    196    width = width_for_reg[reg.width];
    197 
    198    assert(insn->header.execution_size >= 0 &&
    199 	  insn->header.execution_size < Elements(execsize_for_reg));
    200    execsize = execsize_for_reg[insn->header.execution_size];
    201 
    202    /* Restrictions from 3.3.10: Register Region Restrictions. */
    203    /* 3. */
    204    assert(execsize >= width);
    205 
    206    /* 4. */
    207    if (execsize == width && hstride != 0) {
    208       assert(vstride == -1 || vstride == width * hstride);
    209    }
    210 
    211    /* 5. */
    212    if (execsize == width && hstride == 0) {
    213       /* no restriction on vstride. */
    214    }
    215 
    216    /* 6. */
    217    if (width == 1) {
    218       assert(hstride == 0);
    219    }
    220 
    221    /* 7. */
    222    if (execsize == 1 && width == 1) {
    223       assert(hstride == 0);
    224       assert(vstride == 0);
    225    }
    226 
    227    /* 8. */
    228    if (vstride == 0 && hstride == 0) {
    229       assert(width == 1);
    230    }
    231 
    232    /* 10. Check destination issues. */
    233 }
    234 
    235 void
    236 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
    237 	     struct brw_reg reg)
    238 {
    239    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
    240       assert(reg.nr < 128);
    241 
    242    gen7_convert_mrf_to_grf(p, &reg);
    243 
    244    validate_reg(insn, reg);
    245 
    246    insn->bits1.da1.src0_reg_file = reg.file;
    247    insn->bits1.da1.src0_reg_type = reg.type;
    248    insn->bits2.da1.src0_abs = reg.abs;
    249    insn->bits2.da1.src0_negate = reg.negate;
    250    insn->bits2.da1.src0_address_mode = reg.address_mode;
    251 
    252    if (reg.file == BRW_IMMEDIATE_VALUE) {
    253       insn->bits3.ud = reg.dw1.ud;
    254 
    255       /* Required to set some fields in src1 as well:
    256        */
    257       insn->bits1.da1.src1_reg_file = 0; /* arf */
    258       insn->bits1.da1.src1_reg_type = reg.type;
    259    }
    260    else
    261    {
    262       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
    263 	 if (insn->header.access_mode == BRW_ALIGN_1) {
    264 	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
    265 	    insn->bits2.da1.src0_reg_nr = reg.nr;
    266 	 }
    267 	 else {
    268 	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
    269 	    insn->bits2.da16.src0_reg_nr = reg.nr;
    270 	 }
    271       }
    272       else {
    273 	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
    274 
    275 	 if (insn->header.access_mode == BRW_ALIGN_1) {
    276 	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
    277 	 }
    278 	 else {
    279 	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
    280 	 }
    281       }
    282 
    283       if (insn->header.access_mode == BRW_ALIGN_1) {
    284 	 if (reg.width == BRW_WIDTH_1 &&
    285 	     insn->header.execution_size == BRW_EXECUTE_1) {
    286 	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
    287 	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
    288 	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
    289 	 }
    290 	 else {
    291 	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
    292 	    insn->bits2.da1.src0_width = reg.width;
    293 	    insn->bits2.da1.src0_vert_stride = reg.vstride;
    294 	 }
    295       }
    296       else {
    297 	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
    298 	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
    299 	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
    300 	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
    301 
    302 	 /* This is an oddity of the fact we're using the same
    303 	  * descriptions for registers in align_16 as align_1:
    304 	  */
    305 	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
    306 	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
    307 	 else
    308 	    insn->bits2.da16.src0_vert_stride = reg.vstride;
    309       }
    310    }
    311 }
    312 
    313 
    314 void brw_set_src1(struct brw_compile *p,
    315 		  struct brw_instruction *insn,
    316 		  struct brw_reg reg)
    317 {
    318    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
    319 
    320    assert(reg.nr < 128);
    321 
    322    gen7_convert_mrf_to_grf(p, &reg);
    323 
    324    validate_reg(insn, reg);
    325 
    326    insn->bits1.da1.src1_reg_file = reg.file;
    327    insn->bits1.da1.src1_reg_type = reg.type;
    328    insn->bits3.da1.src1_abs = reg.abs;
    329    insn->bits3.da1.src1_negate = reg.negate;
    330 
    331    /* Only src1 can be immediate in two-argument instructions.
    332     */
    333    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
    334 
    335    if (reg.file == BRW_IMMEDIATE_VALUE) {
    336       insn->bits3.ud = reg.dw1.ud;
    337    }
    338    else {
    339       /* This is a hardware restriction, which may or may not be lifted
    340        * in the future:
    341        */
    342       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
    343       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
    344 
    345       if (insn->header.access_mode == BRW_ALIGN_1) {
    346 	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
    347 	 insn->bits3.da1.src1_reg_nr = reg.nr;
    348       }
    349       else {
    350 	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
    351 	 insn->bits3.da16.src1_reg_nr = reg.nr;
    352       }
    353 
    354       if (insn->header.access_mode == BRW_ALIGN_1) {
    355 	 if (reg.width == BRW_WIDTH_1 &&
    356 	     insn->header.execution_size == BRW_EXECUTE_1) {
    357 	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
    358 	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
    359 	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
    360 	 }
    361 	 else {
    362 	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
    363 	    insn->bits3.da1.src1_width = reg.width;
    364 	    insn->bits3.da1.src1_vert_stride = reg.vstride;
    365 	 }
    366       }
    367       else {
    368 	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
    369 	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
    370 	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
    371 	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
    372 
    373 	 /* This is an oddity of the fact we're using the same
    374 	  * descriptions for registers in align_16 as align_1:
    375 	  */
    376 	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
    377 	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
    378 	 else
    379 	    insn->bits3.da16.src1_vert_stride = reg.vstride;
    380       }
    381    }
    382 }
    383 
    384 /**
    385  * Set the Message Descriptor and Extended Message Descriptor fields
    386  * for SEND messages.
    387  *
    388  * \note This zeroes out the Function Control bits, so it must be called
    389  *       \b before filling out any message-specific data.  Callers can
    390  *       choose not to fill in irrelevant bits; they will be zero.
    391  */
    392 static void
    393 brw_set_message_descriptor(struct brw_compile *p,
    394 			   struct brw_instruction *inst,
    395 			   enum brw_message_target sfid,
    396 			   unsigned msg_length,
    397 			   unsigned response_length,
    398 			   bool header_present,
    399 			   bool end_of_thread)
    400 {
    401    struct intel_context *intel = &p->brw->intel;
    402 
    403    brw_set_src1(p, inst, brw_imm_d(0));
    404 
    405    if (intel->gen >= 5) {
    406       inst->bits3.generic_gen5.header_present = header_present;
    407       inst->bits3.generic_gen5.response_length = response_length;
    408       inst->bits3.generic_gen5.msg_length = msg_length;
    409       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
    410 
    411       if (intel->gen >= 6) {
    412 	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
    413 	 inst->header.destreg__conditionalmod = sfid;
    414       } else {
    415 	 /* Set Extended Message Descriptor (ex_desc) */
    416 	 inst->bits2.send_gen5.sfid = sfid;
    417 	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
    418       }
    419    } else {
    420       inst->bits3.generic.response_length = response_length;
    421       inst->bits3.generic.msg_length = msg_length;
    422       inst->bits3.generic.msg_target = sfid;
    423       inst->bits3.generic.end_of_thread = end_of_thread;
    424    }
    425 }
    426 
    427 static void brw_set_math_message( struct brw_compile *p,
    428 				  struct brw_instruction *insn,
    429 				  GLuint function,
    430 				  GLuint integer_type,
    431 				  bool low_precision,
    432 				  GLuint dataType )
    433 {
    434    struct brw_context *brw = p->brw;
    435    struct intel_context *intel = &brw->intel;
    436    unsigned msg_length;
    437    unsigned response_length;
    438 
    439    /* Infer message length from the function */
    440    switch (function) {
    441    case BRW_MATH_FUNCTION_POW:
    442    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
    443    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
    444    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
    445       msg_length = 2;
    446       break;
    447    default:
    448       msg_length = 1;
    449       break;
    450    }
    451 
    452    /* Infer response length from the function */
    453    switch (function) {
    454    case BRW_MATH_FUNCTION_SINCOS:
    455    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
    456       response_length = 2;
    457       break;
    458    default:
    459       response_length = 1;
    460       break;
    461    }
    462 
    463 
    464    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
    465 			      msg_length, response_length, false, false);
    466    if (intel->gen == 5) {
    467       insn->bits3.math_gen5.function = function;
    468       insn->bits3.math_gen5.int_type = integer_type;
    469       insn->bits3.math_gen5.precision = low_precision;
    470       insn->bits3.math_gen5.saturate = insn->header.saturate;
    471       insn->bits3.math_gen5.data_type = dataType;
    472       insn->bits3.math_gen5.snapshot = 0;
    473    } else {
    474       insn->bits3.math.function = function;
    475       insn->bits3.math.int_type = integer_type;
    476       insn->bits3.math.precision = low_precision;
    477       insn->bits3.math.saturate = insn->header.saturate;
    478       insn->bits3.math.data_type = dataType;
    479    }
    480    insn->header.saturate = 0;
    481 }
    482 
    483 
    484 static void brw_set_ff_sync_message(struct brw_compile *p,
    485 				    struct brw_instruction *insn,
    486 				    bool allocate,
    487 				    GLuint response_length,
    488 				    bool end_of_thread)
    489 {
    490    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
    491 			      1, response_length, true, end_of_thread);
    492    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
    493    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
    494    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
    495    insn->bits3.urb_gen5.allocate = allocate;
    496    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
    497    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
    498 }
    499 
    500 static void brw_set_urb_message( struct brw_compile *p,
    501 				 struct brw_instruction *insn,
    502 				 bool allocate,
    503 				 bool used,
    504 				 GLuint msg_length,
    505 				 GLuint response_length,
    506 				 bool end_of_thread,
    507 				 bool complete,
    508 				 GLuint offset,
    509 				 GLuint swizzle_control )
    510 {
    511    struct brw_context *brw = p->brw;
    512    struct intel_context *intel = &brw->intel;
    513 
    514    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
    515 			      msg_length, response_length, true, end_of_thread);
    516    if (intel->gen == 7) {
    517       insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
    518       insn->bits3.urb_gen7.offset = offset;
    519       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
    520       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
    521       /* per_slot_offset = 0 makes it ignore offsets in message header */
    522       insn->bits3.urb_gen7.per_slot_offset = 0;
    523       insn->bits3.urb_gen7.complete = complete;
    524    } else if (intel->gen >= 5) {
    525       insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
    526       insn->bits3.urb_gen5.offset = offset;
    527       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
    528       insn->bits3.urb_gen5.allocate = allocate;
    529       insn->bits3.urb_gen5.used = used;	/* ? */
    530       insn->bits3.urb_gen5.complete = complete;
    531    } else {
    532       insn->bits3.urb.opcode = 0;	/* ? */
    533       insn->bits3.urb.offset = offset;
    534       insn->bits3.urb.swizzle_control = swizzle_control;
    535       insn->bits3.urb.allocate = allocate;
    536       insn->bits3.urb.used = used;	/* ? */
    537       insn->bits3.urb.complete = complete;
    538    }
    539 }
    540 
    541 void
    542 brw_set_dp_write_message(struct brw_compile *p,
    543 			 struct brw_instruction *insn,
    544 			 GLuint binding_table_index,
    545 			 GLuint msg_control,
    546 			 GLuint msg_type,
    547 			 GLuint msg_length,
    548 			 bool header_present,
    549 			 GLuint last_render_target,
    550 			 GLuint response_length,
    551 			 GLuint end_of_thread,
    552 			 GLuint send_commit_msg)
    553 {
    554    struct brw_context *brw = p->brw;
    555    struct intel_context *intel = &brw->intel;
    556    unsigned sfid;
    557 
    558    if (intel->gen >= 7) {
    559       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
    560       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
    561 	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
    562       else
    563 	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
    564    } else if (intel->gen == 6) {
    565       /* Use the render cache for all write messages. */
    566       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
    567    } else {
    568       sfid = BRW_SFID_DATAPORT_WRITE;
    569    }
    570 
    571    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
    572 			      header_present, end_of_thread);
    573 
    574    if (intel->gen >= 7) {
    575       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
    576       insn->bits3.gen7_dp.msg_control = msg_control;
    577       insn->bits3.gen7_dp.last_render_target = last_render_target;
    578       insn->bits3.gen7_dp.msg_type = msg_type;
    579    } else if (intel->gen == 6) {
    580       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
    581       insn->bits3.gen6_dp.msg_control = msg_control;
    582       insn->bits3.gen6_dp.last_render_target = last_render_target;
    583       insn->bits3.gen6_dp.msg_type = msg_type;
    584       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
    585    } else if (intel->gen == 5) {
    586       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
    587       insn->bits3.dp_write_gen5.msg_control = msg_control;
    588       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
    589       insn->bits3.dp_write_gen5.msg_type = msg_type;
    590       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
    591    } else {
    592       insn->bits3.dp_write.binding_table_index = binding_table_index;
    593       insn->bits3.dp_write.msg_control = msg_control;
    594       insn->bits3.dp_write.last_render_target = last_render_target;
    595       insn->bits3.dp_write.msg_type = msg_type;
    596       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
    597    }
    598 }
    599 
    600 void
    601 brw_set_dp_read_message(struct brw_compile *p,
    602 			struct brw_instruction *insn,
    603 			GLuint binding_table_index,
    604 			GLuint msg_control,
    605 			GLuint msg_type,
    606 			GLuint target_cache,
    607 			GLuint msg_length,
    608 			GLuint response_length)
    609 {
    610    struct brw_context *brw = p->brw;
    611    struct intel_context *intel = &brw->intel;
    612    unsigned sfid;
    613 
    614    if (intel->gen >= 7) {
    615       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
    616    } else if (intel->gen == 6) {
    617       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
    618 	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
    619       else
    620 	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
    621    } else {
    622       sfid = BRW_SFID_DATAPORT_READ;
    623    }
    624 
    625    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
    626 			      true, false);
    627 
    628    if (intel->gen >= 7) {
    629       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
    630       insn->bits3.gen7_dp.msg_control = msg_control;
    631       insn->bits3.gen7_dp.last_render_target = 0;
    632       insn->bits3.gen7_dp.msg_type = msg_type;
    633    } else if (intel->gen == 6) {
    634       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
    635       insn->bits3.gen6_dp.msg_control = msg_control;
    636       insn->bits3.gen6_dp.last_render_target = 0;
    637       insn->bits3.gen6_dp.msg_type = msg_type;
    638       insn->bits3.gen6_dp.send_commit_msg = 0;
    639    } else if (intel->gen == 5) {
    640       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
    641       insn->bits3.dp_read_gen5.msg_control = msg_control;
    642       insn->bits3.dp_read_gen5.msg_type = msg_type;
    643       insn->bits3.dp_read_gen5.target_cache = target_cache;
    644    } else if (intel->is_g4x) {
    645       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
    646       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
    647       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
    648       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
    649    } else {
    650       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
    651       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
    652       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
    653       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
    654    }
    655 }
    656 
    657 void
    658 brw_set_sampler_message(struct brw_compile *p,
    659                         struct brw_instruction *insn,
    660                         GLuint binding_table_index,
    661                         GLuint sampler,
    662                         GLuint msg_type,
    663                         GLuint response_length,
    664                         GLuint msg_length,
    665                         GLuint header_present,
    666                         GLuint simd_mode,
    667                         GLuint return_format)
    668 {
    669    struct brw_context *brw = p->brw;
    670    struct intel_context *intel = &brw->intel;
    671 
    672    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
    673 			      response_length, header_present, false);
    674 
    675    if (intel->gen >= 7) {
    676       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
    677       insn->bits3.sampler_gen7.sampler = sampler;
    678       insn->bits3.sampler_gen7.msg_type = msg_type;
    679       insn->bits3.sampler_gen7.simd_mode = simd_mode;
    680    } else if (intel->gen >= 5) {
    681       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
    682       insn->bits3.sampler_gen5.sampler = sampler;
    683       insn->bits3.sampler_gen5.msg_type = msg_type;
    684       insn->bits3.sampler_gen5.simd_mode = simd_mode;
    685    } else if (intel->is_g4x) {
    686       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
    687       insn->bits3.sampler_g4x.sampler = sampler;
    688       insn->bits3.sampler_g4x.msg_type = msg_type;
    689    } else {
    690       insn->bits3.sampler.binding_table_index = binding_table_index;
    691       insn->bits3.sampler.sampler = sampler;
    692       insn->bits3.sampler.msg_type = msg_type;
    693       insn->bits3.sampler.return_format = return_format;
    694    }
    695 }
    696 
    697 
    698 #define next_insn brw_next_insn
    699 struct brw_instruction *
    700 brw_next_insn(struct brw_compile *p, GLuint opcode)
    701 {
    702    struct brw_instruction *insn;
    703 
    704    if (p->nr_insn + 1 > p->store_size) {
    705       if (0)
    706          printf("incresing the store size to %d\n", p->store_size << 1);
    707       p->store_size <<= 1;
    708       p->store = reralloc(p->mem_ctx, p->store,
    709                           struct brw_instruction, p->store_size);
    710       if (!p->store)
    711          assert(!"realloc eu store memeory failed");
    712    }
    713 
    714    insn = &p->store[p->nr_insn++];
    715    memcpy(insn, p->current, sizeof(*insn));
    716 
    717    /* Reset this one-shot flag:
    718     */
    719 
    720    if (p->current->header.destreg__conditionalmod) {
    721       p->current->header.destreg__conditionalmod = 0;
    722       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
    723    }
    724 
    725    insn->header.opcode = opcode;
    726    return insn;
    727 }
    728 
    729 static struct brw_instruction *brw_alu1( struct brw_compile *p,
    730 					 GLuint opcode,
    731 					 struct brw_reg dest,
    732 					 struct brw_reg src )
    733 {
    734    struct brw_instruction *insn = next_insn(p, opcode);
    735    brw_set_dest(p, insn, dest);
    736    brw_set_src0(p, insn, src);
    737    return insn;
    738 }
    739 
    740 static struct brw_instruction *brw_alu2(struct brw_compile *p,
    741 					GLuint opcode,
    742 					struct brw_reg dest,
    743 					struct brw_reg src0,
    744 					struct brw_reg src1 )
    745 {
    746    struct brw_instruction *insn = next_insn(p, opcode);
    747    brw_set_dest(p, insn, dest);
    748    brw_set_src0(p, insn, src0);
    749    brw_set_src1(p, insn, src1);
    750    return insn;
    751 }
    752 
    753 static int
    754 get_3src_subreg_nr(struct brw_reg reg)
    755 {
    756    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
    757       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
    758       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
    759    } else {
    760       return reg.subnr / 4;
    761    }
    762 }
    763 
    764 static struct brw_instruction *brw_alu3(struct brw_compile *p,
    765 					GLuint opcode,
    766 					struct brw_reg dest,
    767 					struct brw_reg src0,
    768 					struct brw_reg src1,
    769 					struct brw_reg src2)
    770 {
    771    struct brw_instruction *insn = next_insn(p, opcode);
    772 
    773    gen7_convert_mrf_to_grf(p, &dest);
    774 
    775    assert(insn->header.access_mode == BRW_ALIGN_16);
    776 
    777    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
    778 	  dest.file == BRW_MESSAGE_REGISTER_FILE);
    779    assert(dest.nr < 128);
    780    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
    781    assert(dest.type = BRW_REGISTER_TYPE_F);
    782    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
    783    insn->bits1.da3src.dest_reg_nr = dest.nr;
    784    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
    785    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
    786    guess_execution_size(p, insn, dest);
    787 
    788    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
    789    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
    790    assert(src0.nr < 128);
    791    assert(src0.type == BRW_REGISTER_TYPE_F);
    792    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
    793    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
    794    insn->bits2.da3src.src0_reg_nr = src0.nr;
    795    insn->bits1.da3src.src0_abs = src0.abs;
    796    insn->bits1.da3src.src0_negate = src0.negate;
    797    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
    798 
    799    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
    800    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
    801    assert(src1.nr < 128);
    802    assert(src1.type == BRW_REGISTER_TYPE_F);
    803    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
    804    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
    805    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
    806    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
    807    insn->bits3.da3src.src1_reg_nr = src1.nr;
    808    insn->bits1.da3src.src1_abs = src1.abs;
    809    insn->bits1.da3src.src1_negate = src1.negate;
    810 
    811    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
    812    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
    813    assert(src2.nr < 128);
    814    assert(src2.type == BRW_REGISTER_TYPE_F);
    815    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
    816    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
    817    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
    818    insn->bits3.da3src.src2_reg_nr = src2.nr;
    819    insn->bits1.da3src.src2_abs = src2.abs;
    820    insn->bits1.da3src.src2_negate = src2.negate;
    821 
    822    return insn;
    823 }
    824 
    825 
    826 /***********************************************************************
    827  * Convenience routines.
    828  */
    829 #define ALU1(OP)					\
    830 struct brw_instruction *brw_##OP(struct brw_compile *p,	\
    831 	      struct brw_reg dest,			\
    832 	      struct brw_reg src0)   			\
    833 {							\
    834    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
    835 }
    836 
    837 #define ALU2(OP)					\
    838 struct brw_instruction *brw_##OP(struct brw_compile *p,	\
    839 	      struct brw_reg dest,			\
    840 	      struct brw_reg src0,			\
    841 	      struct brw_reg src1)   			\
    842 {							\
    843    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
    844 }
    845 
    846 #define ALU3(OP)					\
    847 struct brw_instruction *brw_##OP(struct brw_compile *p,	\
    848 	      struct brw_reg dest,			\
    849 	      struct brw_reg src0,			\
    850 	      struct brw_reg src1,			\
    851 	      struct brw_reg src2)   			\
    852 {							\
    853    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
    854 }
    855 
    856 /* Rounding operations (other than RNDD) require two instructions - the first
    857  * stores a rounded value (possibly the wrong way) in the dest register, but
    858  * also sets a per-channel "increment bit" in the flag register.  A predicated
    859  * add of 1.0 fixes dest to contain the desired result.
    860  *
    861  * Sandybridge and later appear to round correctly without an ADD.
    862  */
    863 #define ROUND(OP)							      \
    864 void brw_##OP(struct brw_compile *p,					      \
    865 	      struct brw_reg dest,					      \
    866 	      struct brw_reg src)					      \
    867 {									      \
    868    struct brw_instruction *rnd, *add;					      \
    869    rnd = next_insn(p, BRW_OPCODE_##OP);					      \
    870    brw_set_dest(p, rnd, dest);						      \
    871    brw_set_src0(p, rnd, src);						      \
    872 									      \
    873    if (p->brw->intel.gen < 6) {						      \
    874       /* turn on round-increments */					      \
    875       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
    876       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
    877       add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
    878    }									      \
    879 }
    880 
    881 
    882 ALU1(MOV)
    883 ALU2(SEL)
    884 ALU1(NOT)
    885 ALU2(AND)
    886 ALU2(OR)
    887 ALU2(XOR)
    888 ALU2(SHR)
    889 ALU2(SHL)
    890 ALU2(RSR)
    891 ALU2(RSL)
    892 ALU2(ASR)
    893 ALU1(FRC)
    894 ALU1(RNDD)
    895 ALU2(MAC)
    896 ALU2(MACH)
    897 ALU1(LZD)
    898 ALU2(DP4)
    899 ALU2(DPH)
    900 ALU2(DP3)
    901 ALU2(DP2)
    902 ALU2(LINE)
    903 ALU2(PLN)
    904 ALU3(MAD)
    905 
    906 ROUND(RNDZ)
    907 ROUND(RNDE)
    908 
    909 
    910 struct brw_instruction *brw_ADD(struct brw_compile *p,
    911 				struct brw_reg dest,
    912 				struct brw_reg src0,
    913 				struct brw_reg src1)
    914 {
    915    /* 6.2.2: add */
    916    if (src0.type == BRW_REGISTER_TYPE_F ||
    917        (src0.file == BRW_IMMEDIATE_VALUE &&
    918 	src0.type == BRW_REGISTER_TYPE_VF)) {
    919       assert(src1.type != BRW_REGISTER_TYPE_UD);
    920       assert(src1.type != BRW_REGISTER_TYPE_D);
    921    }
    922 
    923    if (src1.type == BRW_REGISTER_TYPE_F ||
    924        (src1.file == BRW_IMMEDIATE_VALUE &&
    925 	src1.type == BRW_REGISTER_TYPE_VF)) {
    926       assert(src0.type != BRW_REGISTER_TYPE_UD);
    927       assert(src0.type != BRW_REGISTER_TYPE_D);
    928    }
    929 
    930    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
    931 }
    932 
    933 struct brw_instruction *brw_AVG(struct brw_compile *p,
    934                                 struct brw_reg dest,
    935                                 struct brw_reg src0,
    936                                 struct brw_reg src1)
    937 {
    938    assert(dest.type == src0.type);
    939    assert(src0.type == src1.type);
    940    switch (src0.type) {
    941    case BRW_REGISTER_TYPE_B:
    942    case BRW_REGISTER_TYPE_UB:
    943    case BRW_REGISTER_TYPE_W:
    944    case BRW_REGISTER_TYPE_UW:
    945    case BRW_REGISTER_TYPE_D:
    946    case BRW_REGISTER_TYPE_UD:
    947       break;
    948    default:
    949       assert(!"Bad type for brw_AVG");
    950    }
    951 
    952    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
    953 }
    954 
    955 struct brw_instruction *brw_MUL(struct brw_compile *p,
    956 				struct brw_reg dest,
    957 				struct brw_reg src0,
    958 				struct brw_reg src1)
    959 {
    960    /* 6.32.38: mul */
    961    if (src0.type == BRW_REGISTER_TYPE_D ||
    962        src0.type == BRW_REGISTER_TYPE_UD ||
    963        src1.type == BRW_REGISTER_TYPE_D ||
    964        src1.type == BRW_REGISTER_TYPE_UD) {
    965       assert(dest.type != BRW_REGISTER_TYPE_F);
    966    }
    967 
    968    if (src0.type == BRW_REGISTER_TYPE_F ||
    969        (src0.file == BRW_IMMEDIATE_VALUE &&
    970 	src0.type == BRW_REGISTER_TYPE_VF)) {
    971       assert(src1.type != BRW_REGISTER_TYPE_UD);
    972       assert(src1.type != BRW_REGISTER_TYPE_D);
    973    }
    974 
    975    if (src1.type == BRW_REGISTER_TYPE_F ||
    976        (src1.file == BRW_IMMEDIATE_VALUE &&
    977 	src1.type == BRW_REGISTER_TYPE_VF)) {
    978       assert(src0.type != BRW_REGISTER_TYPE_UD);
    979       assert(src0.type != BRW_REGISTER_TYPE_D);
    980    }
    981 
    982    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
    983 	  src0.nr != BRW_ARF_ACCUMULATOR);
    984    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
    985 	  src1.nr != BRW_ARF_ACCUMULATOR);
    986 
    987    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
    988 }
    989 
    990 
    991 void brw_NOP(struct brw_compile *p)
    992 {
    993    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
    994    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    995    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    996    brw_set_src1(p, insn, brw_imm_ud(0x0));
    997 }
    998 
    999 
   1000 
   1001 
   1002 
   1003 /***********************************************************************
   1004  * Comparisons, if/else/endif
   1005  */
   1006 
   1007 struct brw_instruction *brw_JMPI(struct brw_compile *p,
   1008                                  struct brw_reg dest,
   1009                                  struct brw_reg src0,
   1010                                  struct brw_reg src1)
   1011 {
   1012    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
   1013 
   1014    insn->header.execution_size = 1;
   1015    insn->header.compression_control = BRW_COMPRESSION_NONE;
   1016    insn->header.mask_control = BRW_MASK_DISABLE;
   1017 
   1018    p->current->header.predicate_control = BRW_PREDICATE_NONE;
   1019 
   1020    return insn;
   1021 }
   1022 
   1023 static void
   1024 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
   1025 {
   1026    p->if_stack[p->if_stack_depth] = inst - p->store;
   1027 
   1028    p->if_stack_depth++;
   1029    if (p->if_stack_array_size <= p->if_stack_depth) {
   1030       p->if_stack_array_size *= 2;
   1031       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
   1032 			     p->if_stack_array_size);
   1033    }
   1034 }
   1035 
   1036 static struct brw_instruction *
   1037 pop_if_stack(struct brw_compile *p)
   1038 {
   1039    p->if_stack_depth--;
   1040    return &p->store[p->if_stack[p->if_stack_depth]];
   1041 }
   1042 
   1043 static void
   1044 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
   1045 {
   1046    if (p->loop_stack_array_size < p->loop_stack_depth) {
   1047       p->loop_stack_array_size *= 2;
   1048       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
   1049 			       p->loop_stack_array_size);
   1050       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
   1051 				     p->loop_stack_array_size);
   1052    }
   1053 
   1054    p->loop_stack[p->loop_stack_depth] = inst - p->store;
   1055    p->loop_stack_depth++;
   1056    p->if_depth_in_loop[p->loop_stack_depth] = 0;
   1057 }
   1058 
   1059 static struct brw_instruction *
   1060 get_inner_do_insn(struct brw_compile *p)
   1061 {
   1062    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
   1063 }
   1064 
   1065 /* EU takes the value from the flag register and pushes it onto some
   1066  * sort of a stack (presumably merging with any flag value already on
   1067  * the stack).  Within an if block, the flags at the top of the stack
   1068  * control execution on each channel of the unit, eg. on each of the
   1069  * 16 pixel values in our wm programs.
   1070  *
   1071  * When the matching 'else' instruction is reached (presumably by
   1072  * countdown of the instruction count patched in by our ELSE/ENDIF
   1073  * functions), the relevent flags are inverted.
   1074  *
   1075  * When the matching 'endif' instruction is reached, the flags are
   1076  * popped off.  If the stack is now empty, normal execution resumes.
   1077  */
   1078 struct brw_instruction *
   1079 brw_IF(struct brw_compile *p, GLuint execute_size)
   1080 {
   1081    struct intel_context *intel = &p->brw->intel;
   1082    struct brw_instruction *insn;
   1083 
   1084    insn = next_insn(p, BRW_OPCODE_IF);
   1085 
   1086    /* Override the defaults for this instruction:
   1087     */
   1088    if (intel->gen < 6) {
   1089       brw_set_dest(p, insn, brw_ip_reg());
   1090       brw_set_src0(p, insn, brw_ip_reg());
   1091       brw_set_src1(p, insn, brw_imm_d(0x0));
   1092    } else if (intel->gen == 6) {
   1093       brw_set_dest(p, insn, brw_imm_w(0));
   1094       insn->bits1.branch_gen6.jump_count = 0;
   1095       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1096       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1097    } else {
   1098       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1099       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
   1100       brw_set_src1(p, insn, brw_imm_ud(0));
   1101       insn->bits3.break_cont.jip = 0;
   1102       insn->bits3.break_cont.uip = 0;
   1103    }
   1104 
   1105    insn->header.execution_size = execute_size;
   1106    insn->header.compression_control = BRW_COMPRESSION_NONE;
   1107    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
   1108    insn->header.mask_control = BRW_MASK_ENABLE;
   1109    if (!p->single_program_flow)
   1110       insn->header.thread_control = BRW_THREAD_SWITCH;
   1111 
   1112    p->current->header.predicate_control = BRW_PREDICATE_NONE;
   1113 
   1114    push_if_stack(p, insn);
   1115    p->if_depth_in_loop[p->loop_stack_depth]++;
   1116    return insn;
   1117 }
   1118 
   1119 /* This function is only used for gen6-style IF instructions with an
   1120  * embedded comparison (conditional modifier).  It is not used on gen7.
   1121  */
   1122 struct brw_instruction *
   1123 gen6_IF(struct brw_compile *p, uint32_t conditional,
   1124 	struct brw_reg src0, struct brw_reg src1)
   1125 {
   1126    struct brw_instruction *insn;
   1127 
   1128    insn = next_insn(p, BRW_OPCODE_IF);
   1129 
   1130    brw_set_dest(p, insn, brw_imm_w(0));
   1131    if (p->compressed) {
   1132       insn->header.execution_size = BRW_EXECUTE_16;
   1133    } else {
   1134       insn->header.execution_size = BRW_EXECUTE_8;
   1135    }
   1136    insn->bits1.branch_gen6.jump_count = 0;
   1137    brw_set_src0(p, insn, src0);
   1138    brw_set_src1(p, insn, src1);
   1139 
   1140    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
   1141    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
   1142    insn->header.destreg__conditionalmod = conditional;
   1143 
   1144    if (!p->single_program_flow)
   1145       insn->header.thread_control = BRW_THREAD_SWITCH;
   1146 
   1147    push_if_stack(p, insn);
   1148    return insn;
   1149 }
   1150 
   1151 /**
   1152  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
   1153  */
   1154 static void
   1155 convert_IF_ELSE_to_ADD(struct brw_compile *p,
   1156 		       struct brw_instruction *if_inst,
   1157 		       struct brw_instruction *else_inst)
   1158 {
   1159    /* The next instruction (where the ENDIF would be, if it existed) */
   1160    struct brw_instruction *next_inst = &p->store[p->nr_insn];
   1161 
   1162    assert(p->single_program_flow);
   1163    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
   1164    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
   1165    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
   1166 
   1167    /* Convert IF to an ADD instruction that moves the instruction pointer
   1168     * to the first instruction of the ELSE block.  If there is no ELSE
   1169     * block, point to where ENDIF would be.  Reverse the predicate.
   1170     *
   1171     * There's no need to execute an ENDIF since we don't need to do any
   1172     * stack operations, and if we're currently executing, we just want to
   1173     * continue normally.
   1174     */
   1175    if_inst->header.opcode = BRW_OPCODE_ADD;
   1176    if_inst->header.predicate_inverse = 1;
   1177 
   1178    if (else_inst != NULL) {
   1179       /* Convert ELSE to an ADD instruction that points where the ENDIF
   1180        * would be.
   1181        */
   1182       else_inst->header.opcode = BRW_OPCODE_ADD;
   1183 
   1184       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
   1185       else_inst->bits3.ud = (next_inst - else_inst) * 16;
   1186    } else {
   1187       if_inst->bits3.ud = (next_inst - if_inst) * 16;
   1188    }
   1189 }
   1190 
   1191 /**
   1192  * Patch IF and ELSE instructions with appropriate jump targets.
   1193  */
   1194 static void
   1195 patch_IF_ELSE(struct brw_compile *p,
   1196 	      struct brw_instruction *if_inst,
   1197 	      struct brw_instruction *else_inst,
   1198 	      struct brw_instruction *endif_inst)
   1199 {
   1200    struct intel_context *intel = &p->brw->intel;
   1201 
   1202    /* We shouldn't be patching IF and ELSE instructions in single program flow
   1203     * mode when gen < 6, because in single program flow mode on those
   1204     * platforms, we convert flow control instructions to conditional ADDs that
   1205     * operate on IP (see brw_ENDIF).
   1206     *
   1207     * However, on Gen6, writing to IP doesn't work in single program flow mode
   1208     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
   1209     * not be updated by non-flow control instructions.").  And on later
   1210     * platforms, there is no significant benefit to converting control flow
   1211     * instructions to conditional ADDs.  So we do patch IF and ELSE
   1212     * instructions in single program flow mode on those platforms.
   1213     */
   1214    if (intel->gen < 6)
   1215       assert(!p->single_program_flow);
   1216 
   1217    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
   1218    assert(endif_inst != NULL);
   1219    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
   1220 
   1221    unsigned br = 1;
   1222    /* Jump count is for 64bit data chunk each, so one 128bit instruction
   1223     * requires 2 chunks.
   1224     */
   1225    if (intel->gen >= 5)
   1226       br = 2;
   1227 
   1228    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
   1229    endif_inst->header.execution_size = if_inst->header.execution_size;
   1230 
   1231    if (else_inst == NULL) {
   1232       /* Patch IF -> ENDIF */
   1233       if (intel->gen < 6) {
   1234 	 /* Turn it into an IFF, which means no mask stack operations for
   1235 	  * all-false and jumping past the ENDIF.
   1236 	  */
   1237 	 if_inst->header.opcode = BRW_OPCODE_IFF;
   1238 	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
   1239 	 if_inst->bits3.if_else.pop_count = 0;
   1240 	 if_inst->bits3.if_else.pad0 = 0;
   1241       } else if (intel->gen == 6) {
   1242 	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
   1243 	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
   1244       } else {
   1245 	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
   1246 	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
   1247       }
   1248    } else {
   1249       else_inst->header.execution_size = if_inst->header.execution_size;
   1250 
   1251       /* Patch IF -> ELSE */
   1252       if (intel->gen < 6) {
   1253 	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
   1254 	 if_inst->bits3.if_else.pop_count = 0;
   1255 	 if_inst->bits3.if_else.pad0 = 0;
   1256       } else if (intel->gen == 6) {
   1257 	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
   1258       }
   1259 
   1260       /* Patch ELSE -> ENDIF */
   1261       if (intel->gen < 6) {
   1262 	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
   1263 	  * matching ENDIF.
   1264 	  */
   1265 	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
   1266 	 else_inst->bits3.if_else.pop_count = 1;
   1267 	 else_inst->bits3.if_else.pad0 = 0;
   1268       } else if (intel->gen == 6) {
   1269 	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
   1270 	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
   1271       } else {
   1272 	 /* The IF instruction's JIP should point just past the ELSE */
   1273 	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
   1274 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
   1275 	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
   1276 	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
   1277       }
   1278    }
   1279 }
   1280 
   1281 void
   1282 brw_ELSE(struct brw_compile *p)
   1283 {
   1284    struct intel_context *intel = &p->brw->intel;
   1285    struct brw_instruction *insn;
   1286 
   1287    insn = next_insn(p, BRW_OPCODE_ELSE);
   1288 
   1289    if (intel->gen < 6) {
   1290       brw_set_dest(p, insn, brw_ip_reg());
   1291       brw_set_src0(p, insn, brw_ip_reg());
   1292       brw_set_src1(p, insn, brw_imm_d(0x0));
   1293    } else if (intel->gen == 6) {
   1294       brw_set_dest(p, insn, brw_imm_w(0));
   1295       insn->bits1.branch_gen6.jump_count = 0;
   1296       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1297       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1298    } else {
   1299       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1300       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1301       brw_set_src1(p, insn, brw_imm_ud(0));
   1302       insn->bits3.break_cont.jip = 0;
   1303       insn->bits3.break_cont.uip = 0;
   1304    }
   1305 
   1306    insn->header.compression_control = BRW_COMPRESSION_NONE;
   1307    insn->header.mask_control = BRW_MASK_ENABLE;
   1308    if (!p->single_program_flow)
   1309       insn->header.thread_control = BRW_THREAD_SWITCH;
   1310 
   1311    push_if_stack(p, insn);
   1312 }
   1313 
   1314 void
   1315 brw_ENDIF(struct brw_compile *p)
   1316 {
   1317    struct intel_context *intel = &p->brw->intel;
   1318    struct brw_instruction *insn = NULL;
   1319    struct brw_instruction *else_inst = NULL;
   1320    struct brw_instruction *if_inst = NULL;
   1321    struct brw_instruction *tmp;
   1322    bool emit_endif = true;
   1323 
   1324    /* In single program flow mode, we can express IF and ELSE instructions
   1325     * equivalently as ADD instructions that operate on IP.  On platforms prior
   1326     * to Gen6, flow control instructions cause an implied thread switch, so
   1327     * this is a significant savings.
   1328     *
   1329     * However, on Gen6, writing to IP doesn't work in single program flow mode
   1330     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
   1331     * not be updated by non-flow control instructions.").  And on later
   1332     * platforms, there is no significant benefit to converting control flow
   1333     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
   1334     * Gen5.
   1335     */
   1336    if (intel->gen < 6 && p->single_program_flow)
   1337       emit_endif = false;
   1338 
   1339    /*
   1340     * A single next_insn() may change the base adress of instruction store
   1341     * memory(p->store), so call it first before referencing the instruction
   1342     * store pointer from an index
   1343     */
   1344    if (emit_endif)
   1345       insn = next_insn(p, BRW_OPCODE_ENDIF);
   1346 
   1347    /* Pop the IF and (optional) ELSE instructions from the stack */
   1348    p->if_depth_in_loop[p->loop_stack_depth]--;
   1349    tmp = pop_if_stack(p);
   1350    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
   1351       else_inst = tmp;
   1352       tmp = pop_if_stack(p);
   1353    }
   1354    if_inst = tmp;
   1355 
   1356    if (!emit_endif) {
   1357       /* ENDIF is useless; don't bother emitting it. */
   1358       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
   1359       return;
   1360    }
   1361 
   1362    if (intel->gen < 6) {
   1363       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
   1364       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
   1365       brw_set_src1(p, insn, brw_imm_d(0x0));
   1366    } else if (intel->gen == 6) {
   1367       brw_set_dest(p, insn, brw_imm_w(0));
   1368       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1369       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1370    } else {
   1371       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1372       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1373       brw_set_src1(p, insn, brw_imm_ud(0));
   1374    }
   1375 
   1376    insn->header.compression_control = BRW_COMPRESSION_NONE;
   1377    insn->header.mask_control = BRW_MASK_ENABLE;
   1378    insn->header.thread_control = BRW_THREAD_SWITCH;
   1379 
   1380    /* Also pop item off the stack in the endif instruction: */
   1381    if (intel->gen < 6) {
   1382       insn->bits3.if_else.jump_count = 0;
   1383       insn->bits3.if_else.pop_count = 1;
   1384       insn->bits3.if_else.pad0 = 0;
   1385    } else if (intel->gen == 6) {
   1386       insn->bits1.branch_gen6.jump_count = 2;
   1387    } else {
   1388       insn->bits3.break_cont.jip = 2;
   1389    }
   1390    patch_IF_ELSE(p, if_inst, else_inst, insn);
   1391 }
   1392 
   1393 struct brw_instruction *brw_BREAK(struct brw_compile *p)
   1394 {
   1395    struct intel_context *intel = &p->brw->intel;
   1396    struct brw_instruction *insn;
   1397 
   1398    insn = next_insn(p, BRW_OPCODE_BREAK);
   1399    if (intel->gen >= 6) {
   1400       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1401       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1402       brw_set_src1(p, insn, brw_imm_d(0x0));
   1403    } else {
   1404       brw_set_dest(p, insn, brw_ip_reg());
   1405       brw_set_src0(p, insn, brw_ip_reg());
   1406       brw_set_src1(p, insn, brw_imm_d(0x0));
   1407       insn->bits3.if_else.pad0 = 0;
   1408       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
   1409    }
   1410    insn->header.compression_control = BRW_COMPRESSION_NONE;
   1411    insn->header.execution_size = BRW_EXECUTE_8;
   1412 
   1413    return insn;
   1414 }
   1415 
   1416 struct brw_instruction *gen6_CONT(struct brw_compile *p)
   1417 {
   1418    struct brw_instruction *insn;
   1419 
   1420    insn = next_insn(p, BRW_OPCODE_CONTINUE);
   1421    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1422    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1423    brw_set_dest(p, insn, brw_ip_reg());
   1424    brw_set_src0(p, insn, brw_ip_reg());
   1425    brw_set_src1(p, insn, brw_imm_d(0x0));
   1426 
   1427    insn->header.compression_control = BRW_COMPRESSION_NONE;
   1428    insn->header.execution_size = BRW_EXECUTE_8;
   1429    return insn;
   1430 }
   1431 
   1432 struct brw_instruction *brw_CONT(struct brw_compile *p)
   1433 {
   1434    struct brw_instruction *insn;
   1435    insn = next_insn(p, BRW_OPCODE_CONTINUE);
   1436    brw_set_dest(p, insn, brw_ip_reg());
   1437    brw_set_src0(p, insn, brw_ip_reg());
   1438    brw_set_src1(p, insn, brw_imm_d(0x0));
   1439    insn->header.compression_control = BRW_COMPRESSION_NONE;
   1440    insn->header.execution_size = BRW_EXECUTE_8;
   1441    /* insn->header.mask_control = BRW_MASK_DISABLE; */
   1442    insn->bits3.if_else.pad0 = 0;
   1443    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
   1444    return insn;
   1445 }
   1446 
   1447 /* DO/WHILE loop:
   1448  *
   1449  * The DO/WHILE is just an unterminated loop -- break or continue are
   1450  * used for control within the loop.  We have a few ways they can be
   1451  * done.
   1452  *
   1453  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
   1454  * jip and no DO instruction.
   1455  *
   1456  * For non-uniform control flow pre-gen6, there's a DO instruction to
   1457  * push the mask, and a WHILE to jump back, and BREAK to get out and
   1458  * pop the mask.
   1459  *
   1460  * For gen6, there's no more mask stack, so no need for DO.  WHILE
   1461  * just points back to the first instruction of the loop.
   1462  */
   1463 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
   1464 {
   1465    struct intel_context *intel = &p->brw->intel;
   1466 
   1467    if (intel->gen >= 6 || p->single_program_flow) {
   1468       push_loop_stack(p, &p->store[p->nr_insn]);
   1469       return &p->store[p->nr_insn];
   1470    } else {
   1471       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
   1472 
   1473       push_loop_stack(p, insn);
   1474 
   1475       /* Override the defaults for this instruction:
   1476        */
   1477       brw_set_dest(p, insn, brw_null_reg());
   1478       brw_set_src0(p, insn, brw_null_reg());
   1479       brw_set_src1(p, insn, brw_null_reg());
   1480 
   1481       insn->header.compression_control = BRW_COMPRESSION_NONE;
   1482       insn->header.execution_size = execute_size;
   1483       insn->header.predicate_control = BRW_PREDICATE_NONE;
   1484       /* insn->header.mask_control = BRW_MASK_ENABLE; */
   1485       /* insn->header.mask_control = BRW_MASK_DISABLE; */
   1486 
   1487       return insn;
   1488    }
   1489 }
   1490 
   1491 /**
   1492  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
   1493  * instruction here.
   1494  *
   1495  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
   1496  * nesting, since it can always just point to the end of the block/current loop.
   1497  */
   1498 static void
   1499 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
   1500 {
   1501    struct intel_context *intel = &p->brw->intel;
   1502    struct brw_instruction *do_inst = get_inner_do_insn(p);
   1503    struct brw_instruction *inst;
   1504    int br = (intel->gen == 5) ? 2 : 1;
   1505 
   1506    for (inst = while_inst - 1; inst != do_inst; inst--) {
   1507       /* If the jump count is != 0, that means that this instruction has already
   1508        * been patched because it's part of a loop inside of the one we're
   1509        * patching.
   1510        */
   1511       if (inst->header.opcode == BRW_OPCODE_BREAK &&
   1512 	  inst->bits3.if_else.jump_count == 0) {
   1513 	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
   1514       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
   1515 		 inst->bits3.if_else.jump_count == 0) {
   1516 	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
   1517       }
   1518    }
   1519 }
   1520 
   1521 struct brw_instruction *brw_WHILE(struct brw_compile *p)
   1522 {
   1523    struct intel_context *intel = &p->brw->intel;
   1524    struct brw_instruction *insn, *do_insn;
   1525    GLuint br = 1;
   1526 
   1527    if (intel->gen >= 5)
   1528       br = 2;
   1529 
   1530    if (intel->gen >= 7) {
   1531       insn = next_insn(p, BRW_OPCODE_WHILE);
   1532       do_insn = get_inner_do_insn(p);
   1533 
   1534       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1535       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1536       brw_set_src1(p, insn, brw_imm_ud(0));
   1537       insn->bits3.break_cont.jip = br * (do_insn - insn);
   1538 
   1539       insn->header.execution_size = BRW_EXECUTE_8;
   1540    } else if (intel->gen == 6) {
   1541       insn = next_insn(p, BRW_OPCODE_WHILE);
   1542       do_insn = get_inner_do_insn(p);
   1543 
   1544       brw_set_dest(p, insn, brw_imm_w(0));
   1545       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
   1546       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1547       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   1548 
   1549       insn->header.execution_size = BRW_EXECUTE_8;
   1550    } else {
   1551       if (p->single_program_flow) {
   1552 	 insn = next_insn(p, BRW_OPCODE_ADD);
   1553          do_insn = get_inner_do_insn(p);
   1554 
   1555 	 brw_set_dest(p, insn, brw_ip_reg());
   1556 	 brw_set_src0(p, insn, brw_ip_reg());
   1557 	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
   1558 	 insn->header.execution_size = BRW_EXECUTE_1;
   1559       } else {
   1560 	 insn = next_insn(p, BRW_OPCODE_WHILE);
   1561          do_insn = get_inner_do_insn(p);
   1562 
   1563 	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
   1564 
   1565 	 brw_set_dest(p, insn, brw_ip_reg());
   1566 	 brw_set_src0(p, insn, brw_ip_reg());
   1567 	 brw_set_src1(p, insn, brw_imm_d(0));
   1568 
   1569 	 insn->header.execution_size = do_insn->header.execution_size;
   1570 	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
   1571 	 insn->bits3.if_else.pop_count = 0;
   1572 	 insn->bits3.if_else.pad0 = 0;
   1573 
   1574 	 brw_patch_break_cont(p, insn);
   1575       }
   1576    }
   1577    insn->header.compression_control = BRW_COMPRESSION_NONE;
   1578    p->current->header.predicate_control = BRW_PREDICATE_NONE;
   1579 
   1580    p->loop_stack_depth--;
   1581 
   1582    return insn;
   1583 }
   1584 
   1585 
   1586 /* FORWARD JUMPS:
   1587  */
   1588 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
   1589 {
   1590    struct intel_context *intel = &p->brw->intel;
   1591    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
   1592    GLuint jmpi = 1;
   1593 
   1594    if (intel->gen >= 5)
   1595       jmpi = 2;
   1596 
   1597    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
   1598    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
   1599 
   1600    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
   1601 }
   1602 
   1603 
   1604 
   1605 /* To integrate with the above, it makes sense that the comparison
   1606  * instruction should populate the flag register.  It might be simpler
   1607  * just to use the flag reg for most WM tasks?
   1608  */
   1609 void brw_CMP(struct brw_compile *p,
   1610 	     struct brw_reg dest,
   1611 	     GLuint conditional,
   1612 	     struct brw_reg src0,
   1613 	     struct brw_reg src1)
   1614 {
   1615    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
   1616 
   1617    insn->header.destreg__conditionalmod = conditional;
   1618    brw_set_dest(p, insn, dest);
   1619    brw_set_src0(p, insn, src0);
   1620    brw_set_src1(p, insn, src1);
   1621 
   1622 /*    guess_execution_size(insn, src0); */
   1623 
   1624 
   1625    /* Make it so that future instructions will use the computed flag
   1626     * value until brw_set_predicate_control_flag_value() is called
   1627     * again.
   1628     */
   1629    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
   1630        dest.nr == 0) {
   1631       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
   1632       p->flag_value = 0xff;
   1633    }
   1634 }
   1635 
   1636 /* Issue 'wait' instruction for n1, host could program MMIO
   1637    to wake up thread. */
   1638 void brw_WAIT (struct brw_compile *p)
   1639 {
   1640    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
   1641    struct brw_reg src = brw_notification_1_reg();
   1642 
   1643    brw_set_dest(p, insn, src);
   1644    brw_set_src0(p, insn, src);
   1645    brw_set_src1(p, insn, brw_null_reg());
   1646    insn->header.execution_size = 0; /* must */
   1647    insn->header.predicate_control = 0;
   1648    insn->header.compression_control = 0;
   1649 }
   1650 
   1651 
   1652 /***********************************************************************
   1653  * Helpers for the various SEND message types:
   1654  */
   1655 
   1656 /** Extended math function, float[8].
   1657  */
   1658 void brw_math( struct brw_compile *p,
   1659 	       struct brw_reg dest,
   1660 	       GLuint function,
   1661 	       GLuint msg_reg_nr,
   1662 	       struct brw_reg src,
   1663 	       GLuint data_type,
   1664 	       GLuint precision )
   1665 {
   1666    struct intel_context *intel = &p->brw->intel;
   1667 
   1668    if (intel->gen >= 6) {
   1669       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
   1670 
   1671       assert(dest.file == BRW_GENERAL_REGISTER_FILE);
   1672       assert(src.file == BRW_GENERAL_REGISTER_FILE);
   1673 
   1674       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
   1675       if (intel->gen == 6)
   1676 	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
   1677 
   1678       /* Source modifiers are ignored for extended math instructions on Gen6. */
   1679       if (intel->gen == 6) {
   1680 	 assert(!src.negate);
   1681 	 assert(!src.abs);
   1682       }
   1683 
   1684       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
   1685 	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
   1686 	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
   1687 	 assert(src.type != BRW_REGISTER_TYPE_F);
   1688       } else {
   1689 	 assert(src.type == BRW_REGISTER_TYPE_F);
   1690       }
   1691 
   1692       /* Math is the same ISA format as other opcodes, except that CondModifier
   1693        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
   1694        */
   1695       insn->header.destreg__conditionalmod = function;
   1696 
   1697       brw_set_dest(p, insn, dest);
   1698       brw_set_src0(p, insn, src);
   1699       brw_set_src1(p, insn, brw_null_reg());
   1700    } else {
   1701       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
   1702 
   1703       /* Example code doesn't set predicate_control for send
   1704        * instructions.
   1705        */
   1706       insn->header.predicate_control = 0;
   1707       insn->header.destreg__conditionalmod = msg_reg_nr;
   1708 
   1709       brw_set_dest(p, insn, dest);
   1710       brw_set_src0(p, insn, src);
   1711       brw_set_math_message(p,
   1712 			   insn,
   1713 			   function,
   1714 			   src.type == BRW_REGISTER_TYPE_D,
   1715 			   precision,
   1716 			   data_type);
   1717    }
   1718 }
   1719 
   1720 /** Extended math function, float[8].
   1721  */
   1722 void brw_math2(struct brw_compile *p,
   1723 	       struct brw_reg dest,
   1724 	       GLuint function,
   1725 	       struct brw_reg src0,
   1726 	       struct brw_reg src1)
   1727 {
   1728    struct intel_context *intel = &p->brw->intel;
   1729    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
   1730 
   1731    assert(intel->gen >= 6);
   1732    (void) intel;
   1733 
   1734 
   1735    assert(dest.file == BRW_GENERAL_REGISTER_FILE);
   1736    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
   1737    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
   1738 
   1739    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
   1740    if (intel->gen == 6) {
   1741       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
   1742       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
   1743    }
   1744 
   1745    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
   1746        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
   1747        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
   1748       assert(src0.type != BRW_REGISTER_TYPE_F);
   1749       assert(src1.type != BRW_REGISTER_TYPE_F);
   1750    } else {
   1751       assert(src0.type == BRW_REGISTER_TYPE_F);
   1752       assert(src1.type == BRW_REGISTER_TYPE_F);
   1753    }
   1754 
   1755    /* Source modifiers are ignored for extended math instructions on Gen6. */
   1756    if (intel->gen == 6) {
   1757       assert(!src0.negate);
   1758       assert(!src0.abs);
   1759       assert(!src1.negate);
   1760       assert(!src1.abs);
   1761    }
   1762 
   1763    /* Math is the same ISA format as other opcodes, except that CondModifier
   1764     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
   1765     */
   1766    insn->header.destreg__conditionalmod = function;
   1767 
   1768    brw_set_dest(p, insn, dest);
   1769    brw_set_src0(p, insn, src0);
   1770    brw_set_src1(p, insn, src1);
   1771 }
   1772 
   1773 /**
   1774  * Extended math function, float[16].
   1775  * Use 2 send instructions.
   1776  */
   1777 void brw_math_16( struct brw_compile *p,
   1778 		  struct brw_reg dest,
   1779 		  GLuint function,
   1780 		  GLuint msg_reg_nr,
   1781 		  struct brw_reg src,
   1782 		  GLuint precision )
   1783 {
   1784    struct intel_context *intel = &p->brw->intel;
   1785    struct brw_instruction *insn;
   1786 
   1787    if (intel->gen >= 6) {
   1788       insn = next_insn(p, BRW_OPCODE_MATH);
   1789 
   1790       /* Math is the same ISA format as other opcodes, except that CondModifier
   1791        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
   1792        */
   1793       insn->header.destreg__conditionalmod = function;
   1794 
   1795       /* Source modifiers are ignored for extended math instructions. */
   1796       assert(!src.negate);
   1797       assert(!src.abs);
   1798 
   1799       brw_set_dest(p, insn, dest);
   1800       brw_set_src0(p, insn, src);
   1801       brw_set_src1(p, insn, brw_null_reg());
   1802       return;
   1803    }
   1804 
   1805    /* First instruction:
   1806     */
   1807    brw_push_insn_state(p);
   1808    brw_set_predicate_control_flag_value(p, 0xff);
   1809    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1810 
   1811    insn = next_insn(p, BRW_OPCODE_SEND);
   1812    insn->header.destreg__conditionalmod = msg_reg_nr;
   1813 
   1814    brw_set_dest(p, insn, dest);
   1815    brw_set_src0(p, insn, src);
   1816    brw_set_math_message(p,
   1817 			insn,
   1818 			function,
   1819 			BRW_MATH_INTEGER_UNSIGNED,
   1820 			precision,
   1821 			BRW_MATH_DATA_VECTOR);
   1822 
   1823    /* Second instruction:
   1824     */
   1825    insn = next_insn(p, BRW_OPCODE_SEND);
   1826    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
   1827    insn->header.destreg__conditionalmod = msg_reg_nr+1;
   1828 
   1829    brw_set_dest(p, insn, offset(dest,1));
   1830    brw_set_src0(p, insn, src);
   1831    brw_set_math_message(p,
   1832 			insn,
   1833 			function,
   1834 			BRW_MATH_INTEGER_UNSIGNED,
   1835 			precision,
   1836 			BRW_MATH_DATA_VECTOR);
   1837 
   1838    brw_pop_insn_state(p);
   1839 }
   1840 
   1841 
   1842 /**
   1843  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
   1844  * using a constant offset per channel.
   1845  *
   1846  * The offset must be aligned to oword size (16 bytes).  Used for
   1847  * register spilling.
   1848  */
   1849 void brw_oword_block_write_scratch(struct brw_compile *p,
   1850 				   struct brw_reg mrf,
   1851 				   int num_regs,
   1852 				   GLuint offset)
   1853 {
   1854    struct intel_context *intel = &p->brw->intel;
   1855    uint32_t msg_control, msg_type;
   1856    int mlen;
   1857 
   1858    if (intel->gen >= 6)
   1859       offset /= 16;
   1860 
   1861    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
   1862 
   1863    if (num_regs == 1) {
   1864       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
   1865       mlen = 2;
   1866    } else {
   1867       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
   1868       mlen = 3;
   1869    }
   1870 
   1871    /* Set up the message header.  This is g0, with g0.2 filled with
   1872     * the offset.  We don't want to leave our offset around in g0 or
   1873     * it'll screw up texture samples, so set it up inside the message
   1874     * reg.
   1875     */
   1876    {
   1877       brw_push_insn_state(p);
   1878       brw_set_mask_control(p, BRW_MASK_DISABLE);
   1879       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1880 
   1881       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   1882 
   1883       /* set message header global offset field (reg 0, element 2) */
   1884       brw_MOV(p,
   1885 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
   1886 				  mrf.nr,
   1887 				  2), BRW_REGISTER_TYPE_UD),
   1888 	      brw_imm_ud(offset));
   1889 
   1890       brw_pop_insn_state(p);
   1891    }
   1892 
   1893    {
   1894       struct brw_reg dest;
   1895       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
   1896       int send_commit_msg;
   1897       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
   1898 					 BRW_REGISTER_TYPE_UW);
   1899 
   1900       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
   1901 	 insn->header.compression_control = BRW_COMPRESSION_NONE;
   1902 	 src_header = vec16(src_header);
   1903       }
   1904       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
   1905       insn->header.destreg__conditionalmod = mrf.nr;
   1906 
   1907       /* Until gen6, writes followed by reads from the same location
   1908        * are not guaranteed to be ordered unless write_commit is set.
   1909        * If set, then a no-op write is issued to the destination
   1910        * register to set a dependency, and a read from the destination
   1911        * can be used to ensure the ordering.
   1912        *
   1913        * For gen6, only writes between different threads need ordering
   1914        * protection.  Our use of DP writes is all about register
   1915        * spilling within a thread.
   1916        */
   1917       if (intel->gen >= 6) {
   1918 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
   1919 	 send_commit_msg = 0;
   1920       } else {
   1921 	 dest = src_header;
   1922 	 send_commit_msg = 1;
   1923       }
   1924 
   1925       brw_set_dest(p, insn, dest);
   1926       if (intel->gen >= 6) {
   1927 	 brw_set_src0(p, insn, mrf);
   1928       } else {
   1929 	 brw_set_src0(p, insn, brw_null_reg());
   1930       }
   1931 
   1932       if (intel->gen >= 6)
   1933 	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
   1934       else
   1935 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
   1936 
   1937       brw_set_dp_write_message(p,
   1938 			       insn,
   1939 			       255, /* binding table index (255=stateless) */
   1940 			       msg_control,
   1941 			       msg_type,
   1942 			       mlen,
   1943 			       true, /* header_present */
   1944 			       0, /* not a render target */
   1945 			       send_commit_msg, /* response_length */
   1946 			       0, /* eot */
   1947 			       send_commit_msg);
   1948    }
   1949 }
   1950 
   1951 
   1952 /**
   1953  * Read a block of owords (half a GRF each) from the scratch buffer
   1954  * using a constant index per channel.
   1955  *
   1956  * Offset must be aligned to oword size (16 bytes).  Used for register
   1957  * spilling.
   1958  */
   1959 void
   1960 brw_oword_block_read_scratch(struct brw_compile *p,
   1961 			     struct brw_reg dest,
   1962 			     struct brw_reg mrf,
   1963 			     int num_regs,
   1964 			     GLuint offset)
   1965 {
   1966    struct intel_context *intel = &p->brw->intel;
   1967    uint32_t msg_control;
   1968    int rlen;
   1969 
   1970    if (intel->gen >= 6)
   1971       offset /= 16;
   1972 
   1973    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
   1974    dest = retype(dest, BRW_REGISTER_TYPE_UW);
   1975 
   1976    if (num_regs == 1) {
   1977       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
   1978       rlen = 1;
   1979    } else {
   1980       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
   1981       rlen = 2;
   1982    }
   1983 
   1984    {
   1985       brw_push_insn_state(p);
   1986       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   1987       brw_set_mask_control(p, BRW_MASK_DISABLE);
   1988 
   1989       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   1990 
   1991       /* set message header global offset field (reg 0, element 2) */
   1992       brw_MOV(p,
   1993 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
   1994 				  mrf.nr,
   1995 				  2), BRW_REGISTER_TYPE_UD),
   1996 	      brw_imm_ud(offset));
   1997 
   1998       brw_pop_insn_state(p);
   1999    }
   2000 
   2001    {
   2002       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
   2003 
   2004       assert(insn->header.predicate_control == 0);
   2005       insn->header.compression_control = BRW_COMPRESSION_NONE;
   2006       insn->header.destreg__conditionalmod = mrf.nr;
   2007 
   2008       brw_set_dest(p, insn, dest);	/* UW? */
   2009       if (intel->gen >= 6) {
   2010 	 brw_set_src0(p, insn, mrf);
   2011       } else {
   2012 	 brw_set_src0(p, insn, brw_null_reg());
   2013       }
   2014 
   2015       brw_set_dp_read_message(p,
   2016 			      insn,
   2017 			      255, /* binding table index (255=stateless) */
   2018 			      msg_control,
   2019 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
   2020 			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
   2021 			      1, /* msg_length */
   2022 			      rlen);
   2023    }
   2024 }
   2025 
   2026 /**
   2027  * Read a float[4] vector from the data port Data Cache (const buffer).
   2028  * Location (in buffer) should be a multiple of 16.
   2029  * Used for fetching shader constants.
   2030  */
   2031 void brw_oword_block_read(struct brw_compile *p,
   2032 			  struct brw_reg dest,
   2033 			  struct brw_reg mrf,
   2034 			  uint32_t offset,
   2035 			  uint32_t bind_table_index)
   2036 {
   2037    struct intel_context *intel = &p->brw->intel;
   2038 
   2039    /* On newer hardware, offset is in units of owords. */
   2040    if (intel->gen >= 6)
   2041       offset /= 16;
   2042 
   2043    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
   2044 
   2045    brw_push_insn_state(p);
   2046    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
   2047    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   2048    brw_set_mask_control(p, BRW_MASK_DISABLE);
   2049 
   2050    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   2051 
   2052    /* set message header global offset field (reg 0, element 2) */
   2053    brw_MOV(p,
   2054 	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
   2055 			       mrf.nr,
   2056 			       2), BRW_REGISTER_TYPE_UD),
   2057 	   brw_imm_ud(offset));
   2058 
   2059    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
   2060    insn->header.destreg__conditionalmod = mrf.nr;
   2061 
   2062    /* cast dest to a uword[8] vector */
   2063    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
   2064 
   2065    brw_set_dest(p, insn, dest);
   2066    if (intel->gen >= 6) {
   2067       brw_set_src0(p, insn, mrf);
   2068    } else {
   2069       brw_set_src0(p, insn, brw_null_reg());
   2070    }
   2071 
   2072    brw_set_dp_read_message(p,
   2073 			   insn,
   2074 			   bind_table_index,
   2075 			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
   2076 			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
   2077 			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
   2078 			   1, /* msg_length */
   2079 			   1); /* response_length (1 reg, 2 owords!) */
   2080 
   2081    brw_pop_insn_state(p);
   2082 }
   2083 
   2084 /**
   2085  * Read a set of dwords from the data port Data Cache (const buffer).
   2086  *
   2087  * Location (in buffer) appears as UD offsets in the register after
   2088  * the provided mrf header reg.
   2089  */
   2090 void brw_dword_scattered_read(struct brw_compile *p,
   2091 			      struct brw_reg dest,
   2092 			      struct brw_reg mrf,
   2093 			      uint32_t bind_table_index)
   2094 {
   2095    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
   2096 
   2097    brw_push_insn_state(p);
   2098    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
   2099    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   2100    brw_set_mask_control(p, BRW_MASK_DISABLE);
   2101    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   2102    brw_pop_insn_state(p);
   2103 
   2104    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
   2105    insn->header.destreg__conditionalmod = mrf.nr;
   2106 
   2107    /* cast dest to a uword[8] vector */
   2108    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
   2109 
   2110    brw_set_dest(p, insn, dest);
   2111    brw_set_src0(p, insn, brw_null_reg());
   2112 
   2113    brw_set_dp_read_message(p,
   2114 			   insn,
   2115 			   bind_table_index,
   2116 			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
   2117 			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
   2118 			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
   2119 			   2, /* msg_length */
   2120 			   1); /* response_length */
   2121 }
   2122 
   2123 
   2124 
   2125 /**
   2126  * Read float[4] constant(s) from VS constant buffer.
   2127  * For relative addressing, two float[4] constants will be read into 'dest'.
   2128  * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
   2129  */
   2130 void brw_dp_READ_4_vs(struct brw_compile *p,
   2131                       struct brw_reg dest,
   2132                       GLuint location,
   2133                       GLuint bind_table_index)
   2134 {
   2135    struct intel_context *intel = &p->brw->intel;
   2136    struct brw_instruction *insn;
   2137    GLuint msg_reg_nr = 1;
   2138 
   2139    if (intel->gen >= 6)
   2140       location /= 16;
   2141 
   2142    /* Setup MRF[1] with location/offset into const buffer */
   2143    brw_push_insn_state(p);
   2144    brw_set_access_mode(p, BRW_ALIGN_1);
   2145    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   2146    brw_set_mask_control(p, BRW_MASK_DISABLE);
   2147    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
   2148    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
   2149 		     BRW_REGISTER_TYPE_UD),
   2150 	   brw_imm_ud(location));
   2151    brw_pop_insn_state(p);
   2152 
   2153    insn = next_insn(p, BRW_OPCODE_SEND);
   2154 
   2155    insn->header.predicate_control = BRW_PREDICATE_NONE;
   2156    insn->header.compression_control = BRW_COMPRESSION_NONE;
   2157    insn->header.destreg__conditionalmod = msg_reg_nr;
   2158    insn->header.mask_control = BRW_MASK_DISABLE;
   2159 
   2160    brw_set_dest(p, insn, dest);
   2161    if (intel->gen >= 6) {
   2162       brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
   2163    } else {
   2164       brw_set_src0(p, insn, brw_null_reg());
   2165    }
   2166 
   2167    brw_set_dp_read_message(p,
   2168 			   insn,
   2169 			   bind_table_index,
   2170 			   0,
   2171 			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
   2172 			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
   2173 			   1, /* msg_length */
   2174 			   1); /* response_length (1 Oword) */
   2175 }
   2176 
   2177 /**
   2178  * Read a float[4] constant per vertex from VS constant buffer, with
   2179  * relative addressing.
   2180  */
   2181 void brw_dp_READ_4_vs_relative(struct brw_compile *p,
   2182 			       struct brw_reg dest,
   2183 			       struct brw_reg addr_reg,
   2184 			       GLuint offset,
   2185 			       GLuint bind_table_index)
   2186 {
   2187    struct intel_context *intel = &p->brw->intel;
   2188    struct brw_reg src = brw_vec8_grf(0, 0);
   2189    int msg_type;
   2190 
   2191    /* Setup MRF[1] with offset into const buffer */
   2192    brw_push_insn_state(p);
   2193    brw_set_access_mode(p, BRW_ALIGN_1);
   2194    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   2195    brw_set_mask_control(p, BRW_MASK_DISABLE);
   2196    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
   2197 
   2198    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
   2199     * fields ignored.
   2200     */
   2201    brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
   2202 	   addr_reg, brw_imm_d(offset));
   2203    brw_pop_insn_state(p);
   2204 
   2205    gen6_resolve_implied_move(p, &src, 0);
   2206    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
   2207 
   2208    insn->header.predicate_control = BRW_PREDICATE_NONE;
   2209    insn->header.compression_control = BRW_COMPRESSION_NONE;
   2210    insn->header.destreg__conditionalmod = 0;
   2211    insn->header.mask_control = BRW_MASK_DISABLE;
   2212 
   2213    brw_set_dest(p, insn, dest);
   2214    brw_set_src0(p, insn, src);
   2215 
   2216    if (intel->gen >= 6)
   2217       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   2218    else if (intel->gen == 5 || intel->is_g4x)
   2219       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   2220    else
   2221       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
   2222 
   2223    brw_set_dp_read_message(p,
   2224 			   insn,
   2225 			   bind_table_index,
   2226 			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
   2227 			   msg_type,
   2228 			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
   2229 			   2, /* msg_length */
   2230 			   1); /* response_length */
   2231 }
   2232 
   2233 
   2234 
   2235 void brw_fb_WRITE(struct brw_compile *p,
   2236 		  int dispatch_width,
   2237                   GLuint msg_reg_nr,
   2238                   struct brw_reg src0,
   2239                   GLuint msg_control,
   2240                   GLuint binding_table_index,
   2241                   GLuint msg_length,
   2242                   GLuint response_length,
   2243                   bool eot,
   2244                   bool header_present)
   2245 {
   2246    struct intel_context *intel = &p->brw->intel;
   2247    struct brw_instruction *insn;
   2248    GLuint msg_type;
   2249    struct brw_reg dest;
   2250 
   2251    if (dispatch_width == 16)
   2252       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
   2253    else
   2254       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
   2255 
   2256    if (intel->gen >= 6) {
   2257       insn = next_insn(p, BRW_OPCODE_SENDC);
   2258    } else {
   2259       insn = next_insn(p, BRW_OPCODE_SEND);
   2260    }
   2261    /* The execution mask is ignored for render target writes. */
   2262    insn->header.predicate_control = 0;
   2263    insn->header.compression_control = BRW_COMPRESSION_NONE;
   2264 
   2265    if (intel->gen >= 6) {
   2266       /* headerless version, just submit color payload */
   2267       src0 = brw_message_reg(msg_reg_nr);
   2268 
   2269       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
   2270    } else {
   2271       insn->header.destreg__conditionalmod = msg_reg_nr;
   2272 
   2273       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
   2274    }
   2275 
   2276    brw_set_dest(p, insn, dest);
   2277    brw_set_src0(p, insn, src0);
   2278    brw_set_dp_write_message(p,
   2279 			    insn,
   2280 			    binding_table_index,
   2281 			    msg_control,
   2282 			    msg_type,
   2283 			    msg_length,
   2284 			    header_present,
   2285 			    eot, /* last render target write */
   2286 			    response_length,
   2287 			    eot,
   2288 			    0 /* send_commit_msg */);
   2289 }
   2290 
   2291 
   2292 /**
   2293  * Texture sample instruction.
   2294  * Note: the msg_type plus msg_length values determine exactly what kind
   2295  * of sampling operation is performed.  See volume 4, page 161 of docs.
   2296  */
   2297 void brw_SAMPLE(struct brw_compile *p,
   2298 		struct brw_reg dest,
   2299 		GLuint msg_reg_nr,
   2300 		struct brw_reg src0,
   2301 		GLuint binding_table_index,
   2302 		GLuint sampler,
   2303 		GLuint writemask,
   2304 		GLuint msg_type,
   2305 		GLuint response_length,
   2306 		GLuint msg_length,
   2307 		GLuint header_present,
   2308 		GLuint simd_mode,
   2309 		GLuint return_format)
   2310 {
   2311    struct intel_context *intel = &p->brw->intel;
   2312    bool need_stall = 0;
   2313 
   2314    if (writemask == 0) {
   2315       /*printf("%s: zero writemask??\n", __FUNCTION__); */
   2316       return;
   2317    }
   2318 
   2319    /* Hardware doesn't do destination dependency checking on send
   2320     * instructions properly.  Add a workaround which generates the
   2321     * dependency by other means.  In practice it seems like this bug
   2322     * only crops up for texture samples, and only where registers are
   2323     * written by the send and then written again later without being
   2324     * read in between.  Luckily for us, we already track that
   2325     * information and use it to modify the writemask for the
   2326     * instruction, so that is a guide for whether a workaround is
   2327     * needed.
   2328     */
   2329    if (writemask != WRITEMASK_XYZW) {
   2330       GLuint dst_offset = 0;
   2331       GLuint i, newmask = 0, len = 0;
   2332 
   2333       for (i = 0; i < 4; i++) {
   2334 	 if (writemask & (1<<i))
   2335 	    break;
   2336 	 dst_offset += 2;
   2337       }
   2338       for (; i < 4; i++) {
   2339 	 if (!(writemask & (1<<i)))
   2340 	    break;
   2341 	 newmask |= 1<<i;
   2342 	 len++;
   2343       }
   2344 
   2345       if (newmask != writemask) {
   2346 	 need_stall = 1;
   2347          /* printf("need stall %x %x\n", newmask , writemask); */
   2348       }
   2349       else {
   2350 	 bool dispatch_16 = false;
   2351 
   2352 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
   2353 
   2354 	 guess_execution_size(p, p->current, dest);
   2355 	 if (p->current->header.execution_size == BRW_EXECUTE_16)
   2356 	    dispatch_16 = true;
   2357 
   2358 	 newmask = ~newmask & WRITEMASK_XYZW;
   2359 
   2360 	 brw_push_insn_state(p);
   2361 
   2362 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   2363 	 brw_set_mask_control(p, BRW_MASK_DISABLE);
   2364 
   2365 	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
   2366 		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
   2367   	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
   2368 
   2369 	 brw_pop_insn_state(p);
   2370 
   2371   	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
   2372 	 dest = offset(dest, dst_offset);
   2373 
   2374 	 /* For 16-wide dispatch, masked channels are skipped in the
   2375 	  * response.  For 8-wide, masked channels still take up slots,
   2376 	  * and are just not written to.
   2377 	  */
   2378 	 if (dispatch_16)
   2379 	    response_length = len * 2;
   2380       }
   2381    }
   2382 
   2383    {
   2384       struct brw_instruction *insn;
   2385 
   2386       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
   2387 
   2388       insn = next_insn(p, BRW_OPCODE_SEND);
   2389       insn->header.predicate_control = 0; /* XXX */
   2390       insn->header.compression_control = BRW_COMPRESSION_NONE;
   2391       if (intel->gen < 6)
   2392 	  insn->header.destreg__conditionalmod = msg_reg_nr;
   2393 
   2394       brw_set_dest(p, insn, dest);
   2395       brw_set_src0(p, insn, src0);
   2396       brw_set_sampler_message(p, insn,
   2397 			      binding_table_index,
   2398 			      sampler,
   2399 			      msg_type,
   2400 			      response_length,
   2401 			      msg_length,
   2402 			      header_present,
   2403 			      simd_mode,
   2404 			      return_format);
   2405    }
   2406 
   2407    if (need_stall) {
   2408       struct brw_reg reg = vec8(offset(dest, response_length-1));
   2409 
   2410       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
   2411        */
   2412       brw_push_insn_state(p);
   2413       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   2414       brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
   2415 	      retype(reg, BRW_REGISTER_TYPE_UD));
   2416       brw_pop_insn_state(p);
   2417    }
   2418 
   2419 }
   2420 
   2421 /* All these variables are pretty confusing - we might be better off
   2422  * using bitmasks and macros for this, in the old style.  Or perhaps
   2423  * just having the caller instantiate the fields in dword3 itself.
   2424  */
   2425 void brw_urb_WRITE(struct brw_compile *p,
   2426 		   struct brw_reg dest,
   2427 		   GLuint msg_reg_nr,
   2428 		   struct brw_reg src0,
   2429 		   bool allocate,
   2430 		   bool used,
   2431 		   GLuint msg_length,
   2432 		   GLuint response_length,
   2433 		   bool eot,
   2434 		   bool writes_complete,
   2435 		   GLuint offset,
   2436 		   GLuint swizzle)
   2437 {
   2438    struct intel_context *intel = &p->brw->intel;
   2439    struct brw_instruction *insn;
   2440 
   2441    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
   2442 
   2443    if (intel->gen == 7) {
   2444       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
   2445       brw_push_insn_state(p);
   2446       brw_set_access_mode(p, BRW_ALIGN_1);
   2447       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
   2448 		       BRW_REGISTER_TYPE_UD),
   2449 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
   2450 		brw_imm_ud(0xff00));
   2451       brw_pop_insn_state(p);
   2452    }
   2453 
   2454    insn = next_insn(p, BRW_OPCODE_SEND);
   2455 
   2456    assert(msg_length < BRW_MAX_MRF);
   2457 
   2458    brw_set_dest(p, insn, dest);
   2459    brw_set_src0(p, insn, src0);
   2460    brw_set_src1(p, insn, brw_imm_d(0));
   2461 
   2462    if (intel->gen < 6)
   2463       insn->header.destreg__conditionalmod = msg_reg_nr;
   2464 
   2465    brw_set_urb_message(p,
   2466 		       insn,
   2467 		       allocate,
   2468 		       used,
   2469 		       msg_length,
   2470 		       response_length,
   2471 		       eot,
   2472 		       writes_complete,
   2473 		       offset,
   2474 		       swizzle);
   2475 }
   2476 
   2477 static int
   2478 brw_find_next_block_end(struct brw_compile *p, int start)
   2479 {
   2480    int ip;
   2481 
   2482    for (ip = start + 1; ip < p->nr_insn; ip++) {
   2483       struct brw_instruction *insn = &p->store[ip];
   2484 
   2485       switch (insn->header.opcode) {
   2486       case BRW_OPCODE_ENDIF:
   2487       case BRW_OPCODE_ELSE:
   2488       case BRW_OPCODE_WHILE:
   2489 	 return ip;
   2490       }
   2491    }
   2492    assert(!"not reached");
   2493    return start + 1;
   2494 }
   2495 
   2496 /* There is no DO instruction on gen6, so to find the end of the loop
   2497  * we have to see if the loop is jumping back before our start
   2498  * instruction.
   2499  */
   2500 static int
   2501 brw_find_loop_end(struct brw_compile *p, int start)
   2502 {
   2503    struct intel_context *intel = &p->brw->intel;
   2504    int ip;
   2505    int br = 2;
   2506 
   2507    for (ip = start + 1; ip < p->nr_insn; ip++) {
   2508       struct brw_instruction *insn = &p->store[ip];
   2509 
   2510       if (insn->header.opcode == BRW_OPCODE_WHILE) {
   2511 	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
   2512 				   : insn->bits3.break_cont.jip;
   2513 	 if (ip + jip / br <= start)
   2514 	    return ip;
   2515       }
   2516    }
   2517    assert(!"not reached");
   2518    return start + 1;
   2519 }
   2520 
   2521 /* After program generation, go back and update the UIP and JIP of
   2522  * BREAK and CONT instructions to their correct locations.
   2523  */
   2524 void
   2525 brw_set_uip_jip(struct brw_compile *p)
   2526 {
   2527    struct intel_context *intel = &p->brw->intel;
   2528    int ip;
   2529    int br = 2;
   2530 
   2531    if (intel->gen < 6)
   2532       return;
   2533 
   2534    for (ip = 0; ip < p->nr_insn; ip++) {
   2535       struct brw_instruction *insn = &p->store[ip];
   2536 
   2537       switch (insn->header.opcode) {
   2538       case BRW_OPCODE_BREAK:
   2539 	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
   2540 	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
   2541 	 insn->bits3.break_cont.uip =
   2542 	    br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
   2543 	 break;
   2544       case BRW_OPCODE_CONTINUE:
   2545 	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
   2546 	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
   2547 
   2548 	 assert(insn->bits3.break_cont.uip != 0);
   2549 	 assert(insn->bits3.break_cont.jip != 0);
   2550 	 break;
   2551       }
   2552    }
   2553 }
   2554 
   2555 void brw_ff_sync(struct brw_compile *p,
   2556 		   struct brw_reg dest,
   2557 		   GLuint msg_reg_nr,
   2558 		   struct brw_reg src0,
   2559 		   bool allocate,
   2560 		   GLuint response_length,
   2561 		   bool eot)
   2562 {
   2563    struct intel_context *intel = &p->brw->intel;
   2564    struct brw_instruction *insn;
   2565 
   2566    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
   2567 
   2568    insn = next_insn(p, BRW_OPCODE_SEND);
   2569    brw_set_dest(p, insn, dest);
   2570    brw_set_src0(p, insn, src0);
   2571    brw_set_src1(p, insn, brw_imm_d(0));
   2572 
   2573    if (intel->gen < 6)
   2574       insn->header.destreg__conditionalmod = msg_reg_nr;
   2575 
   2576    brw_set_ff_sync_message(p,
   2577 			   insn,
   2578 			   allocate,
   2579 			   response_length,
   2580 			   eot);
   2581 }
   2582 
   2583 /**
   2584  * Emit the SEND instruction necessary to generate stream output data on Gen6
   2585  * (for transform feedback).
   2586  *
   2587  * If send_commit_msg is true, this is the last piece of stream output data
   2588  * from this thread, so send the data as a committed write.  According to the
   2589  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
   2590  *
   2591  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
   2592  *   writes are complete by sending the final write as a committed write."
   2593  */
   2594 void
   2595 brw_svb_write(struct brw_compile *p,
   2596               struct brw_reg dest,
   2597               GLuint msg_reg_nr,
   2598               struct brw_reg src0,
   2599               GLuint binding_table_index,
   2600               bool   send_commit_msg)
   2601 {
   2602    struct brw_instruction *insn;
   2603 
   2604    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
   2605 
   2606    insn = next_insn(p, BRW_OPCODE_SEND);
   2607    brw_set_dest(p, insn, dest);
   2608    brw_set_src0(p, insn, src0);
   2609    brw_set_src1(p, insn, brw_imm_d(0));
   2610    brw_set_dp_write_message(p, insn,
   2611                             binding_table_index,
   2612                             0, /* msg_control: ignored */
   2613                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
   2614                             1, /* msg_length */
   2615                             true, /* header_present */
   2616                             0, /* last_render_target: ignored */
   2617                             send_commit_msg, /* response_length */
   2618                             0, /* end_of_thread */
   2619                             send_commit_msg); /* send_commit_msg */
   2620 }
   2621