Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include "compiler/glsl/ir.h"
     25 #include "brw_fs.h"
     26 #include "brw_fs_surface_builder.h"
     27 #include "brw_nir.h"
     28 #include "brw_program.h"
     29 
     30 using namespace brw;
     31 using namespace brw::surface_access;
     32 
     33 void
     34 fs_visitor::emit_nir_code()
     35 {
     36    /* emit the arrays used for inputs and outputs - load/store intrinsics will
     37     * be converted to reads/writes of these arrays
     38     */
     39    nir_setup_outputs();
     40    nir_setup_uniforms();
     41    nir_emit_system_values();
     42 
     43    /* get the main function and emit it */
     44    nir_foreach_function(function, nir) {
     45       assert(strcmp(function->name, "main") == 0);
     46       assert(function->impl);
     47       nir_emit_impl(function->impl);
     48    }
     49 }
     50 
     51 void
     52 fs_visitor::nir_setup_outputs()
     53 {
     54    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
     55       return;
     56 
     57    nir_foreach_variable(var, &nir->outputs) {
     58       const unsigned vec4s =
     59          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
     60                            : type_size_vec4(var->type);
     61       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
     62       for (unsigned i = 0; i < vec4s; i++) {
     63          if (outputs[var->data.driver_location + i].file == BAD_FILE)
     64             outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
     65       }
     66    }
     67 }
     68 
     69 void
     70 fs_visitor::nir_setup_uniforms()
     71 {
     72    if (dispatch_width != min_dispatch_width)
     73       return;
     74 
     75    uniforms = nir->num_uniforms / 4;
     76 }
     77 
     78 static bool
     79 emit_system_values_block(nir_block *block, fs_visitor *v)
     80 {
     81    fs_reg *reg;
     82 
     83    nir_foreach_instr(instr, block) {
     84       if (instr->type != nir_instr_type_intrinsic)
     85          continue;
     86 
     87       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
     88       switch (intrin->intrinsic) {
     89       case nir_intrinsic_load_vertex_id:
     90          unreachable("should be lowered by lower_vertex_id().");
     91 
     92       case nir_intrinsic_load_vertex_id_zero_base:
     93          assert(v->stage == MESA_SHADER_VERTEX);
     94          reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
     95          if (reg->file == BAD_FILE)
     96             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
     97          break;
     98 
     99       case nir_intrinsic_load_base_vertex:
    100          assert(v->stage == MESA_SHADER_VERTEX);
    101          reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
    102          if (reg->file == BAD_FILE)
    103             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
    104          break;
    105 
    106       case nir_intrinsic_load_instance_id:
    107          assert(v->stage == MESA_SHADER_VERTEX);
    108          reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
    109          if (reg->file == BAD_FILE)
    110             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
    111          break;
    112 
    113       case nir_intrinsic_load_base_instance:
    114          assert(v->stage == MESA_SHADER_VERTEX);
    115          reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
    116          if (reg->file == BAD_FILE)
    117             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
    118          break;
    119 
    120       case nir_intrinsic_load_draw_id:
    121          assert(v->stage == MESA_SHADER_VERTEX);
    122          reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
    123          if (reg->file == BAD_FILE)
    124             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
    125          break;
    126 
    127       case nir_intrinsic_load_invocation_id:
    128          if (v->stage == MESA_SHADER_TESS_CTRL)
    129             break;
    130          assert(v->stage == MESA_SHADER_GEOMETRY);
    131          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
    132          if (reg->file == BAD_FILE) {
    133             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
    134             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
    135             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
    136             abld.SHR(iid, g1, brw_imm_ud(27u));
    137             *reg = iid;
    138          }
    139          break;
    140 
    141       case nir_intrinsic_load_sample_pos:
    142          assert(v->stage == MESA_SHADER_FRAGMENT);
    143          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
    144          if (reg->file == BAD_FILE)
    145             *reg = *v->emit_samplepos_setup();
    146          break;
    147 
    148       case nir_intrinsic_load_sample_id:
    149          assert(v->stage == MESA_SHADER_FRAGMENT);
    150          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
    151          if (reg->file == BAD_FILE)
    152             *reg = *v->emit_sampleid_setup();
    153          break;
    154 
    155       case nir_intrinsic_load_sample_mask_in:
    156          assert(v->stage == MESA_SHADER_FRAGMENT);
    157          assert(v->devinfo->gen >= 7);
    158          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
    159          if (reg->file == BAD_FILE)
    160             *reg = *v->emit_samplemaskin_setup();
    161          break;
    162 
    163       case nir_intrinsic_load_work_group_id:
    164          assert(v->stage == MESA_SHADER_COMPUTE);
    165          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
    166          if (reg->file == BAD_FILE)
    167             *reg = *v->emit_cs_work_group_id_setup();
    168          break;
    169 
    170       case nir_intrinsic_load_helper_invocation:
    171          assert(v->stage == MESA_SHADER_FRAGMENT);
    172          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
    173          if (reg->file == BAD_FILE) {
    174             const fs_builder abld =
    175                v->bld.annotate("gl_HelperInvocation", NULL);
    176 
    177             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
    178              * pixel mask is in g1.7 of the thread payload.
    179              *
    180              * We move the per-channel pixel enable bit to the low bit of each
    181              * channel by shifting the byte containing the pixel mask by the
    182              * vector immediate 0x76543210UV.
    183              *
    184              * The region of <1,8,0> reads only 1 byte (the pixel masks for
    185              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
    186              * masks for 2 and 3) in SIMD16.
    187              */
    188             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
    189             abld.SHR(shifted,
    190                      stride(byte_offset(retype(brw_vec1_grf(1, 0),
    191                                                BRW_REGISTER_TYPE_UB), 28),
    192                             1, 8, 0),
    193                      brw_imm_v(0x76543210));
    194 
    195             /* A set bit in the pixel mask means the channel is enabled, but
    196              * that is the opposite of gl_HelperInvocation so we need to invert
    197              * the mask.
    198              *
    199              * The negate source-modifier bit of logical instructions on Gen8+
    200              * performs 1's complement negation, so we can use that instead of
    201              * a NOT instruction.
    202              */
    203             fs_reg inverted = negate(shifted);
    204             if (v->devinfo->gen < 8) {
    205                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
    206                abld.NOT(inverted, shifted);
    207             }
    208 
    209             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
    210              * with 1 and negating.
    211              */
    212             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
    213             abld.AND(anded, inverted, brw_imm_uw(1));
    214 
    215             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
    216             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
    217             *reg = dst;
    218          }
    219          break;
    220 
    221       default:
    222          break;
    223       }
    224    }
    225 
    226    return true;
    227 }
    228 
    229 void
    230 fs_visitor::nir_emit_system_values()
    231 {
    232    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
    233    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
    234       nir_system_values[i] = fs_reg();
    235    }
    236 
    237    nir_foreach_function(function, nir) {
    238       assert(strcmp(function->name, "main") == 0);
    239       assert(function->impl);
    240       nir_foreach_block(block, function->impl) {
    241          emit_system_values_block(block, this);
    242       }
    243    }
    244 }
    245 
    246 void
    247 fs_visitor::nir_emit_impl(nir_function_impl *impl)
    248 {
    249    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
    250    for (unsigned i = 0; i < impl->reg_alloc; i++) {
    251       nir_locals[i] = fs_reg();
    252    }
    253 
    254    foreach_list_typed(nir_register, reg, node, &impl->registers) {
    255       unsigned array_elems =
    256          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
    257       unsigned size = array_elems * reg->num_components;
    258       const brw_reg_type reg_type =
    259          reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
    260       nir_locals[reg->index] = bld.vgrf(reg_type, size);
    261    }
    262 
    263    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
    264                              impl->ssa_alloc);
    265 
    266    nir_emit_cf_list(&impl->body);
    267 }
    268 
    269 void
    270 fs_visitor::nir_emit_cf_list(exec_list *list)
    271 {
    272    exec_list_validate(list);
    273    foreach_list_typed(nir_cf_node, node, node, list) {
    274       switch (node->type) {
    275       case nir_cf_node_if:
    276          nir_emit_if(nir_cf_node_as_if(node));
    277          break;
    278 
    279       case nir_cf_node_loop:
    280          nir_emit_loop(nir_cf_node_as_loop(node));
    281          break;
    282 
    283       case nir_cf_node_block:
    284          nir_emit_block(nir_cf_node_as_block(node));
    285          break;
    286 
    287       default:
    288          unreachable("Invalid CFG node block");
    289       }
    290    }
    291 }
    292 
    293 void
    294 fs_visitor::nir_emit_if(nir_if *if_stmt)
    295 {
    296    /* first, put the condition into f0 */
    297    fs_inst *inst = bld.MOV(bld.null_reg_d(),
    298                             retype(get_nir_src(if_stmt->condition),
    299                                    BRW_REGISTER_TYPE_D));
    300    inst->conditional_mod = BRW_CONDITIONAL_NZ;
    301 
    302    bld.IF(BRW_PREDICATE_NORMAL);
    303 
    304    nir_emit_cf_list(&if_stmt->then_list);
    305 
    306    /* note: if the else is empty, dead CF elimination will remove it */
    307    bld.emit(BRW_OPCODE_ELSE);
    308 
    309    nir_emit_cf_list(&if_stmt->else_list);
    310 
    311    bld.emit(BRW_OPCODE_ENDIF);
    312 }
    313 
    314 void
    315 fs_visitor::nir_emit_loop(nir_loop *loop)
    316 {
    317    bld.emit(BRW_OPCODE_DO);
    318 
    319    nir_emit_cf_list(&loop->body);
    320 
    321    bld.emit(BRW_OPCODE_WHILE);
    322 }
    323 
    324 void
    325 fs_visitor::nir_emit_block(nir_block *block)
    326 {
    327    nir_foreach_instr(instr, block) {
    328       nir_emit_instr(instr);
    329    }
    330 }
    331 
    332 void
    333 fs_visitor::nir_emit_instr(nir_instr *instr)
    334 {
    335    const fs_builder abld = bld.annotate(NULL, instr);
    336 
    337    switch (instr->type) {
    338    case nir_instr_type_alu:
    339       nir_emit_alu(abld, nir_instr_as_alu(instr));
    340       break;
    341 
    342    case nir_instr_type_intrinsic:
    343       switch (stage) {
    344       case MESA_SHADER_VERTEX:
    345          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
    346          break;
    347       case MESA_SHADER_TESS_CTRL:
    348          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
    349          break;
    350       case MESA_SHADER_TESS_EVAL:
    351          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
    352          break;
    353       case MESA_SHADER_GEOMETRY:
    354          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
    355          break;
    356       case MESA_SHADER_FRAGMENT:
    357          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
    358          break;
    359       case MESA_SHADER_COMPUTE:
    360          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
    361          break;
    362       default:
    363          unreachable("unsupported shader stage");
    364       }
    365       break;
    366 
    367    case nir_instr_type_tex:
    368       nir_emit_texture(abld, nir_instr_as_tex(instr));
    369       break;
    370 
    371    case nir_instr_type_load_const:
    372       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
    373       break;
    374 
    375    case nir_instr_type_ssa_undef:
    376       /* We create a new VGRF for undefs on every use (by handling
    377        * them in get_nir_src()), rather than for each definition.
    378        * This helps register coalescing eliminate MOVs from undef.
    379        */
    380       break;
    381 
    382    case nir_instr_type_jump:
    383       nir_emit_jump(abld, nir_instr_as_jump(instr));
    384       break;
    385 
    386    default:
    387       unreachable("unknown instruction type");
    388    }
    389 }
    390 
    391 /**
    392  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
    393  * match instr.
    394  */
    395 bool
    396 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
    397                                       const fs_reg &result)
    398 {
    399    if (!instr->src[0].src.is_ssa ||
    400        !instr->src[0].src.ssa->parent_instr)
    401       return false;
    402 
    403    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
    404       return false;
    405 
    406    nir_alu_instr *src0 =
    407       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
    408 
    409    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
    410        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
    411       return false;
    412 
    413    nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
    414    assert(element != NULL);
    415 
    416    /* Element type to extract.*/
    417    const brw_reg_type type = brw_int_type(
    418       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
    419       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
    420 
    421    fs_reg op0 = get_nir_src(src0->src[0].src);
    422    op0.type = brw_type_for_nir_type(
    423       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
    424                      nir_src_bit_size(src0->src[0].src)));
    425    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
    426 
    427    set_saturate(instr->dest.saturate,
    428                 bld.MOV(result, subscript(op0, type, element->u32[0])));
    429    return true;
    430 }
    431 
    432 bool
    433 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
    434                                          const fs_reg &result)
    435 {
    436    if (!instr->src[0].src.is_ssa ||
    437        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
    438       return false;
    439 
    440    nir_intrinsic_instr *src0 =
    441       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
    442 
    443    if (src0->intrinsic != nir_intrinsic_load_front_face)
    444       return false;
    445 
    446    nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
    447    if (!value1 || fabsf(value1->f32[0]) != 1.0f)
    448       return false;
    449 
    450    nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
    451    if (!value2 || fabsf(value2->f32[0]) != 1.0f)
    452       return false;
    453 
    454    fs_reg tmp = vgrf(glsl_type::int_type);
    455 
    456    if (devinfo->gen >= 6) {
    457       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
    458       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
    459 
    460       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
    461        *
    462        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
    463        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
    464        *
    465        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
    466        *
    467        * This negation looks like it's safe in practice, because bits 0:4 will
    468        * surely be TRIANGLES
    469        */
    470 
    471       if (value1->f32[0] == -1.0f) {
    472          g0.negate = true;
    473       }
    474 
    475       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
    476              g0, brw_imm_uw(0x3f80));
    477    } else {
    478       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
    479       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
    480 
    481       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
    482        *
    483        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
    484        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
    485        *
    486        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
    487        *
    488        * This negation looks like it's safe in practice, because bits 0:4 will
    489        * surely be TRIANGLES
    490        */
    491 
    492       if (value1->f32[0] == -1.0f) {
    493          g1_6.negate = true;
    494       }
    495 
    496       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
    497    }
    498    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
    499 
    500    return true;
    501 }
    502 
    503 static void
    504 emit_find_msb_using_lzd(const fs_builder &bld,
    505                         const fs_reg &result,
    506                         const fs_reg &src,
    507                         bool is_signed)
    508 {
    509    fs_inst *inst;
    510    fs_reg temp = src;
    511 
    512    if (is_signed) {
    513       /* LZD of an absolute value source almost always does the right
    514        * thing.  There are two problem values:
    515        *
    516        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
    517        *   0.  However, findMSB(int(0x80000000)) == 30.
    518        *
    519        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
    520        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
    521        *
    522        *    For a value of zero or negative one, -1 will be returned.
    523        *
    524        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
    525        *   findMSB(-(1<<x)) should return x-1.
    526        *
    527        * For all negative number cases, including 0x80000000 and
    528        * 0xffffffff, the correct value is obtained from LZD if instead of
    529        * negating the (already negative) value the logical-not is used.  A
    530        * conditonal logical-not can be achieved in two instructions.
    531        */
    532       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
    533 
    534       bld.ASR(temp, src, brw_imm_d(31));
    535       bld.XOR(temp, temp, src);
    536    }
    537 
    538    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
    539            retype(temp, BRW_REGISTER_TYPE_UD));
    540 
    541    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
    542     * from the LSB side. Subtract the result from 31 to convert the MSB
    543     * count into an LSB count.  If no bits are set, LZD will return 32.
    544     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
    545     */
    546    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
    547    inst->src[0].negate = true;
    548 }
    549 
    550 void
    551 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    552 {
    553    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
    554    fs_inst *inst;
    555 
    556    fs_reg result = get_nir_dest(instr->dest.dest);
    557    result.type = brw_type_for_nir_type(
    558       (nir_alu_type)(nir_op_infos[instr->op].output_type |
    559                      nir_dest_bit_size(instr->dest.dest)));
    560 
    561    fs_reg op[4];
    562    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
    563       op[i] = get_nir_src(instr->src[i].src);
    564       op[i].type = brw_type_for_nir_type(
    565          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
    566                         nir_src_bit_size(instr->src[i].src)));
    567       op[i].abs = instr->src[i].abs;
    568       op[i].negate = instr->src[i].negate;
    569    }
    570 
    571    /* We get a bunch of mov's out of the from_ssa pass and they may still
    572     * be vectorized.  We'll handle them as a special-case.  We'll also
    573     * handle vecN here because it's basically the same thing.
    574     */
    575    switch (instr->op) {
    576    case nir_op_imov:
    577    case nir_op_fmov:
    578    case nir_op_vec2:
    579    case nir_op_vec3:
    580    case nir_op_vec4: {
    581       fs_reg temp = result;
    582       bool need_extra_copy = false;
    583       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
    584          if (!instr->src[i].src.is_ssa &&
    585              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
    586             need_extra_copy = true;
    587             temp = bld.vgrf(result.type, 4);
    588             break;
    589          }
    590       }
    591 
    592       for (unsigned i = 0; i < 4; i++) {
    593          if (!(instr->dest.write_mask & (1 << i)))
    594             continue;
    595 
    596          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
    597             inst = bld.MOV(offset(temp, bld, i),
    598                            offset(op[0], bld, instr->src[0].swizzle[i]));
    599          } else {
    600             inst = bld.MOV(offset(temp, bld, i),
    601                            offset(op[i], bld, instr->src[i].swizzle[0]));
    602          }
    603          inst->saturate = instr->dest.saturate;
    604       }
    605 
    606       /* In this case the source and destination registers were the same,
    607        * so we need to insert an extra set of moves in order to deal with
    608        * any swizzling.
    609        */
    610       if (need_extra_copy) {
    611          for (unsigned i = 0; i < 4; i++) {
    612             if (!(instr->dest.write_mask & (1 << i)))
    613                continue;
    614 
    615             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
    616          }
    617       }
    618       return;
    619    }
    620    default:
    621       break;
    622    }
    623 
    624    /* At this point, we have dealt with any instruction that operates on
    625     * more than a single channel.  Therefore, we can just adjust the source
    626     * and destination registers for that channel and emit the instruction.
    627     */
    628    unsigned channel = 0;
    629    if (nir_op_infos[instr->op].output_size == 0) {
    630       /* Since NIR is doing the scalarizing for us, we should only ever see
    631        * vectorized operations with a single channel.
    632        */
    633       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
    634       channel = ffs(instr->dest.write_mask) - 1;
    635 
    636       result = offset(result, bld, channel);
    637    }
    638 
    639    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
    640       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
    641       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
    642    }
    643 
    644    switch (instr->op) {
    645    case nir_op_i2f:
    646    case nir_op_u2f:
    647       if (optimize_extract_to_float(instr, result))
    648          return;
    649       inst = bld.MOV(result, op[0]);
    650       inst->saturate = instr->dest.saturate;
    651       break;
    652 
    653    case nir_op_f2d:
    654    case nir_op_i2d:
    655    case nir_op_u2d:
    656       /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
    657        *
    658        *    "When source or destination is 64b (...), regioning in Align1
    659        *     must follow these rules:
    660        *
    661        *     1. Source and destination horizontal stride must be aligned to
    662        *        the same qword.
    663        *     (...)"
    664        *
    665        * This means that 32-bit to 64-bit conversions need to have the 32-bit
    666        * data elements aligned to 64-bit. This restriction does not apply to
    667        * BDW and later.
    668        */
    669       if (devinfo->is_cherryview || devinfo->is_broxton) {
    670          fs_reg tmp = bld.vgrf(result.type, 1);
    671          tmp = subscript(tmp, op[0].type, 0);
    672          inst = bld.MOV(tmp, op[0]);
    673          inst = bld.MOV(result, tmp);
    674          inst->saturate = instr->dest.saturate;
    675          break;
    676       }
    677       /* fallthrough */
    678    case nir_op_d2f:
    679    case nir_op_d2i:
    680    case nir_op_d2u:
    681       inst = bld.MOV(result, op[0]);
    682       inst->saturate = instr->dest.saturate;
    683       break;
    684 
    685    case nir_op_f2i:
    686    case nir_op_f2u:
    687       bld.MOV(result, op[0]);
    688       break;
    689 
    690    case nir_op_fsign: {
    691       if (type_sz(op[0].type) < 8) {
    692          /* AND(val, 0x80000000) gives the sign bit.
    693           *
    694           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
    695           * zero.
    696           */
    697          bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
    698 
    699          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
    700          op[0].type = BRW_REGISTER_TYPE_UD;
    701          result.type = BRW_REGISTER_TYPE_UD;
    702          bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
    703 
    704          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
    705          inst->predicate = BRW_PREDICATE_NORMAL;
    706          if (instr->dest.saturate) {
    707             inst = bld.MOV(result, result);
    708             inst->saturate = true;
    709          }
    710       } else {
    711          /* For doubles we do the same but we need to consider:
    712           *
    713           * - 2-src instructions can't operate with 64-bit immediates
    714           * - The sign is encoded in the high 32-bit of each DF
    715           * - CMP with DF requires special handling in SIMD16
    716           * - We need to produce a DF result.
    717           */
    718 
    719          /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
    720           * a register and compare with that.
    721           */
    722          fs_reg tmp = vgrf(glsl_type::double_type);
    723          bld.MOV(tmp, setup_imm_df(bld, 0.0));
    724 
    725          /* A direct DF CMP using the flag register (null dst) won't work in
    726           * SIMD16 because the CMP will be split in two by lower_simd_width,
    727           * resulting in two CMP instructions with the same dst (NULL),
    728           * leading to dead code elimination of the first one. In SIMD8,
    729           * however, there is no need to split the CMP and we can save some
    730           * work.
    731           */
    732          fs_reg dst_tmp = vgrf(glsl_type::double_type);
    733          bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
    734 
    735          /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
    736           * so we store the result of the comparison in a vgrf instead and
    737           * then we generate a UD comparison from that that won't have to
    738           * be split by lower_simd_width. This is what NIR does to handle
    739           * double comparisons in the general case.
    740           */
    741          if (bld.dispatch_width() == 16 ) {
    742             fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
    743             bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
    744             bld.CMP(bld.null_reg_ud(),
    745                     dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
    746          }
    747 
    748          /* Get the high 32-bit of each double component where the sign is */
    749          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
    750          bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
    751 
    752          /* Get the sign bit */
    753          bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
    754 
    755          /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
    756          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
    757          inst->predicate = BRW_PREDICATE_NORMAL;
    758 
    759          /* Convert from 32-bit float to 64-bit double */
    760          result.type = BRW_REGISTER_TYPE_DF;
    761          inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
    762 
    763          if (instr->dest.saturate) {
    764             inst = bld.MOV(result, result);
    765             inst->saturate = true;
    766          }
    767       }
    768       break;
    769    }
    770 
    771    case nir_op_isign:
    772       /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
    773        *               -> non-negative val generates 0x00000000.
    774        *  Predicated OR sets 1 if val is positive.
    775        */
    776       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    777       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
    778       bld.ASR(result, op[0], brw_imm_d(31));
    779       inst = bld.OR(result, result, brw_imm_d(1));
    780       inst->predicate = BRW_PREDICATE_NORMAL;
    781       break;
    782 
    783    case nir_op_frcp:
    784       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
    785       inst->saturate = instr->dest.saturate;
    786       break;
    787 
    788    case nir_op_fexp2:
    789       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
    790       inst->saturate = instr->dest.saturate;
    791       break;
    792 
    793    case nir_op_flog2:
    794       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
    795       inst->saturate = instr->dest.saturate;
    796       break;
    797 
    798    case nir_op_fsin:
    799       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
    800       inst->saturate = instr->dest.saturate;
    801       break;
    802 
    803    case nir_op_fcos:
    804       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
    805       inst->saturate = instr->dest.saturate;
    806       break;
    807 
    808    case nir_op_fddx:
    809       if (fs_key->high_quality_derivatives) {
    810          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
    811       } else {
    812          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
    813       }
    814       inst->saturate = instr->dest.saturate;
    815       break;
    816    case nir_op_fddx_fine:
    817       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
    818       inst->saturate = instr->dest.saturate;
    819       break;
    820    case nir_op_fddx_coarse:
    821       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
    822       inst->saturate = instr->dest.saturate;
    823       break;
    824    case nir_op_fddy:
    825       if (fs_key->high_quality_derivatives) {
    826          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
    827       } else {
    828          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
    829       }
    830       inst->saturate = instr->dest.saturate;
    831       break;
    832    case nir_op_fddy_fine:
    833       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
    834       inst->saturate = instr->dest.saturate;
    835       break;
    836    case nir_op_fddy_coarse:
    837       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
    838       inst->saturate = instr->dest.saturate;
    839       break;
    840 
    841    case nir_op_iadd:
    842       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    843    case nir_op_fadd:
    844       inst = bld.ADD(result, op[0], op[1]);
    845       inst->saturate = instr->dest.saturate;
    846       break;
    847 
    848    case nir_op_fmul:
    849       inst = bld.MUL(result, op[0], op[1]);
    850       inst->saturate = instr->dest.saturate;
    851       break;
    852 
    853    case nir_op_imul:
    854       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    855       bld.MUL(result, op[0], op[1]);
    856       break;
    857 
    858    case nir_op_imul_high:
    859    case nir_op_umul_high:
    860       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    861       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
    862       break;
    863 
    864    case nir_op_idiv:
    865    case nir_op_udiv:
    866       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    867       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
    868       break;
    869 
    870    case nir_op_uadd_carry:
    871       unreachable("Should have been lowered by carry_to_arith().");
    872 
    873    case nir_op_usub_borrow:
    874       unreachable("Should have been lowered by borrow_to_arith().");
    875 
    876    case nir_op_umod:
    877    case nir_op_irem:
    878       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
    879        * appears that our hardware just does the right thing for signed
    880        * remainder.
    881        */
    882       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    883       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
    884       break;
    885 
    886    case nir_op_imod: {
    887       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
    888       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
    889 
    890       /* Math instructions don't support conditional mod */
    891       inst = bld.MOV(bld.null_reg_d(), result);
    892       inst->conditional_mod = BRW_CONDITIONAL_NZ;
    893 
    894       /* Now, we need to determine if signs of the sources are different.
    895        * When we XOR the sources, the top bit is 0 if they are the same and 1
    896        * if they are different.  We can then use a conditional modifier to
    897        * turn that into a predicate.  This leads us to an XOR.l instruction.
    898        *
    899        * Technically, according to the PRM, you're not allowed to use .l on a
    900        * XOR instruction.  However, emperical experiments and Curro's reading
    901        * of the simulator source both indicate that it's safe.
    902        */
    903       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
    904       inst = bld.XOR(tmp, op[0], op[1]);
    905       inst->predicate = BRW_PREDICATE_NORMAL;
    906       inst->conditional_mod = BRW_CONDITIONAL_L;
    907 
    908       /* If the result of the initial remainder operation is non-zero and the
    909        * two sources have different signs, add in a copy of op[1] to get the
    910        * final integer modulus value.
    911        */
    912       inst = bld.ADD(result, result, op[1]);
    913       inst->predicate = BRW_PREDICATE_NORMAL;
    914       break;
    915    }
    916 
    917    case nir_op_flt:
    918    case nir_op_fge:
    919    case nir_op_feq:
    920    case nir_op_fne: {
    921       fs_reg dest = result;
    922       if (nir_src_bit_size(instr->src[0].src) > 32) {
    923          dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
    924       }
    925       brw_conditional_mod cond;
    926       switch (instr->op) {
    927       case nir_op_flt:
    928          cond = BRW_CONDITIONAL_L;
    929          break;
    930       case nir_op_fge:
    931          cond = BRW_CONDITIONAL_GE;
    932          break;
    933       case nir_op_feq:
    934          cond = BRW_CONDITIONAL_Z;
    935          break;
    936       case nir_op_fne:
    937          cond = BRW_CONDITIONAL_NZ;
    938          break;
    939       default:
    940          unreachable("bad opcode");
    941       }
    942       bld.CMP(dest, op[0], op[1], cond);
    943       if (nir_src_bit_size(instr->src[0].src) > 32) {
    944          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
    945       }
    946       break;
    947    }
    948 
    949    case nir_op_ilt:
    950    case nir_op_ult:
    951       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    952       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
    953       break;
    954 
    955    case nir_op_ige:
    956    case nir_op_uge:
    957       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    958       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_GE);
    959       break;
    960 
    961    case nir_op_ieq:
    962       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    963       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_Z);
    964       break;
    965 
    966    case nir_op_ine:
    967       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    968       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ);
    969       break;
    970 
    971    case nir_op_inot:
    972       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    973       if (devinfo->gen >= 8) {
    974          op[0] = resolve_source_modifiers(op[0]);
    975       }
    976       bld.NOT(result, op[0]);
    977       break;
    978    case nir_op_ixor:
    979       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    980       if (devinfo->gen >= 8) {
    981          op[0] = resolve_source_modifiers(op[0]);
    982          op[1] = resolve_source_modifiers(op[1]);
    983       }
    984       bld.XOR(result, op[0], op[1]);
    985       break;
    986    case nir_op_ior:
    987       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    988       if (devinfo->gen >= 8) {
    989          op[0] = resolve_source_modifiers(op[0]);
    990          op[1] = resolve_source_modifiers(op[1]);
    991       }
    992       bld.OR(result, op[0], op[1]);
    993       break;
    994    case nir_op_iand:
    995       assert(nir_dest_bit_size(instr->dest.dest) < 64);
    996       if (devinfo->gen >= 8) {
    997          op[0] = resolve_source_modifiers(op[0]);
    998          op[1] = resolve_source_modifiers(op[1]);
    999       }
   1000       bld.AND(result, op[0], op[1]);
   1001       break;
   1002 
   1003    case nir_op_fdot2:
   1004    case nir_op_fdot3:
   1005    case nir_op_fdot4:
   1006    case nir_op_ball_fequal2:
   1007    case nir_op_ball_iequal2:
   1008    case nir_op_ball_fequal3:
   1009    case nir_op_ball_iequal3:
   1010    case nir_op_ball_fequal4:
   1011    case nir_op_ball_iequal4:
   1012    case nir_op_bany_fnequal2:
   1013    case nir_op_bany_inequal2:
   1014    case nir_op_bany_fnequal3:
   1015    case nir_op_bany_inequal3:
   1016    case nir_op_bany_fnequal4:
   1017    case nir_op_bany_inequal4:
   1018       unreachable("Lowered by nir_lower_alu_reductions");
   1019 
   1020    case nir_op_fnoise1_1:
   1021    case nir_op_fnoise1_2:
   1022    case nir_op_fnoise1_3:
   1023    case nir_op_fnoise1_4:
   1024    case nir_op_fnoise2_1:
   1025    case nir_op_fnoise2_2:
   1026    case nir_op_fnoise2_3:
   1027    case nir_op_fnoise2_4:
   1028    case nir_op_fnoise3_1:
   1029    case nir_op_fnoise3_2:
   1030    case nir_op_fnoise3_3:
   1031    case nir_op_fnoise3_4:
   1032    case nir_op_fnoise4_1:
   1033    case nir_op_fnoise4_2:
   1034    case nir_op_fnoise4_3:
   1035    case nir_op_fnoise4_4:
   1036       unreachable("not reached: should be handled by lower_noise");
   1037 
   1038    case nir_op_ldexp:
   1039       unreachable("not reached: should be handled by ldexp_to_arith()");
   1040 
   1041    case nir_op_fsqrt:
   1042       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
   1043       inst->saturate = instr->dest.saturate;
   1044       break;
   1045 
   1046    case nir_op_frsq:
   1047       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
   1048       inst->saturate = instr->dest.saturate;
   1049       break;
   1050 
   1051    case nir_op_b2i:
   1052    case nir_op_b2f:
   1053       bld.MOV(result, negate(op[0]));
   1054       break;
   1055 
   1056    case nir_op_f2b:
   1057       bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
   1058       break;
   1059    case nir_op_d2b: {
   1060       /* two-argument instructions can't take 64-bit immediates */
   1061       fs_reg zero = vgrf(glsl_type::double_type);
   1062       bld.MOV(zero, setup_imm_df(bld, 0.0));
   1063       /* A SIMD16 execution needs to be split in two instructions, so use
   1064        * a vgrf instead of the flag register as dst so instruction splitting
   1065        * works
   1066        */
   1067       fs_reg tmp = vgrf(glsl_type::double_type);
   1068       bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
   1069       bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
   1070       break;
   1071    }
   1072    case nir_op_i2b:
   1073       bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
   1074       break;
   1075 
   1076    case nir_op_ftrunc:
   1077       inst = bld.RNDZ(result, op[0]);
   1078       inst->saturate = instr->dest.saturate;
   1079       break;
   1080 
   1081    case nir_op_fceil: {
   1082       op[0].negate = !op[0].negate;
   1083       fs_reg temp = vgrf(glsl_type::float_type);
   1084       bld.RNDD(temp, op[0]);
   1085       temp.negate = true;
   1086       inst = bld.MOV(result, temp);
   1087       inst->saturate = instr->dest.saturate;
   1088       break;
   1089    }
   1090    case nir_op_ffloor:
   1091       inst = bld.RNDD(result, op[0]);
   1092       inst->saturate = instr->dest.saturate;
   1093       break;
   1094    case nir_op_ffract:
   1095       inst = bld.FRC(result, op[0]);
   1096       inst->saturate = instr->dest.saturate;
   1097       break;
   1098    case nir_op_fround_even:
   1099       inst = bld.RNDE(result, op[0]);
   1100       inst->saturate = instr->dest.saturate;
   1101       break;
   1102 
   1103    case nir_op_fquantize2f16: {
   1104       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
   1105       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
   1106       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
   1107 
   1108       /* The destination stride must be at least as big as the source stride. */
   1109       tmp16.type = BRW_REGISTER_TYPE_W;
   1110       tmp16.stride = 2;
   1111 
   1112       /* Check for denormal */
   1113       fs_reg abs_src0 = op[0];
   1114       abs_src0.abs = true;
   1115       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
   1116               BRW_CONDITIONAL_L);
   1117       /* Get the appropriately signed zero */
   1118       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
   1119               retype(op[0], BRW_REGISTER_TYPE_UD),
   1120               brw_imm_ud(0x80000000));
   1121       /* Do the actual F32 -> F16 -> F32 conversion */
   1122       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
   1123       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
   1124       /* Select that or zero based on normal status */
   1125       inst = bld.SEL(result, zero, tmp32);
   1126       inst->predicate = BRW_PREDICATE_NORMAL;
   1127       inst->saturate = instr->dest.saturate;
   1128       break;
   1129    }
   1130 
   1131    case nir_op_imin:
   1132    case nir_op_umin:
   1133       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1134    case nir_op_fmin:
   1135       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
   1136       inst->saturate = instr->dest.saturate;
   1137       break;
   1138 
   1139    case nir_op_imax:
   1140    case nir_op_umax:
   1141       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1142    case nir_op_fmax:
   1143       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
   1144       inst->saturate = instr->dest.saturate;
   1145       break;
   1146 
   1147    case nir_op_pack_snorm_2x16:
   1148    case nir_op_pack_snorm_4x8:
   1149    case nir_op_pack_unorm_2x16:
   1150    case nir_op_pack_unorm_4x8:
   1151    case nir_op_unpack_snorm_2x16:
   1152    case nir_op_unpack_snorm_4x8:
   1153    case nir_op_unpack_unorm_2x16:
   1154    case nir_op_unpack_unorm_4x8:
   1155    case nir_op_unpack_half_2x16:
   1156    case nir_op_pack_half_2x16:
   1157       unreachable("not reached: should be handled by lower_packing_builtins");
   1158 
   1159    case nir_op_unpack_half_2x16_split_x:
   1160       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
   1161       inst->saturate = instr->dest.saturate;
   1162       break;
   1163    case nir_op_unpack_half_2x16_split_y:
   1164       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
   1165       inst->saturate = instr->dest.saturate;
   1166       break;
   1167 
   1168    case nir_op_pack_double_2x32_split:
   1169       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
   1170       break;
   1171 
   1172    case nir_op_unpack_double_2x32_split_x:
   1173    case nir_op_unpack_double_2x32_split_y: {
   1174       /* Optimize the common case where we are unpacking from a double we have
   1175        * previously packed. In this case we can just bypass the pack operation
   1176        * and source directly from its arguments.
   1177        */
   1178       unsigned index = (instr->op == nir_op_unpack_double_2x32_split_x) ? 0 : 1;
   1179       if (instr->src[0].src.is_ssa) {
   1180          nir_instr *parent_instr = instr->src[0].src.ssa->parent_instr;
   1181          if (parent_instr->type == nir_instr_type_alu) {
   1182             nir_alu_instr *alu_parent = nir_instr_as_alu(parent_instr);
   1183             if (alu_parent->op == nir_op_pack_double_2x32_split &&
   1184                 alu_parent->src[index].src.is_ssa) {
   1185                op[0] = retype(get_nir_src(alu_parent->src[index].src),
   1186                               BRW_REGISTER_TYPE_UD);
   1187                op[0] =
   1188                   offset(op[0], bld, alu_parent->src[index].swizzle[channel]);
   1189                bld.MOV(result, op[0]);
   1190                break;
   1191             }
   1192          }
   1193       }
   1194 
   1195       if (instr->op == nir_op_unpack_double_2x32_split_x)
   1196          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
   1197       else
   1198          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
   1199       break;
   1200    }
   1201 
   1202    case nir_op_fpow:
   1203       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
   1204       inst->saturate = instr->dest.saturate;
   1205       break;
   1206 
   1207    case nir_op_bitfield_reverse:
   1208       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1209       bld.BFREV(result, op[0]);
   1210       break;
   1211 
   1212    case nir_op_bit_count:
   1213       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1214       bld.CBIT(result, op[0]);
   1215       break;
   1216 
   1217    case nir_op_ufind_msb: {
   1218       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1219       emit_find_msb_using_lzd(bld, result, op[0], false);
   1220       break;
   1221    }
   1222 
   1223    case nir_op_ifind_msb: {
   1224       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1225 
   1226       if (devinfo->gen < 7) {
   1227          emit_find_msb_using_lzd(bld, result, op[0], true);
   1228       } else {
   1229          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
   1230 
   1231          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
   1232           * count from the LSB side. If FBH didn't return an error
   1233           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
   1234           * count into an LSB count.
   1235           */
   1236          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
   1237 
   1238          inst = bld.ADD(result, result, brw_imm_d(31));
   1239          inst->predicate = BRW_PREDICATE_NORMAL;
   1240          inst->src[0].negate = true;
   1241       }
   1242       break;
   1243    }
   1244 
   1245    case nir_op_find_lsb:
   1246       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1247 
   1248       if (devinfo->gen < 7) {
   1249          fs_reg temp = vgrf(glsl_type::int_type);
   1250 
   1251          /* (x & -x) generates a value that consists of only the LSB of x.
   1252           * For all powers of 2, findMSB(y) == findLSB(y).
   1253           */
   1254          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
   1255          fs_reg negated_src = src;
   1256 
   1257          /* One must be negated, and the other must be non-negated.  It
   1258           * doesn't matter which is which.
   1259           */
   1260          negated_src.negate = true;
   1261          src.negate = false;
   1262 
   1263          bld.AND(temp, src, negated_src);
   1264          emit_find_msb_using_lzd(bld, result, temp, false);
   1265       } else {
   1266          bld.FBL(result, op[0]);
   1267       }
   1268       break;
   1269 
   1270    case nir_op_ubitfield_extract:
   1271    case nir_op_ibitfield_extract:
   1272       unreachable("should have been lowered");
   1273    case nir_op_ubfe:
   1274    case nir_op_ibfe:
   1275       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1276       bld.BFE(result, op[2], op[1], op[0]);
   1277       break;
   1278    case nir_op_bfm:
   1279       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1280       bld.BFI1(result, op[0], op[1]);
   1281       break;
   1282    case nir_op_bfi:
   1283       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1284       bld.BFI2(result, op[0], op[1], op[2]);
   1285       break;
   1286 
   1287    case nir_op_bitfield_insert:
   1288       unreachable("not reached: should have been lowered");
   1289 
   1290    case nir_op_ishl:
   1291       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1292       bld.SHL(result, op[0], op[1]);
   1293       break;
   1294    case nir_op_ishr:
   1295       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1296       bld.ASR(result, op[0], op[1]);
   1297       break;
   1298    case nir_op_ushr:
   1299       assert(nir_dest_bit_size(instr->dest.dest) < 64);
   1300       bld.SHR(result, op[0], op[1]);
   1301       break;
   1302 
   1303    case nir_op_pack_half_2x16_split:
   1304       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
   1305       break;
   1306 
   1307    case nir_op_ffma:
   1308       inst = bld.MAD(result, op[2], op[1], op[0]);
   1309       inst->saturate = instr->dest.saturate;
   1310       break;
   1311 
   1312    case nir_op_flrp:
   1313       inst = bld.LRP(result, op[0], op[1], op[2]);
   1314       inst->saturate = instr->dest.saturate;
   1315       break;
   1316 
   1317    case nir_op_bcsel:
   1318       if (optimize_frontfacing_ternary(instr, result))
   1319          return;
   1320 
   1321       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
   1322       inst = bld.SEL(result, op[1], op[2]);
   1323       inst->predicate = BRW_PREDICATE_NORMAL;
   1324       break;
   1325 
   1326    case nir_op_extract_u8:
   1327    case nir_op_extract_i8: {
   1328       const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
   1329       nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
   1330       assert(byte != NULL);
   1331       bld.MOV(result, subscript(op[0], type, byte->u32[0]));
   1332       break;
   1333    }
   1334 
   1335    case nir_op_extract_u16:
   1336    case nir_op_extract_i16: {
   1337       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
   1338       nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
   1339       assert(word != NULL);
   1340       bld.MOV(result, subscript(op[0], type, word->u32[0]));
   1341       break;
   1342    }
   1343 
   1344    default:
   1345       unreachable("unhandled instruction");
   1346    }
   1347 
   1348    /* If we need to do a boolean resolve, replace the result with -(x & 1)
   1349     * to sign extend the low bit to 0/~0
   1350     */
   1351    if (devinfo->gen <= 5 &&
   1352        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
   1353       fs_reg masked = vgrf(glsl_type::int_type);
   1354       bld.AND(masked, result, brw_imm_d(1));
   1355       masked.negate = true;
   1356       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
   1357    }
   1358 }
   1359 
   1360 void
   1361 fs_visitor::nir_emit_load_const(const fs_builder &bld,
   1362                                 nir_load_const_instr *instr)
   1363 {
   1364    const brw_reg_type reg_type =
   1365       instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
   1366    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
   1367 
   1368    switch (instr->def.bit_size) {
   1369    case 32:
   1370       for (unsigned i = 0; i < instr->def.num_components; i++)
   1371          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
   1372       break;
   1373 
   1374    case 64:
   1375       for (unsigned i = 0; i < instr->def.num_components; i++)
   1376          bld.MOV(offset(reg, bld, i),
   1377                  setup_imm_df(bld, instr->value.f64[i]));
   1378       break;
   1379 
   1380    default:
   1381       unreachable("Invalid bit size");
   1382    }
   1383 
   1384    nir_ssa_values[instr->def.index] = reg;
   1385 }
   1386 
   1387 fs_reg
   1388 fs_visitor::get_nir_src(const nir_src &src)
   1389 {
   1390    fs_reg reg;
   1391    if (src.is_ssa) {
   1392       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
   1393          const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
   1394             BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
   1395          reg = bld.vgrf(reg_type, src.ssa->num_components);
   1396       } else {
   1397          reg = nir_ssa_values[src.ssa->index];
   1398       }
   1399    } else {
   1400       /* We don't handle indirects on locals */
   1401       assert(src.reg.indirect == NULL);
   1402       reg = offset(nir_locals[src.reg.reg->index], bld,
   1403                    src.reg.base_offset * src.reg.reg->num_components);
   1404    }
   1405 
   1406    /* to avoid floating-point denorm flushing problems, set the type by
   1407     * default to D - instructions that need floating point semantics will set
   1408     * this to F if they need to
   1409     */
   1410    return retype(reg, BRW_REGISTER_TYPE_D);
   1411 }
   1412 
   1413 /**
   1414  * Return an IMM for constants; otherwise call get_nir_src() as normal.
   1415  */
   1416 fs_reg
   1417 fs_visitor::get_nir_src_imm(const nir_src &src)
   1418 {
   1419    nir_const_value *val = nir_src_as_const_value(src);
   1420    return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
   1421 }
   1422 
   1423 fs_reg
   1424 fs_visitor::get_nir_dest(const nir_dest &dest)
   1425 {
   1426    if (dest.is_ssa) {
   1427       const brw_reg_type reg_type =
   1428          dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
   1429       nir_ssa_values[dest.ssa.index] =
   1430          bld.vgrf(reg_type, dest.ssa.num_components);
   1431       return nir_ssa_values[dest.ssa.index];
   1432    } else {
   1433       /* We don't handle indirects on locals */
   1434       assert(dest.reg.indirect == NULL);
   1435       return offset(nir_locals[dest.reg.reg->index], bld,
   1436                     dest.reg.base_offset * dest.reg.reg->num_components);
   1437    }
   1438 }
   1439 
   1440 fs_reg
   1441 fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
   1442 {
   1443    fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
   1444                 BRW_REGISTER_TYPE_UD);
   1445    fs_reg indirect;
   1446    unsigned indirect_max = 0;
   1447 
   1448    for (const nir_deref *tail = &deref->deref; tail->child;
   1449         tail = tail->child) {
   1450       const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
   1451       assert(tail->child->deref_type == nir_deref_type_array);
   1452       const unsigned size = glsl_get_length(tail->type);
   1453       const unsigned element_size = type_size_scalar(deref_array->deref.type);
   1454       const unsigned base = MIN2(deref_array->base_offset, size - 1);
   1455       image = offset(image, bld, base * element_size);
   1456 
   1457       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
   1458          fs_reg tmp = vgrf(glsl_type::uint_type);
   1459 
   1460          /* Accessing an invalid surface index with the dataport can result
   1461           * in a hang.  According to the spec "if the index used to
   1462           * select an individual element is negative or greater than or
   1463           * equal to the size of the array, the results of the operation
   1464           * are undefined but may not lead to termination" -- which is one
   1465           * of the possible outcomes of the hang.  Clamp the index to
   1466           * prevent access outside of the array bounds.
   1467           */
   1468          bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
   1469                                      BRW_REGISTER_TYPE_UD),
   1470                          brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
   1471 
   1472          indirect_max += element_size * (tail->type->length - 1);
   1473 
   1474          bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
   1475          if (indirect.file == BAD_FILE) {
   1476             indirect = tmp;
   1477          } else {
   1478             bld.ADD(indirect, indirect, tmp);
   1479          }
   1480       }
   1481    }
   1482 
   1483    if (indirect.file == BAD_FILE) {
   1484       return image;
   1485    } else {
   1486       /* Emit a pile of MOVs to load the uniform into a temporary.  The
   1487        * dead-code elimination pass will get rid of what we don't use.
   1488        */
   1489       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
   1490       for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
   1491          bld.emit(SHADER_OPCODE_MOV_INDIRECT,
   1492                   offset(tmp, bld, j), offset(image, bld, j),
   1493                   indirect, brw_imm_ud((indirect_max + 1) * 4));
   1494       }
   1495       return tmp;
   1496    }
   1497 }
   1498 
   1499 void
   1500 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
   1501                          unsigned wr_mask)
   1502 {
   1503    for (unsigned i = 0; i < 4; i++) {
   1504       if (!((wr_mask >> i) & 1))
   1505          continue;
   1506 
   1507       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
   1508       new_inst->dst = offset(new_inst->dst, bld, i);
   1509       for (unsigned j = 0; j < new_inst->sources; j++)
   1510          if (new_inst->src[j].file == VGRF)
   1511             new_inst->src[j] = offset(new_inst->src[j], bld, i);
   1512 
   1513       bld.emit(new_inst);
   1514    }
   1515 }
   1516 
   1517 /**
   1518  * Get the matching channel register datatype for an image intrinsic of the
   1519  * specified GLSL image type.
   1520  */
   1521 static brw_reg_type
   1522 get_image_base_type(const glsl_type *type)
   1523 {
   1524    switch ((glsl_base_type)type->sampled_type) {
   1525    case GLSL_TYPE_UINT:
   1526       return BRW_REGISTER_TYPE_UD;
   1527    case GLSL_TYPE_INT:
   1528       return BRW_REGISTER_TYPE_D;
   1529    case GLSL_TYPE_FLOAT:
   1530       return BRW_REGISTER_TYPE_F;
   1531    default:
   1532       unreachable("Not reached.");
   1533    }
   1534 }
   1535 
   1536 /**
   1537  * Get the appropriate atomic op for an image atomic intrinsic.
   1538  */
   1539 static unsigned
   1540 get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
   1541 {
   1542    switch (op) {
   1543    case nir_intrinsic_image_atomic_add:
   1544       return BRW_AOP_ADD;
   1545    case nir_intrinsic_image_atomic_min:
   1546       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
   1547               BRW_AOP_IMIN : BRW_AOP_UMIN);
   1548    case nir_intrinsic_image_atomic_max:
   1549       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
   1550               BRW_AOP_IMAX : BRW_AOP_UMAX);
   1551    case nir_intrinsic_image_atomic_and:
   1552       return BRW_AOP_AND;
   1553    case nir_intrinsic_image_atomic_or:
   1554       return BRW_AOP_OR;
   1555    case nir_intrinsic_image_atomic_xor:
   1556       return BRW_AOP_XOR;
   1557    case nir_intrinsic_image_atomic_exchange:
   1558       return BRW_AOP_MOV;
   1559    case nir_intrinsic_image_atomic_comp_swap:
   1560       return BRW_AOP_CMPWR;
   1561    default:
   1562       unreachable("Not reachable.");
   1563    }
   1564 }
   1565 
   1566 static fs_inst *
   1567 emit_pixel_interpolater_send(const fs_builder &bld,
   1568                              enum opcode opcode,
   1569                              const fs_reg &dst,
   1570                              const fs_reg &src,
   1571                              const fs_reg &desc,
   1572                              glsl_interp_mode interpolation)
   1573 {
   1574    struct brw_wm_prog_data *wm_prog_data =
   1575       brw_wm_prog_data(bld.shader->stage_prog_data);
   1576    fs_inst *inst;
   1577    fs_reg payload;
   1578    int mlen;
   1579 
   1580    if (src.file == BAD_FILE) {
   1581       /* Dummy payload */
   1582       payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
   1583       mlen = 1;
   1584    } else {
   1585       payload = src;
   1586       mlen = 2 * bld.dispatch_width() / 8;
   1587    }
   1588 
   1589    inst = bld.emit(opcode, dst, payload, desc);
   1590    inst->mlen = mlen;
   1591    /* 2 floats per slot returned */
   1592    inst->size_written = 2 * dst.component_size(inst->exec_size);
   1593    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
   1594 
   1595    wm_prog_data->pulls_bary = true;
   1596 
   1597    return inst;
   1598 }
   1599 
   1600 /**
   1601  * Computes 1 << x, given a D/UD register containing some value x.
   1602  */
   1603 static fs_reg
   1604 intexp2(const fs_builder &bld, const fs_reg &x)
   1605 {
   1606    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
   1607 
   1608    fs_reg result = bld.vgrf(x.type, 1);
   1609    fs_reg one = bld.vgrf(x.type, 1);
   1610 
   1611    bld.MOV(one, retype(brw_imm_d(1), one.type));
   1612    bld.SHL(result, one, x);
   1613    return result;
   1614 }
   1615 
   1616 void
   1617 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
   1618 {
   1619    assert(stage == MESA_SHADER_GEOMETRY);
   1620 
   1621    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
   1622 
   1623    if (gs_compile->control_data_header_size_bits == 0)
   1624       return;
   1625 
   1626    /* We can only do EndPrimitive() functionality when the control data
   1627     * consists of cut bits.  Fortunately, the only time it isn't is when the
   1628     * output type is points, in which case EndPrimitive() is a no-op.
   1629     */
   1630    if (gs_prog_data->control_data_format !=
   1631        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
   1632       return;
   1633    }
   1634 
   1635    /* Cut bits use one bit per vertex. */
   1636    assert(gs_compile->control_data_bits_per_vertex == 1);
   1637 
   1638    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
   1639    vertex_count.type = BRW_REGISTER_TYPE_UD;
   1640 
   1641    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
   1642     * vertex n, 0 otherwise.  So all we need to do here is mark bit
   1643     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
   1644     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
   1645     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
   1646     *
   1647     * Note that if EndPrimitive() is called before emitting any vertices, this
   1648     * will cause us to set bit 31 of the control_data_bits register to 1.
   1649     * That's fine because:
   1650     *
   1651     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
   1652     *   output, so the hardware will ignore cut bit 31.
   1653     *
   1654     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
   1655     *   last vertex, so setting cut bit 31 has no effect (since the primitive
   1656     *   is automatically ended when the GS terminates).
   1657     *
   1658     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
   1659     *   control_data_bits register to 0 when the first vertex is emitted.
   1660     */
   1661 
   1662    const fs_builder abld = bld.annotate("end primitive");
   1663 
   1664    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
   1665    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1666    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
   1667    fs_reg mask = intexp2(abld, prev_count);
   1668    /* Note: we're relying on the fact that the GEN SHL instruction only pays
   1669     * attention to the lower 5 bits of its second source argument, so on this
   1670     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
   1671     * ((vertex_count - 1) % 32).
   1672     */
   1673    abld.OR(this->control_data_bits, this->control_data_bits, mask);
   1674 }
   1675 
   1676 void
   1677 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
   1678 {
   1679    assert(stage == MESA_SHADER_GEOMETRY);
   1680    assert(gs_compile->control_data_bits_per_vertex != 0);
   1681 
   1682    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
   1683 
   1684    const fs_builder abld = bld.annotate("emit control data bits");
   1685    const fs_builder fwa_bld = bld.exec_all();
   1686 
   1687    /* We use a single UD register to accumulate control data bits (32 bits
   1688     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
   1689     * at a time.
   1690     *
   1691     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
   1692     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
   1693     * use the Channel Mask phase to enable/disable which DWord within that
   1694     * group to write.  (Remember, different SIMD8 channels may have emitted
   1695     * different numbers of vertices, so we may need per-slot offsets.)
   1696     *
   1697     * Channel masking presents an annoying problem: we may have to replicate
   1698     * the data up to 4 times:
   1699     *
   1700     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
   1701     *
   1702     * To avoid penalizing shaders that emit a small number of vertices, we
   1703     * can avoid these sometimes: if the size of the control data header is
   1704     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
   1705     * land in the same 128-bit group, so we can skip per-slot offsets.
   1706     *
   1707     * Similarly, if the control data header is <= 32 bits, there is only one
   1708     * DWord, so we can skip channel masks.
   1709     */
   1710    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
   1711 
   1712    fs_reg channel_mask, per_slot_offset;
   1713 
   1714    if (gs_compile->control_data_header_size_bits > 32) {
   1715       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
   1716       channel_mask = vgrf(glsl_type::uint_type);
   1717    }
   1718 
   1719    if (gs_compile->control_data_header_size_bits > 128) {
   1720       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
   1721       per_slot_offset = vgrf(glsl_type::uint_type);
   1722    }
   1723 
   1724    /* Figure out which DWord we're trying to write to using the formula:
   1725     *
   1726     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
   1727     *
   1728     * Since bits_per_vertex is a power of two, and is known at compile
   1729     * time, this can be optimized to:
   1730     *
   1731     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
   1732     */
   1733    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
   1734       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1735       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1736       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
   1737       unsigned log2_bits_per_vertex =
   1738          util_last_bit(gs_compile->control_data_bits_per_vertex);
   1739       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
   1740 
   1741       if (per_slot_offset.file != BAD_FILE) {
   1742          /* Set the per-slot offset to dword_index / 4, so that we'll write to
   1743           * the appropriate OWord within the control data header.
   1744           */
   1745          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
   1746       }
   1747 
   1748       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
   1749        * write to the appropriate DWORD within the OWORD.
   1750        */
   1751       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1752       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
   1753       channel_mask = intexp2(fwa_bld, channel);
   1754       /* Then the channel masks need to be in bits 23:16. */
   1755       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
   1756    }
   1757 
   1758    /* Store the control data bits in the message payload and send it. */
   1759    int mlen = 2;
   1760    if (channel_mask.file != BAD_FILE)
   1761       mlen += 4; /* channel masks, plus 3 extra copies of the data */
   1762    if (per_slot_offset.file != BAD_FILE)
   1763       mlen++;
   1764 
   1765    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
   1766    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
   1767    int i = 0;
   1768    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
   1769    if (per_slot_offset.file != BAD_FILE)
   1770       sources[i++] = per_slot_offset;
   1771    if (channel_mask.file != BAD_FILE)
   1772       sources[i++] = channel_mask;
   1773    while (i < mlen) {
   1774       sources[i++] = this->control_data_bits;
   1775    }
   1776 
   1777    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
   1778    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
   1779    inst->mlen = mlen;
   1780    /* We need to increment Global Offset by 256-bits to make room for
   1781     * Broadwell's extra "Vertex Count" payload at the beginning of the
   1782     * URB entry.  Since this is an OWord message, Global Offset is counted
   1783     * in 128-bit units, so we must set it to 2.
   1784     */
   1785    if (gs_prog_data->static_vertex_count == -1)
   1786       inst->offset = 2;
   1787 }
   1788 
   1789 void
   1790 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
   1791                                             unsigned stream_id)
   1792 {
   1793    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
   1794 
   1795    /* Note: we are calling this *before* increasing vertex_count, so
   1796     * this->vertex_count == vertex_count - 1 in the formula above.
   1797     */
   1798 
   1799    /* Stream mode uses 2 bits per vertex */
   1800    assert(gs_compile->control_data_bits_per_vertex == 2);
   1801 
   1802    /* Must be a valid stream */
   1803    assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
   1804 
   1805    /* Control data bits are initialized to 0 so we don't have to set any
   1806     * bits when sending vertices to stream 0.
   1807     */
   1808    if (stream_id == 0)
   1809       return;
   1810 
   1811    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
   1812 
   1813    /* reg::sid = stream_id */
   1814    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1815    abld.MOV(sid, brw_imm_ud(stream_id));
   1816 
   1817    /* reg:shift_count = 2 * (vertex_count - 1) */
   1818    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1819    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
   1820 
   1821    /* Note: we're relying on the fact that the GEN SHL instruction only pays
   1822     * attention to the lower 5 bits of its second source argument, so on this
   1823     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
   1824     * stream_id << ((2 * (vertex_count - 1)) % 32).
   1825     */
   1826    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1827    abld.SHL(mask, sid, shift_count);
   1828    abld.OR(this->control_data_bits, this->control_data_bits, mask);
   1829 }
   1830 
   1831 void
   1832 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
   1833                            unsigned stream_id)
   1834 {
   1835    assert(stage == MESA_SHADER_GEOMETRY);
   1836 
   1837    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
   1838 
   1839    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
   1840    vertex_count.type = BRW_REGISTER_TYPE_UD;
   1841 
   1842    /* Haswell and later hardware ignores the "Render Stream Select" bits
   1843     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
   1844     * and instead sends all primitives down the pipeline for rasterization.
   1845     * If the SOL stage is enabled, "Render Stream Select" is honored and
   1846     * primitives bound to non-zero streams are discarded after stream output.
   1847     *
   1848     * Since the only purpose of primives sent to non-zero streams is to
   1849     * be recorded by transform feedback, we can simply discard all geometry
   1850     * bound to these streams when transform feedback is disabled.
   1851     */
   1852    if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
   1853       return;
   1854 
   1855    /* If we're outputting 32 control data bits or less, then we can wait
   1856     * until the shader is over to output them all.  Otherwise we need to
   1857     * output them as we go.  Now is the time to do it, since we're about to
   1858     * output the vertex_count'th vertex, so it's guaranteed that the
   1859     * control data bits associated with the (vertex_count - 1)th vertex are
   1860     * correct.
   1861     */
   1862    if (gs_compile->control_data_header_size_bits > 32) {
   1863       const fs_builder abld =
   1864          bld.annotate("emit vertex: emit control data bits");
   1865 
   1866       /* Only emit control data bits if we've finished accumulating a batch
   1867        * of 32 bits.  This is the case when:
   1868        *
   1869        *     (vertex_count * bits_per_vertex) % 32 == 0
   1870        *
   1871        * (in other words, when the last 5 bits of vertex_count *
   1872        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
   1873        * integer n (which is always the case, since bits_per_vertex is
   1874        * always 1 or 2), this is equivalent to requiring that the last 5-n
   1875        * bits of vertex_count are 0:
   1876        *
   1877        *     vertex_count & (2^(5-n) - 1) == 0
   1878        *
   1879        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
   1880        * equivalent to:
   1881        *
   1882        *     vertex_count & (32 / bits_per_vertex - 1) == 0
   1883        *
   1884        * TODO: If vertex_count is an immediate, we could do some of this math
   1885        *       at compile time...
   1886        */
   1887       fs_inst *inst =
   1888          abld.AND(bld.null_reg_d(), vertex_count,
   1889                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
   1890       inst->conditional_mod = BRW_CONDITIONAL_Z;
   1891 
   1892       abld.IF(BRW_PREDICATE_NORMAL);
   1893       /* If vertex_count is 0, then no control data bits have been
   1894        * accumulated yet, so we can skip emitting them.
   1895        */
   1896       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
   1897                BRW_CONDITIONAL_NEQ);
   1898       abld.IF(BRW_PREDICATE_NORMAL);
   1899       emit_gs_control_data_bits(vertex_count);
   1900       abld.emit(BRW_OPCODE_ENDIF);
   1901 
   1902       /* Reset control_data_bits to 0 so we can start accumulating a new
   1903        * batch.
   1904        *
   1905        * Note: in the case where vertex_count == 0, this neutralizes the
   1906        * effect of any call to EndPrimitive() that the shader may have
   1907        * made before outputting its first vertex.
   1908        */
   1909       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
   1910       inst->force_writemask_all = true;
   1911       abld.emit(BRW_OPCODE_ENDIF);
   1912    }
   1913 
   1914    emit_urb_writes(vertex_count);
   1915 
   1916    /* In stream mode we have to set control data bits for all vertices
   1917     * unless we have disabled control data bits completely (which we do
   1918     * do for GL_POINTS outputs that don't use streams).
   1919     */
   1920    if (gs_compile->control_data_header_size_bits > 0 &&
   1921        gs_prog_data->control_data_format ==
   1922           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
   1923       set_gs_stream_control_data_bits(vertex_count, stream_id);
   1924    }
   1925 }
   1926 
   1927 void
   1928 fs_visitor::emit_gs_input_load(const fs_reg &dst,
   1929                                const nir_src &vertex_src,
   1930                                unsigned base_offset,
   1931                                const nir_src &offset_src,
   1932                                unsigned num_components,
   1933                                unsigned first_component)
   1934 {
   1935    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
   1936 
   1937    nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
   1938    nir_const_value *offset_const = nir_src_as_const_value(offset_src);
   1939    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
   1940 
   1941    /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
   1942     * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
   1943     * gl_PointSize is available as a GS input, however, so it must be that.
   1944     */
   1945    const bool is_point_size = (base_offset == 0);
   1946 
   1947    /* TODO: figure out push input layout for invocations == 1 */
   1948    if (gs_prog_data->invocations == 1 &&
   1949        offset_const != NULL && vertex_const != NULL &&
   1950        4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
   1951       int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
   1952                        vertex_const->u32[0] * push_reg_count;
   1953       /* This input was pushed into registers. */
   1954       if (is_point_size) {
   1955          /* gl_PointSize comes in .w */
   1956          bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
   1957       } else {
   1958          for (unsigned i = 0; i < num_components; i++) {
   1959             bld.MOV(offset(dst, bld, i),
   1960                     fs_reg(ATTR, imm_offset + i + first_component, dst.type));
   1961          }
   1962       }
   1963       return;
   1964    }
   1965 
   1966    /* Resort to the pull model.  Ensure the VUE handles are provided. */
   1967    gs_prog_data->base.include_vue_handles = true;
   1968 
   1969    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
   1970    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1971 
   1972    if (gs_prog_data->invocations == 1) {
   1973       if (vertex_const) {
   1974          /* The vertex index is constant; just select the proper URB handle. */
   1975          icp_handle =
   1976             retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
   1977                    BRW_REGISTER_TYPE_UD);
   1978       } else {
   1979          /* The vertex index is non-constant.  We need to use indirect
   1980           * addressing to fetch the proper URB handle.
   1981           *
   1982           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
   1983           * indicating that channel <n> should read the handle from
   1984           * DWord <n>.  We convert that to bytes by multiplying by 4.
   1985           *
   1986           * Next, we convert the vertex index to bytes by multiplying
   1987           * by 32 (shifting by 5), and add the two together.  This is
   1988           * the final indirect byte offset.
   1989           */
   1990          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
   1991          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1992          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1993          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1994 
   1995          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
   1996          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
   1997          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
   1998          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
   1999          /* Convert vertex_index to bytes (multiply by 32) */
   2000          bld.SHL(vertex_offset_bytes,
   2001                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
   2002                  brw_imm_ud(5u));
   2003          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
   2004 
   2005          /* Use first_icp_handle as the base offset.  There is one register
   2006           * of URB handles per vertex, so inform the register allocator that
   2007           * we might read up to nir->info->gs.vertices_in registers.
   2008           */
   2009          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
   2010                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
   2011                   fs_reg(icp_offset_bytes),
   2012                   brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
   2013       }
   2014    } else {
   2015       assert(gs_prog_data->invocations > 1);
   2016 
   2017       if (vertex_const) {
   2018          assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5);
   2019          bld.MOV(icp_handle,
   2020                  retype(brw_vec1_grf(first_icp_handle +
   2021                                      vertex_const->i32[0] / 8,
   2022                                      vertex_const->i32[0] % 8),
   2023                         BRW_REGISTER_TYPE_UD));
   2024       } else {
   2025          /* The vertex index is non-constant.  We need to use indirect
   2026           * addressing to fetch the proper URB handle.
   2027           *
   2028           */
   2029          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   2030 
   2031          /* Convert vertex_index to bytes (multiply by 4) */
   2032          bld.SHL(icp_offset_bytes,
   2033                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
   2034                  brw_imm_ud(2u));
   2035 
   2036          /* Use first_icp_handle as the base offset.  There is one DWord
   2037           * of URB handles per vertex, so inform the register allocator that
   2038           * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
   2039           */
   2040          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
   2041                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
   2042                   fs_reg(icp_offset_bytes),
   2043                   brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
   2044                              REG_SIZE));
   2045       }
   2046    }
   2047 
   2048    fs_inst *inst;
   2049 
   2050    fs_reg tmp_dst = dst;
   2051    fs_reg indirect_offset = get_nir_src(offset_src);
   2052    unsigned num_iterations = 1;
   2053    unsigned orig_num_components = num_components;
   2054 
   2055    if (type_sz(dst.type) == 8) {
   2056       if (num_components > 2) {
   2057          num_iterations = 2;
   2058          num_components = 2;
   2059       }
   2060       fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
   2061       tmp_dst = tmp;
   2062       first_component = first_component / 2;
   2063    }
   2064 
   2065    for (unsigned iter = 0; iter < num_iterations; iter++) {
   2066       if (offset_const) {
   2067          /* Constant indexing - use global offset. */
   2068          if (first_component != 0) {
   2069             unsigned read_components = num_components + first_component;
   2070             fs_reg tmp = bld.vgrf(dst.type, read_components);
   2071             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
   2072             inst->size_written = read_components *
   2073                                  tmp.component_size(inst->exec_size);
   2074             for (unsigned i = 0; i < num_components; i++) {
   2075                bld.MOV(offset(tmp_dst, bld, i),
   2076                        offset(tmp, bld, i + first_component));
   2077             }
   2078          } else {
   2079             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
   2080                             icp_handle);
   2081             inst->size_written = num_components *
   2082                                  tmp_dst.component_size(inst->exec_size);
   2083          }
   2084          inst->offset = base_offset + offset_const->u32[0];
   2085          inst->mlen = 1;
   2086       } else {
   2087          /* Indirect indexing - use per-slot offsets as well. */
   2088          const fs_reg srcs[] = { icp_handle, indirect_offset };
   2089          unsigned read_components = num_components + first_component;
   2090          fs_reg tmp = bld.vgrf(dst.type, read_components);
   2091          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
   2092          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
   2093          if (first_component != 0) {
   2094             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
   2095                             payload);
   2096             inst->size_written = read_components *
   2097                                  tmp.component_size(inst->exec_size);
   2098             for (unsigned i = 0; i < num_components; i++) {
   2099                bld.MOV(offset(tmp_dst, bld, i),
   2100                        offset(tmp, bld, i + first_component));
   2101             }
   2102          } else {
   2103             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
   2104                          payload);
   2105             inst->size_written = num_components *
   2106                                  tmp_dst.component_size(inst->exec_size);
   2107          }
   2108          inst->offset = base_offset;
   2109          inst->mlen = 2;
   2110       }
   2111 
   2112       if (type_sz(dst.type) == 8) {
   2113          shuffle_32bit_load_result_to_64bit_data(
   2114             bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
   2115 
   2116          for (unsigned c = 0; c < num_components; c++)
   2117             bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
   2118       }
   2119 
   2120       if (num_iterations > 1) {
   2121          num_components = orig_num_components - 2;
   2122          if(offset_const) {
   2123             base_offset++;
   2124          } else {
   2125             fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   2126             bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
   2127             indirect_offset = new_indirect;
   2128          }
   2129       }
   2130    }
   2131 
   2132    if (is_point_size) {
   2133       /* Read the whole VUE header (because of alignment) and read .w. */
   2134       fs_reg tmp = bld.vgrf(dst.type, 4);
   2135       inst->dst = tmp;
   2136       inst->size_written = 4 * REG_SIZE;
   2137       bld.MOV(dst, offset(tmp, bld, 3));
   2138    }
   2139 }
   2140 
   2141 fs_reg
   2142 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
   2143 {
   2144    nir_src *offset_src = nir_get_io_offset_src(instr);
   2145    nir_const_value *const_value = nir_src_as_const_value(*offset_src);
   2146 
   2147    if (const_value) {
   2148       /* The only constant offset we should find is 0.  brw_nir.c's
   2149        * add_const_offset_to_base() will fold other constant offsets
   2150        * into instr->const_index[0].
   2151        */
   2152       assert(const_value->u32[0] == 0);
   2153       return fs_reg();
   2154    }
   2155 
   2156    return get_nir_src(*offset_src);
   2157 }
   2158 
   2159 static void
   2160 do_untyped_vector_read(const fs_builder &bld,
   2161                        const fs_reg dest,
   2162                        const fs_reg surf_index,
   2163                        const fs_reg offset_reg,
   2164                        unsigned num_components)
   2165 {
   2166    if (type_sz(dest.type) == 4) {
   2167       fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
   2168                                              1 /* dims */,
   2169                                              num_components,
   2170                                              BRW_PREDICATE_NONE);
   2171       read_result.type = dest.type;
   2172       for (unsigned i = 0; i < num_components; i++)
   2173          bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
   2174    } else if (type_sz(dest.type) == 8) {
   2175       /* Reading a dvec, so we need to:
   2176        *
   2177        * 1. Multiply num_components by 2, to account for the fact that we
   2178        *    need to read 64-bit components.
   2179        * 2. Shuffle the result of the load to form valid 64-bit elements
   2180        * 3. Emit a second load (for components z/w) if needed.
   2181        */
   2182       fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
   2183       bld.MOV(read_offset, offset_reg);
   2184 
   2185       int iters = num_components <= 2 ? 1 : 2;
   2186 
   2187       /* Load the dvec, the first iteration loads components x/y, the second
   2188        * iteration, if needed, loads components z/w
   2189        */
   2190       for (int it = 0; it < iters; it++) {
   2191          /* Compute number of components to read in this iteration */
   2192          int iter_components = MIN2(2, num_components);
   2193          num_components -= iter_components;
   2194 
   2195          /* Read. Since this message reads 32-bit components, we need to
   2196           * read twice as many components.
   2197           */
   2198          fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
   2199                                                 1 /* dims */,
   2200                                                 iter_components * 2,
   2201                                                 BRW_PREDICATE_NONE);
   2202 
   2203          /* Shuffle the 32-bit load result into valid 64-bit data */
   2204          const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
   2205          shuffle_32bit_load_result_to_64bit_data(
   2206             bld, packed_result, read_result, iter_components);
   2207 
   2208          /* Move each component to its destination */
   2209          read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
   2210          for (int c = 0; c < iter_components; c++) {
   2211             bld.MOV(offset(dest, bld, it * 2 + c),
   2212                     offset(packed_result, bld, c));
   2213          }
   2214 
   2215          bld.ADD(read_offset, read_offset, brw_imm_ud(16));
   2216       }
   2217    } else {
   2218       unreachable("Unsupported type");
   2219    }
   2220 }
   2221 
   2222 void
   2223 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
   2224                                   nir_intrinsic_instr *instr)
   2225 {
   2226    assert(stage == MESA_SHADER_VERTEX);
   2227 
   2228    fs_reg dest;
   2229    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   2230       dest = get_nir_dest(instr->dest);
   2231 
   2232    switch (instr->intrinsic) {
   2233    case nir_intrinsic_load_vertex_id:
   2234       unreachable("should be lowered by lower_vertex_id()");
   2235 
   2236    case nir_intrinsic_load_vertex_id_zero_base:
   2237    case nir_intrinsic_load_base_vertex:
   2238    case nir_intrinsic_load_instance_id:
   2239    case nir_intrinsic_load_base_instance:
   2240    case nir_intrinsic_load_draw_id: {
   2241       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
   2242       fs_reg val = nir_system_values[sv];
   2243       assert(val.file != BAD_FILE);
   2244       dest.type = val.type;
   2245       bld.MOV(dest, val);
   2246       break;
   2247    }
   2248 
   2249    case nir_intrinsic_load_input: {
   2250       fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
   2251       unsigned first_component = nir_intrinsic_component(instr);
   2252       unsigned num_components = instr->num_components;
   2253       enum brw_reg_type type = dest.type;
   2254 
   2255       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
   2256       assert(const_offset && "Indirect input loads not allowed");
   2257       src = offset(src, bld, const_offset->u32[0]);
   2258 
   2259       for (unsigned j = 0; j < num_components; j++) {
   2260          bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
   2261       }
   2262 
   2263       if (type == BRW_REGISTER_TYPE_DF) {
   2264          /* Once the double vector is read, set again its original register
   2265           * type to continue with normal execution.
   2266           */
   2267          src = retype(src, type);
   2268          dest = retype(dest, type);
   2269       }
   2270 
   2271       if (type_sz(src.type) == 8) {
   2272          shuffle_32bit_load_result_to_64bit_data(bld,
   2273                                                  dest,
   2274                                                  retype(dest, BRW_REGISTER_TYPE_F),
   2275                                                  instr->num_components);
   2276       }
   2277       break;
   2278    }
   2279 
   2280    default:
   2281       nir_emit_intrinsic(bld, instr);
   2282       break;
   2283    }
   2284 }
   2285 
   2286 void
   2287 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
   2288                                    nir_intrinsic_instr *instr)
   2289 {
   2290    assert(stage == MESA_SHADER_TESS_CTRL);
   2291    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
   2292    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
   2293 
   2294    fs_reg dst;
   2295    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   2296       dst = get_nir_dest(instr->dest);
   2297 
   2298    switch (instr->intrinsic) {
   2299    case nir_intrinsic_load_primitive_id:
   2300       bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
   2301       break;
   2302    case nir_intrinsic_load_invocation_id:
   2303       bld.MOV(retype(dst, invocation_id.type), invocation_id);
   2304       break;
   2305    case nir_intrinsic_load_patch_vertices_in:
   2306       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
   2307               brw_imm_d(tcs_key->input_vertices));
   2308       break;
   2309 
   2310    case nir_intrinsic_barrier: {
   2311       if (tcs_prog_data->instances == 1)
   2312          break;
   2313 
   2314       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   2315       fs_reg m0_2 = component(m0, 2);
   2316 
   2317       const fs_builder chanbld = bld.exec_all().group(1, 0);
   2318 
   2319       /* Zero the message header */
   2320       bld.exec_all().MOV(m0, brw_imm_ud(0u));
   2321 
   2322       /* Copy "Barrier ID" from r0.2, bits 16:13 */
   2323       chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
   2324                   brw_imm_ud(INTEL_MASK(16, 13)));
   2325 
   2326       /* Shift it up to bits 27:24. */
   2327       chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
   2328 
   2329       /* Set the Barrier Count and the enable bit */
   2330       chanbld.OR(m0_2, m0_2,
   2331                  brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
   2332 
   2333       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
   2334       break;
   2335    }
   2336 
   2337    case nir_intrinsic_load_input:
   2338       unreachable("nir_lower_io should never give us these.");
   2339       break;
   2340 
   2341    case nir_intrinsic_load_per_vertex_input: {
   2342       fs_reg indirect_offset = get_indirect_offset(instr);
   2343       unsigned imm_offset = instr->const_index[0];
   2344 
   2345       const nir_src &vertex_src = instr->src[0];
   2346       nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
   2347 
   2348       fs_inst *inst;
   2349 
   2350       fs_reg icp_handle;
   2351 
   2352       if (vertex_const) {
   2353          /* Emit a MOV to resolve <0,1,0> regioning. */
   2354          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   2355          bld.MOV(icp_handle,
   2356                  retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
   2357                                      vertex_const->i32[0] & 7),
   2358                         BRW_REGISTER_TYPE_UD));
   2359       } else if (tcs_prog_data->instances == 1 &&
   2360                  vertex_src.is_ssa &&
   2361                  vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
   2362                  nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
   2363          /* For the common case of only 1 instance, an array index of
   2364           * gl_InvocationID means reading g1.  Skip all the indirect work.
   2365           */
   2366          icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
   2367       } else {
   2368          /* The vertex index is non-constant.  We need to use indirect
   2369           * addressing to fetch the proper URB handle.
   2370           */
   2371          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   2372 
   2373          /* Each ICP handle is a single DWord (4 bytes) */
   2374          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   2375          bld.SHL(vertex_offset_bytes,
   2376                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
   2377                  brw_imm_ud(2u));
   2378 
   2379          /* Start at g1.  We might read up to 4 registers. */
   2380          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
   2381                   retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
   2382                   brw_imm_ud(4 * REG_SIZE));
   2383       }
   2384 
   2385       /* We can only read two double components with each URB read, so
   2386        * we send two read messages in that case, each one loading up to
   2387        * two double components.
   2388        */
   2389       unsigned num_iterations = 1;
   2390       unsigned num_components = instr->num_components;
   2391       unsigned first_component = nir_intrinsic_component(instr);
   2392       fs_reg orig_dst = dst;
   2393       if (type_sz(dst.type) == 8) {
   2394          first_component = first_component / 2;
   2395          if (instr->num_components > 2) {
   2396             num_iterations = 2;
   2397             num_components = 2;
   2398          }
   2399 
   2400          fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
   2401          dst = tmp;
   2402       }
   2403 
   2404       for (unsigned iter = 0; iter < num_iterations; iter++) {
   2405          if (indirect_offset.file == BAD_FILE) {
   2406             /* Constant indexing - use global offset. */
   2407             if (first_component != 0) {
   2408                unsigned read_components = num_components + first_component;
   2409                fs_reg tmp = bld.vgrf(dst.type, read_components);
   2410                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
   2411                for (unsigned i = 0; i < num_components; i++) {
   2412                   bld.MOV(offset(dst, bld, i),
   2413                           offset(tmp, bld, i + first_component));
   2414                }
   2415             } else {
   2416                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
   2417             }
   2418             inst->offset = imm_offset;
   2419             inst->mlen = 1;
   2420          } else {
   2421             /* Indirect indexing - use per-slot offsets as well. */
   2422             const fs_reg srcs[] = { icp_handle, indirect_offset };
   2423             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
   2424             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
   2425             if (first_component != 0) {
   2426                unsigned read_components = num_components + first_component;
   2427                fs_reg tmp = bld.vgrf(dst.type, read_components);
   2428                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
   2429                                payload);
   2430                for (unsigned i = 0; i < num_components; i++) {
   2431                   bld.MOV(offset(dst, bld, i),
   2432                           offset(tmp, bld, i + first_component));
   2433                }
   2434             } else {
   2435                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
   2436                                payload);
   2437             }
   2438             inst->offset = imm_offset;
   2439             inst->mlen = 2;
   2440          }
   2441          inst->size_written = (num_components + first_component) *
   2442                               inst->dst.component_size(inst->exec_size);
   2443 
   2444          /* If we are reading 64-bit data using 32-bit read messages we need
   2445           * build proper 64-bit data elements by shuffling the low and high
   2446           * 32-bit components around like we do for other things like UBOs
   2447           * or SSBOs.
   2448           */
   2449          if (type_sz(dst.type) == 8) {
   2450             shuffle_32bit_load_result_to_64bit_data(
   2451                bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
   2452 
   2453             for (unsigned c = 0; c < num_components; c++) {
   2454                bld.MOV(offset(orig_dst, bld, iter * 2 + c),
   2455                        offset(dst, bld, c));
   2456             }
   2457          }
   2458 
   2459          /* Copy the temporary to the destination to deal with writemasking.
   2460           *
   2461           * Also attempt to deal with gl_PointSize being in the .w component.
   2462           */
   2463          if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
   2464             assert(type_sz(dst.type) < 8);
   2465             inst->dst = bld.vgrf(dst.type, 4);
   2466             inst->size_written = 4 * REG_SIZE;
   2467             bld.MOV(dst, offset(inst->dst, bld, 3));
   2468          }
   2469 
   2470          /* If we are loading double data and we need a second read message
   2471           * adjust the write offset
   2472           */
   2473          if (num_iterations > 1) {
   2474             num_components = instr->num_components - 2;
   2475             imm_offset++;
   2476          }
   2477       }
   2478       break;
   2479    }
   2480 
   2481    case nir_intrinsic_load_output:
   2482    case nir_intrinsic_load_per_vertex_output: {
   2483       fs_reg indirect_offset = get_indirect_offset(instr);
   2484       unsigned imm_offset = instr->const_index[0];
   2485       unsigned first_component = nir_intrinsic_component(instr);
   2486 
   2487       fs_inst *inst;
   2488       if (indirect_offset.file == BAD_FILE) {
   2489          /* Replicate the patch handle to all enabled channels */
   2490          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   2491          bld.MOV(patch_handle,
   2492                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
   2493 
   2494          {
   2495             if (first_component != 0) {
   2496                unsigned read_components =
   2497                   instr->num_components + first_component;
   2498                fs_reg tmp = bld.vgrf(dst.type, read_components);
   2499                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
   2500                                patch_handle);
   2501                inst->size_written = read_components * REG_SIZE;
   2502                for (unsigned i = 0; i < instr->num_components; i++) {
   2503                   bld.MOV(offset(dst, bld, i),
   2504                           offset(tmp, bld, i + first_component));
   2505                }
   2506             } else {
   2507                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
   2508                                patch_handle);
   2509                inst->size_written = instr->num_components * REG_SIZE;
   2510             }
   2511             inst->offset = imm_offset;
   2512             inst->mlen = 1;
   2513          }
   2514       } else {
   2515          /* Indirect indexing - use per-slot offsets as well. */
   2516          const fs_reg srcs[] = {
   2517             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
   2518             indirect_offset
   2519          };
   2520          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
   2521          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
   2522          if (first_component != 0) {
   2523             unsigned read_components =
   2524                instr->num_components + first_component;
   2525             fs_reg tmp = bld.vgrf(dst.type, read_components);
   2526             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
   2527                             payload);
   2528             inst->size_written = read_components * REG_SIZE;
   2529             for (unsigned i = 0; i < instr->num_components; i++) {
   2530                bld.MOV(offset(dst, bld, i),
   2531                        offset(tmp, bld, i + first_component));
   2532             }
   2533          } else {
   2534             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
   2535                             payload);
   2536             inst->size_written = instr->num_components * REG_SIZE;
   2537          }
   2538          inst->offset = imm_offset;
   2539          inst->mlen = 2;
   2540       }
   2541       break;
   2542    }
   2543 
   2544    case nir_intrinsic_store_output:
   2545    case nir_intrinsic_store_per_vertex_output: {
   2546       fs_reg value = get_nir_src(instr->src[0]);
   2547       bool is_64bit = (instr->src[0].is_ssa ?
   2548          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
   2549       fs_reg indirect_offset = get_indirect_offset(instr);
   2550       unsigned imm_offset = instr->const_index[0];
   2551       unsigned swiz = BRW_SWIZZLE_XYZW;
   2552       unsigned mask = instr->const_index[1];
   2553       unsigned header_regs = 0;
   2554       fs_reg srcs[7];
   2555       srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
   2556 
   2557       if (indirect_offset.file != BAD_FILE) {
   2558          srcs[header_regs++] = indirect_offset;
   2559       }
   2560 
   2561       if (mask == 0)
   2562          break;
   2563 
   2564       unsigned num_components = util_last_bit(mask);
   2565       enum opcode opcode;
   2566 
   2567       /* We can only pack two 64-bit components in a single message, so send
   2568        * 2 messages if we have more components
   2569        */
   2570       unsigned num_iterations = 1;
   2571       unsigned iter_components = num_components;
   2572       unsigned first_component = nir_intrinsic_component(instr);
   2573       if (is_64bit) {
   2574          first_component = first_component / 2;
   2575          if (instr->num_components > 2) {
   2576             num_iterations = 2;
   2577             iter_components = 2;
   2578          }
   2579       }
   2580 
   2581       /* 64-bit data needs to me shuffled before we can write it to the URB.
   2582        * We will use this temporary to shuffle the components in each
   2583        * iteration.
   2584        */
   2585       fs_reg tmp =
   2586          fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
   2587 
   2588       mask = mask << first_component;
   2589 
   2590       for (unsigned iter = 0; iter < num_iterations; iter++) {
   2591          if (!is_64bit && mask != WRITEMASK_XYZW) {
   2592             srcs[header_regs++] = brw_imm_ud(mask << 16);
   2593             opcode = indirect_offset.file != BAD_FILE ?
   2594                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
   2595                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
   2596          } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
   2597             /* Expand the 64-bit mask to 32-bit channels. We only handle
   2598              * two channels in each iteration, so we only care about X/Y.
   2599              */
   2600             unsigned mask32 = 0;
   2601             if (mask & WRITEMASK_X)
   2602                mask32 |= WRITEMASK_XY;
   2603             if (mask & WRITEMASK_Y)
   2604                mask32 |= WRITEMASK_ZW;
   2605 
   2606             /* If the mask does not include any of the channels X or Y there
   2607              * is nothing to do in this iteration. Move on to the next couple
   2608              * of 64-bit channels.
   2609              */
   2610             if (!mask32) {
   2611                mask >>= 2;
   2612                imm_offset++;
   2613                continue;
   2614             }
   2615 
   2616             srcs[header_regs++] = brw_imm_ud(mask32 << 16);
   2617             opcode = indirect_offset.file != BAD_FILE ?
   2618                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
   2619                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
   2620          } else {
   2621             opcode = indirect_offset.file != BAD_FILE ?
   2622                SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
   2623                SHADER_OPCODE_URB_WRITE_SIMD8;
   2624          }
   2625 
   2626          for (unsigned i = 0; i < iter_components; i++) {
   2627             if (!(mask & (1 << (i + first_component))))
   2628                continue;
   2629 
   2630             if (!is_64bit) {
   2631                srcs[header_regs + i + first_component] =
   2632                   offset(value, bld, BRW_GET_SWZ(swiz, i));
   2633             } else {
   2634                /* We need to shuffle the 64-bit data to match the layout
   2635                 * expected by our 32-bit URB write messages. We use a temporary
   2636                 * for that.
   2637                 */
   2638                unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
   2639                shuffle_64bit_data_for_32bit_write(bld,
   2640                   retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
   2641                   retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
   2642                   1);
   2643 
   2644                /* Now copy the data to the destination */
   2645                fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
   2646                unsigned idx = 2 * i;
   2647                bld.MOV(dest, offset(tmp, bld, idx));
   2648                bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
   2649                srcs[header_regs + idx + first_component * 2] = dest;
   2650                srcs[header_regs + idx + 1 + first_component * 2] =
   2651                   offset(dest, bld, 1);
   2652             }
   2653          }
   2654 
   2655          unsigned mlen =
   2656             header_regs + (is_64bit ? 2 * iter_components : iter_components) +
   2657             (is_64bit ? 2 * first_component : first_component);
   2658          fs_reg payload =
   2659             bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
   2660          bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
   2661 
   2662          fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
   2663          inst->offset = imm_offset;
   2664          inst->mlen = mlen;
   2665 
   2666          /* If this is a 64-bit attribute, select the next two 64-bit channels
   2667           * to be handled in the next iteration.
   2668           */
   2669          if (is_64bit) {
   2670             mask >>= 2;
   2671             imm_offset++;
   2672          }
   2673       }
   2674       break;
   2675    }
   2676 
   2677    default:
   2678       nir_emit_intrinsic(bld, instr);
   2679       break;
   2680    }
   2681 }
   2682 
   2683 void
   2684 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
   2685                                    nir_intrinsic_instr *instr)
   2686 {
   2687    assert(stage == MESA_SHADER_TESS_EVAL);
   2688    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
   2689 
   2690    fs_reg dest;
   2691    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   2692       dest = get_nir_dest(instr->dest);
   2693 
   2694    switch (instr->intrinsic) {
   2695    case nir_intrinsic_load_primitive_id:
   2696       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
   2697       break;
   2698    case nir_intrinsic_load_tess_coord:
   2699       /* gl_TessCoord is part of the payload in g1-3 */
   2700       for (unsigned i = 0; i < 3; i++) {
   2701          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
   2702       }
   2703       break;
   2704 
   2705    case nir_intrinsic_load_input:
   2706    case nir_intrinsic_load_per_vertex_input: {
   2707       fs_reg indirect_offset = get_indirect_offset(instr);
   2708       unsigned imm_offset = instr->const_index[0];
   2709       unsigned first_component = nir_intrinsic_component(instr);
   2710 
   2711       if (type_sz(dest.type) == 8) {
   2712          first_component = first_component / 2;
   2713       }
   2714 
   2715       fs_inst *inst;
   2716       if (indirect_offset.file == BAD_FILE) {
   2717          /* Arbitrarily only push up to 32 vec4 slots worth of data,
   2718           * which is 16 registers (since each holds 2 vec4 slots).
   2719           */
   2720          const unsigned max_push_slots = 32;
   2721          if (imm_offset < max_push_slots) {
   2722             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
   2723             for (int i = 0; i < instr->num_components; i++) {
   2724                unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
   2725                   i + first_component;
   2726                bld.MOV(offset(dest, bld, i), component(src, comp));
   2727             }
   2728             tes_prog_data->base.urb_read_length =
   2729                MAX2(tes_prog_data->base.urb_read_length,
   2730                     DIV_ROUND_UP(imm_offset + 1, 2));
   2731          } else {
   2732             /* Replicate the patch handle to all enabled channels */
   2733             const fs_reg srcs[] = {
   2734                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
   2735             };
   2736             fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   2737             bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
   2738 
   2739             if (first_component != 0) {
   2740                unsigned read_components =
   2741                   instr->num_components + first_component;
   2742                fs_reg tmp = bld.vgrf(dest.type, read_components);
   2743                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
   2744                                patch_handle);
   2745                inst->size_written = read_components * REG_SIZE;
   2746                for (unsigned i = 0; i < instr->num_components; i++) {
   2747                   bld.MOV(offset(dest, bld, i),
   2748                           offset(tmp, bld, i + first_component));
   2749                }
   2750             } else {
   2751                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
   2752                                patch_handle);
   2753                inst->size_written = instr->num_components * REG_SIZE;
   2754             }
   2755             inst->mlen = 1;
   2756             inst->offset = imm_offset;
   2757          }
   2758       } else {
   2759          /* Indirect indexing - use per-slot offsets as well. */
   2760 
   2761          /* We can only read two double components with each URB read, so
   2762           * we send two read messages in that case, each one loading up to
   2763           * two double components.
   2764           */
   2765          unsigned num_iterations = 1;
   2766          unsigned num_components = instr->num_components;
   2767          fs_reg orig_dest = dest;
   2768          if (type_sz(dest.type) == 8) {
   2769             if (instr->num_components > 2) {
   2770                num_iterations = 2;
   2771                num_components = 2;
   2772             }
   2773             fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
   2774             dest = tmp;
   2775          }
   2776 
   2777          for (unsigned iter = 0; iter < num_iterations; iter++) {
   2778             const fs_reg srcs[] = {
   2779                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
   2780                indirect_offset
   2781             };
   2782             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
   2783             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
   2784 
   2785             if (first_component != 0) {
   2786                unsigned read_components =
   2787                    num_components + first_component;
   2788                fs_reg tmp = bld.vgrf(dest.type, read_components);
   2789                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
   2790                                payload);
   2791                for (unsigned i = 0; i < num_components; i++) {
   2792                   bld.MOV(offset(dest, bld, i),
   2793                           offset(tmp, bld, i + first_component));
   2794                }
   2795             } else {
   2796                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
   2797                                payload);
   2798             }
   2799             inst->mlen = 2;
   2800             inst->offset = imm_offset;
   2801             inst->size_written = (num_components + first_component) *
   2802                                  inst->dst.component_size(inst->exec_size);
   2803 
   2804             /* If we are reading 64-bit data using 32-bit read messages we need
   2805              * build proper 64-bit data elements by shuffling the low and high
   2806              * 32-bit components around like we do for other things like UBOs
   2807              * or SSBOs.
   2808              */
   2809             if (type_sz(dest.type) == 8) {
   2810                shuffle_32bit_load_result_to_64bit_data(
   2811                   bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
   2812 
   2813                for (unsigned c = 0; c < num_components; c++) {
   2814                   bld.MOV(offset(orig_dest, bld, iter * 2 + c),
   2815                           offset(dest, bld, c));
   2816                }
   2817             }
   2818 
   2819             /* If we are loading double data and we need a second read message
   2820              * adjust the offset
   2821              */
   2822             if (num_iterations > 1) {
   2823                num_components = instr->num_components - 2;
   2824                imm_offset++;
   2825             }
   2826          }
   2827       }
   2828       break;
   2829    }
   2830    default:
   2831       nir_emit_intrinsic(bld, instr);
   2832       break;
   2833    }
   2834 }
   2835 
   2836 void
   2837 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
   2838                                   nir_intrinsic_instr *instr)
   2839 {
   2840    assert(stage == MESA_SHADER_GEOMETRY);
   2841    fs_reg indirect_offset;
   2842 
   2843    fs_reg dest;
   2844    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   2845       dest = get_nir_dest(instr->dest);
   2846 
   2847    switch (instr->intrinsic) {
   2848    case nir_intrinsic_load_primitive_id:
   2849       assert(stage == MESA_SHADER_GEOMETRY);
   2850       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
   2851       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
   2852               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
   2853       break;
   2854 
   2855    case nir_intrinsic_load_input:
   2856       unreachable("load_input intrinsics are invalid for the GS stage");
   2857 
   2858    case nir_intrinsic_load_per_vertex_input:
   2859       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
   2860                          instr->src[1], instr->num_components,
   2861                          nir_intrinsic_component(instr));
   2862       break;
   2863 
   2864    case nir_intrinsic_emit_vertex_with_counter:
   2865       emit_gs_vertex(instr->src[0], instr->const_index[0]);
   2866       break;
   2867 
   2868    case nir_intrinsic_end_primitive_with_counter:
   2869       emit_gs_end_primitive(instr->src[0]);
   2870       break;
   2871 
   2872    case nir_intrinsic_set_vertex_count:
   2873       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
   2874       break;
   2875 
   2876    case nir_intrinsic_load_invocation_id: {
   2877       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
   2878       assert(val.file != BAD_FILE);
   2879       dest.type = val.type;
   2880       bld.MOV(dest, val);
   2881       break;
   2882    }
   2883 
   2884    default:
   2885       nir_emit_intrinsic(bld, instr);
   2886       break;
   2887    }
   2888 }
   2889 
   2890 /**
   2891  * Fetch the current render target layer index.
   2892  */
   2893 static fs_reg
   2894 fetch_render_target_array_index(const fs_builder &bld)
   2895 {
   2896    if (bld.shader->devinfo->gen >= 6) {
   2897       /* The render target array index is provided in the thread payload as
   2898        * bits 26:16 of r0.0.
   2899        */
   2900       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
   2901       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
   2902               brw_imm_uw(0x7ff));
   2903       return idx;
   2904    } else {
   2905       /* Pre-SNB we only ever render into the first layer of the framebuffer
   2906        * since layered rendering is not implemented.
   2907        */
   2908       return brw_imm_ud(0);
   2909    }
   2910 }
   2911 
   2912 /**
   2913  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
   2914  * framebuffer at the current fragment coordinates and sample index.
   2915  */
   2916 fs_inst *
   2917 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
   2918                                       unsigned target)
   2919 {
   2920    const struct gen_device_info *devinfo = bld.shader->devinfo;
   2921 
   2922    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
   2923    const brw_wm_prog_key *wm_key =
   2924       reinterpret_cast<const brw_wm_prog_key *>(key);
   2925    assert(!wm_key->coherent_fb_fetch);
   2926    const struct brw_wm_prog_data *wm_prog_data =
   2927       brw_wm_prog_data(stage_prog_data);
   2928 
   2929    /* Calculate the surface index relative to the start of the texture binding
   2930     * table block, since that's what the texturing messages expect.
   2931     */
   2932    const unsigned surface = target +
   2933       wm_prog_data->binding_table.render_target_read_start -
   2934       wm_prog_data->base.binding_table.texture_start;
   2935 
   2936    brw_mark_surface_used(
   2937       bld.shader->stage_prog_data,
   2938       wm_prog_data->binding_table.render_target_read_start + target);
   2939 
   2940    /* Calculate the fragment coordinates. */
   2941    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
   2942    bld.MOV(offset(coords, bld, 0), pixel_x);
   2943    bld.MOV(offset(coords, bld, 1), pixel_y);
   2944    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
   2945 
   2946    /* Calculate the sample index and MCS payload when multisampling.  Luckily
   2947     * the MCS fetch message behaves deterministically for UMS surfaces, so it
   2948     * shouldn't be necessary to recompile based on whether the framebuffer is
   2949     * CMS or UMS.
   2950     */
   2951    if (wm_key->multisample_fbo &&
   2952        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
   2953       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
   2954 
   2955    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
   2956    const fs_reg mcs = wm_key->multisample_fbo ?
   2957       emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
   2958 
   2959    /* Use either a normal or a CMS texel fetch message depending on whether
   2960     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
   2961     * message just in case the framebuffer uses 16x multisampling, it should
   2962     * be equivalent to the normal CMS fetch for lower multisampling modes.
   2963     */
   2964    const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
   2965                      devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
   2966                      SHADER_OPCODE_TXF_CMS_LOGICAL;
   2967 
   2968    /* Emit the instruction. */
   2969    const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
   2970                            sample, mcs,
   2971                            brw_imm_ud(surface), brw_imm_ud(0),
   2972                            fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
   2973    STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
   2974 
   2975    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
   2976    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
   2977 
   2978    return inst;
   2979 }
   2980 
   2981 /**
   2982  * Actual coherent framebuffer read implemented using the native render target
   2983  * read message.  Requires SKL+.
   2984  */
   2985 static fs_inst *
   2986 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
   2987 {
   2988    assert(bld.shader->devinfo->gen >= 9);
   2989    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
   2990    inst->target = target;
   2991    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
   2992 
   2993    return inst;
   2994 }
   2995 
   2996 static fs_reg
   2997 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
   2998 {
   2999    if (n && regs[0].file != BAD_FILE) {
   3000       return regs[0];
   3001 
   3002    } else {
   3003       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
   3004 
   3005       for (unsigned i = 0; i < n; i++)
   3006          regs[i] = tmp;
   3007 
   3008       return tmp;
   3009    }
   3010 }
   3011 
   3012 static fs_reg
   3013 alloc_frag_output(fs_visitor *v, unsigned location)
   3014 {
   3015    assert(v->stage == MESA_SHADER_FRAGMENT);
   3016    const brw_wm_prog_key *const key =
   3017       reinterpret_cast<const brw_wm_prog_key *>(v->key);
   3018    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
   3019    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
   3020 
   3021    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
   3022       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
   3023 
   3024    else if (l == FRAG_RESULT_COLOR)
   3025       return alloc_temporary(v->bld, 4, v->outputs,
   3026                              MAX2(key->nr_color_regions, 1));
   3027 
   3028    else if (l == FRAG_RESULT_DEPTH)
   3029       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
   3030 
   3031    else if (l == FRAG_RESULT_STENCIL)
   3032       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
   3033 
   3034    else if (l == FRAG_RESULT_SAMPLE_MASK)
   3035       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
   3036 
   3037    else if (l >= FRAG_RESULT_DATA0 &&
   3038             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
   3039       return alloc_temporary(v->bld, 4,
   3040                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
   3041 
   3042    else
   3043       unreachable("Invalid location");
   3044 }
   3045 
   3046 void
   3047 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
   3048                                   nir_intrinsic_instr *instr)
   3049 {
   3050    assert(stage == MESA_SHADER_FRAGMENT);
   3051 
   3052    fs_reg dest;
   3053    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   3054       dest = get_nir_dest(instr->dest);
   3055 
   3056    switch (instr->intrinsic) {
   3057    case nir_intrinsic_load_front_face:
   3058       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
   3059               *emit_frontfacing_interpolation());
   3060       break;
   3061 
   3062    case nir_intrinsic_load_sample_pos: {
   3063       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
   3064       assert(sample_pos.file != BAD_FILE);
   3065       dest.type = sample_pos.type;
   3066       bld.MOV(dest, sample_pos);
   3067       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
   3068       break;
   3069    }
   3070 
   3071    case nir_intrinsic_load_layer_id:
   3072       dest.type = BRW_REGISTER_TYPE_UD;
   3073       bld.MOV(dest, fetch_render_target_array_index(bld));
   3074       break;
   3075 
   3076    case nir_intrinsic_load_helper_invocation:
   3077    case nir_intrinsic_load_sample_mask_in:
   3078    case nir_intrinsic_load_sample_id: {
   3079       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
   3080       fs_reg val = nir_system_values[sv];
   3081       assert(val.file != BAD_FILE);
   3082       dest.type = val.type;
   3083       bld.MOV(dest, val);
   3084       break;
   3085    }
   3086 
   3087    case nir_intrinsic_store_output: {
   3088       const fs_reg src = get_nir_src(instr->src[0]);
   3089       const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
   3090       assert(const_offset && "Indirect output stores not allowed");
   3091       const unsigned location = nir_intrinsic_base(instr) +
   3092          SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
   3093       const fs_reg new_dest = retype(alloc_frag_output(this, location),
   3094                                      src.type);
   3095 
   3096       for (unsigned j = 0; j < instr->num_components; j++)
   3097          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
   3098                  offset(src, bld, j));
   3099 
   3100       break;
   3101    }
   3102 
   3103    case nir_intrinsic_load_output: {
   3104       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
   3105                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
   3106       assert(l >= FRAG_RESULT_DATA0);
   3107       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
   3108       assert(const_offset && "Indirect output loads not allowed");
   3109       const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
   3110       const fs_reg tmp = bld.vgrf(dest.type, 4);
   3111 
   3112       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
   3113          emit_coherent_fb_read(bld, tmp, target);
   3114       else
   3115          emit_non_coherent_fb_read(bld, tmp, target);
   3116 
   3117       for (unsigned j = 0; j < instr->num_components; j++) {
   3118          bld.MOV(offset(dest, bld, j),
   3119                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
   3120       }
   3121 
   3122       break;
   3123    }
   3124 
   3125    case nir_intrinsic_discard:
   3126    case nir_intrinsic_discard_if: {
   3127       /* We track our discarded pixels in f0.1.  By predicating on it, we can
   3128        * update just the flag bits that aren't yet discarded.  If there's no
   3129        * condition, we emit a CMP of g0 != g0, so all currently executing
   3130        * channels will get turned off.
   3131        */
   3132       fs_inst *cmp;
   3133       if (instr->intrinsic == nir_intrinsic_discard_if) {
   3134          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
   3135                        brw_imm_d(0), BRW_CONDITIONAL_Z);
   3136       } else {
   3137          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
   3138                                        BRW_REGISTER_TYPE_UW));
   3139          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
   3140       }
   3141       cmp->predicate = BRW_PREDICATE_NORMAL;
   3142       cmp->flag_subreg = 1;
   3143 
   3144       if (devinfo->gen >= 6) {
   3145          emit_discard_jump();
   3146       }
   3147       break;
   3148    }
   3149 
   3150    case nir_intrinsic_load_input: {
   3151       /* load_input is only used for flat inputs */
   3152       unsigned base = nir_intrinsic_base(instr);
   3153       unsigned component = nir_intrinsic_component(instr);
   3154       unsigned num_components = instr->num_components;
   3155       enum brw_reg_type type = dest.type;
   3156 
   3157       /* Special case fields in the VUE header */
   3158       if (base == VARYING_SLOT_LAYER)
   3159          component = 1;
   3160       else if (base == VARYING_SLOT_VIEWPORT)
   3161          component = 2;
   3162 
   3163       if (nir_dest_bit_size(instr->dest) == 64) {
   3164          /* const_index is in 32-bit type size units that could not be aligned
   3165           * with DF. We need to read the double vector as if it was a float
   3166           * vector of twice the number of components to fetch the right data.
   3167           */
   3168          type = BRW_REGISTER_TYPE_F;
   3169          num_components *= 2;
   3170       }
   3171 
   3172       for (unsigned int i = 0; i < num_components; i++) {
   3173          struct brw_reg interp = interp_reg(base, component + i);
   3174          interp = suboffset(interp, 3);
   3175          bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
   3176                   retype(fs_reg(interp), type));
   3177       }
   3178 
   3179       if (nir_dest_bit_size(instr->dest) == 64) {
   3180          shuffle_32bit_load_result_to_64bit_data(bld,
   3181                                                  dest,
   3182                                                  retype(dest, type),
   3183                                                  instr->num_components);
   3184       }
   3185       break;
   3186    }
   3187 
   3188    case nir_intrinsic_load_barycentric_pixel:
   3189    case nir_intrinsic_load_barycentric_centroid:
   3190    case nir_intrinsic_load_barycentric_sample:
   3191       /* Do nothing - load_interpolated_input handling will handle it later. */
   3192       break;
   3193 
   3194    case nir_intrinsic_load_barycentric_at_sample: {
   3195       const glsl_interp_mode interpolation =
   3196          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
   3197 
   3198       nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
   3199 
   3200       if (const_sample) {
   3201          unsigned msg_data = const_sample->i32[0] << 4;
   3202 
   3203          emit_pixel_interpolater_send(bld,
   3204                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
   3205                                       dest,
   3206                                       fs_reg(), /* src */
   3207                                       brw_imm_ud(msg_data),
   3208                                       interpolation);
   3209       } else {
   3210          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
   3211                                           BRW_REGISTER_TYPE_UD);
   3212 
   3213          if (nir_src_is_dynamically_uniform(instr->src[0])) {
   3214             const fs_reg sample_id = bld.emit_uniformize(sample_src);
   3215             const fs_reg msg_data = vgrf(glsl_type::uint_type);
   3216             bld.exec_all().group(1, 0)
   3217                .SHL(msg_data, sample_id, brw_imm_ud(4u));
   3218             emit_pixel_interpolater_send(bld,
   3219                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
   3220                                          dest,
   3221                                          fs_reg(), /* src */
   3222                                          msg_data,
   3223                                          interpolation);
   3224          } else {
   3225             /* Make a loop that sends a message to the pixel interpolater
   3226              * for the sample number in each live channel. If there are
   3227              * multiple channels with the same sample number then these
   3228              * will be handled simultaneously with a single interation of
   3229              * the loop.
   3230              */
   3231             bld.emit(BRW_OPCODE_DO);
   3232 
   3233             /* Get the next live sample number into sample_id_reg */
   3234             const fs_reg sample_id = bld.emit_uniformize(sample_src);
   3235 
   3236             /* Set the flag register so that we can perform the send
   3237              * message on all channels that have the same sample number
   3238              */
   3239             bld.CMP(bld.null_reg_ud(),
   3240                     sample_src, sample_id,
   3241                     BRW_CONDITIONAL_EQ);
   3242             const fs_reg msg_data = vgrf(glsl_type::uint_type);
   3243             bld.exec_all().group(1, 0)
   3244                .SHL(msg_data, sample_id, brw_imm_ud(4u));
   3245             fs_inst *inst =
   3246                emit_pixel_interpolater_send(bld,
   3247                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
   3248                                             dest,
   3249                                             fs_reg(), /* src */
   3250                                             msg_data,
   3251                                             interpolation);
   3252             set_predicate(BRW_PREDICATE_NORMAL, inst);
   3253 
   3254             /* Continue the loop if there are any live channels left */
   3255             set_predicate_inv(BRW_PREDICATE_NORMAL,
   3256                               true, /* inverse */
   3257                               bld.emit(BRW_OPCODE_WHILE));
   3258          }
   3259       }
   3260       break;
   3261    }
   3262 
   3263    case nir_intrinsic_load_barycentric_at_offset: {
   3264       const glsl_interp_mode interpolation =
   3265          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
   3266 
   3267       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
   3268 
   3269       if (const_offset) {
   3270          unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
   3271          unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
   3272 
   3273          emit_pixel_interpolater_send(bld,
   3274                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
   3275                                       dest,
   3276                                       fs_reg(), /* src */
   3277                                       brw_imm_ud(off_x | (off_y << 4)),
   3278                                       interpolation);
   3279       } else {
   3280          fs_reg src = vgrf(glsl_type::ivec2_type);
   3281          fs_reg offset_src = retype(get_nir_src(instr->src[0]),
   3282                                     BRW_REGISTER_TYPE_F);
   3283          for (int i = 0; i < 2; i++) {
   3284             fs_reg temp = vgrf(glsl_type::float_type);
   3285             bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
   3286             fs_reg itemp = vgrf(glsl_type::int_type);
   3287             /* float to int */
   3288             bld.MOV(itemp, temp);
   3289 
   3290             /* Clamp the upper end of the range to +7/16.
   3291              * ARB_gpu_shader5 requires that we support a maximum offset
   3292              * of +0.5, which isn't representable in a S0.4 value -- if
   3293              * we didn't clamp it, we'd end up with -8/16, which is the
   3294              * opposite of what the shader author wanted.
   3295              *
   3296              * This is legal due to ARB_gpu_shader5's quantization
   3297              * rules:
   3298              *
   3299              * "Not all values of <offset> may be supported; x and y
   3300              * offsets may be rounded to fixed-point values with the
   3301              * number of fraction bits given by the
   3302              * implementation-dependent constant
   3303              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
   3304              */
   3305             set_condmod(BRW_CONDITIONAL_L,
   3306                         bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
   3307          }
   3308 
   3309          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
   3310          emit_pixel_interpolater_send(bld,
   3311                                       opcode,
   3312                                       dest,
   3313                                       src,
   3314                                       brw_imm_ud(0u),
   3315                                       interpolation);
   3316       }
   3317       break;
   3318    }
   3319 
   3320    case nir_intrinsic_load_interpolated_input: {
   3321       if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
   3322          emit_fragcoord_interpolation(dest);
   3323          break;
   3324       }
   3325 
   3326       assert(instr->src[0].ssa &&
   3327              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
   3328       nir_intrinsic_instr *bary_intrinsic =
   3329          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
   3330       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
   3331       enum glsl_interp_mode interp_mode =
   3332          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
   3333       fs_reg dst_xy;
   3334 
   3335       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
   3336           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
   3337          /* Use the result of the PI message */
   3338          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
   3339       } else {
   3340          /* Use the delta_xy values computed from the payload */
   3341          enum brw_barycentric_mode bary =
   3342             brw_barycentric_mode(interp_mode, bary_intrin);
   3343 
   3344          dst_xy = this->delta_xy[bary];
   3345       }
   3346 
   3347       for (unsigned int i = 0; i < instr->num_components; i++) {
   3348          fs_reg interp =
   3349             fs_reg(interp_reg(nir_intrinsic_base(instr),
   3350                               nir_intrinsic_component(instr) + i));
   3351          interp.type = BRW_REGISTER_TYPE_F;
   3352          dest.type = BRW_REGISTER_TYPE_F;
   3353 
   3354          if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
   3355             fs_reg tmp = vgrf(glsl_type::float_type);
   3356             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
   3357             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
   3358          } else {
   3359             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
   3360          }
   3361       }
   3362       break;
   3363    }
   3364 
   3365    default:
   3366       nir_emit_intrinsic(bld, instr);
   3367       break;
   3368    }
   3369 }
   3370 
   3371 void
   3372 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
   3373                                   nir_intrinsic_instr *instr)
   3374 {
   3375    assert(stage == MESA_SHADER_COMPUTE);
   3376    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
   3377 
   3378    fs_reg dest;
   3379    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   3380       dest = get_nir_dest(instr->dest);
   3381 
   3382    switch (instr->intrinsic) {
   3383    case nir_intrinsic_barrier:
   3384       emit_barrier();
   3385       cs_prog_data->uses_barrier = true;
   3386       break;
   3387 
   3388    case nir_intrinsic_load_local_invocation_id:
   3389    case nir_intrinsic_load_work_group_id: {
   3390       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
   3391       fs_reg val = nir_system_values[sv];
   3392       assert(val.file != BAD_FILE);
   3393       dest.type = val.type;
   3394       for (unsigned i = 0; i < 3; i++)
   3395          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
   3396       break;
   3397    }
   3398 
   3399    case nir_intrinsic_load_num_work_groups: {
   3400       const unsigned surface =
   3401          cs_prog_data->binding_table.work_groups_start;
   3402 
   3403       cs_prog_data->uses_num_work_groups = true;
   3404 
   3405       fs_reg surf_index = brw_imm_ud(surface);
   3406       brw_mark_surface_used(prog_data, surface);
   3407 
   3408       /* Read the 3 GLuint components of gl_NumWorkGroups */
   3409       for (unsigned i = 0; i < 3; i++) {
   3410          fs_reg read_result =
   3411             emit_untyped_read(bld, surf_index,
   3412                               brw_imm_ud(i << 2),
   3413                               1 /* dims */, 1 /* size */,
   3414                               BRW_PREDICATE_NONE);
   3415          read_result.type = dest.type;
   3416          bld.MOV(dest, read_result);
   3417          dest = offset(dest, bld, 1);
   3418       }
   3419       break;
   3420    }
   3421 
   3422    case nir_intrinsic_shared_atomic_add:
   3423       nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr);
   3424       break;
   3425    case nir_intrinsic_shared_atomic_imin:
   3426       nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
   3427       break;
   3428    case nir_intrinsic_shared_atomic_umin:
   3429       nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
   3430       break;
   3431    case nir_intrinsic_shared_atomic_imax:
   3432       nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
   3433       break;
   3434    case nir_intrinsic_shared_atomic_umax:
   3435       nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
   3436       break;
   3437    case nir_intrinsic_shared_atomic_and:
   3438       nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
   3439       break;
   3440    case nir_intrinsic_shared_atomic_or:
   3441       nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
   3442       break;
   3443    case nir_intrinsic_shared_atomic_xor:
   3444       nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
   3445       break;
   3446    case nir_intrinsic_shared_atomic_exchange:
   3447       nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
   3448       break;
   3449    case nir_intrinsic_shared_atomic_comp_swap:
   3450       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
   3451       break;
   3452 
   3453    case nir_intrinsic_load_shared: {
   3454       assert(devinfo->gen >= 7);
   3455 
   3456       fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
   3457 
   3458       /* Get the offset to read from */
   3459       fs_reg offset_reg;
   3460       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
   3461       if (const_offset) {
   3462          offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
   3463       } else {
   3464          offset_reg = vgrf(glsl_type::uint_type);
   3465          bld.ADD(offset_reg,
   3466                  retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
   3467                  brw_imm_ud(instr->const_index[0]));
   3468       }
   3469 
   3470       /* Read the vector */
   3471       do_untyped_vector_read(bld, dest, surf_index, offset_reg,
   3472                              instr->num_components);
   3473       break;
   3474    }
   3475 
   3476    case nir_intrinsic_store_shared: {
   3477       assert(devinfo->gen >= 7);
   3478 
   3479       /* Block index */
   3480       fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
   3481 
   3482       /* Value */
   3483       fs_reg val_reg = get_nir_src(instr->src[0]);
   3484 
   3485       /* Writemask */
   3486       unsigned writemask = instr->const_index[1];
   3487 
   3488       /* get_nir_src() retypes to integer. Be wary of 64-bit types though
   3489        * since the untyped writes below operate in units of 32-bits, which
   3490        * means that we need to write twice as many components each time.
   3491        * Also, we have to suffle 64-bit data to be in the appropriate layout
   3492        * expected by our 32-bit write messages.
   3493        */
   3494       unsigned type_size = 4;
   3495       unsigned bit_size = instr->src[0].is_ssa ?
   3496          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
   3497       if (bit_size == 64) {
   3498          type_size = 8;
   3499          fs_reg tmp =
   3500            fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
   3501          shuffle_64bit_data_for_32bit_write(
   3502             bld,
   3503             retype(tmp, BRW_REGISTER_TYPE_F),
   3504             retype(val_reg, BRW_REGISTER_TYPE_DF),
   3505             instr->num_components);
   3506          val_reg = tmp;
   3507       }
   3508 
   3509       unsigned type_slots = type_size / 4;
   3510 
   3511       /* Combine groups of consecutive enabled channels in one write
   3512        * message. We use ffs to find the first enabled channel and then ffs on
   3513        * the bit-inverse, down-shifted writemask to determine the length of
   3514        * the block of enabled bits.
   3515        */
   3516       while (writemask) {
   3517          unsigned first_component = ffs(writemask) - 1;
   3518          unsigned length = ffs(~(writemask >> first_component)) - 1;
   3519 
   3520          /* We can't write more than 2 64-bit components at once. Limit the
   3521           * length of the write to what we can do and let the next iteration
   3522           * handle the rest
   3523           */
   3524          if (type_size > 4)
   3525             length = MIN2(2, length);
   3526 
   3527          fs_reg offset_reg;
   3528          nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
   3529          if (const_offset) {
   3530             offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
   3531                                     type_size * first_component);
   3532          } else {
   3533             offset_reg = vgrf(glsl_type::uint_type);
   3534             bld.ADD(offset_reg,
   3535                     retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
   3536                     brw_imm_ud(instr->const_index[0] + type_size * first_component));
   3537          }
   3538 
   3539          emit_untyped_write(bld, surf_index, offset_reg,
   3540                             offset(val_reg, bld, first_component * type_slots),
   3541                             1 /* dims */, length * type_slots,
   3542                             BRW_PREDICATE_NONE);
   3543 
   3544          /* Clear the bits in the writemask that we just wrote, then try
   3545           * again to see if more channels are left.
   3546           */
   3547          writemask &= (15 << (first_component + length));
   3548       }
   3549 
   3550       break;
   3551    }
   3552 
   3553    default:
   3554       nir_emit_intrinsic(bld, instr);
   3555       break;
   3556    }
   3557 }
   3558 
   3559 void
   3560 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
   3561 {
   3562    fs_reg dest;
   3563    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   3564       dest = get_nir_dest(instr->dest);
   3565 
   3566    switch (instr->intrinsic) {
   3567    case nir_intrinsic_atomic_counter_inc:
   3568    case nir_intrinsic_atomic_counter_dec:
   3569    case nir_intrinsic_atomic_counter_read:
   3570    case nir_intrinsic_atomic_counter_add:
   3571    case nir_intrinsic_atomic_counter_min:
   3572    case nir_intrinsic_atomic_counter_max:
   3573    case nir_intrinsic_atomic_counter_and:
   3574    case nir_intrinsic_atomic_counter_or:
   3575    case nir_intrinsic_atomic_counter_xor:
   3576    case nir_intrinsic_atomic_counter_exchange:
   3577    case nir_intrinsic_atomic_counter_comp_swap: {
   3578       if (stage == MESA_SHADER_FRAGMENT &&
   3579           instr->intrinsic != nir_intrinsic_atomic_counter_read)
   3580          brw_wm_prog_data(prog_data)->has_side_effects = true;
   3581 
   3582       /* Get some metadata from the image intrinsic. */
   3583       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
   3584 
   3585       /* Get the arguments of the atomic intrinsic. */
   3586       const fs_reg offset = get_nir_src(instr->src[0]);
   3587       const unsigned surface = (stage_prog_data->binding_table.abo_start +
   3588                                 instr->const_index[0]);
   3589       const fs_reg src0 = (info->num_srcs >= 2
   3590                            ? get_nir_src(instr->src[1]) : fs_reg());
   3591       const fs_reg src1 = (info->num_srcs >= 3
   3592                            ? get_nir_src(instr->src[2]) : fs_reg());
   3593       fs_reg tmp;
   3594 
   3595       assert(info->num_srcs <= 3);
   3596 
   3597       /* Emit a surface read or atomic op. */
   3598       if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
   3599          tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
   3600       } else {
   3601          tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
   3602                                    src1, 1, 1,
   3603                                    get_atomic_counter_op(instr->intrinsic));
   3604       }
   3605 
   3606       /* Assign the result. */
   3607       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
   3608 
   3609       /* Mark the surface as used. */
   3610       brw_mark_surface_used(stage_prog_data, surface);
   3611       break;
   3612    }
   3613 
   3614    case nir_intrinsic_image_load:
   3615    case nir_intrinsic_image_store:
   3616    case nir_intrinsic_image_atomic_add:
   3617    case nir_intrinsic_image_atomic_min:
   3618    case nir_intrinsic_image_atomic_max:
   3619    case nir_intrinsic_image_atomic_and:
   3620    case nir_intrinsic_image_atomic_or:
   3621    case nir_intrinsic_image_atomic_xor:
   3622    case nir_intrinsic_image_atomic_exchange:
   3623    case nir_intrinsic_image_atomic_comp_swap: {
   3624       using namespace image_access;
   3625 
   3626       if (stage == MESA_SHADER_FRAGMENT &&
   3627           instr->intrinsic != nir_intrinsic_image_load)
   3628          brw_wm_prog_data(prog_data)->has_side_effects = true;
   3629 
   3630       /* Get the referenced image variable and type. */
   3631       const nir_variable *var = instr->variables[0]->var;
   3632       const glsl_type *type = var->type->without_array();
   3633       const brw_reg_type base_type = get_image_base_type(type);
   3634 
   3635       /* Get some metadata from the image intrinsic. */
   3636       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
   3637       const unsigned arr_dims = type->sampler_array ? 1 : 0;
   3638       const unsigned surf_dims = type->coordinate_components() - arr_dims;
   3639       const unsigned format = var->data.image.format;
   3640 
   3641       /* Get the arguments of the image intrinsic. */
   3642       const fs_reg image = get_nir_image_deref(instr->variables[0]);
   3643       const fs_reg addr = retype(get_nir_src(instr->src[0]),
   3644                                  BRW_REGISTER_TYPE_UD);
   3645       const fs_reg src0 = (info->num_srcs >= 3 ?
   3646                            retype(get_nir_src(instr->src[2]), base_type) :
   3647                            fs_reg());
   3648       const fs_reg src1 = (info->num_srcs >= 4 ?
   3649                            retype(get_nir_src(instr->src[3]), base_type) :
   3650                            fs_reg());
   3651       fs_reg tmp;
   3652 
   3653       /* Emit an image load, store or atomic op. */
   3654       if (instr->intrinsic == nir_intrinsic_image_load)
   3655          tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
   3656 
   3657       else if (instr->intrinsic == nir_intrinsic_image_store)
   3658          emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
   3659                           var->data.image.write_only ? GL_NONE : format);
   3660 
   3661       else
   3662          tmp = emit_image_atomic(bld, image, addr, src0, src1,
   3663                                  surf_dims, arr_dims, info->dest_components,
   3664                                  get_image_atomic_op(instr->intrinsic, type));
   3665 
   3666       /* Assign the result. */
   3667       for (unsigned c = 0; c < info->dest_components; ++c)
   3668          bld.MOV(offset(retype(dest, base_type), bld, c),
   3669                  offset(tmp, bld, c));
   3670       break;
   3671    }
   3672 
   3673    case nir_intrinsic_memory_barrier_atomic_counter:
   3674    case nir_intrinsic_memory_barrier_buffer:
   3675    case nir_intrinsic_memory_barrier_image:
   3676    case nir_intrinsic_memory_barrier: {
   3677       const fs_builder ubld = bld.group(8, 0);
   3678       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
   3679       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
   3680          ->size_written = 2 * REG_SIZE;
   3681       break;
   3682    }
   3683 
   3684    case nir_intrinsic_group_memory_barrier:
   3685    case nir_intrinsic_memory_barrier_shared:
   3686       /* We treat these workgroup-level barriers as no-ops.  This should be
   3687        * safe at present and as long as:
   3688        *
   3689        *  - Memory access instructions are not subsequently reordered by the
   3690        *    compiler back-end.
   3691        *
   3692        *  - All threads from a given compute shader workgroup fit within a
   3693        *    single subslice and therefore talk to the same HDC shared unit
   3694        *    what supposedly guarantees ordering and coherency between threads
   3695        *    from the same workgroup.  This may change in the future when we
   3696        *    start splitting workgroups across multiple subslices.
   3697        *
   3698        *  - The context is not in fault-and-stream mode, which could cause
   3699        *    memory transactions (including to SLM) prior to the barrier to be
   3700        *    replayed after the barrier if a pagefault occurs.  This shouldn't
   3701        *    be a problem up to and including SKL because fault-and-stream is
   3702        *    not usable due to hardware issues, but that's likely to change in
   3703        *    the future.
   3704        */
   3705       break;
   3706 
   3707    case nir_intrinsic_shader_clock: {
   3708       /* We cannot do anything if there is an event, so ignore it for now */
   3709       const fs_reg shader_clock = get_timestamp(bld);
   3710       const fs_reg srcs[] = { component(shader_clock, 0),
   3711                               component(shader_clock, 1) };
   3712       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
   3713       break;
   3714    }
   3715 
   3716    case nir_intrinsic_image_size: {
   3717       /* Get the referenced image variable and type. */
   3718       const nir_variable *var = instr->variables[0]->var;
   3719       const glsl_type *type = var->type->without_array();
   3720 
   3721       /* Get the size of the image. */
   3722       const fs_reg image = get_nir_image_deref(instr->variables[0]);
   3723       const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
   3724 
   3725       /* For 1DArray image types, the array index is stored in the Z component.
   3726        * Fix this by swizzling the Z component to the Y component.
   3727        */
   3728       const bool is_1d_array_image =
   3729                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
   3730                   type->sampler_array;
   3731 
   3732       /* For CubeArray images, we should count the number of cubes instead
   3733        * of the number of faces. Fix it by dividing the (Z component) by 6.
   3734        */
   3735       const bool is_cube_array_image =
   3736                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
   3737                   type->sampler_array;
   3738 
   3739       /* Copy all the components. */
   3740       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
   3741       for (unsigned c = 0; c < info->dest_components; ++c) {
   3742          if ((int)c >= type->coordinate_components()) {
   3743              bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
   3744                      brw_imm_d(1));
   3745          } else if (c == 1 && is_1d_array_image) {
   3746             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
   3747                     offset(size, bld, 2));
   3748          } else if (c == 2 && is_cube_array_image) {
   3749             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
   3750                      offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
   3751                      offset(size, bld, c), brw_imm_d(6));
   3752          } else {
   3753             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
   3754                     offset(size, bld, c));
   3755          }
   3756        }
   3757 
   3758       break;
   3759    }
   3760 
   3761    case nir_intrinsic_image_samples:
   3762       /* The driver does not support multi-sampled images. */
   3763       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
   3764       break;
   3765 
   3766    case nir_intrinsic_load_uniform: {
   3767       /* Offsets are in bytes but they should always be multiples of 4 */
   3768       assert(instr->const_index[0] % 4 == 0);
   3769 
   3770       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
   3771 
   3772       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
   3773       if (const_offset) {
   3774          /* Offsets are in bytes but they should always be multiples of 4 */
   3775          assert(const_offset->u32[0] % 4 == 0);
   3776          src.offset = const_offset->u32[0];
   3777 
   3778          for (unsigned j = 0; j < instr->num_components; j++) {
   3779             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
   3780          }
   3781       } else {
   3782          fs_reg indirect = retype(get_nir_src(instr->src[0]),
   3783                                   BRW_REGISTER_TYPE_UD);
   3784 
   3785          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
   3786           * go past the end of the uniform.  In order to keep the n'th
   3787           * component from running past, we subtract off the size of all but
   3788           * one component of the vector.
   3789           */
   3790          assert(instr->const_index[1] >=
   3791                 instr->num_components * (int) type_sz(dest.type));
   3792          unsigned read_size = instr->const_index[1] -
   3793             (instr->num_components - 1) * type_sz(dest.type);
   3794 
   3795          bool supports_64bit_indirects =
   3796             !devinfo->is_cherryview && !devinfo->is_broxton;
   3797 
   3798          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
   3799             for (unsigned j = 0; j < instr->num_components; j++) {
   3800                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
   3801                         offset(dest, bld, j), offset(src, bld, j),
   3802                         indirect, brw_imm_ud(read_size));
   3803             }
   3804          } else {
   3805             const unsigned num_mov_indirects =
   3806                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
   3807             /* We read a little bit less per MOV INDIRECT, as they are now
   3808              * 32-bits ones instead of 64-bit. Fix read_size then.
   3809              */
   3810             const unsigned read_size_32bit = read_size -
   3811                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
   3812             for (unsigned j = 0; j < instr->num_components; j++) {
   3813                for (unsigned i = 0; i < num_mov_indirects; i++) {
   3814                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
   3815                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
   3816                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
   3817                            indirect, brw_imm_ud(read_size_32bit));
   3818                }
   3819             }
   3820          }
   3821       }
   3822       break;
   3823    }
   3824 
   3825    case nir_intrinsic_load_ubo: {
   3826       nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
   3827       fs_reg surf_index;
   3828 
   3829       if (const_index) {
   3830          const unsigned index = stage_prog_data->binding_table.ubo_start +
   3831                                 const_index->u32[0];
   3832          surf_index = brw_imm_ud(index);
   3833          brw_mark_surface_used(prog_data, index);
   3834       } else {
   3835          /* The block index is not a constant. Evaluate the index expression
   3836           * per-channel and add the base UBO index; we have to select a value
   3837           * from any live channel.
   3838           */
   3839          surf_index = vgrf(glsl_type::uint_type);
   3840          bld.ADD(surf_index, get_nir_src(instr->src[0]),
   3841                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
   3842          surf_index = bld.emit_uniformize(surf_index);
   3843 
   3844          /* Assume this may touch any UBO. It would be nice to provide
   3845           * a tighter bound, but the array information is already lowered away.
   3846           */
   3847          brw_mark_surface_used(prog_data,
   3848                                stage_prog_data->binding_table.ubo_start +
   3849                                nir->info->num_ubos - 1);
   3850       }
   3851 
   3852       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
   3853       if (const_offset == NULL) {
   3854          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
   3855                                      BRW_REGISTER_TYPE_UD);
   3856 
   3857          for (int i = 0; i < instr->num_components; i++)
   3858             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
   3859                                        base_offset, i * type_sz(dest.type));
   3860       } else {
   3861          /* Even if we are loading doubles, a pull constant load will load
   3862           * a 32-bit vec4, so should only reserve vgrf space for that. If we
   3863           * need to load a full dvec4 we will have to emit 2 loads. This is
   3864           * similar to demote_pull_constants(), except that in that case we
   3865           * see individual accesses to each component of the vector and then
   3866           * we let CSE deal with duplicate loads. Here we see a vector access
   3867           * and we have to split it if necessary.
   3868           */
   3869          const unsigned type_size = type_sz(dest.type);
   3870          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
   3871          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
   3872          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
   3873 
   3874          for (unsigned c = 0; c < instr->num_components;) {
   3875             const unsigned base = const_offset->u32[0] + c * type_size;
   3876             /* Number of usable components in the next block-aligned load. */
   3877             const unsigned count = MIN2(instr->num_components - c,
   3878                                         (block_sz - base % block_sz) / type_size);
   3879 
   3880             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
   3881                       packed_consts, surf_index,
   3882                       brw_imm_ud(base & ~(block_sz - 1)));
   3883 
   3884             const fs_reg consts =
   3885                retype(byte_offset(packed_consts, base & (block_sz - 1)),
   3886                       dest.type);
   3887 
   3888             for (unsigned d = 0; d < count; d++)
   3889                bld.MOV(offset(dest, bld, c + d), component(consts, d));
   3890 
   3891             c += count;
   3892          }
   3893       }
   3894       break;
   3895    }
   3896 
   3897    case nir_intrinsic_load_ssbo: {
   3898       assert(devinfo->gen >= 7);
   3899 
   3900       nir_const_value *const_uniform_block =
   3901          nir_src_as_const_value(instr->src[0]);
   3902 
   3903       fs_reg surf_index;
   3904       if (const_uniform_block) {
   3905          unsigned index = stage_prog_data->binding_table.ssbo_start +
   3906                           const_uniform_block->u32[0];
   3907          surf_index = brw_imm_ud(index);
   3908          brw_mark_surface_used(prog_data, index);
   3909       } else {
   3910          surf_index = vgrf(glsl_type::uint_type);
   3911          bld.ADD(surf_index, get_nir_src(instr->src[0]),
   3912                  brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
   3913 
   3914          /* Assume this may touch any UBO. It would be nice to provide
   3915           * a tighter bound, but the array information is already lowered away.
   3916           */
   3917          brw_mark_surface_used(prog_data,
   3918                                stage_prog_data->binding_table.ssbo_start +
   3919                                nir->info->num_ssbos - 1);
   3920       }
   3921 
   3922       fs_reg offset_reg;
   3923       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
   3924       if (const_offset) {
   3925          offset_reg = brw_imm_ud(const_offset->u32[0]);
   3926       } else {
   3927          offset_reg = get_nir_src(instr->src[1]);
   3928       }
   3929 
   3930       /* Read the vector */
   3931       do_untyped_vector_read(bld, dest, surf_index, offset_reg,
   3932                              instr->num_components);
   3933 
   3934       break;
   3935    }
   3936 
   3937    case nir_intrinsic_store_ssbo: {
   3938       assert(devinfo->gen >= 7);
   3939 
   3940       if (stage == MESA_SHADER_FRAGMENT)
   3941          brw_wm_prog_data(prog_data)->has_side_effects = true;
   3942 
   3943       /* Block index */
   3944       fs_reg surf_index;
   3945       nir_const_value *const_uniform_block =
   3946          nir_src_as_const_value(instr->src[1]);
   3947       if (const_uniform_block) {
   3948          unsigned index = stage_prog_data->binding_table.ssbo_start +
   3949                           const_uniform_block->u32[0];
   3950          surf_index = brw_imm_ud(index);
   3951          brw_mark_surface_used(prog_data, index);
   3952       } else {
   3953          surf_index = vgrf(glsl_type::uint_type);
   3954          bld.ADD(surf_index, get_nir_src(instr->src[1]),
   3955                   brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
   3956 
   3957          brw_mark_surface_used(prog_data,
   3958                                stage_prog_data->binding_table.ssbo_start +
   3959                                nir->info->num_ssbos - 1);
   3960       }
   3961 
   3962       /* Value */
   3963       fs_reg val_reg = get_nir_src(instr->src[0]);
   3964 
   3965       /* Writemask */
   3966       unsigned writemask = instr->const_index[0];
   3967 
   3968       /* get_nir_src() retypes to integer. Be wary of 64-bit types though
   3969        * since the untyped writes below operate in units of 32-bits, which
   3970        * means that we need to write twice as many components each time.
   3971        * Also, we have to suffle 64-bit data to be in the appropriate layout
   3972        * expected by our 32-bit write messages.
   3973        */
   3974       unsigned type_size = 4;
   3975       unsigned bit_size = instr->src[0].is_ssa ?
   3976          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
   3977       if (bit_size == 64) {
   3978          type_size = 8;
   3979          fs_reg tmp =
   3980            fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
   3981          shuffle_64bit_data_for_32bit_write(bld,
   3982             retype(tmp, BRW_REGISTER_TYPE_F),
   3983             retype(val_reg, BRW_REGISTER_TYPE_DF),
   3984             instr->num_components);
   3985          val_reg = tmp;
   3986       }
   3987 
   3988       unsigned type_slots = type_size / 4;
   3989 
   3990       /* Combine groups of consecutive enabled channels in one write
   3991        * message. We use ffs to find the first enabled channel and then ffs on
   3992        * the bit-inverse, down-shifted writemask to determine the length of
   3993        * the block of enabled bits.
   3994        */
   3995       while (writemask) {
   3996          unsigned first_component = ffs(writemask) - 1;
   3997          unsigned length = ffs(~(writemask >> first_component)) - 1;
   3998 
   3999          /* We can't write more than 2 64-bit components at once. Limit the
   4000           * length of the write to what we can do and let the next iteration
   4001           * handle the rest
   4002           */
   4003          if (type_size > 4)
   4004             length = MIN2(2, length);
   4005 
   4006          fs_reg offset_reg;
   4007          nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
   4008          if (const_offset) {
   4009             offset_reg = brw_imm_ud(const_offset->u32[0] +
   4010                                     type_size * first_component);
   4011          } else {
   4012             offset_reg = vgrf(glsl_type::uint_type);
   4013             bld.ADD(offset_reg,
   4014                     retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
   4015                     brw_imm_ud(type_size * first_component));
   4016          }
   4017 
   4018 
   4019          emit_untyped_write(bld, surf_index, offset_reg,
   4020                             offset(val_reg, bld, first_component * type_slots),
   4021                             1 /* dims */, length * type_slots,
   4022                             BRW_PREDICATE_NONE);
   4023 
   4024          /* Clear the bits in the writemask that we just wrote, then try
   4025           * again to see if more channels are left.
   4026           */
   4027          writemask &= (15 << (first_component + length));
   4028       }
   4029       break;
   4030    }
   4031 
   4032    case nir_intrinsic_store_output: {
   4033       fs_reg src = get_nir_src(instr->src[0]);
   4034 
   4035       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
   4036       assert(const_offset && "Indirect output stores not allowed");
   4037       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
   4038                                       4 * const_offset->u32[0]), src.type);
   4039 
   4040       unsigned num_components = instr->num_components;
   4041       unsigned first_component = nir_intrinsic_component(instr);
   4042       unsigned bit_size = instr->src[0].is_ssa ?
   4043          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
   4044       if (bit_size == 64) {
   4045          fs_reg tmp =
   4046             fs_reg(VGRF, alloc.allocate(2 * num_components),
   4047                    BRW_REGISTER_TYPE_F);
   4048          shuffle_64bit_data_for_32bit_write(
   4049             bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
   4050          src = retype(tmp, src.type);
   4051          num_components *= 2;
   4052       }
   4053 
   4054       for (unsigned j = 0; j < num_components; j++) {
   4055          bld.MOV(offset(new_dest, bld, j + first_component),
   4056                  offset(src, bld, j));
   4057       }
   4058       break;
   4059    }
   4060 
   4061    case nir_intrinsic_ssbo_atomic_add:
   4062       nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
   4063       break;
   4064    case nir_intrinsic_ssbo_atomic_imin:
   4065       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
   4066       break;
   4067    case nir_intrinsic_ssbo_atomic_umin:
   4068       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
   4069       break;
   4070    case nir_intrinsic_ssbo_atomic_imax:
   4071       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
   4072       break;
   4073    case nir_intrinsic_ssbo_atomic_umax:
   4074       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
   4075       break;
   4076    case nir_intrinsic_ssbo_atomic_and:
   4077       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
   4078       break;
   4079    case nir_intrinsic_ssbo_atomic_or:
   4080       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
   4081       break;
   4082    case nir_intrinsic_ssbo_atomic_xor:
   4083       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
   4084       break;
   4085    case nir_intrinsic_ssbo_atomic_exchange:
   4086       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
   4087       break;
   4088    case nir_intrinsic_ssbo_atomic_comp_swap:
   4089       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
   4090       break;
   4091 
   4092    case nir_intrinsic_get_buffer_size: {
   4093       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
   4094       unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
   4095 
   4096       /* A resinfo's sampler message is used to get the buffer size.  The
   4097        * SIMD8's writeback message consists of four registers and SIMD16's
   4098        * writeback message consists of 8 destination registers (two per each
   4099        * component).  Because we are only interested on the first channel of
   4100        * the first returned component, where resinfo returns the buffer size
   4101        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
   4102        * the dispatch width.
   4103        */
   4104       const fs_builder ubld = bld.exec_all().group(8, 0);
   4105       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
   4106       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
   4107 
   4108       /* Set LOD = 0 */
   4109       ubld.MOV(src_payload, brw_imm_d(0));
   4110 
   4111       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
   4112       fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
   4113                                 src_payload, brw_imm_ud(index));
   4114       inst->header_size = 0;
   4115       inst->mlen = 1;
   4116       inst->size_written = 4 * REG_SIZE;
   4117 
   4118       bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
   4119       brw_mark_surface_used(prog_data, index);
   4120       break;
   4121    }
   4122 
   4123    case nir_intrinsic_load_channel_num: {
   4124       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
   4125       dest = retype(dest, BRW_REGISTER_TYPE_UD);
   4126       const fs_builder allbld8 = bld.group(8, 0).exec_all();
   4127       allbld8.MOV(tmp, brw_imm_v(0x76543210));
   4128       if (dispatch_width > 8)
   4129          allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
   4130       if (dispatch_width > 16) {
   4131          const fs_builder allbld16 = bld.group(16, 0).exec_all();
   4132          allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
   4133       }
   4134       bld.MOV(dest, tmp);
   4135       break;
   4136    }
   4137 
   4138    default:
   4139       unreachable("unknown intrinsic");
   4140    }
   4141 }
   4142 
   4143 void
   4144 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
   4145                                  int op, nir_intrinsic_instr *instr)
   4146 {
   4147    if (stage == MESA_SHADER_FRAGMENT)
   4148       brw_wm_prog_data(prog_data)->has_side_effects = true;
   4149 
   4150    fs_reg dest;
   4151    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   4152       dest = get_nir_dest(instr->dest);
   4153 
   4154    fs_reg surface;
   4155    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
   4156    if (const_surface) {
   4157       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
   4158                             const_surface->u32[0];
   4159       surface = brw_imm_ud(surf_index);
   4160       brw_mark_surface_used(prog_data, surf_index);
   4161    } else {
   4162       surface = vgrf(glsl_type::uint_type);
   4163       bld.ADD(surface, get_nir_src(instr->src[0]),
   4164               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
   4165 
   4166       /* Assume this may touch any SSBO. This is the same we do for other
   4167        * UBO/SSBO accesses with non-constant surface.
   4168        */
   4169       brw_mark_surface_used(prog_data,
   4170                             stage_prog_data->binding_table.ssbo_start +
   4171                             nir->info->num_ssbos - 1);
   4172    }
   4173 
   4174    fs_reg offset = get_nir_src(instr->src[1]);
   4175    fs_reg data1 = get_nir_src(instr->src[2]);
   4176    fs_reg data2;
   4177    if (op == BRW_AOP_CMPWR)
   4178       data2 = get_nir_src(instr->src[3]);
   4179 
   4180    /* Emit the actual atomic operation */
   4181 
   4182    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
   4183                                               data1, data2,
   4184                                               1 /* dims */, 1 /* rsize */,
   4185                                               op,
   4186                                               BRW_PREDICATE_NONE);
   4187    dest.type = atomic_result.type;
   4188    bld.MOV(dest, atomic_result);
   4189 }
   4190 
   4191 void
   4192 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
   4193                                    int op, nir_intrinsic_instr *instr)
   4194 {
   4195    fs_reg dest;
   4196    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
   4197       dest = get_nir_dest(instr->dest);
   4198 
   4199    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
   4200    fs_reg offset;
   4201    fs_reg data1 = get_nir_src(instr->src[1]);
   4202    fs_reg data2;
   4203    if (op == BRW_AOP_CMPWR)
   4204       data2 = get_nir_src(instr->src[2]);
   4205 
   4206    /* Get the offset */
   4207    nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
   4208    if (const_offset) {
   4209       offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
   4210    } else {
   4211       offset = vgrf(glsl_type::uint_type);
   4212       bld.ADD(offset,
   4213 	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
   4214 	      brw_imm_ud(instr->const_index[0]));
   4215    }
   4216 
   4217    /* Emit the actual atomic operation operation */
   4218 
   4219    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
   4220                                               data1, data2,
   4221                                               1 /* dims */, 1 /* rsize */,
   4222                                               op,
   4223                                               BRW_PREDICATE_NONE);
   4224    dest.type = atomic_result.type;
   4225    bld.MOV(dest, atomic_result);
   4226 }
   4227 
   4228 void
   4229 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
   4230 {
   4231    unsigned texture = instr->texture_index;
   4232    unsigned sampler = instr->sampler_index;
   4233 
   4234    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
   4235 
   4236    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
   4237    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
   4238 
   4239    int lod_components = 0;
   4240 
   4241    /* The hardware requires a LOD for buffer textures */
   4242    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
   4243       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
   4244 
   4245    uint32_t header_bits = 0;
   4246    for (unsigned i = 0; i < instr->num_srcs; i++) {
   4247       fs_reg src = get_nir_src(instr->src[i].src);
   4248       switch (instr->src[i].src_type) {
   4249       case nir_tex_src_bias:
   4250          srcs[TEX_LOGICAL_SRC_LOD] =
   4251             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
   4252          break;
   4253       case nir_tex_src_comparator:
   4254          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
   4255          break;
   4256       case nir_tex_src_coord:
   4257          switch (instr->op) {
   4258          case nir_texop_txf:
   4259          case nir_texop_txf_ms:
   4260          case nir_texop_txf_ms_mcs:
   4261          case nir_texop_samples_identical:
   4262             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
   4263             break;
   4264          default:
   4265             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
   4266             break;
   4267          }
   4268          break;
   4269       case nir_tex_src_ddx:
   4270          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
   4271          lod_components = nir_tex_instr_src_size(instr, i);
   4272          break;
   4273       case nir_tex_src_ddy:
   4274          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
   4275          break;
   4276       case nir_tex_src_lod:
   4277          switch (instr->op) {
   4278          case nir_texop_txs:
   4279             srcs[TEX_LOGICAL_SRC_LOD] =
   4280                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
   4281             break;
   4282          case nir_texop_txf:
   4283             srcs[TEX_LOGICAL_SRC_LOD] =
   4284                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
   4285             break;
   4286          default:
   4287             srcs[TEX_LOGICAL_SRC_LOD] =
   4288                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
   4289             break;
   4290          }
   4291          break;
   4292       case nir_tex_src_ms_index:
   4293          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
   4294          break;
   4295 
   4296       case nir_tex_src_offset: {
   4297          nir_const_value *const_offset =
   4298             nir_src_as_const_value(instr->src[i].src);
   4299          unsigned offset_bits = 0;
   4300          if (const_offset &&
   4301              brw_texture_offset(const_offset->i32,
   4302                                 nir_tex_instr_src_size(instr, i),
   4303                                 &offset_bits)) {
   4304             header_bits |= offset_bits;
   4305          } else {
   4306             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
   4307                retype(src, BRW_REGISTER_TYPE_D);
   4308          }
   4309          break;
   4310       }
   4311 
   4312       case nir_tex_src_projector:
   4313          unreachable("should be lowered");
   4314 
   4315       case nir_tex_src_texture_offset: {
   4316          /* Figure out the highest possible texture index and mark it as used */
   4317          uint32_t max_used = texture + instr->texture_array_size - 1;
   4318          if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
   4319             max_used += stage_prog_data->binding_table.gather_texture_start;
   4320          } else {
   4321             max_used += stage_prog_data->binding_table.texture_start;
   4322          }
   4323          brw_mark_surface_used(prog_data, max_used);
   4324 
   4325          /* Emit code to evaluate the actual indexing expression */
   4326          fs_reg tmp = vgrf(glsl_type::uint_type);
   4327          bld.ADD(tmp, src, brw_imm_ud(texture));
   4328          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
   4329          break;
   4330       }
   4331 
   4332       case nir_tex_src_sampler_offset: {
   4333          /* Emit code to evaluate the actual indexing expression */
   4334          fs_reg tmp = vgrf(glsl_type::uint_type);
   4335          bld.ADD(tmp, src, brw_imm_ud(sampler));
   4336          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
   4337          break;
   4338       }
   4339 
   4340       case nir_tex_src_ms_mcs:
   4341          assert(instr->op == nir_texop_txf_ms);
   4342          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
   4343          break;
   4344 
   4345       case nir_tex_src_plane: {
   4346          nir_const_value *const_plane =
   4347             nir_src_as_const_value(instr->src[i].src);
   4348          const uint32_t plane = const_plane->u32[0];
   4349          const uint32_t texture_index =
   4350             instr->texture_index +
   4351             stage_prog_data->binding_table.plane_start[plane] -
   4352             stage_prog_data->binding_table.texture_start;
   4353 
   4354          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
   4355          break;
   4356       }
   4357 
   4358       default:
   4359          unreachable("unknown texture source");
   4360       }
   4361    }
   4362 
   4363    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
   4364        (instr->op == nir_texop_txf_ms ||
   4365         instr->op == nir_texop_samples_identical)) {
   4366       if (devinfo->gen >= 7 &&
   4367           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
   4368          srcs[TEX_LOGICAL_SRC_MCS] =
   4369             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
   4370                            instr->coord_components,
   4371                            srcs[TEX_LOGICAL_SRC_SURFACE]);
   4372       } else {
   4373          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
   4374       }
   4375    }
   4376 
   4377    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
   4378    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
   4379 
   4380    enum opcode opcode;
   4381    switch (instr->op) {
   4382    case nir_texop_tex:
   4383       opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
   4384                 SHADER_OPCODE_TXL_LOGICAL);
   4385       break;
   4386    case nir_texop_txb:
   4387       opcode = FS_OPCODE_TXB_LOGICAL;
   4388       break;
   4389    case nir_texop_txl:
   4390       opcode = SHADER_OPCODE_TXL_LOGICAL;
   4391       break;
   4392    case nir_texop_txd:
   4393       opcode = SHADER_OPCODE_TXD_LOGICAL;
   4394       break;
   4395    case nir_texop_txf:
   4396       opcode = SHADER_OPCODE_TXF_LOGICAL;
   4397       break;
   4398    case nir_texop_txf_ms:
   4399       if ((key_tex->msaa_16 & (1 << sampler)))
   4400          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
   4401       else
   4402          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
   4403       break;
   4404    case nir_texop_txf_ms_mcs:
   4405       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
   4406       break;
   4407    case nir_texop_query_levels:
   4408    case nir_texop_txs:
   4409       opcode = SHADER_OPCODE_TXS_LOGICAL;
   4410       break;
   4411    case nir_texop_lod:
   4412       opcode = SHADER_OPCODE_LOD_LOGICAL;
   4413       break;
   4414    case nir_texop_tg4:
   4415       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
   4416          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
   4417       else
   4418          opcode = SHADER_OPCODE_TG4_LOGICAL;
   4419       break;
   4420    case nir_texop_texture_samples:
   4421       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
   4422       break;
   4423    case nir_texop_samples_identical: {
   4424       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
   4425 
   4426       /* If mcs is an immediate value, it means there is no MCS.  In that case
   4427        * just return false.
   4428        */
   4429       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
   4430          bld.MOV(dst, brw_imm_ud(0u));
   4431       } else if ((key_tex->msaa_16 & (1 << sampler))) {
   4432          fs_reg tmp = vgrf(glsl_type::uint_type);
   4433          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
   4434                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
   4435          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
   4436       } else {
   4437          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
   4438                  BRW_CONDITIONAL_EQ);
   4439       }
   4440       return;
   4441    }
   4442    default:
   4443       unreachable("unknown texture opcode");
   4444    }
   4445 
   4446    /* TXS and TXL require a LOD but not everything we implement using those
   4447     * two opcodes provides one.  Provide a default LOD of 0.
   4448     */
   4449    if ((opcode == SHADER_OPCODE_TXS_LOGICAL ||
   4450         opcode == SHADER_OPCODE_TXL_LOGICAL) &&
   4451        srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE) {
   4452       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
   4453    }
   4454 
   4455    if (instr->op == nir_texop_tg4) {
   4456       if (instr->component == 1 &&
   4457           key_tex->gather_channel_quirk_mask & (1 << texture)) {
   4458          /* gather4 sampler is broken for green channel on RG32F --
   4459           * we must ask for blue instead.
   4460           */
   4461          header_bits |= 2 << 16;
   4462       } else {
   4463          header_bits |= instr->component << 16;
   4464       }
   4465    }
   4466 
   4467    fs_reg dst = bld.vgrf(brw_type_for_nir_type(instr->dest_type), 4);
   4468    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
   4469    inst->offset = header_bits;
   4470 
   4471    const unsigned dest_size = nir_tex_instr_dest_size(instr);
   4472    if (devinfo->gen >= 9 &&
   4473        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
   4474       unsigned write_mask = instr->dest.is_ssa ?
   4475                             nir_ssa_def_components_read(&instr->dest.ssa):
   4476                             (1 << dest_size) - 1;
   4477       assert(write_mask != 0); /* dead code should have been eliminated */
   4478       inst->size_written = util_last_bit(write_mask) *
   4479                            inst->dst.component_size(inst->exec_size);
   4480    } else {
   4481       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
   4482    }
   4483 
   4484    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
   4485       inst->shadow_compare = true;
   4486 
   4487    if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
   4488       emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
   4489 
   4490    fs_reg nir_dest[4];
   4491    for (unsigned i = 0; i < dest_size; i++)
   4492       nir_dest[i] = offset(dst, bld, i);
   4493 
   4494    if (instr->op == nir_texop_query_levels) {
   4495       /* # levels is in .w */
   4496       nir_dest[0] = offset(dst, bld, 3);
   4497    } else if (instr->op == nir_texop_txs &&
   4498               dest_size >= 3 && devinfo->gen < 7) {
   4499       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
   4500       fs_reg depth = offset(dst, bld, 2);
   4501       nir_dest[2] = vgrf(glsl_type::int_type);
   4502       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
   4503    }
   4504 
   4505    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
   4506 }
   4507 
   4508 void
   4509 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
   4510 {
   4511    switch (instr->type) {
   4512    case nir_jump_break:
   4513       bld.emit(BRW_OPCODE_BREAK);
   4514       break;
   4515    case nir_jump_continue:
   4516       bld.emit(BRW_OPCODE_CONTINUE);
   4517       break;
   4518    case nir_jump_return:
   4519    default:
   4520       unreachable("unknown jump");
   4521    }
   4522 }
   4523 
   4524 /**
   4525  * This helper takes the result of a load operation that reads 32-bit elements
   4526  * in this format:
   4527  *
   4528  * x x x x x x x x
   4529  * y y y y y y y y
   4530  * z z z z z z z z
   4531  * w w w w w w w w
   4532  *
   4533  * and shuffles the data to get this:
   4534  *
   4535  * x y x y x y x y
   4536  * x y x y x y x y
   4537  * z w z w z w z w
   4538  * z w z w z w z w
   4539  *
   4540  * Which is exactly what we want if the load is reading 64-bit components
   4541  * like doubles, where x represents the low 32-bit of the x double component
   4542  * and y represents the high 32-bit of the x double component (likewise with
   4543  * z and w for double component y). The parameter @components represents
   4544  * the number of 64-bit components present in @src. This would typically be
   4545  * 2 at most, since we can only fit 2 double elements in the result of a
   4546  * vec4 load.
   4547  *
   4548  * Notice that @dst and @src can be the same register.
   4549  */
   4550 void
   4551 shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
   4552                                         const fs_reg &dst,
   4553                                         const fs_reg &src,
   4554                                         uint32_t components)
   4555 {
   4556    assert(type_sz(src.type) == 4);
   4557    assert(type_sz(dst.type) == 8);
   4558 
   4559    /* A temporary that we will use to shuffle the 32-bit data of each
   4560     * component in the vector into valid 64-bit data. We can't write directly
   4561     * to dst because dst can be (and would usually be) the same as src
   4562     * and in that case the first MOV in the loop below would overwrite the
   4563     * data read in the second MOV.
   4564     */
   4565    fs_reg tmp = bld.vgrf(dst.type);
   4566 
   4567    for (unsigned i = 0; i < components; i++) {
   4568       const fs_reg component_i = offset(src, bld, 2 * i);
   4569 
   4570       bld.MOV(subscript(tmp, src.type, 0), component_i);
   4571       bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
   4572 
   4573       bld.MOV(offset(dst, bld, i), tmp);
   4574    }
   4575 }
   4576 
   4577 /**
   4578  * This helper does the inverse operation of
   4579  * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
   4580  *
   4581  * We need to do this when we are going to use untyped write messsages that
   4582  * operate with 32-bit components in order to arrange our 64-bit data to be
   4583  * in the expected layout.
   4584  *
   4585  * Notice that callers of this function, unlike in the case of the inverse
   4586  * operation, would typically need to call this with dst and src being
   4587  * different registers, since they would otherwise corrupt the original
   4588  * 64-bit data they are about to write. Because of this the function checks
   4589  * that the src and dst regions involved in the operation do not overlap.
   4590  */
   4591 void
   4592 shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
   4593                                    const fs_reg &dst,
   4594                                    const fs_reg &src,
   4595                                    uint32_t components)
   4596 {
   4597    assert(type_sz(src.type) == 8);
   4598    assert(type_sz(dst.type) == 4);
   4599 
   4600    assert(!regions_overlap(
   4601              dst, 2 * components * dst.component_size(bld.dispatch_width()),
   4602              src, components * src.component_size(bld.dispatch_width())));
   4603 
   4604    for (unsigned i = 0; i < components; i++) {
   4605       const fs_reg component_i = offset(src, bld, i);
   4606       bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
   4607       bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
   4608    }
   4609 }
   4610 
   4611 fs_reg
   4612 setup_imm_df(const fs_builder &bld, double v)
   4613 {
   4614    const struct gen_device_info *devinfo = bld.shader->devinfo;
   4615    assert(devinfo->gen >= 7);
   4616 
   4617    if (devinfo->gen >= 8)
   4618       return brw_imm_df(v);
   4619 
   4620    /* gen7.5 does not support DF immediates straighforward but the DIM
   4621     * instruction allows to set the 64-bit immediate value.
   4622     */
   4623    if (devinfo->is_haswell) {
   4624       const fs_builder ubld = bld.exec_all().group(1, 0);
   4625       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
   4626       ubld.DIM(dst, brw_imm_df(v));
   4627       return component(dst, 0);
   4628    }
   4629 
   4630    /* gen7 does not support DF immediates, so we generate a 64-bit constant by
   4631     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
   4632     * the high 32-bit to suboffset 4 and then applying a stride of 0.
   4633     *
   4634     * Alternatively, we could also produce a normal VGRF (without stride 0)
   4635     * by writing to all the channels in the VGRF, however, that would hit the
   4636     * gen7 bug where we have to split writes that span more than 1 register
   4637     * into instructions with a width of 4 (otherwise the write to the second
   4638     * register written runs into an execmask hardware bug) which isn't very
   4639     * nice.
   4640     */
   4641    union {
   4642       double d;
   4643       struct {
   4644          uint32_t i1;
   4645          uint32_t i2;
   4646       };
   4647    } di;
   4648 
   4649    di.d = v;
   4650 
   4651    const fs_builder ubld = bld.exec_all().group(1, 0);
   4652    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
   4653    ubld.MOV(tmp, brw_imm_ud(di.i1));
   4654    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
   4655 
   4656    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
   4657 }
   4658