Home | History | Annotate | Download | only in compiler
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 /** @file brw_fs.cpp
     25  *
     26  * This file drives the GLSL IR -> LIR translation, contains the
     27  * optimizations on the LIR, and drives the generation of native code
     28  * from the LIR.
     29  */
     30 
     31 #include "main/macros.h"
     32 #include "brw_eu.h"
     33 #include "brw_fs.h"
     34 #include "brw_nir.h"
     35 #include "brw_vec4_gs_visitor.h"
     36 #include "brw_cfg.h"
     37 #include "brw_dead_control_flow.h"
     38 #include "common/gen_debug.h"
     39 #include "compiler/glsl_types.h"
     40 #include "compiler/nir/nir_builder.h"
     41 #include "program/prog_parameter.h"
     42 
     43 using namespace brw;
     44 
     45 static unsigned get_lowered_simd_width(const struct gen_device_info *devinfo,
     46                                        const fs_inst *inst);
     47 
     48 void
     49 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
     50               const fs_reg *src, unsigned sources)
     51 {
     52    memset(this, 0, sizeof(*this));
     53 
     54    this->src = new fs_reg[MAX2(sources, 3)];
     55    for (unsigned i = 0; i < sources; i++)
     56       this->src[i] = src[i];
     57 
     58    this->opcode = opcode;
     59    this->dst = dst;
     60    this->sources = sources;
     61    this->exec_size = exec_size;
     62    this->base_mrf = -1;
     63 
     64    assert(dst.file != IMM && dst.file != UNIFORM);
     65 
     66    assert(this->exec_size != 0);
     67 
     68    this->conditional_mod = BRW_CONDITIONAL_NONE;
     69 
     70    /* This will be the case for almost all instructions. */
     71    switch (dst.file) {
     72    case VGRF:
     73    case ARF:
     74    case FIXED_GRF:
     75    case MRF:
     76    case ATTR:
     77       this->size_written = dst.component_size(exec_size);
     78       break;
     79    case BAD_FILE:
     80       this->size_written = 0;
     81       break;
     82    case IMM:
     83    case UNIFORM:
     84       unreachable("Invalid destination register file");
     85    }
     86 
     87    this->writes_accumulator = false;
     88 }
     89 
     90 fs_inst::fs_inst()
     91 {
     92    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
     93 }
     94 
     95 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
     96 {
     97    init(opcode, exec_size, reg_undef, NULL, 0);
     98 }
     99 
    100 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
    101 {
    102    init(opcode, exec_size, dst, NULL, 0);
    103 }
    104 
    105 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    106                  const fs_reg &src0)
    107 {
    108    const fs_reg src[1] = { src0 };
    109    init(opcode, exec_size, dst, src, 1);
    110 }
    111 
    112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    113                  const fs_reg &src0, const fs_reg &src1)
    114 {
    115    const fs_reg src[2] = { src0, src1 };
    116    init(opcode, exec_size, dst, src, 2);
    117 }
    118 
    119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    120                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
    121 {
    122    const fs_reg src[3] = { src0, src1, src2 };
    123    init(opcode, exec_size, dst, src, 3);
    124 }
    125 
    126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
    127                  const fs_reg src[], unsigned sources)
    128 {
    129    init(opcode, exec_width, dst, src, sources);
    130 }
    131 
    132 fs_inst::fs_inst(const fs_inst &that)
    133 {
    134    memcpy(this, &that, sizeof(that));
    135 
    136    this->src = new fs_reg[MAX2(that.sources, 3)];
    137 
    138    for (unsigned i = 0; i < that.sources; i++)
    139       this->src[i] = that.src[i];
    140 }
    141 
    142 fs_inst::~fs_inst()
    143 {
    144    delete[] this->src;
    145 }
    146 
    147 void
    148 fs_inst::resize_sources(uint8_t num_sources)
    149 {
    150    if (this->sources != num_sources) {
    151       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
    152 
    153       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
    154          src[i] = this->src[i];
    155 
    156       delete[] this->src;
    157       this->src = src;
    158       this->sources = num_sources;
    159    }
    160 }
    161 
    162 void
    163 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    164                                        const fs_reg &dst,
    165                                        const fs_reg &surf_index,
    166                                        const fs_reg &varying_offset,
    167                                        uint32_t const_offset)
    168 {
    169    /* We have our constant surface use a pitch of 4 bytes, so our index can
    170     * be any component of a vector, and then we load 4 contiguous
    171     * components starting from that.
    172     *
    173     * We break down the const_offset to a portion added to the variable offset
    174     * and a portion done using fs_reg::offset, which means that if you have
    175     * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
    176     * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
    177     * later notice that those loads are all the same and eliminate the
    178     * redundant ones.
    179     */
    180    fs_reg vec4_offset = vgrf(glsl_type::uint_type);
    181    bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
    182 
    183    /* The pull load message will load a vec4 (16 bytes). If we are loading
    184     * a double this means we are only loading 2 elements worth of data.
    185     * We also want to use a 32-bit data type for the dst of the load operation
    186     * so other parts of the driver don't get confused about the size of the
    187     * result.
    188     */
    189    fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
    190    fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
    191                             vec4_result, surf_index, vec4_offset);
    192    inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
    193 
    194    fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
    195    switch (type_sz(dst.type)) {
    196    case 2:
    197       shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1);
    198       bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
    199       break;
    200    case 4:
    201       bld.MOV(dst, retype(dw, dst.type));
    202       break;
    203    case 8:
    204       shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
    205       break;
    206    default:
    207       unreachable("Unsupported bit_size");
    208    }
    209 }
    210 
    211 /**
    212  * A helper for MOV generation for fixing up broken hardware SEND dependency
    213  * handling.
    214  */
    215 void
    216 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
    217 {
    218    /* The caller always wants uncompressed to emit the minimal extra
    219     * dependencies, and to avoid having to deal with aligning its regs to 2.
    220     */
    221    const fs_builder ubld = bld.annotate("send dependency resolve")
    222                               .half(0);
    223 
    224    ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
    225 }
    226 
    227 bool
    228 fs_inst::equals(fs_inst *inst) const
    229 {
    230    return (opcode == inst->opcode &&
    231            dst.equals(inst->dst) &&
    232            src[0].equals(inst->src[0]) &&
    233            src[1].equals(inst->src[1]) &&
    234            src[2].equals(inst->src[2]) &&
    235            saturate == inst->saturate &&
    236            predicate == inst->predicate &&
    237            conditional_mod == inst->conditional_mod &&
    238            mlen == inst->mlen &&
    239            base_mrf == inst->base_mrf &&
    240            target == inst->target &&
    241            eot == inst->eot &&
    242            header_size == inst->header_size &&
    243            shadow_compare == inst->shadow_compare &&
    244            exec_size == inst->exec_size &&
    245            offset == inst->offset);
    246 }
    247 
    248 bool
    249 fs_inst::is_send_from_grf() const
    250 {
    251    switch (opcode) {
    252    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
    253    case SHADER_OPCODE_SHADER_TIME_ADD:
    254    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
    255    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
    256    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
    257    case SHADER_OPCODE_UNTYPED_ATOMIC:
    258    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    259    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
    260    case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
    261    case SHADER_OPCODE_BYTE_SCATTERED_READ:
    262    case SHADER_OPCODE_TYPED_ATOMIC:
    263    case SHADER_OPCODE_TYPED_SURFACE_READ:
    264    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    265    case SHADER_OPCODE_URB_WRITE_SIMD8:
    266    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
    267    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    268    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    269    case SHADER_OPCODE_URB_READ_SIMD8:
    270    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
    271       return true;
    272    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
    273       return src[1].file == VGRF;
    274    case FS_OPCODE_FB_WRITE:
    275    case FS_OPCODE_FB_READ:
    276       return src[0].file == VGRF;
    277    default:
    278       if (is_tex())
    279          return src[0].file == VGRF;
    280 
    281       return false;
    282    }
    283 }
    284 
    285 /**
    286  * Returns true if this instruction's sources and destinations cannot
    287  * safely be the same register.
    288  *
    289  * In most cases, a register can be written over safely by the same
    290  * instruction that is its last use.  For a single instruction, the
    291  * sources are dereferenced before writing of the destination starts
    292  * (naturally).
    293  *
    294  * However, there are a few cases where this can be problematic:
    295  *
    296  * - Virtual opcodes that translate to multiple instructions in the
    297  *   code generator: if src == dst and one instruction writes the
    298  *   destination before a later instruction reads the source, then
    299  *   src will have been clobbered.
    300  *
    301  * - SIMD16 compressed instructions with certain regioning (see below).
    302  *
    303  * The register allocator uses this information to set up conflicts between
    304  * GRF sources and the destination.
    305  */
    306 bool
    307 fs_inst::has_source_and_destination_hazard() const
    308 {
    309    switch (opcode) {
    310    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
    311       /* Multiple partial writes to the destination */
    312       return true;
    313    default:
    314       /* The SIMD16 compressed instruction
    315        *
    316        * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
    317        *
    318        * is actually decoded in hardware as:
    319        *
    320        * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
    321        * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
    322        *
    323        * Which is safe.  However, if we have uniform accesses
    324        * happening, we get into trouble:
    325        *
    326        * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
    327        * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
    328        *
    329        * Now our destination for the first instruction overwrote the
    330        * second instruction's src0, and we get garbage for those 8
    331        * pixels.  There's a similar issue for the pre-gen6
    332        * pixel_x/pixel_y, which are registers of 16-bit values and thus
    333        * would get stomped by the first decode as well.
    334        */
    335       if (exec_size == 16) {
    336          for (int i = 0; i < sources; i++) {
    337             if (src[i].file == VGRF && (src[i].stride == 0 ||
    338                                         src[i].type == BRW_REGISTER_TYPE_UW ||
    339                                         src[i].type == BRW_REGISTER_TYPE_W ||
    340                                         src[i].type == BRW_REGISTER_TYPE_UB ||
    341                                         src[i].type == BRW_REGISTER_TYPE_B)) {
    342                return true;
    343             }
    344          }
    345       }
    346       return false;
    347    }
    348 }
    349 
    350 bool
    351 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
    352 {
    353    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
    354       return false;
    355 
    356    fs_reg reg = this->src[0];
    357    if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1)
    358       return false;
    359 
    360    if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
    361       return false;
    362 
    363    for (int i = 0; i < this->sources; i++) {
    364       reg.type = this->src[i].type;
    365       if (!this->src[i].equals(reg))
    366          return false;
    367 
    368       if (i < this->header_size) {
    369          reg.offset += REG_SIZE;
    370       } else {
    371          reg = horiz_offset(reg, this->exec_size);
    372       }
    373    }
    374 
    375    return true;
    376 }
    377 
    378 bool
    379 fs_inst::can_do_source_mods(const struct gen_device_info *devinfo)
    380 {
    381    if (devinfo->gen == 6 && is_math())
    382       return false;
    383 
    384    if (is_send_from_grf())
    385       return false;
    386 
    387    if (!backend_instruction::can_do_source_mods())
    388       return false;
    389 
    390    return true;
    391 }
    392 
    393 bool
    394 fs_inst::can_change_types() const
    395 {
    396    return dst.type == src[0].type &&
    397           !src[0].abs && !src[0].negate && !saturate &&
    398           (opcode == BRW_OPCODE_MOV ||
    399            (opcode == BRW_OPCODE_SEL &&
    400             dst.type == src[1].type &&
    401             predicate != BRW_PREDICATE_NONE &&
    402             !src[1].abs && !src[1].negate));
    403 }
    404 
    405 void
    406 fs_reg::init()
    407 {
    408    memset(this, 0, sizeof(*this));
    409    type = BRW_REGISTER_TYPE_UD;
    410    stride = 1;
    411 }
    412 
    413 /** Generic unset register constructor. */
    414 fs_reg::fs_reg()
    415 {
    416    init();
    417    this->file = BAD_FILE;
    418 }
    419 
    420 fs_reg::fs_reg(struct ::brw_reg reg) :
    421    backend_reg(reg)
    422 {
    423    this->offset = 0;
    424    this->stride = 1;
    425    if (this->file == IMM &&
    426        (this->type != BRW_REGISTER_TYPE_V &&
    427         this->type != BRW_REGISTER_TYPE_UV &&
    428         this->type != BRW_REGISTER_TYPE_VF)) {
    429       this->stride = 0;
    430    }
    431 }
    432 
    433 bool
    434 fs_reg::equals(const fs_reg &r) const
    435 {
    436    return (this->backend_reg::equals(r) &&
    437            stride == r.stride);
    438 }
    439 
    440 bool
    441 fs_reg::is_contiguous() const
    442 {
    443    return stride == 1;
    444 }
    445 
    446 unsigned
    447 fs_reg::component_size(unsigned width) const
    448 {
    449    const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
    450                             hstride == 0 ? 0 :
    451                             1 << (hstride - 1));
    452    return MAX2(width * stride, 1) * type_sz(type);
    453 }
    454 
    455 extern "C" int
    456 type_size_scalar(const struct glsl_type *type)
    457 {
    458    unsigned int size, i;
    459 
    460    switch (type->base_type) {
    461    case GLSL_TYPE_UINT:
    462    case GLSL_TYPE_INT:
    463    case GLSL_TYPE_FLOAT:
    464    case GLSL_TYPE_BOOL:
    465       return type->components();
    466    case GLSL_TYPE_UINT16:
    467    case GLSL_TYPE_INT16:
    468    case GLSL_TYPE_FLOAT16:
    469       return DIV_ROUND_UP(type->components(), 2);
    470    case GLSL_TYPE_DOUBLE:
    471    case GLSL_TYPE_UINT64:
    472    case GLSL_TYPE_INT64:
    473       return type->components() * 2;
    474    case GLSL_TYPE_ARRAY:
    475       return type_size_scalar(type->fields.array) * type->length;
    476    case GLSL_TYPE_STRUCT:
    477       size = 0;
    478       for (i = 0; i < type->length; i++) {
    479 	 size += type_size_scalar(type->fields.structure[i].type);
    480       }
    481       return size;
    482    case GLSL_TYPE_SAMPLER:
    483       /* Samplers take up no register space, since they're baked in at
    484        * link time.
    485        */
    486       return 0;
    487    case GLSL_TYPE_ATOMIC_UINT:
    488       return 0;
    489    case GLSL_TYPE_SUBROUTINE:
    490       return 1;
    491    case GLSL_TYPE_IMAGE:
    492       return BRW_IMAGE_PARAM_SIZE;
    493    case GLSL_TYPE_VOID:
    494    case GLSL_TYPE_ERROR:
    495    case GLSL_TYPE_INTERFACE:
    496    case GLSL_TYPE_FUNCTION:
    497       unreachable("not reached");
    498    }
    499 
    500    return 0;
    501 }
    502 
    503 /**
    504  * Create a MOV to read the timestamp register.
    505  *
    506  * The caller is responsible for emitting the MOV.  The return value is
    507  * the destination of the MOV, with extra parameters set.
    508  */
    509 fs_reg
    510 fs_visitor::get_timestamp(const fs_builder &bld)
    511 {
    512    assert(devinfo->gen >= 7);
    513 
    514    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
    515                                           BRW_ARF_TIMESTAMP,
    516                                           0),
    517                              BRW_REGISTER_TYPE_UD));
    518 
    519    fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    520 
    521    /* We want to read the 3 fields we care about even if it's not enabled in
    522     * the dispatch.
    523     */
    524    bld.group(4, 0).exec_all().MOV(dst, ts);
    525 
    526    return dst;
    527 }
    528 
    529 void
    530 fs_visitor::emit_shader_time_begin()
    531 {
    532    /* We want only the low 32 bits of the timestamp.  Since it's running
    533     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
    534     * which is plenty of time for our purposes.  It is identical across the
    535     * EUs, but since it's tracking GPU core speed it will increment at a
    536     * varying rate as render P-states change.
    537     */
    538    shader_start_time = component(
    539       get_timestamp(bld.annotate("shader time start")), 0);
    540 }
    541 
    542 void
    543 fs_visitor::emit_shader_time_end()
    544 {
    545    /* Insert our code just before the final SEND with EOT. */
    546    exec_node *end = this->instructions.get_tail();
    547    assert(end && ((fs_inst *) end)->eot);
    548    const fs_builder ibld = bld.annotate("shader time end")
    549                               .exec_all().at(NULL, end);
    550    const fs_reg timestamp = get_timestamp(ibld);
    551 
    552    /* We only use the low 32 bits of the timestamp - see
    553     * emit_shader_time_begin()).
    554     *
    555     * We could also check if render P-states have changed (or anything
    556     * else that might disrupt timing) by setting smear to 2 and checking if
    557     * that field is != 0.
    558     */
    559    const fs_reg shader_end_time = component(timestamp, 0);
    560 
    561    /* Check that there weren't any timestamp reset events (assuming these
    562     * were the only two timestamp reads that happened).
    563     */
    564    const fs_reg reset = component(timestamp, 2);
    565    set_condmod(BRW_CONDITIONAL_Z,
    566                ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
    567    ibld.IF(BRW_PREDICATE_NORMAL);
    568 
    569    fs_reg start = shader_start_time;
    570    start.negate = true;
    571    const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
    572                                         BRW_REGISTER_TYPE_UD),
    573                                  0);
    574    const fs_builder cbld = ibld.group(1, 0);
    575    cbld.group(1, 0).ADD(diff, start, shader_end_time);
    576 
    577    /* If there were no instructions between the two timestamp gets, the diff
    578     * is 2 cycles.  Remove that overhead, so I can forget about that when
    579     * trying to determine the time taken for single instructions.
    580     */
    581    cbld.ADD(diff, diff, brw_imm_ud(-2u));
    582    SHADER_TIME_ADD(cbld, 0, diff);
    583    SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
    584    ibld.emit(BRW_OPCODE_ELSE);
    585    SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
    586    ibld.emit(BRW_OPCODE_ENDIF);
    587 }
    588 
    589 void
    590 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
    591                             int shader_time_subindex,
    592                             fs_reg value)
    593 {
    594    int index = shader_time_index * 3 + shader_time_subindex;
    595    struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
    596 
    597    fs_reg payload;
    598    if (dispatch_width == 8)
    599       payload = vgrf(glsl_type::uvec2_type);
    600    else
    601       payload = vgrf(glsl_type::uint_type);
    602 
    603    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
    604 }
    605 
    606 void
    607 fs_visitor::vfail(const char *format, va_list va)
    608 {
    609    char *msg;
    610 
    611    if (failed)
    612       return;
    613 
    614    failed = true;
    615 
    616    msg = ralloc_vasprintf(mem_ctx, format, va);
    617    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
    618 
    619    this->fail_msg = msg;
    620 
    621    if (debug_enabled) {
    622       fprintf(stderr, "%s",  msg);
    623    }
    624 }
    625 
    626 void
    627 fs_visitor::fail(const char *format, ...)
    628 {
    629    va_list va;
    630 
    631    va_start(va, format);
    632    vfail(format, va);
    633    va_end(va);
    634 }
    635 
    636 /**
    637  * Mark this program as impossible to compile with dispatch width greater
    638  * than n.
    639  *
    640  * During the SIMD8 compile (which happens first), we can detect and flag
    641  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
    642  * SIMD16+ compile altogether.
    643  *
    644  * During a compile of dispatch width greater than n (if one happens anyway),
    645  * this just calls fail().
    646  */
    647 void
    648 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
    649 {
    650    if (dispatch_width > n) {
    651       fail("%s", msg);
    652    } else {
    653       max_dispatch_width = n;
    654       compiler->shader_perf_log(log_data,
    655                                 "Shader dispatch width limited to SIMD%d: %s",
    656                                 n, msg);
    657    }
    658 }
    659 
    660 /**
    661  * Returns true if the instruction has a flag that means it won't
    662  * update an entire destination register.
    663  *
    664  * For example, dead code elimination and live variable analysis want to know
    665  * when a write to a variable screens off any preceding values that were in
    666  * it.
    667  */
    668 bool
    669 fs_inst::is_partial_write() const
    670 {
    671    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
    672            (this->exec_size * type_sz(this->dst.type)) < 32 ||
    673            !this->dst.is_contiguous() ||
    674            this->dst.offset % REG_SIZE != 0);
    675 }
    676 
    677 unsigned
    678 fs_inst::components_read(unsigned i) const
    679 {
    680    /* Return zero if the source is not present. */
    681    if (src[i].file == BAD_FILE)
    682       return 0;
    683 
    684    switch (opcode) {
    685    case FS_OPCODE_LINTERP:
    686       if (i == 0)
    687          return 2;
    688       else
    689          return 1;
    690 
    691    case FS_OPCODE_PIXEL_X:
    692    case FS_OPCODE_PIXEL_Y:
    693       assert(i == 0);
    694       return 2;
    695 
    696    case FS_OPCODE_FB_WRITE_LOGICAL:
    697       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
    698       /* First/second FB write color. */
    699       if (i < 2)
    700          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
    701       else
    702          return 1;
    703 
    704    case SHADER_OPCODE_TEX_LOGICAL:
    705    case SHADER_OPCODE_TXD_LOGICAL:
    706    case SHADER_OPCODE_TXF_LOGICAL:
    707    case SHADER_OPCODE_TXL_LOGICAL:
    708    case SHADER_OPCODE_TXS_LOGICAL:
    709    case FS_OPCODE_TXB_LOGICAL:
    710    case SHADER_OPCODE_TXF_CMS_LOGICAL:
    711    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
    712    case SHADER_OPCODE_TXF_UMS_LOGICAL:
    713    case SHADER_OPCODE_TXF_MCS_LOGICAL:
    714    case SHADER_OPCODE_LOD_LOGICAL:
    715    case SHADER_OPCODE_TG4_LOGICAL:
    716    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
    717    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
    718       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
    719              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
    720       /* Texture coordinates. */
    721       if (i == TEX_LOGICAL_SRC_COORDINATE)
    722          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
    723       /* Texture derivatives. */
    724       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
    725                opcode == SHADER_OPCODE_TXD_LOGICAL)
    726          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
    727       /* Texture offset. */
    728       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
    729          return 2;
    730       /* MCS */
    731       else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
    732          return 2;
    733       else
    734          return 1;
    735 
    736    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
    737    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
    738       assert(src[3].file == IMM);
    739       /* Surface coordinates. */
    740       if (i == 0)
    741          return src[3].ud;
    742       /* Surface operation source (ignored for reads). */
    743       else if (i == 1)
    744          return 0;
    745       else
    746          return 1;
    747 
    748    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
    749    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
    750       assert(src[3].file == IMM &&
    751              src[4].file == IMM);
    752       /* Surface coordinates. */
    753       if (i == 0)
    754          return src[3].ud;
    755       /* Surface operation source. */
    756       else if (i == 1)
    757          return src[4].ud;
    758       else
    759          return 1;
    760 
    761    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
    762       /* Scattered logical opcodes use the following params:
    763        * src[0] Surface coordinates
    764        * src[1] Surface operation source (ignored for reads)
    765        * src[2] Surface
    766        * src[3] IMM with always 1 dimension.
    767        * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
    768        */
    769       assert(src[3].file == IMM &&
    770              src[4].file == IMM);
    771       return i == 1 ? 0 : 1;
    772 
    773    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
    774       assert(src[3].file == IMM &&
    775              src[4].file == IMM);
    776       return 1;
    777 
    778    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
    779    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
    780       assert(src[3].file == IMM &&
    781              src[4].file == IMM);
    782       const unsigned op = src[4].ud;
    783       /* Surface coordinates. */
    784       if (i == 0)
    785          return src[3].ud;
    786       /* Surface operation source. */
    787       else if (i == 1 && op == BRW_AOP_CMPWR)
    788          return 2;
    789       else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
    790                           op == BRW_AOP_PREDEC))
    791          return 0;
    792       else
    793          return 1;
    794    }
    795 
    796    default:
    797       return 1;
    798    }
    799 }
    800 
    801 unsigned
    802 fs_inst::size_read(int arg) const
    803 {
    804    switch (opcode) {
    805    case FS_OPCODE_FB_WRITE:
    806    case FS_OPCODE_FB_READ:
    807    case SHADER_OPCODE_URB_WRITE_SIMD8:
    808    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
    809    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    810    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    811    case SHADER_OPCODE_URB_READ_SIMD8:
    812    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
    813    case SHADER_OPCODE_UNTYPED_ATOMIC:
    814    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    815    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
    816    case SHADER_OPCODE_TYPED_ATOMIC:
    817    case SHADER_OPCODE_TYPED_SURFACE_READ:
    818    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    819    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
    820    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
    821    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
    822    case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
    823    case SHADER_OPCODE_BYTE_SCATTERED_READ:
    824       if (arg == 0)
    825          return mlen * REG_SIZE;
    826       break;
    827 
    828    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
    829       /* The payload is actually stored in src1 */
    830       if (arg == 1)
    831          return mlen * REG_SIZE;
    832       break;
    833 
    834    case FS_OPCODE_LINTERP:
    835       if (arg == 1)
    836          return 16;
    837       break;
    838 
    839    case SHADER_OPCODE_LOAD_PAYLOAD:
    840       if (arg < this->header_size)
    841          return REG_SIZE;
    842       break;
    843 
    844    case CS_OPCODE_CS_TERMINATE:
    845    case SHADER_OPCODE_BARRIER:
    846       return REG_SIZE;
    847 
    848    case SHADER_OPCODE_MOV_INDIRECT:
    849       if (arg == 0) {
    850          assert(src[2].file == IMM);
    851          return src[2].ud;
    852       }
    853       break;
    854 
    855    default:
    856       if (is_tex() && arg == 0 && src[0].file == VGRF)
    857          return mlen * REG_SIZE;
    858       break;
    859    }
    860 
    861    switch (src[arg].file) {
    862    case UNIFORM:
    863    case IMM:
    864       return components_read(arg) * type_sz(src[arg].type);
    865    case BAD_FILE:
    866    case ARF:
    867    case FIXED_GRF:
    868    case VGRF:
    869    case ATTR:
    870       return components_read(arg) * src[arg].component_size(exec_size);
    871    case MRF:
    872       unreachable("MRF registers are not allowed as sources");
    873    }
    874    return 0;
    875 }
    876 
    877 namespace {
    878    /* Return the subset of flag registers that an instruction could
    879     * potentially read or write based on the execution controls and flag
    880     * subregister number of the instruction.
    881     */
    882    unsigned
    883    flag_mask(const fs_inst *inst)
    884    {
    885       const unsigned start = inst->flag_subreg * 16 + inst->group;
    886       const unsigned end = start + inst->exec_size;
    887       return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
    888    }
    889 
    890    unsigned
    891    bit_mask(unsigned n)
    892    {
    893       return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
    894    }
    895 
    896    unsigned
    897    flag_mask(const fs_reg &r, unsigned sz)
    898    {
    899       if (r.file == ARF) {
    900          const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
    901          const unsigned end = start + sz;
    902          return bit_mask(end) & ~bit_mask(start);
    903       } else {
    904          return 0;
    905       }
    906    }
    907 }
    908 
    909 unsigned
    910 fs_inst::flags_read(const gen_device_info *devinfo) const
    911 {
    912    if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
    913        predicate == BRW_PREDICATE_ALIGN1_ALLV) {
    914       /* The vertical predication modes combine corresponding bits from
    915        * f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware.
    916        */
    917       const unsigned shift = devinfo->gen >= 7 ? 4 : 2;
    918       return flag_mask(this) << shift | flag_mask(this);
    919    } else if (predicate) {
    920       return flag_mask(this);
    921    } else {
    922       unsigned mask = 0;
    923       for (int i = 0; i < sources; i++) {
    924          mask |= flag_mask(src[i], size_read(i));
    925       }
    926       return mask;
    927    }
    928 }
    929 
    930 unsigned
    931 fs_inst::flags_written() const
    932 {
    933    if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
    934                             opcode != BRW_OPCODE_IF &&
    935                             opcode != BRW_OPCODE_WHILE)) ||
    936        opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
    937       return flag_mask(this);
    938    } else {
    939       return flag_mask(dst, size_written);
    940    }
    941 }
    942 
    943 /**
    944  * Returns how many MRFs an FS opcode will write over.
    945  *
    946  * Note that this is not the 0 or 1 implied writes in an actual gen
    947  * instruction -- the FS opcodes often generate MOVs in addition.
    948  */
    949 int
    950 fs_visitor::implied_mrf_writes(fs_inst *inst) const
    951 {
    952    if (inst->mlen == 0)
    953       return 0;
    954 
    955    if (inst->base_mrf == -1)
    956       return 0;
    957 
    958    switch (inst->opcode) {
    959    case SHADER_OPCODE_RCP:
    960    case SHADER_OPCODE_RSQ:
    961    case SHADER_OPCODE_SQRT:
    962    case SHADER_OPCODE_EXP2:
    963    case SHADER_OPCODE_LOG2:
    964    case SHADER_OPCODE_SIN:
    965    case SHADER_OPCODE_COS:
    966       return 1 * dispatch_width / 8;
    967    case SHADER_OPCODE_POW:
    968    case SHADER_OPCODE_INT_QUOTIENT:
    969    case SHADER_OPCODE_INT_REMAINDER:
    970       return 2 * dispatch_width / 8;
    971    case SHADER_OPCODE_TEX:
    972    case FS_OPCODE_TXB:
    973    case SHADER_OPCODE_TXD:
    974    case SHADER_OPCODE_TXF:
    975    case SHADER_OPCODE_TXF_CMS:
    976    case SHADER_OPCODE_TXF_MCS:
    977    case SHADER_OPCODE_TG4:
    978    case SHADER_OPCODE_TG4_OFFSET:
    979    case SHADER_OPCODE_TXL:
    980    case SHADER_OPCODE_TXS:
    981    case SHADER_OPCODE_LOD:
    982    case SHADER_OPCODE_SAMPLEINFO:
    983       return 1;
    984    case FS_OPCODE_FB_WRITE:
    985       return 2;
    986    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
    987    case SHADER_OPCODE_GEN4_SCRATCH_READ:
    988       return 1;
    989    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
    990       return inst->mlen;
    991    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
    992       return inst->mlen;
    993    default:
    994       unreachable("not reached");
    995    }
    996 }
    997 
    998 fs_reg
    999 fs_visitor::vgrf(const glsl_type *const type)
   1000 {
   1001    int reg_width = dispatch_width / 8;
   1002    return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
   1003                  brw_type_for_base_type(type));
   1004 }
   1005 
   1006 fs_reg::fs_reg(enum brw_reg_file file, int nr)
   1007 {
   1008    init();
   1009    this->file = file;
   1010    this->nr = nr;
   1011    this->type = BRW_REGISTER_TYPE_F;
   1012    this->stride = (file == UNIFORM ? 0 : 1);
   1013 }
   1014 
   1015 fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
   1016 {
   1017    init();
   1018    this->file = file;
   1019    this->nr = nr;
   1020    this->type = type;
   1021    this->stride = (file == UNIFORM ? 0 : 1);
   1022 }
   1023 
   1024 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
   1025  * This brings in those uniform definitions
   1026  */
   1027 void
   1028 fs_visitor::import_uniforms(fs_visitor *v)
   1029 {
   1030    this->push_constant_loc = v->push_constant_loc;
   1031    this->pull_constant_loc = v->pull_constant_loc;
   1032    this->uniforms = v->uniforms;
   1033    this->subgroup_id = v->subgroup_id;
   1034 }
   1035 
   1036 void
   1037 fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
   1038 {
   1039    assert(stage == MESA_SHADER_FRAGMENT);
   1040 
   1041    /* gl_FragCoord.x */
   1042    bld.MOV(wpos, this->pixel_x);
   1043    wpos = offset(wpos, bld, 1);
   1044 
   1045    /* gl_FragCoord.y */
   1046    bld.MOV(wpos, this->pixel_y);
   1047    wpos = offset(wpos, bld, 1);
   1048 
   1049    /* gl_FragCoord.z */
   1050    if (devinfo->gen >= 6) {
   1051       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
   1052    } else {
   1053       bld.emit(FS_OPCODE_LINTERP, wpos,
   1054            this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
   1055            interp_reg(VARYING_SLOT_POS, 2));
   1056    }
   1057    wpos = offset(wpos, bld, 1);
   1058 
   1059    /* gl_FragCoord.w: Already set up in emit_interpolation */
   1060    bld.MOV(wpos, this->wpos_w);
   1061 }
   1062 
   1063 enum brw_barycentric_mode
   1064 brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
   1065 {
   1066    /* Barycentric modes don't make sense for flat inputs. */
   1067    assert(mode != INTERP_MODE_FLAT);
   1068 
   1069    unsigned bary;
   1070    switch (op) {
   1071    case nir_intrinsic_load_barycentric_pixel:
   1072    case nir_intrinsic_load_barycentric_at_offset:
   1073       bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
   1074       break;
   1075    case nir_intrinsic_load_barycentric_centroid:
   1076       bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
   1077       break;
   1078    case nir_intrinsic_load_barycentric_sample:
   1079    case nir_intrinsic_load_barycentric_at_sample:
   1080       bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
   1081       break;
   1082    default:
   1083       unreachable("invalid intrinsic");
   1084    }
   1085 
   1086    if (mode == INTERP_MODE_NOPERSPECTIVE)
   1087       bary += 3;
   1088 
   1089    return (enum brw_barycentric_mode) bary;
   1090 }
   1091 
   1092 /**
   1093  * Turn one of the two CENTROID barycentric modes into PIXEL mode.
   1094  */
   1095 static enum brw_barycentric_mode
   1096 centroid_to_pixel(enum brw_barycentric_mode bary)
   1097 {
   1098    assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
   1099           bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
   1100    return (enum brw_barycentric_mode) ((unsigned) bary - 1);
   1101 }
   1102 
   1103 fs_reg *
   1104 fs_visitor::emit_frontfacing_interpolation()
   1105 {
   1106    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
   1107 
   1108    if (devinfo->gen >= 6) {
   1109       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
   1110        * a boolean result from this (~0/true or 0/false).
   1111        *
   1112        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
   1113        * this task in only one instruction:
   1114        *    - a negation source modifier will flip the bit; and
   1115        *    - a W -> D type conversion will sign extend the bit into the high
   1116        *      word of the destination.
   1117        *
   1118        * An ASR 15 fills the low word of the destination.
   1119        */
   1120       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
   1121       g0.negate = true;
   1122 
   1123       bld.ASR(*reg, g0, brw_imm_d(15));
   1124    } else {
   1125       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
   1126        * a boolean result from this (1/true or 0/false).
   1127        *
   1128        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
   1129        * the negation source modifier to flip it. Unfortunately the SHR
   1130        * instruction only operates on UD (or D with an abs source modifier)
   1131        * sources without negation.
   1132        *
   1133        * Instead, use ASR (which will give ~0/true or 0/false).
   1134        */
   1135       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
   1136       g1_6.negate = true;
   1137 
   1138       bld.ASR(*reg, g1_6, brw_imm_d(31));
   1139    }
   1140 
   1141    return reg;
   1142 }
   1143 
   1144 void
   1145 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
   1146 {
   1147    assert(stage == MESA_SHADER_FRAGMENT);
   1148    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
   1149    assert(dst.type == BRW_REGISTER_TYPE_F);
   1150 
   1151    if (wm_prog_data->persample_dispatch) {
   1152       /* Convert int_sample_pos to floating point */
   1153       bld.MOV(dst, int_sample_pos);
   1154       /* Scale to the range [0, 1] */
   1155       bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
   1156    }
   1157    else {
   1158       /* From ARB_sample_shading specification:
   1159        * "When rendering to a non-multisample buffer, or if multisample
   1160        *  rasterization is disabled, gl_SamplePosition will always be
   1161        *  (0.5, 0.5).
   1162        */
   1163       bld.MOV(dst, brw_imm_f(0.5f));
   1164    }
   1165 }
   1166 
   1167 fs_reg *
   1168 fs_visitor::emit_samplepos_setup()
   1169 {
   1170    assert(devinfo->gen >= 6);
   1171 
   1172    const fs_builder abld = bld.annotate("compute sample position");
   1173    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
   1174    fs_reg pos = *reg;
   1175    fs_reg int_sample_x = vgrf(glsl_type::int_type);
   1176    fs_reg int_sample_y = vgrf(glsl_type::int_type);
   1177 
   1178    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
   1179     * mode will be enabled.
   1180     *
   1181     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
   1182     * R31.1:0         Position Offset X/Y for Slot[3:0]
   1183     * R31.3:2         Position Offset X/Y for Slot[7:4]
   1184     * .....
   1185     *
   1186     * The X, Y sample positions come in as bytes in  thread payload. So, read
   1187     * the positions using vstride=16, width=8, hstride=2.
   1188     */
   1189    struct brw_reg sample_pos_reg =
   1190       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
   1191                     BRW_REGISTER_TYPE_B), 16, 8, 2);
   1192 
   1193    if (dispatch_width == 8) {
   1194       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
   1195    } else {
   1196       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
   1197       abld.half(1).MOV(half(int_sample_x, 1),
   1198                        fs_reg(suboffset(sample_pos_reg, 16)));
   1199    }
   1200    /* Compute gl_SamplePosition.x */
   1201    compute_sample_position(pos, int_sample_x);
   1202    pos = offset(pos, abld, 1);
   1203    if (dispatch_width == 8) {
   1204       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
   1205    } else {
   1206       abld.half(0).MOV(half(int_sample_y, 0),
   1207                        fs_reg(suboffset(sample_pos_reg, 1)));
   1208       abld.half(1).MOV(half(int_sample_y, 1),
   1209                        fs_reg(suboffset(sample_pos_reg, 17)));
   1210    }
   1211    /* Compute gl_SamplePosition.y */
   1212    compute_sample_position(pos, int_sample_y);
   1213    return reg;
   1214 }
   1215 
   1216 fs_reg *
   1217 fs_visitor::emit_sampleid_setup()
   1218 {
   1219    assert(stage == MESA_SHADER_FRAGMENT);
   1220    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
   1221    assert(devinfo->gen >= 6);
   1222 
   1223    const fs_builder abld = bld.annotate("compute sample id");
   1224    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
   1225 
   1226    if (!key->multisample_fbo) {
   1227       /* As per GL_ARB_sample_shading specification:
   1228        * "When rendering to a non-multisample buffer, or if multisample
   1229        *  rasterization is disabled, gl_SampleID will always be zero."
   1230        */
   1231       abld.MOV(*reg, brw_imm_d(0));
   1232    } else if (devinfo->gen >= 8) {
   1233       /* Sample ID comes in as 4-bit numbers in g1.0:
   1234        *
   1235        *    15:12 Slot 3 SampleID (only used in SIMD16)
   1236        *     11:8 Slot 2 SampleID (only used in SIMD16)
   1237        *      7:4 Slot 1 SampleID
   1238        *      3:0 Slot 0 SampleID
   1239        *
   1240        * Each slot corresponds to four channels, so we want to replicate each
   1241        * half-byte value to 4 channels in a row:
   1242        *
   1243        *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
   1244        *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
   1245        *
   1246        *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
   1247        *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
   1248        *
   1249        * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
   1250        * channels to read the first byte (7:0), and the second group of 8
   1251        * channels to read the second byte (15:8).  Then, we shift right by
   1252        * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
   1253        * values into place.  Finally, we AND with 0xf to keep the low nibble.
   1254        *
   1255        *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
   1256        *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
   1257        *
   1258        * TODO: These payload bits exist on Gen7 too, but they appear to always
   1259        *       be zero, so this code fails to work.  We should find out why.
   1260        */
   1261       fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
   1262 
   1263       abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
   1264                                          BRW_REGISTER_TYPE_UB), 1, 8, 0)),
   1265                     brw_imm_v(0x44440000));
   1266       abld.AND(*reg, tmp, brw_imm_w(0xf));
   1267    } else {
   1268       const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
   1269                                          BRW_REGISTER_TYPE_UD), 0);
   1270       const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
   1271 
   1272       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
   1273        * 8x multisampling, subspan 0 will represent sample N (where N
   1274        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
   1275        * 7. We can find the value of N by looking at R0.0 bits 7:6
   1276        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
   1277        * (since samples are always delivered in pairs). That is, we
   1278        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
   1279        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
   1280        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
   1281        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
   1282        * populating a temporary variable with the sequence (0, 1, 2, 3),
   1283        * and then reading from it using vstride=1, width=4, hstride=0.
   1284        * These computations hold good for 4x multisampling as well.
   1285        *
   1286        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
   1287        * the first four slots are sample 0 of subspan 0; the next four
   1288        * are sample 1 of subspan 0; the third group is sample 0 of
   1289        * subspan 1, and finally sample 1 of subspan 1.
   1290        */
   1291 
   1292       /* SKL+ has an extra bit for the Starting Sample Pair Index to
   1293        * accomodate 16x MSAA.
   1294        */
   1295       abld.exec_all().group(1, 0)
   1296           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
   1297                brw_imm_ud(0xc0));
   1298       abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
   1299 
   1300       /* This works for both SIMD8 and SIMD16 */
   1301       abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210));
   1302 
   1303       /* This special instruction takes care of setting vstride=1,
   1304        * width=4, hstride=0 of t2 during an ADD instruction.
   1305        */
   1306       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
   1307    }
   1308 
   1309    return reg;
   1310 }
   1311 
   1312 fs_reg *
   1313 fs_visitor::emit_samplemaskin_setup()
   1314 {
   1315    assert(stage == MESA_SHADER_FRAGMENT);
   1316    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
   1317    assert(devinfo->gen >= 6);
   1318 
   1319    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
   1320 
   1321    fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
   1322                                BRW_REGISTER_TYPE_D));
   1323 
   1324    if (wm_prog_data->persample_dispatch) {
   1325       /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
   1326        * and a mask representing which sample is being processed by the
   1327        * current shader invocation.
   1328        *
   1329        * From the OES_sample_variables specification:
   1330        * "When per-sample shading is active due to the use of a fragment input
   1331        *  qualified by "sample" or due to the use of the gl_SampleID or
   1332        *  gl_SamplePosition variables, only the bit for the current sample is
   1333        *  set in gl_SampleMaskIn."
   1334        */
   1335       const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
   1336 
   1337       if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
   1338          nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
   1339 
   1340       fs_reg one = vgrf(glsl_type::int_type);
   1341       fs_reg enabled_mask = vgrf(glsl_type::int_type);
   1342       abld.MOV(one, brw_imm_d(1));
   1343       abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
   1344       abld.AND(*reg, enabled_mask, coverage_mask);
   1345    } else {
   1346       /* In per-pixel mode, the coverage mask is sufficient. */
   1347       *reg = coverage_mask;
   1348    }
   1349    return reg;
   1350 }
   1351 
   1352 fs_reg
   1353 fs_visitor::resolve_source_modifiers(const fs_reg &src)
   1354 {
   1355    if (!src.abs && !src.negate)
   1356       return src;
   1357 
   1358    fs_reg temp = bld.vgrf(src.type);
   1359    bld.MOV(temp, src);
   1360 
   1361    return temp;
   1362 }
   1363 
   1364 void
   1365 fs_visitor::emit_discard_jump()
   1366 {
   1367    assert(brw_wm_prog_data(this->prog_data)->uses_kill);
   1368 
   1369    /* For performance, after a discard, jump to the end of the
   1370     * shader if all relevant channels have been discarded.
   1371     */
   1372    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
   1373    discard_jump->flag_subreg = 1;
   1374 
   1375    discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
   1376    discard_jump->predicate_inverse = true;
   1377 }
   1378 
   1379 void
   1380 fs_visitor::emit_gs_thread_end()
   1381 {
   1382    assert(stage == MESA_SHADER_GEOMETRY);
   1383 
   1384    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
   1385 
   1386    if (gs_compile->control_data_header_size_bits > 0) {
   1387       emit_gs_control_data_bits(this->final_gs_vertex_count);
   1388    }
   1389 
   1390    const fs_builder abld = bld.annotate("thread end");
   1391    fs_inst *inst;
   1392 
   1393    if (gs_prog_data->static_vertex_count != -1) {
   1394       foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
   1395          if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
   1396              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
   1397              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
   1398              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
   1399             prev->eot = true;
   1400 
   1401             /* Delete now dead instructions. */
   1402             foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
   1403                if (dead == prev)
   1404                   break;
   1405                dead->remove();
   1406             }
   1407             return;
   1408          } else if (prev->is_control_flow() || prev->has_side_effects()) {
   1409             break;
   1410          }
   1411       }
   1412       fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1413       abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
   1414       inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
   1415       inst->mlen = 1;
   1416    } else {
   1417       fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
   1418       fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
   1419       sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
   1420       sources[1] = this->final_gs_vertex_count;
   1421       abld.LOAD_PAYLOAD(payload, sources, 2, 2);
   1422       inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
   1423       inst->mlen = 2;
   1424    }
   1425    inst->eot = true;
   1426    inst->offset = 0;
   1427 }
   1428 
   1429 void
   1430 fs_visitor::assign_curb_setup()
   1431 {
   1432    unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
   1433 
   1434    unsigned ubo_push_length = 0;
   1435    unsigned ubo_push_start[4];
   1436    for (int i = 0; i < 4; i++) {
   1437       ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
   1438       ubo_push_length += stage_prog_data->ubo_ranges[i].length;
   1439    }
   1440 
   1441    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
   1442 
   1443    /* Map the offsets in the UNIFORM file to fixed HW regs. */
   1444    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1445       for (unsigned int i = 0; i < inst->sources; i++) {
   1446 	 if (inst->src[i].file == UNIFORM) {
   1447             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
   1448             int constant_nr;
   1449             if (inst->src[i].nr >= UBO_START) {
   1450                /* constant_nr is in 32-bit units, the rest are in bytes */
   1451                constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
   1452                              inst->src[i].offset / 4;
   1453             } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
   1454                constant_nr = push_constant_loc[uniform_nr];
   1455             } else {
   1456                /* Section 5.11 of the OpenGL 4.1 spec says:
   1457                 * "Out-of-bounds reads return undefined values, which include
   1458                 *  values from other variables of the active program or zero."
   1459                 * Just return the first push constant.
   1460                 */
   1461                constant_nr = 0;
   1462             }
   1463 
   1464 	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
   1465 						  constant_nr / 8,
   1466 						  constant_nr % 8);
   1467             brw_reg.abs = inst->src[i].abs;
   1468             brw_reg.negate = inst->src[i].negate;
   1469 
   1470             assert(inst->src[i].stride == 0);
   1471             inst->src[i] = byte_offset(
   1472                retype(brw_reg, inst->src[i].type),
   1473                inst->src[i].offset % 4);
   1474 	 }
   1475       }
   1476    }
   1477 
   1478    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
   1479    this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
   1480 }
   1481 
   1482 void
   1483 fs_visitor::calculate_urb_setup()
   1484 {
   1485    assert(stage == MESA_SHADER_FRAGMENT);
   1486    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
   1487    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
   1488 
   1489    memset(prog_data->urb_setup, -1,
   1490           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
   1491 
   1492    int urb_next = 0;
   1493    /* Figure out where each of the incoming setup attributes lands. */
   1494    if (devinfo->gen >= 6) {
   1495       if (_mesa_bitcount_64(nir->info.inputs_read &
   1496                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
   1497          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
   1498           * first 16 varying inputs, so we can put them wherever we want.
   1499           * Just put them in order.
   1500           *
   1501           * This is useful because it means that (a) inputs not used by the
   1502           * fragment shader won't take up valuable register space, and (b) we
   1503           * won't have to recompile the fragment shader if it gets paired with
   1504           * a different vertex (or geometry) shader.
   1505           */
   1506          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
   1507             if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
   1508                 BITFIELD64_BIT(i)) {
   1509                prog_data->urb_setup[i] = urb_next++;
   1510             }
   1511          }
   1512       } else {
   1513          /* We have enough input varyings that the SF/SBE pipeline stage can't
   1514           * arbitrarily rearrange them to suit our whim; we have to put them
   1515           * in an order that matches the output of the previous pipeline stage
   1516           * (geometry or vertex shader).
   1517           */
   1518          struct brw_vue_map prev_stage_vue_map;
   1519          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
   1520                              key->input_slots_valid,
   1521                              nir->info.separate_shader);
   1522 
   1523          int first_slot =
   1524             brw_compute_first_urb_slot_required(nir->info.inputs_read,
   1525                                                 &prev_stage_vue_map);
   1526 
   1527          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
   1528          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
   1529               slot++) {
   1530             int varying = prev_stage_vue_map.slot_to_varying[slot];
   1531             if (varying != BRW_VARYING_SLOT_PAD &&
   1532                 (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
   1533                  BITFIELD64_BIT(varying))) {
   1534                prog_data->urb_setup[varying] = slot - first_slot;
   1535             }
   1536          }
   1537          urb_next = prev_stage_vue_map.num_slots - first_slot;
   1538       }
   1539    } else {
   1540       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
   1541       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
   1542          /* Point size is packed into the header, not as a general attribute */
   1543          if (i == VARYING_SLOT_PSIZ)
   1544             continue;
   1545 
   1546 	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
   1547 	    /* The back color slot is skipped when the front color is
   1548 	     * also written to.  In addition, some slots can be
   1549 	     * written in the vertex shader and not read in the
   1550 	     * fragment shader.  So the register number must always be
   1551 	     * incremented, mapped or not.
   1552 	     */
   1553 	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
   1554 	       prog_data->urb_setup[i] = urb_next;
   1555             urb_next++;
   1556 	 }
   1557       }
   1558 
   1559       /*
   1560        * It's a FS only attribute, and we did interpolation for this attribute
   1561        * in SF thread. So, count it here, too.
   1562        *
   1563        * See compile_sf_prog() for more info.
   1564        */
   1565       if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
   1566          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
   1567    }
   1568 
   1569    prog_data->num_varying_inputs = urb_next;
   1570 }
   1571 
   1572 void
   1573 fs_visitor::assign_urb_setup()
   1574 {
   1575    assert(stage == MESA_SHADER_FRAGMENT);
   1576    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
   1577 
   1578    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
   1579 
   1580    /* Offset all the urb_setup[] index by the actual position of the
   1581     * setup regs, now that the location of the constants has been chosen.
   1582     */
   1583    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1584       if (inst->opcode == FS_OPCODE_LINTERP) {
   1585 	 assert(inst->src[1].file == FIXED_GRF);
   1586          inst->src[1].nr += urb_start;
   1587       }
   1588 
   1589       if (inst->opcode == FS_OPCODE_CINTERP) {
   1590 	 assert(inst->src[0].file == FIXED_GRF);
   1591          inst->src[0].nr += urb_start;
   1592       }
   1593    }
   1594 
   1595    /* Each attribute is 4 setup channels, each of which is half a reg. */
   1596    this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
   1597 }
   1598 
   1599 void
   1600 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
   1601 {
   1602    for (int i = 0; i < inst->sources; i++) {
   1603       if (inst->src[i].file == ATTR) {
   1604          int grf = payload.num_regs +
   1605                    prog_data->curb_read_length +
   1606                    inst->src[i].nr +
   1607                    inst->src[i].offset / REG_SIZE;
   1608 
   1609          /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
   1610           *
   1611           * VertStride must be used to cross GRF register boundaries. This
   1612           * rule implies that elements within a 'Width' cannot cross GRF
   1613           * boundaries.
   1614           *
   1615           * So, for registers that are large enough, we have to split the exec
   1616           * size in two and trust the compression state to sort it out.
   1617           */
   1618          unsigned total_size = inst->exec_size *
   1619                                inst->src[i].stride *
   1620                                type_sz(inst->src[i].type);
   1621 
   1622          assert(total_size <= 2 * REG_SIZE);
   1623          const unsigned exec_size =
   1624             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
   1625 
   1626          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
   1627          struct brw_reg reg =
   1628             stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
   1629                                inst->src[i].offset % REG_SIZE),
   1630                    exec_size * inst->src[i].stride,
   1631                    width, inst->src[i].stride);
   1632          reg.abs = inst->src[i].abs;
   1633          reg.negate = inst->src[i].negate;
   1634 
   1635          inst->src[i] = reg;
   1636       }
   1637    }
   1638 }
   1639 
   1640 void
   1641 fs_visitor::assign_vs_urb_setup()
   1642 {
   1643    struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
   1644 
   1645    assert(stage == MESA_SHADER_VERTEX);
   1646 
   1647    /* Each attribute is 4 regs. */
   1648    this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
   1649 
   1650    assert(vs_prog_data->base.urb_read_length <= 15);
   1651 
   1652    /* Rewrite all ATTR file references to the hw grf that they land in. */
   1653    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1654       convert_attr_sources_to_hw_regs(inst);
   1655    }
   1656 }
   1657 
   1658 void
   1659 fs_visitor::assign_tcs_single_patch_urb_setup()
   1660 {
   1661    assert(stage == MESA_SHADER_TESS_CTRL);
   1662 
   1663    /* Rewrite all ATTR file references to HW_REGs. */
   1664    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1665       convert_attr_sources_to_hw_regs(inst);
   1666    }
   1667 }
   1668 
   1669 void
   1670 fs_visitor::assign_tes_urb_setup()
   1671 {
   1672    assert(stage == MESA_SHADER_TESS_EVAL);
   1673 
   1674    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
   1675 
   1676    first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
   1677 
   1678    /* Rewrite all ATTR file references to HW_REGs. */
   1679    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1680       convert_attr_sources_to_hw_regs(inst);
   1681    }
   1682 }
   1683 
   1684 void
   1685 fs_visitor::assign_gs_urb_setup()
   1686 {
   1687    assert(stage == MESA_SHADER_GEOMETRY);
   1688 
   1689    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
   1690 
   1691    first_non_payload_grf +=
   1692       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
   1693 
   1694    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1695       /* Rewrite all ATTR file references to GRFs. */
   1696       convert_attr_sources_to_hw_regs(inst);
   1697    }
   1698 }
   1699 
   1700 
   1701 /**
   1702  * Split large virtual GRFs into separate components if we can.
   1703  *
   1704  * This is mostly duplicated with what brw_fs_vector_splitting does,
   1705  * but that's really conservative because it's afraid of doing
   1706  * splitting that doesn't result in real progress after the rest of
   1707  * the optimization phases, which would cause infinite looping in
   1708  * optimization.  We can do it once here, safely.  This also has the
   1709  * opportunity to split interpolated values, or maybe even uniforms,
   1710  * which we don't have at the IR level.
   1711  *
   1712  * We want to split, because virtual GRFs are what we register
   1713  * allocate and spill (due to contiguousness requirements for some
   1714  * instructions), and they're what we naturally generate in the
   1715  * codegen process, but most virtual GRFs don't actually need to be
   1716  * contiguous sets of GRFs.  If we split, we'll end up with reduced
   1717  * live intervals and better dead code elimination and coalescing.
   1718  */
   1719 void
   1720 fs_visitor::split_virtual_grfs()
   1721 {
   1722    /* Compact the register file so we eliminate dead vgrfs.  This
   1723     * only defines split points for live registers, so if we have
   1724     * too large dead registers they will hit assertions later.
   1725     */
   1726    compact_virtual_grfs();
   1727 
   1728    int num_vars = this->alloc.count;
   1729 
   1730    /* Count the total number of registers */
   1731    int reg_count = 0;
   1732    int vgrf_to_reg[num_vars];
   1733    for (int i = 0; i < num_vars; i++) {
   1734       vgrf_to_reg[i] = reg_count;
   1735       reg_count += alloc.sizes[i];
   1736    }
   1737 
   1738    /* An array of "split points".  For each register slot, this indicates
   1739     * if this slot can be separated from the previous slot.  Every time an
   1740     * instruction uses multiple elements of a register (as a source or
   1741     * destination), we mark the used slots as inseparable.  Then we go
   1742     * through and split the registers into the smallest pieces we can.
   1743     */
   1744    bool split_points[reg_count];
   1745    memset(split_points, 0, sizeof(split_points));
   1746 
   1747    /* Mark all used registers as fully splittable */
   1748    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1749       if (inst->dst.file == VGRF) {
   1750          int reg = vgrf_to_reg[inst->dst.nr];
   1751          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
   1752             split_points[reg + j] = true;
   1753       }
   1754 
   1755       for (int i = 0; i < inst->sources; i++) {
   1756          if (inst->src[i].file == VGRF) {
   1757             int reg = vgrf_to_reg[inst->src[i].nr];
   1758             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
   1759                split_points[reg + j] = true;
   1760          }
   1761       }
   1762    }
   1763 
   1764    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1765       if (inst->dst.file == VGRF) {
   1766          int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
   1767          for (unsigned j = 1; j < regs_written(inst); j++)
   1768             split_points[reg + j] = false;
   1769       }
   1770       for (int i = 0; i < inst->sources; i++) {
   1771          if (inst->src[i].file == VGRF) {
   1772             int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
   1773             for (unsigned j = 1; j < regs_read(inst, i); j++)
   1774                split_points[reg + j] = false;
   1775          }
   1776       }
   1777    }
   1778 
   1779    int new_virtual_grf[reg_count];
   1780    int new_reg_offset[reg_count];
   1781 
   1782    int reg = 0;
   1783    for (int i = 0; i < num_vars; i++) {
   1784       /* The first one should always be 0 as a quick sanity check. */
   1785       assert(split_points[reg] == false);
   1786 
   1787       /* j = 0 case */
   1788       new_reg_offset[reg] = 0;
   1789       reg++;
   1790       int offset = 1;
   1791 
   1792       /* j > 0 case */
   1793       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
   1794          /* If this is a split point, reset the offset to 0 and allocate a
   1795           * new virtual GRF for the previous offset many registers
   1796           */
   1797          if (split_points[reg]) {
   1798             assert(offset <= MAX_VGRF_SIZE);
   1799             int grf = alloc.allocate(offset);
   1800             for (int k = reg - offset; k < reg; k++)
   1801                new_virtual_grf[k] = grf;
   1802             offset = 0;
   1803          }
   1804          new_reg_offset[reg] = offset;
   1805          offset++;
   1806          reg++;
   1807       }
   1808 
   1809       /* The last one gets the original register number */
   1810       assert(offset <= MAX_VGRF_SIZE);
   1811       alloc.sizes[i] = offset;
   1812       for (int k = reg - offset; k < reg; k++)
   1813          new_virtual_grf[k] = i;
   1814    }
   1815    assert(reg == reg_count);
   1816 
   1817    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1818       if (inst->dst.file == VGRF) {
   1819          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
   1820          inst->dst.nr = new_virtual_grf[reg];
   1821          inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
   1822                             inst->dst.offset % REG_SIZE;
   1823          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
   1824       }
   1825       for (int i = 0; i < inst->sources; i++) {
   1826 	 if (inst->src[i].file == VGRF) {
   1827             reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
   1828             inst->src[i].nr = new_virtual_grf[reg];
   1829             inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
   1830                                   inst->src[i].offset % REG_SIZE;
   1831             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
   1832          }
   1833       }
   1834    }
   1835    invalidate_live_intervals();
   1836 }
   1837 
   1838 /**
   1839  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
   1840  *
   1841  * During code generation, we create tons of temporary variables, many of
   1842  * which get immediately killed and are never used again.  Yet, in later
   1843  * optimization and analysis passes, such as compute_live_intervals, we need
   1844  * to loop over all the virtual GRFs.  Compacting them can save a lot of
   1845  * overhead.
   1846  */
   1847 bool
   1848 fs_visitor::compact_virtual_grfs()
   1849 {
   1850    bool progress = false;
   1851    int remap_table[this->alloc.count];
   1852    memset(remap_table, -1, sizeof(remap_table));
   1853 
   1854    /* Mark which virtual GRFs are used. */
   1855    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
   1856       if (inst->dst.file == VGRF)
   1857          remap_table[inst->dst.nr] = 0;
   1858 
   1859       for (int i = 0; i < inst->sources; i++) {
   1860          if (inst->src[i].file == VGRF)
   1861             remap_table[inst->src[i].nr] = 0;
   1862       }
   1863    }
   1864 
   1865    /* Compact the GRF arrays. */
   1866    int new_index = 0;
   1867    for (unsigned i = 0; i < this->alloc.count; i++) {
   1868       if (remap_table[i] == -1) {
   1869          /* We just found an unused register.  This means that we are
   1870           * actually going to compact something.
   1871           */
   1872          progress = true;
   1873       } else {
   1874          remap_table[i] = new_index;
   1875          alloc.sizes[new_index] = alloc.sizes[i];
   1876          invalidate_live_intervals();
   1877          ++new_index;
   1878       }
   1879    }
   1880 
   1881    this->alloc.count = new_index;
   1882 
   1883    /* Patch all the instructions to use the newly renumbered registers */
   1884    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1885       if (inst->dst.file == VGRF)
   1886          inst->dst.nr = remap_table[inst->dst.nr];
   1887 
   1888       for (int i = 0; i < inst->sources; i++) {
   1889          if (inst->src[i].file == VGRF)
   1890             inst->src[i].nr = remap_table[inst->src[i].nr];
   1891       }
   1892    }
   1893 
   1894    /* Patch all the references to delta_xy, since they're used in register
   1895     * allocation.  If they're unused, switch them to BAD_FILE so we don't
   1896     * think some random VGRF is delta_xy.
   1897     */
   1898    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
   1899       if (delta_xy[i].file == VGRF) {
   1900          if (remap_table[delta_xy[i].nr] != -1) {
   1901             delta_xy[i].nr = remap_table[delta_xy[i].nr];
   1902          } else {
   1903             delta_xy[i].file = BAD_FILE;
   1904          }
   1905       }
   1906    }
   1907 
   1908    return progress;
   1909 }
   1910 
   1911 static int
   1912 get_subgroup_id_param_index(const brw_stage_prog_data *prog_data)
   1913 {
   1914    if (prog_data->nr_params == 0)
   1915       return -1;
   1916 
   1917    /* The local thread id is always the last parameter in the list */
   1918    uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
   1919    if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
   1920       return prog_data->nr_params - 1;
   1921 
   1922    return -1;
   1923 }
   1924 
   1925 /**
   1926  * Struct for handling complex alignments.
   1927  *
   1928  * A complex alignment is stored as multiplier and an offset.  A value is
   1929  * considered to be aligned if it is {offset} larger than a multiple of {mul}.
   1930  * For instance, with an alignment of {8, 2}, cplx_align_apply would do the
   1931  * following:
   1932  *
   1933  *  N  | cplx_align_apply({8, 2}, N)
   1934  * ----+-----------------------------
   1935  *  4  | 6
   1936  *  6  | 6
   1937  *  8  | 14
   1938  *  10 | 14
   1939  *  12 | 14
   1940  *  14 | 14
   1941  *  16 | 22
   1942  */
   1943 struct cplx_align {
   1944    unsigned mul:4;
   1945    unsigned offset:4;
   1946 };
   1947 
   1948 #define CPLX_ALIGN_MAX_MUL 8
   1949 
   1950 static void
   1951 cplx_align_assert_sane(struct cplx_align a)
   1952 {
   1953    assert(a.mul > 0 && util_is_power_of_two(a.mul));
   1954    assert(a.offset < a.mul);
   1955 }
   1956 
   1957 /**
   1958  * Combines two alignments to produce a least multiple of sorts.
   1959  *
   1960  * The returned alignment is the smallest (in terms of multiplier) such that
   1961  * anything aligned to both a and b will be aligned to the new alignment.
   1962  * This function will assert-fail if a and b are not compatible, i.e. if the
   1963  * offset parameters are such that no common alignment is possible.
   1964  */
   1965 static struct cplx_align
   1966 cplx_align_combine(struct cplx_align a, struct cplx_align b)
   1967 {
   1968    cplx_align_assert_sane(a);
   1969    cplx_align_assert_sane(b);
   1970 
   1971    /* Assert that the alignments agree. */
   1972    assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
   1973 
   1974    return a.mul > b.mul ? a : b;
   1975 }
   1976 
   1977 /**
   1978  * Apply a complex alignment
   1979  *
   1980  * This function will return the smallest number greater than or equal to
   1981  * offset that is aligned to align.
   1982  */
   1983 static unsigned
   1984 cplx_align_apply(struct cplx_align align, unsigned offset)
   1985 {
   1986    return ALIGN(offset - align.offset, align.mul) + align.offset;
   1987 }
   1988 
   1989 #define UNIFORM_SLOT_SIZE 4
   1990 
   1991 struct uniform_slot_info {
   1992    /** True if the given uniform slot is live */
   1993    unsigned is_live:1;
   1994 
   1995    /** True if this slot and the next slot must remain contiguous */
   1996    unsigned contiguous:1;
   1997 
   1998    struct cplx_align align;
   1999 };
   2000 
   2001 static void
   2002 mark_uniform_slots_read(struct uniform_slot_info *slots,
   2003                         unsigned num_slots, unsigned alignment)
   2004 {
   2005    assert(alignment > 0 && util_is_power_of_two(alignment));
   2006    assert(alignment <= CPLX_ALIGN_MAX_MUL);
   2007 
   2008    /* We can't align a slot to anything less than the slot size */
   2009    alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
   2010 
   2011    struct cplx_align align = {alignment, 0};
   2012    cplx_align_assert_sane(align);
   2013 
   2014    for (unsigned i = 0; i < num_slots; i++) {
   2015       slots[i].is_live = true;
   2016       if (i < num_slots - 1)
   2017          slots[i].contiguous = true;
   2018 
   2019       align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
   2020       if (slots[i].align.mul == 0) {
   2021          slots[i].align = align;
   2022       } else {
   2023          slots[i].align = cplx_align_combine(slots[i].align, align);
   2024       }
   2025    }
   2026 }
   2027 
   2028 /**
   2029  * Assign UNIFORM file registers to either push constants or pull constants.
   2030  *
   2031  * We allow a fragment shader to have more than the specified minimum
   2032  * maximum number of fragment shader uniform components (64).  If
   2033  * there are too many of these, they'd fill up all of register space.
   2034  * So, this will push some of them out to the pull constant buffer and
   2035  * update the program to load them.
   2036  */
   2037 void
   2038 fs_visitor::assign_constant_locations()
   2039 {
   2040    /* Only the first compile gets to decide on locations. */
   2041    if (push_constant_loc) {
   2042       assert(pull_constant_loc);
   2043       return;
   2044    }
   2045 
   2046    struct uniform_slot_info slots[uniforms];
   2047    memset(slots, 0, sizeof(slots));
   2048 
   2049    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   2050       for (int i = 0 ; i < inst->sources; i++) {
   2051          if (inst->src[i].file != UNIFORM)
   2052             continue;
   2053 
   2054          /* NIR tightly packs things so the uniform number might not be
   2055           * aligned (if we have a double right after a float, for instance).
   2056           * This is fine because the process of re-arranging them will ensure
   2057           * that things are properly aligned.  The offset into that uniform,
   2058           * however, must be aligned.
   2059           *
   2060           * In Vulkan, we have explicit offsets but everything is crammed
   2061           * into a single "variable" so inst->src[i].nr will always be 0.
   2062           * Everything will be properly aligned relative to that one base.
   2063           */
   2064          assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
   2065 
   2066          unsigned u = inst->src[i].nr +
   2067                       inst->src[i].offset / UNIFORM_SLOT_SIZE;
   2068 
   2069          if (u >= uniforms)
   2070             continue;
   2071 
   2072          unsigned slots_read;
   2073          if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
   2074             slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
   2075          } else {
   2076             unsigned bytes_read = inst->components_read(i) *
   2077                                   type_sz(inst->src[i].type);
   2078             slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
   2079          }
   2080 
   2081          assert(u + slots_read <= uniforms);
   2082          mark_uniform_slots_read(&slots[u], slots_read,
   2083                                  type_sz(inst->src[i].type));
   2084       }
   2085    }
   2086 
   2087    int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
   2088 
   2089    /* Only allow 16 registers (128 uniform components) as push constants.
   2090     *
   2091     * Just demote the end of the list.  We could probably do better
   2092     * here, demoting things that are rarely used in the program first.
   2093     *
   2094     * If changing this value, note the limitation about total_regs in
   2095     * brw_curbe.c.
   2096     */
   2097    unsigned int max_push_components = 16 * 8;
   2098    if (subgroup_id_index >= 0)
   2099       max_push_components--; /* Save a slot for the thread ID */
   2100 
   2101    /* We push small arrays, but no bigger than 16 floats.  This is big enough
   2102     * for a vec4 but hopefully not large enough to push out other stuff.  We
   2103     * should probably use a better heuristic at some point.
   2104     */
   2105    const unsigned int max_chunk_size = 16;
   2106 
   2107    unsigned int num_push_constants = 0;
   2108    unsigned int num_pull_constants = 0;
   2109 
   2110    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
   2111    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
   2112 
   2113    /* Default to -1 meaning no location */
   2114    memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
   2115    memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
   2116 
   2117    int chunk_start = -1;
   2118    struct cplx_align align;
   2119    for (unsigned u = 0; u < uniforms; u++) {
   2120       if (!slots[u].is_live) {
   2121          assert(chunk_start == -1);
   2122          continue;
   2123       }
   2124 
   2125       /* Skip subgroup_id_index to put it in the last push register. */
   2126       if (subgroup_id_index == (int)u)
   2127          continue;
   2128 
   2129       if (chunk_start == -1) {
   2130          chunk_start = u;
   2131          align = slots[u].align;
   2132       } else {
   2133          /* Offset into the chunk */
   2134          unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
   2135 
   2136          /* Shift the slot alignment down by the chunk offset so it is
   2137           * comparable with the base chunk alignment.
   2138           */
   2139          struct cplx_align slot_align = slots[u].align;
   2140          slot_align.offset =
   2141             (slot_align.offset - chunk_offset) & (align.mul - 1);
   2142 
   2143          align = cplx_align_combine(align, slot_align);
   2144       }
   2145 
   2146       /* Sanity check the alignment */
   2147       cplx_align_assert_sane(align);
   2148 
   2149       if (slots[u].contiguous)
   2150          continue;
   2151 
   2152       /* Adjust the alignment to be in terms of slots, not bytes */
   2153       assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
   2154       assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
   2155       align.mul /= UNIFORM_SLOT_SIZE;
   2156       align.offset /= UNIFORM_SLOT_SIZE;
   2157 
   2158       unsigned push_start_align = cplx_align_apply(align, num_push_constants);
   2159       unsigned chunk_size = u - chunk_start + 1;
   2160       if ((!compiler->supports_pull_constants && u < UBO_START) ||
   2161           (chunk_size < max_chunk_size &&
   2162            push_start_align + chunk_size <= max_push_components)) {
   2163          /* Align up the number of push constants */
   2164          num_push_constants = push_start_align;
   2165          for (unsigned i = 0; i < chunk_size; i++)
   2166             push_constant_loc[chunk_start + i] = num_push_constants++;
   2167       } else {
   2168          /* We need to pull this one */
   2169          num_pull_constants = cplx_align_apply(align, num_pull_constants);
   2170          for (unsigned i = 0; i < chunk_size; i++)
   2171             pull_constant_loc[chunk_start + i] = num_pull_constants++;
   2172       }
   2173 
   2174       /* Reset the chunk and start again */
   2175       chunk_start = -1;
   2176    }
   2177 
   2178    /* Add the CS local thread ID uniform at the end of the push constants */
   2179    if (subgroup_id_index >= 0)
   2180       push_constant_loc[subgroup_id_index] = num_push_constants++;
   2181 
   2182    /* As the uniforms are going to be reordered, stash the old array and
   2183     * create two new arrays for push/pull params.
   2184     */
   2185    uint32_t *param = stage_prog_data->param;
   2186    stage_prog_data->nr_params = num_push_constants;
   2187    if (num_push_constants) {
   2188       stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
   2189                                              num_push_constants);
   2190    } else {
   2191       stage_prog_data->param = NULL;
   2192    }
   2193    assert(stage_prog_data->nr_pull_params == 0);
   2194    assert(stage_prog_data->pull_param == NULL);
   2195    if (num_pull_constants > 0) {
   2196       stage_prog_data->nr_pull_params = num_pull_constants;
   2197       stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
   2198                                                   num_pull_constants);
   2199    }
   2200 
   2201    /* Now that we know how many regular uniforms we'll push, reduce the
   2202     * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
   2203     */
   2204    unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
   2205    for (int i = 0; i < 4; i++) {
   2206       struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
   2207 
   2208       if (push_length + range->length > 64)
   2209          range->length = 64 - push_length;
   2210 
   2211       push_length += range->length;
   2212    }
   2213    assert(push_length <= 64);
   2214 
   2215    /* Up until now, the param[] array has been indexed by reg + offset
   2216     * of UNIFORM registers.  Move pull constants into pull_param[] and
   2217     * condense param[] to only contain the uniforms we chose to push.
   2218     *
   2219     * NOTE: Because we are condensing the params[] array, we know that
   2220     * push_constant_loc[i] <= i and we can do it in one smooth loop without
   2221     * having to make a copy.
   2222     */
   2223    for (unsigned int i = 0; i < uniforms; i++) {
   2224       uint32_t value = param[i];
   2225       if (pull_constant_loc[i] != -1) {
   2226          stage_prog_data->pull_param[pull_constant_loc[i]] = value;
   2227       } else if (push_constant_loc[i] != -1) {
   2228          stage_prog_data->param[push_constant_loc[i]] = value;
   2229       }
   2230    }
   2231    ralloc_free(param);
   2232 }
   2233 
   2234 bool
   2235 fs_visitor::get_pull_locs(const fs_reg &src,
   2236                           unsigned *out_surf_index,
   2237                           unsigned *out_pull_index)
   2238 {
   2239    assert(src.file == UNIFORM);
   2240 
   2241    if (src.nr >= UBO_START) {
   2242       const struct brw_ubo_range *range =
   2243          &prog_data->ubo_ranges[src.nr - UBO_START];
   2244 
   2245       /* If this access is in our (reduced) range, use the push data. */
   2246       if (src.offset / 32 < range->length)
   2247          return false;
   2248 
   2249       *out_surf_index = prog_data->binding_table.ubo_start + range->block;
   2250       *out_pull_index = (32 * range->start + src.offset) / 4;
   2251       return true;
   2252    }
   2253 
   2254    const unsigned location = src.nr + src.offset / 4;
   2255 
   2256    if (location < uniforms && pull_constant_loc[location] != -1) {
   2257       /* A regular uniform push constant */
   2258       *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
   2259       *out_pull_index = pull_constant_loc[location];
   2260       return true;
   2261    }
   2262 
   2263    return false;
   2264 }
   2265 
   2266 /**
   2267  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
   2268  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
   2269  */
   2270 void
   2271 fs_visitor::lower_constant_loads()
   2272 {
   2273    unsigned index, pull_index;
   2274 
   2275    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
   2276       /* Set up the annotation tracking for new generated instructions. */
   2277       const fs_builder ibld(this, block, inst);
   2278 
   2279       for (int i = 0; i < inst->sources; i++) {
   2280 	 if (inst->src[i].file != UNIFORM)
   2281 	    continue;
   2282 
   2283          /* We'll handle this case later */
   2284          if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
   2285             continue;
   2286 
   2287          if (!get_pull_locs(inst->src[i], &index, &pull_index))
   2288 	    continue;
   2289 
   2290          assert(inst->src[i].stride == 0);
   2291 
   2292          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
   2293          const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
   2294          const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
   2295          const unsigned base = pull_index * 4;
   2296 
   2297          ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
   2298                    dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
   2299 
   2300          /* Rewrite the instruction to use the temporary VGRF. */
   2301          inst->src[i].file = VGRF;
   2302          inst->src[i].nr = dst.nr;
   2303          inst->src[i].offset = (base & (block_sz - 1)) +
   2304                                inst->src[i].offset % 4;
   2305 
   2306          brw_mark_surface_used(prog_data, index);
   2307       }
   2308 
   2309       if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
   2310           inst->src[0].file == UNIFORM) {
   2311 
   2312          if (!get_pull_locs(inst->src[0], &index, &pull_index))
   2313             continue;
   2314 
   2315          VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
   2316                                     brw_imm_ud(index),
   2317                                     inst->src[1],
   2318                                     pull_index * 4);
   2319          inst->remove(block);
   2320 
   2321          brw_mark_surface_used(prog_data, index);
   2322       }
   2323    }
   2324    invalidate_live_intervals();
   2325 }
   2326 
   2327 bool
   2328 fs_visitor::opt_algebraic()
   2329 {
   2330    bool progress = false;
   2331 
   2332    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   2333       switch (inst->opcode) {
   2334       case BRW_OPCODE_MOV:
   2335          if (inst->src[0].file != IMM)
   2336             break;
   2337 
   2338          if (inst->saturate) {
   2339             if (inst->dst.type != inst->src[0].type)
   2340                assert(!"unimplemented: saturate mixed types");
   2341 
   2342             if (brw_saturate_immediate(inst->dst.type,
   2343                                        &inst->src[0].as_brw_reg())) {
   2344                inst->saturate = false;
   2345                progress = true;
   2346             }
   2347          }
   2348          break;
   2349 
   2350       case BRW_OPCODE_MUL:
   2351 	 if (inst->src[1].file != IMM)
   2352 	    continue;
   2353 
   2354 	 /* a * 1.0 = a */
   2355 	 if (inst->src[1].is_one()) {
   2356 	    inst->opcode = BRW_OPCODE_MOV;
   2357 	    inst->src[1] = reg_undef;
   2358 	    progress = true;
   2359 	    break;
   2360 	 }
   2361 
   2362          /* a * -1.0 = -a */
   2363          if (inst->src[1].is_negative_one()) {
   2364             inst->opcode = BRW_OPCODE_MOV;
   2365             inst->src[0].negate = !inst->src[0].negate;
   2366             inst->src[1] = reg_undef;
   2367             progress = true;
   2368             break;
   2369          }
   2370 
   2371          /* a * 0.0 = 0.0 */
   2372          if (inst->src[1].is_zero()) {
   2373             inst->opcode = BRW_OPCODE_MOV;
   2374             inst->src[0] = inst->src[1];
   2375             inst->src[1] = reg_undef;
   2376             progress = true;
   2377             break;
   2378          }
   2379 
   2380          if (inst->src[0].file == IMM) {
   2381             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
   2382             inst->opcode = BRW_OPCODE_MOV;
   2383             inst->src[0].f *= inst->src[1].f;
   2384             inst->src[1] = reg_undef;
   2385             progress = true;
   2386             break;
   2387          }
   2388 	 break;
   2389       case BRW_OPCODE_ADD:
   2390          if (inst->src[1].file != IMM)
   2391             continue;
   2392 
   2393          /* a + 0.0 = a */
   2394          if (inst->src[1].is_zero()) {
   2395             inst->opcode = BRW_OPCODE_MOV;
   2396             inst->src[1] = reg_undef;
   2397             progress = true;
   2398             break;
   2399          }
   2400 
   2401          if (inst->src[0].file == IMM) {
   2402             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
   2403             inst->opcode = BRW_OPCODE_MOV;
   2404             inst->src[0].f += inst->src[1].f;
   2405             inst->src[1] = reg_undef;
   2406             progress = true;
   2407             break;
   2408          }
   2409          break;
   2410       case BRW_OPCODE_OR:
   2411          if (inst->src[0].equals(inst->src[1])) {
   2412             inst->opcode = BRW_OPCODE_MOV;
   2413             inst->src[1] = reg_undef;
   2414             progress = true;
   2415             break;
   2416          }
   2417          break;
   2418       case BRW_OPCODE_LRP:
   2419          if (inst->src[1].equals(inst->src[2])) {
   2420             inst->opcode = BRW_OPCODE_MOV;
   2421             inst->src[0] = inst->src[1];
   2422             inst->src[1] = reg_undef;
   2423             inst->src[2] = reg_undef;
   2424             progress = true;
   2425             break;
   2426          }
   2427          break;
   2428       case BRW_OPCODE_CMP:
   2429          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
   2430              inst->src[0].abs &&
   2431              inst->src[0].negate &&
   2432              inst->src[1].is_zero()) {
   2433             inst->src[0].abs = false;
   2434             inst->src[0].negate = false;
   2435             inst->conditional_mod = BRW_CONDITIONAL_Z;
   2436             progress = true;
   2437             break;
   2438          }
   2439          break;
   2440       case BRW_OPCODE_SEL:
   2441          if (inst->src[0].equals(inst->src[1])) {
   2442             inst->opcode = BRW_OPCODE_MOV;
   2443             inst->src[1] = reg_undef;
   2444             inst->predicate = BRW_PREDICATE_NONE;
   2445             inst->predicate_inverse = false;
   2446             progress = true;
   2447          } else if (inst->saturate && inst->src[1].file == IMM) {
   2448             switch (inst->conditional_mod) {
   2449             case BRW_CONDITIONAL_LE:
   2450             case BRW_CONDITIONAL_L:
   2451                switch (inst->src[1].type) {
   2452                case BRW_REGISTER_TYPE_F:
   2453                   if (inst->src[1].f >= 1.0f) {
   2454                      inst->opcode = BRW_OPCODE_MOV;
   2455                      inst->src[1] = reg_undef;
   2456                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
   2457                      progress = true;
   2458                   }
   2459                   break;
   2460                default:
   2461                   break;
   2462                }
   2463                break;
   2464             case BRW_CONDITIONAL_GE:
   2465             case BRW_CONDITIONAL_G:
   2466                switch (inst->src[1].type) {
   2467                case BRW_REGISTER_TYPE_F:
   2468                   if (inst->src[1].f <= 0.0f) {
   2469                      inst->opcode = BRW_OPCODE_MOV;
   2470                      inst->src[1] = reg_undef;
   2471                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
   2472                      progress = true;
   2473                   }
   2474                   break;
   2475                default:
   2476                   break;
   2477                }
   2478             default:
   2479                break;
   2480             }
   2481          }
   2482          break;
   2483       case BRW_OPCODE_MAD:
   2484          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
   2485             inst->opcode = BRW_OPCODE_MOV;
   2486             inst->src[1] = reg_undef;
   2487             inst->src[2] = reg_undef;
   2488             progress = true;
   2489          } else if (inst->src[0].is_zero()) {
   2490             inst->opcode = BRW_OPCODE_MUL;
   2491             inst->src[0] = inst->src[2];
   2492             inst->src[2] = reg_undef;
   2493             progress = true;
   2494          } else if (inst->src[1].is_one()) {
   2495             inst->opcode = BRW_OPCODE_ADD;
   2496             inst->src[1] = inst->src[2];
   2497             inst->src[2] = reg_undef;
   2498             progress = true;
   2499          } else if (inst->src[2].is_one()) {
   2500             inst->opcode = BRW_OPCODE_ADD;
   2501             inst->src[2] = reg_undef;
   2502             progress = true;
   2503          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
   2504             inst->opcode = BRW_OPCODE_ADD;
   2505             inst->src[1].f *= inst->src[2].f;
   2506             inst->src[2] = reg_undef;
   2507             progress = true;
   2508          }
   2509          break;
   2510       case SHADER_OPCODE_BROADCAST:
   2511          if (is_uniform(inst->src[0])) {
   2512             inst->opcode = BRW_OPCODE_MOV;
   2513             inst->sources = 1;
   2514             inst->force_writemask_all = true;
   2515             progress = true;
   2516          } else if (inst->src[1].file == IMM) {
   2517             inst->opcode = BRW_OPCODE_MOV;
   2518             /* It's possible that the selected component will be too large and
   2519              * overflow the register.  This can happen if someone does a
   2520              * readInvocation() from GLSL or SPIR-V and provides an OOB
   2521              * invocationIndex.  If this happens and we some how manage
   2522              * to constant fold it in and get here, then component() may cause
   2523              * us to start reading outside of the VGRF which will lead to an
   2524              * assert later.  Instead, just let it wrap around if it goes over
   2525              * exec_size.
   2526              */
   2527             const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
   2528             inst->src[0] = component(inst->src[0], comp);
   2529             inst->sources = 1;
   2530             inst->force_writemask_all = true;
   2531             progress = true;
   2532          }
   2533          break;
   2534 
   2535       default:
   2536 	 break;
   2537       }
   2538 
   2539       /* Swap if src[0] is immediate. */
   2540       if (progress && inst->is_commutative()) {
   2541          if (inst->src[0].file == IMM) {
   2542             fs_reg tmp = inst->src[1];
   2543             inst->src[1] = inst->src[0];
   2544             inst->src[0] = tmp;
   2545          }
   2546       }
   2547    }
   2548    return progress;
   2549 }
   2550 
   2551 /**
   2552  * Optimize sample messages that have constant zero values for the trailing
   2553  * texture coordinates. We can just reduce the message length for these
   2554  * instructions instead of reserving a register for it. Trailing parameters
   2555  * that aren't sent default to zero anyway. This will cause the dead code
   2556  * eliminator to remove the MOV instruction that would otherwise be emitted to
   2557  * set up the zero value.
   2558  */
   2559 bool
   2560 fs_visitor::opt_zero_samples()
   2561 {
   2562    /* Gen4 infers the texturing opcode based on the message length so we can't
   2563     * change it.
   2564     */
   2565    if (devinfo->gen < 5)
   2566       return false;
   2567 
   2568    bool progress = false;
   2569 
   2570    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   2571       if (!inst->is_tex())
   2572          continue;
   2573 
   2574       fs_inst *load_payload = (fs_inst *) inst->prev;
   2575 
   2576       if (load_payload->is_head_sentinel() ||
   2577           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
   2578          continue;
   2579 
   2580       /* We don't want to remove the message header or the first parameter.
   2581        * Removing the first parameter is not allowed, see the Haswell PRM
   2582        * volume 7, page 149:
   2583        *
   2584        *     "Parameter 0 is required except for the sampleinfo message, which
   2585        *      has no parameter 0"
   2586        */
   2587       while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
   2588              load_payload->src[(inst->mlen - inst->header_size) /
   2589                                (inst->exec_size / 8) +
   2590                                inst->header_size - 1].is_zero()) {
   2591          inst->mlen -= inst->exec_size / 8;
   2592          progress = true;
   2593       }
   2594    }
   2595 
   2596    if (progress)
   2597       invalidate_live_intervals();
   2598 
   2599    return progress;
   2600 }
   2601 
   2602 /**
   2603  * Optimize sample messages which are followed by the final RT write.
   2604  *
   2605  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
   2606  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
   2607  * final texturing results copied to the framebuffer write payload and modify
   2608  * them to write to the framebuffer directly.
   2609  */
   2610 bool
   2611 fs_visitor::opt_sampler_eot()
   2612 {
   2613    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
   2614 
   2615    if (stage != MESA_SHADER_FRAGMENT)
   2616       return false;
   2617 
   2618    if (devinfo->gen != 9 && !devinfo->is_cherryview)
   2619       return false;
   2620 
   2621    /* FINISHME: It should be possible to implement this optimization when there
   2622     * are multiple drawbuffers.
   2623     */
   2624    if (key->nr_color_regions != 1)
   2625       return false;
   2626 
   2627    /* Requires emitting a bunch of saturating MOV instructions during logical
   2628     * send lowering to clamp the color payload, which the sampler unit isn't
   2629     * going to do for us.
   2630     */
   2631    if (key->clamp_fragment_color)
   2632       return false;
   2633 
   2634    /* Look for a texturing instruction immediately before the final FB_WRITE. */
   2635    bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
   2636    fs_inst *fb_write = (fs_inst *)block->end();
   2637    assert(fb_write->eot);
   2638    assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL);
   2639 
   2640    /* There wasn't one; nothing to do. */
   2641    if (unlikely(fb_write->prev->is_head_sentinel()))
   2642       return false;
   2643 
   2644    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
   2645 
   2646    /* 3D Sampler  Messages  Message Format
   2647     *
   2648     * Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
   2649     *  messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*
   2650     */
   2651    if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
   2652        tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL &&
   2653        tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
   2654        tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL &&
   2655        tex_inst->opcode != FS_OPCODE_TXB_LOGICAL &&
   2656        tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL &&
   2657        tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL &&
   2658        tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL)
   2659       return false;
   2660 
   2661    /* XXX - This shouldn't be necessary. */
   2662    if (tex_inst->prev->is_head_sentinel())
   2663       return false;
   2664 
   2665    /* Check that the FB write sources are fully initialized by the single
   2666     * texturing instruction.
   2667     */
   2668    for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) {
   2669       if (i == FB_WRITE_LOGICAL_SRC_COLOR0) {
   2670          if (!fb_write->src[i].equals(tex_inst->dst) ||
   2671              fb_write->size_read(i) != tex_inst->size_written)
   2672          return false;
   2673       } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) {
   2674          if (fb_write->src[i].file != BAD_FILE)
   2675             return false;
   2676       }
   2677    }
   2678 
   2679    assert(!tex_inst->eot); /* We can't get here twice */
   2680    assert((tex_inst->offset & (0xff << 24)) == 0);
   2681 
   2682    const fs_builder ibld(this, block, tex_inst);
   2683 
   2684    tex_inst->offset |= fb_write->target << 24;
   2685    tex_inst->eot = true;
   2686    tex_inst->dst = ibld.null_reg_ud();
   2687    tex_inst->size_written = 0;
   2688    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
   2689 
   2690    /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT
   2691     * flag and submit a header together with the sampler message as required
   2692     * by the hardware.
   2693     */
   2694    invalidate_live_intervals();
   2695    return true;
   2696 }
   2697 
   2698 bool
   2699 fs_visitor::opt_register_renaming()
   2700 {
   2701    bool progress = false;
   2702    int depth = 0;
   2703 
   2704    int remap[alloc.count];
   2705    memset(remap, -1, sizeof(int) * alloc.count);
   2706 
   2707    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   2708       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
   2709          depth++;
   2710       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
   2711                  inst->opcode == BRW_OPCODE_WHILE) {
   2712          depth--;
   2713       }
   2714 
   2715       /* Rewrite instruction sources. */
   2716       for (int i = 0; i < inst->sources; i++) {
   2717          if (inst->src[i].file == VGRF &&
   2718              remap[inst->src[i].nr] != -1 &&
   2719              remap[inst->src[i].nr] != inst->src[i].nr) {
   2720             inst->src[i].nr = remap[inst->src[i].nr];
   2721             progress = true;
   2722          }
   2723       }
   2724 
   2725       const int dst = inst->dst.nr;
   2726 
   2727       if (depth == 0 &&
   2728           inst->dst.file == VGRF &&
   2729           alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
   2730           !inst->is_partial_write()) {
   2731          if (remap[dst] == -1) {
   2732             remap[dst] = dst;
   2733          } else {
   2734             remap[dst] = alloc.allocate(regs_written(inst));
   2735             inst->dst.nr = remap[dst];
   2736             progress = true;
   2737          }
   2738       } else if (inst->dst.file == VGRF &&
   2739                  remap[dst] != -1 &&
   2740                  remap[dst] != dst) {
   2741          inst->dst.nr = remap[dst];
   2742          progress = true;
   2743       }
   2744    }
   2745 
   2746    if (progress) {
   2747       invalidate_live_intervals();
   2748 
   2749       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
   2750          if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
   2751             delta_xy[i].nr = remap[delta_xy[i].nr];
   2752          }
   2753       }
   2754    }
   2755 
   2756    return progress;
   2757 }
   2758 
   2759 /**
   2760  * Remove redundant or useless discard jumps.
   2761  *
   2762  * For example, we can eliminate jumps in the following sequence:
   2763  *
   2764  * discard-jump       (redundant with the next jump)
   2765  * discard-jump       (useless; jumps to the next instruction)
   2766  * placeholder-halt
   2767  */
   2768 bool
   2769 fs_visitor::opt_redundant_discard_jumps()
   2770 {
   2771    bool progress = false;
   2772 
   2773    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
   2774 
   2775    fs_inst *placeholder_halt = NULL;
   2776    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
   2777       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
   2778          placeholder_halt = inst;
   2779          break;
   2780       }
   2781    }
   2782 
   2783    if (!placeholder_halt)
   2784       return false;
   2785 
   2786    /* Delete any HALTs immediately before the placeholder halt. */
   2787    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
   2788         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
   2789         prev = (fs_inst *) placeholder_halt->prev) {
   2790       prev->remove(last_bblock);
   2791       progress = true;
   2792    }
   2793 
   2794    if (progress)
   2795       invalidate_live_intervals();
   2796 
   2797    return progress;
   2798 }
   2799 
   2800 /**
   2801  * Compute a bitmask with GRF granularity with a bit set for each GRF starting
   2802  * from \p r.offset which overlaps the region starting at \p s.offset and
   2803  * spanning \p ds bytes.
   2804  */
   2805 static inline unsigned
   2806 mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
   2807 {
   2808    const int rel_offset = reg_offset(s) - reg_offset(r);
   2809    const int shift = rel_offset / REG_SIZE;
   2810    const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
   2811    assert(reg_space(r) == reg_space(s) &&
   2812           shift >= 0 && shift < int(8 * sizeof(unsigned)));
   2813    return ((1 << n) - 1) << shift;
   2814 }
   2815 
   2816 bool
   2817 fs_visitor::compute_to_mrf()
   2818 {
   2819    bool progress = false;
   2820    int next_ip = 0;
   2821 
   2822    /* No MRFs on Gen >= 7. */
   2823    if (devinfo->gen >= 7)
   2824       return false;
   2825 
   2826    calculate_live_intervals();
   2827 
   2828    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   2829       int ip = next_ip;
   2830       next_ip++;
   2831 
   2832       if (inst->opcode != BRW_OPCODE_MOV ||
   2833 	  inst->is_partial_write() ||
   2834 	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
   2835 	  inst->dst.type != inst->src[0].type ||
   2836 	  inst->src[0].abs || inst->src[0].negate ||
   2837           !inst->src[0].is_contiguous() ||
   2838           inst->src[0].offset % REG_SIZE != 0)
   2839 	 continue;
   2840 
   2841       /* Can't compute-to-MRF this GRF if someone else was going to
   2842        * read it later.
   2843        */
   2844       if (this->virtual_grf_end[inst->src[0].nr] > ip)
   2845 	 continue;
   2846 
   2847       /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
   2848        * things that computed the value of all GRFs of the source region.  The
   2849        * regs_left bitset keeps track of the registers we haven't yet found a
   2850        * generating instruction for.
   2851        */
   2852       unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
   2853 
   2854       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
   2855          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
   2856                              inst->src[0], inst->size_read(0))) {
   2857 	    /* Found the last thing to write our reg we want to turn
   2858 	     * into a compute-to-MRF.
   2859 	     */
   2860 
   2861 	    /* If this one instruction didn't populate all the
   2862 	     * channels, bail.  We might be able to rewrite everything
   2863 	     * that writes that reg, but it would require smarter
   2864 	     * tracking.
   2865 	     */
   2866 	    if (scan_inst->is_partial_write())
   2867 	       break;
   2868 
   2869             /* Handling things not fully contained in the source of the copy
   2870              * would need us to understand coalescing out more than one MOV at
   2871              * a time.
   2872              */
   2873             if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
   2874                                      inst->src[0], inst->size_read(0)))
   2875                break;
   2876 
   2877 	    /* SEND instructions can't have MRF as a destination. */
   2878 	    if (scan_inst->mlen)
   2879 	       break;
   2880 
   2881 	    if (devinfo->gen == 6) {
   2882 	       /* gen6 math instructions must have the destination be
   2883 		* GRF, so no compute-to-MRF for them.
   2884 		*/
   2885 	       if (scan_inst->is_math()) {
   2886 		  break;
   2887 	       }
   2888 	    }
   2889 
   2890             /* Clear the bits for any registers this instruction overwrites. */
   2891             regs_left &= ~mask_relative_to(
   2892                inst->src[0], scan_inst->dst, scan_inst->size_written);
   2893             if (!regs_left)
   2894                break;
   2895 	 }
   2896 
   2897 	 /* We don't handle control flow here.  Most computation of
   2898 	  * values that end up in MRFs are shortly before the MRF
   2899 	  * write anyway.
   2900 	  */
   2901 	 if (block->start() == scan_inst)
   2902 	    break;
   2903 
   2904 	 /* You can't read from an MRF, so if someone else reads our
   2905 	  * MRF's source GRF that we wanted to rewrite, that stops us.
   2906 	  */
   2907 	 bool interfered = false;
   2908 	 for (int i = 0; i < scan_inst->sources; i++) {
   2909             if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
   2910                                 inst->src[0], inst->size_read(0))) {
   2911 	       interfered = true;
   2912 	    }
   2913 	 }
   2914 	 if (interfered)
   2915 	    break;
   2916 
   2917          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
   2918                              inst->dst, inst->size_written)) {
   2919 	    /* If somebody else writes our MRF here, we can't
   2920 	     * compute-to-MRF before that.
   2921 	     */
   2922             break;
   2923          }
   2924 
   2925          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
   2926              regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
   2927                              inst->dst, inst->size_written)) {
   2928 	    /* Found a SEND instruction, which means that there are
   2929 	     * live values in MRFs from base_mrf to base_mrf +
   2930 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
   2931 	     * above it.
   2932 	     */
   2933             break;
   2934          }
   2935       }
   2936 
   2937       if (regs_left)
   2938          continue;
   2939 
   2940       /* Found all generating instructions of our MRF's source value, so it
   2941        * should be safe to rewrite them to point to the MRF directly.
   2942        */
   2943       regs_left = (1 << regs_read(inst, 0)) - 1;
   2944 
   2945       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
   2946          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
   2947                              inst->src[0], inst->size_read(0))) {
   2948             /* Clear the bits for any registers this instruction overwrites. */
   2949             regs_left &= ~mask_relative_to(
   2950                inst->src[0], scan_inst->dst, scan_inst->size_written);
   2951 
   2952             const unsigned rel_offset = reg_offset(scan_inst->dst) -
   2953                                         reg_offset(inst->src[0]);
   2954 
   2955             if (inst->dst.nr & BRW_MRF_COMPR4) {
   2956                /* Apply the same address transformation done by the hardware
   2957                 * for COMPR4 MRF writes.
   2958                 */
   2959                assert(rel_offset < 2 * REG_SIZE);
   2960                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
   2961 
   2962                /* Clear the COMPR4 bit if the generating instruction is not
   2963                 * compressed.
   2964                 */
   2965                if (scan_inst->size_written < 2 * REG_SIZE)
   2966                   scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
   2967 
   2968             } else {
   2969                /* Calculate the MRF number the result of this instruction is
   2970                 * ultimately written to.
   2971                 */
   2972                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
   2973             }
   2974 
   2975             scan_inst->dst.file = MRF;
   2976             scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
   2977             scan_inst->saturate |= inst->saturate;
   2978             if (!regs_left)
   2979                break;
   2980          }
   2981       }
   2982 
   2983       assert(!regs_left);
   2984       inst->remove(block);
   2985       progress = true;
   2986    }
   2987 
   2988    if (progress)
   2989       invalidate_live_intervals();
   2990 
   2991    return progress;
   2992 }
   2993 
   2994 /**
   2995  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
   2996  * flow.  We could probably do better here with some form of divergence
   2997  * analysis.
   2998  */
   2999 bool
   3000 fs_visitor::eliminate_find_live_channel()
   3001 {
   3002    bool progress = false;
   3003    unsigned depth = 0;
   3004 
   3005    if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
   3006       /* The optimization below assumes that channel zero is live on thread
   3007        * dispatch, which may not be the case if the fixed function dispatches
   3008        * threads sparsely.
   3009        */
   3010       return false;
   3011    }
   3012 
   3013    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   3014       switch (inst->opcode) {
   3015       case BRW_OPCODE_IF:
   3016       case BRW_OPCODE_DO:
   3017          depth++;
   3018          break;
   3019 
   3020       case BRW_OPCODE_ENDIF:
   3021       case BRW_OPCODE_WHILE:
   3022          depth--;
   3023          break;
   3024 
   3025       case FS_OPCODE_DISCARD_JUMP:
   3026          /* This can potentially make control flow non-uniform until the end
   3027           * of the program.
   3028           */
   3029          return progress;
   3030 
   3031       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
   3032          if (depth == 0) {
   3033             inst->opcode = BRW_OPCODE_MOV;
   3034             inst->src[0] = brw_imm_ud(0u);
   3035             inst->sources = 1;
   3036             inst->force_writemask_all = true;
   3037             progress = true;
   3038          }
   3039          break;
   3040 
   3041       default:
   3042          break;
   3043       }
   3044    }
   3045 
   3046    return progress;
   3047 }
   3048 
   3049 /**
   3050  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
   3051  * instructions to FS_OPCODE_REP_FB_WRITE.
   3052  */
   3053 void
   3054 fs_visitor::emit_repclear_shader()
   3055 {
   3056    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
   3057    int base_mrf = 0;
   3058    int color_mrf = base_mrf + 2;
   3059    fs_inst *mov;
   3060 
   3061    if (uniforms > 0) {
   3062       mov = bld.exec_all().group(4, 0)
   3063                .MOV(brw_message_reg(color_mrf),
   3064                     fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
   3065    } else {
   3066       struct brw_reg reg =
   3067          brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
   3068                  BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
   3069                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   3070 
   3071       mov = bld.exec_all().group(4, 0)
   3072                .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
   3073    }
   3074 
   3075    fs_inst *write;
   3076    if (key->nr_color_regions == 1) {
   3077       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
   3078       write->saturate = key->clamp_fragment_color;
   3079       write->base_mrf = color_mrf;
   3080       write->target = 0;
   3081       write->header_size = 0;
   3082       write->mlen = 1;
   3083    } else {
   3084       assume(key->nr_color_regions > 0);
   3085       for (int i = 0; i < key->nr_color_regions; ++i) {
   3086          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
   3087          write->saturate = key->clamp_fragment_color;
   3088          write->base_mrf = base_mrf;
   3089          write->target = i;
   3090          write->header_size = 2;
   3091          write->mlen = 3;
   3092       }
   3093    }
   3094    write->eot = true;
   3095 
   3096    calculate_cfg();
   3097 
   3098    assign_constant_locations();
   3099    assign_curb_setup();
   3100 
   3101    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
   3102    if (uniforms > 0) {
   3103       assert(mov->src[0].file == FIXED_GRF);
   3104       mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
   3105    }
   3106 }
   3107 
   3108 /**
   3109  * Walks through basic blocks, looking for repeated MRF writes and
   3110  * removing the later ones.
   3111  */
   3112 bool
   3113 fs_visitor::remove_duplicate_mrf_writes()
   3114 {
   3115    fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
   3116    bool progress = false;
   3117 
   3118    /* Need to update the MRF tracking for compressed instructions. */
   3119    if (dispatch_width >= 16)
   3120       return false;
   3121 
   3122    memset(last_mrf_move, 0, sizeof(last_mrf_move));
   3123 
   3124    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
   3125       if (inst->is_control_flow()) {
   3126 	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
   3127       }
   3128 
   3129       if (inst->opcode == BRW_OPCODE_MOV &&
   3130 	  inst->dst.file == MRF) {
   3131          fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
   3132 	 if (prev_inst && inst->equals(prev_inst)) {
   3133 	    inst->remove(block);
   3134 	    progress = true;
   3135 	    continue;
   3136 	 }
   3137       }
   3138 
   3139       /* Clear out the last-write records for MRFs that were overwritten. */
   3140       if (inst->dst.file == MRF) {
   3141          last_mrf_move[inst->dst.nr] = NULL;
   3142       }
   3143 
   3144       if (inst->mlen > 0 && inst->base_mrf != -1) {
   3145 	 /* Found a SEND instruction, which will include two or fewer
   3146 	  * implied MRF writes.  We could do better here.
   3147 	  */
   3148 	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
   3149 	    last_mrf_move[inst->base_mrf + i] = NULL;
   3150 	 }
   3151       }
   3152 
   3153       /* Clear out any MRF move records whose sources got overwritten. */
   3154       for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
   3155          if (last_mrf_move[i] &&
   3156              regions_overlap(inst->dst, inst->size_written,
   3157                              last_mrf_move[i]->src[0],
   3158                              last_mrf_move[i]->size_read(0))) {
   3159             last_mrf_move[i] = NULL;
   3160          }
   3161       }
   3162 
   3163       if (inst->opcode == BRW_OPCODE_MOV &&
   3164 	  inst->dst.file == MRF &&
   3165 	  inst->src[0].file != ARF &&
   3166 	  !inst->is_partial_write()) {
   3167          last_mrf_move[inst->dst.nr] = inst;
   3168       }
   3169    }
   3170 
   3171    if (progress)
   3172       invalidate_live_intervals();
   3173 
   3174    return progress;
   3175 }
   3176 
   3177 /**
   3178  * Rounding modes for conversion instructions are included for each
   3179  * conversion, but right now it is a state. So once it is set,
   3180  * we don't need to call it again for subsequent calls.
   3181  *
   3182  * This is useful for vector/matrices conversions, as setting the
   3183  * mode once is enough for the full vector/matrix
   3184  */
   3185 bool
   3186 fs_visitor::remove_extra_rounding_modes()
   3187 {
   3188    bool progress = false;
   3189 
   3190    foreach_block (block, cfg) {
   3191       brw_rnd_mode prev_mode = BRW_RND_MODE_UNSPECIFIED;
   3192 
   3193       foreach_inst_in_block_safe (fs_inst, inst, block) {
   3194          if (inst->opcode == SHADER_OPCODE_RND_MODE) {
   3195             assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
   3196             const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
   3197             if (mode == prev_mode) {
   3198                inst->remove(block);
   3199                progress = true;
   3200             } else {
   3201                prev_mode = mode;
   3202             }
   3203          }
   3204       }
   3205    }
   3206 
   3207    if (progress)
   3208       invalidate_live_intervals();
   3209 
   3210    return progress;
   3211 }
   3212 
   3213 static void
   3214 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
   3215 {
   3216    /* Clear the flag for registers that actually got read (as expected). */
   3217    for (int i = 0; i < inst->sources; i++) {
   3218       int grf;
   3219       if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
   3220          grf = inst->src[i].nr;
   3221       } else {
   3222          continue;
   3223       }
   3224 
   3225       if (grf >= first_grf &&
   3226           grf < first_grf + grf_len) {
   3227          deps[grf - first_grf] = false;
   3228          if (inst->exec_size == 16)
   3229             deps[grf - first_grf + 1] = false;
   3230       }
   3231    }
   3232 }
   3233 
   3234 /**
   3235  * Implements this workaround for the original 965:
   3236  *
   3237  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
   3238  *      check for post destination dependencies on this instruction, software
   3239  *      must ensure that there is no destination hazard for the case of write
   3240  *      followed by a posted write shown in the following example.
   3241  *
   3242  *      1. mov r3 0
   3243  *      2. send r3.xy <rest of send instruction>
   3244  *      3. mov r2 r3
   3245  *
   3246  *      Due to no post-destination dependency check on the send, the above
   3247  *      code sequence could have two instructions (1 and 2) in flight at the
   3248  *      same time that both consider r3 as the target of their final writes.
   3249  */
   3250 void
   3251 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
   3252                                                         fs_inst *inst)
   3253 {
   3254    int write_len = regs_written(inst);
   3255    int first_write_grf = inst->dst.nr;
   3256    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
   3257    assert(write_len < (int)sizeof(needs_dep) - 1);
   3258 
   3259    memset(needs_dep, false, sizeof(needs_dep));
   3260    memset(needs_dep, true, write_len);
   3261 
   3262    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
   3263 
   3264    /* Walk backwards looking for writes to registers we're writing which
   3265     * aren't read since being written.  If we hit the start of the program,
   3266     * we assume that there are no outstanding dependencies on entry to the
   3267     * program.
   3268     */
   3269    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
   3270       /* If we hit control flow, assume that there *are* outstanding
   3271        * dependencies, and force their cleanup before our instruction.
   3272        */
   3273       if (block->start() == scan_inst && block->num != 0) {
   3274          for (int i = 0; i < write_len; i++) {
   3275             if (needs_dep[i])
   3276                DEP_RESOLVE_MOV(fs_builder(this, block, inst),
   3277                                first_write_grf + i);
   3278          }
   3279          return;
   3280       }
   3281 
   3282       /* We insert our reads as late as possible on the assumption that any
   3283        * instruction but a MOV that might have left us an outstanding
   3284        * dependency has more latency than a MOV.
   3285        */
   3286       if (scan_inst->dst.file == VGRF) {
   3287          for (unsigned i = 0; i < regs_written(scan_inst); i++) {
   3288             int reg = scan_inst->dst.nr + i;
   3289 
   3290             if (reg >= first_write_grf &&
   3291                 reg < first_write_grf + write_len &&
   3292                 needs_dep[reg - first_write_grf]) {
   3293                DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
   3294                needs_dep[reg - first_write_grf] = false;
   3295                if (scan_inst->exec_size == 16)
   3296                   needs_dep[reg - first_write_grf + 1] = false;
   3297             }
   3298          }
   3299       }
   3300 
   3301       /* Clear the flag for registers that actually got read (as expected). */
   3302       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
   3303 
   3304       /* Continue the loop only if we haven't resolved all the dependencies */
   3305       int i;
   3306       for (i = 0; i < write_len; i++) {
   3307          if (needs_dep[i])
   3308             break;
   3309       }
   3310       if (i == write_len)
   3311          return;
   3312    }
   3313 }
   3314 
   3315 /**
   3316  * Implements this workaround for the original 965:
   3317  *
   3318  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
   3319  *      used as a destination register until after it has been sourced by an
   3320  *      instruction with a different destination register.
   3321  */
   3322 void
   3323 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
   3324 {
   3325    int write_len = regs_written(inst);
   3326    int first_write_grf = inst->dst.nr;
   3327    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
   3328    assert(write_len < (int)sizeof(needs_dep) - 1);
   3329 
   3330    memset(needs_dep, false, sizeof(needs_dep));
   3331    memset(needs_dep, true, write_len);
   3332    /* Walk forwards looking for writes to registers we're writing which aren't
   3333     * read before being written.
   3334     */
   3335    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
   3336       /* If we hit control flow, force resolve all remaining dependencies. */
   3337       if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
   3338          for (int i = 0; i < write_len; i++) {
   3339             if (needs_dep[i])
   3340                DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
   3341                                first_write_grf + i);
   3342          }
   3343          return;
   3344       }
   3345 
   3346       /* Clear the flag for registers that actually got read (as expected). */
   3347       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
   3348 
   3349       /* We insert our reads as late as possible since they're reading the
   3350        * result of a SEND, which has massive latency.
   3351        */
   3352       if (scan_inst->dst.file == VGRF &&
   3353           scan_inst->dst.nr >= first_write_grf &&
   3354           scan_inst->dst.nr < first_write_grf + write_len &&
   3355           needs_dep[scan_inst->dst.nr - first_write_grf]) {
   3356          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
   3357                          scan_inst->dst.nr);
   3358          needs_dep[scan_inst->dst.nr - first_write_grf] = false;
   3359       }
   3360 
   3361       /* Continue the loop only if we haven't resolved all the dependencies */
   3362       int i;
   3363       for (i = 0; i < write_len; i++) {
   3364          if (needs_dep[i])
   3365             break;
   3366       }
   3367       if (i == write_len)
   3368          return;
   3369    }
   3370 }
   3371 
   3372 void
   3373 fs_visitor::insert_gen4_send_dependency_workarounds()
   3374 {
   3375    if (devinfo->gen != 4 || devinfo->is_g4x)
   3376       return;
   3377 
   3378    bool progress = false;
   3379 
   3380    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   3381       if (inst->mlen != 0 && inst->dst.file == VGRF) {
   3382          insert_gen4_pre_send_dependency_workarounds(block, inst);
   3383          insert_gen4_post_send_dependency_workarounds(block, inst);
   3384          progress = true;
   3385       }
   3386    }
   3387 
   3388    if (progress)
   3389       invalidate_live_intervals();
   3390 }
   3391 
   3392 /**
   3393  * Turns the generic expression-style uniform pull constant load instruction
   3394  * into a hardware-specific series of instructions for loading a pull
   3395  * constant.
   3396  *
   3397  * The expression style allows the CSE pass before this to optimize out
   3398  * repeated loads from the same offset, and gives the pre-register-allocation
   3399  * scheduling full flexibility, while the conversion to native instructions
   3400  * allows the post-register-allocation scheduler the best information
   3401  * possible.
   3402  *
   3403  * Note that execution masking for setting up pull constant loads is special:
   3404  * the channels that need to be written are unrelated to the current execution
   3405  * mask, since a later instruction will use one of the result channels as a
   3406  * source operand for all 8 or 16 of its channels.
   3407  */
   3408 void
   3409 fs_visitor::lower_uniform_pull_constant_loads()
   3410 {
   3411    foreach_block_and_inst (block, fs_inst, inst, cfg) {
   3412       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
   3413          continue;
   3414 
   3415       if (devinfo->gen >= 7) {
   3416          const fs_builder ubld = fs_builder(this, block, inst).exec_all();
   3417          const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
   3418 
   3419          ubld.group(8, 0).MOV(payload,
   3420                               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   3421          ubld.group(1, 0).MOV(component(payload, 2),
   3422                               brw_imm_ud(inst->src[1].ud / 16));
   3423 
   3424          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
   3425          inst->src[1] = payload;
   3426          inst->header_size = 1;
   3427          inst->mlen = 1;
   3428 
   3429          invalidate_live_intervals();
   3430       } else {
   3431          /* Before register allocation, we didn't tell the scheduler about the
   3432           * MRF we use.  We know it's safe to use this MRF because nothing
   3433           * else does except for register spill/unspill, which generates and
   3434           * uses its MRF within a single IR instruction.
   3435           */
   3436          inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
   3437          inst->mlen = 1;
   3438       }
   3439    }
   3440 }
   3441 
   3442 bool
   3443 fs_visitor::lower_load_payload()
   3444 {
   3445    bool progress = false;
   3446 
   3447    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
   3448       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
   3449          continue;
   3450 
   3451       assert(inst->dst.file == MRF || inst->dst.file == VGRF);
   3452       assert(inst->saturate == false);
   3453       fs_reg dst = inst->dst;
   3454 
   3455       /* Get rid of COMPR4.  We'll add it back in if we need it */
   3456       if (dst.file == MRF)
   3457          dst.nr = dst.nr & ~BRW_MRF_COMPR4;
   3458 
   3459       const fs_builder ibld(this, block, inst);
   3460       const fs_builder hbld = ibld.exec_all().group(8, 0);
   3461 
   3462       for (uint8_t i = 0; i < inst->header_size; i++) {
   3463          if (inst->src[i].file != BAD_FILE) {
   3464             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
   3465             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
   3466             hbld.MOV(mov_dst, mov_src);
   3467          }
   3468          dst = offset(dst, hbld, 1);
   3469       }
   3470 
   3471       if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
   3472           inst->exec_size > 8) {
   3473          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
   3474           * a straightforward copy.  Instead, the result of the
   3475           * LOAD_PAYLOAD is treated as interleaved and the first four
   3476           * non-header sources are unpacked as:
   3477           *
   3478           * m + 0: r0
   3479           * m + 1: g0
   3480           * m + 2: b0
   3481           * m + 3: a0
   3482           * m + 4: r1
   3483           * m + 5: g1
   3484           * m + 6: b1
   3485           * m + 7: a1
   3486           *
   3487           * This is used for gen <= 5 fb writes.
   3488           */
   3489          assert(inst->exec_size == 16);
   3490          assert(inst->header_size + 4 <= inst->sources);
   3491          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
   3492             if (inst->src[i].file != BAD_FILE) {
   3493                if (devinfo->has_compr4) {
   3494                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
   3495                   compr4_dst.nr |= BRW_MRF_COMPR4;
   3496                   ibld.MOV(compr4_dst, inst->src[i]);
   3497                } else {
   3498                   /* Platform doesn't have COMPR4.  We have to fake it */
   3499                   fs_reg mov_dst = retype(dst, inst->src[i].type);
   3500                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
   3501                   mov_dst.nr += 4;
   3502                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
   3503                }
   3504             }
   3505 
   3506             dst.nr++;
   3507          }
   3508 
   3509          /* The loop above only ever incremented us through the first set
   3510           * of 4 registers.  However, thanks to the magic of COMPR4, we
   3511           * actually wrote to the first 8 registers, so we need to take
   3512           * that into account now.
   3513           */
   3514          dst.nr += 4;
   3515 
   3516          /* The COMPR4 code took care of the first 4 sources.  We'll let
   3517           * the regular path handle any remaining sources.  Yes, we are
   3518           * modifying the instruction but we're about to delete it so
   3519           * this really doesn't hurt anything.
   3520           */
   3521          inst->header_size += 4;
   3522       }
   3523 
   3524       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
   3525          if (inst->src[i].file != BAD_FILE)
   3526             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
   3527          dst = offset(dst, ibld, 1);
   3528       }
   3529 
   3530       inst->remove(block);
   3531       progress = true;
   3532    }
   3533 
   3534    if (progress)
   3535       invalidate_live_intervals();
   3536 
   3537    return progress;
   3538 }
   3539 
   3540 bool
   3541 fs_visitor::lower_integer_multiplication()
   3542 {
   3543    bool progress = false;
   3544 
   3545    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   3546       const fs_builder ibld(this, block, inst);
   3547 
   3548       if (inst->opcode == BRW_OPCODE_MUL) {
   3549          if (inst->dst.is_accumulator() ||
   3550              (inst->dst.type != BRW_REGISTER_TYPE_D &&
   3551               inst->dst.type != BRW_REGISTER_TYPE_UD))
   3552             continue;
   3553 
   3554          /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
   3555           * operation directly, but CHV/BXT cannot.
   3556           */
   3557          if (devinfo->gen >= 8 &&
   3558              !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo))
   3559             continue;
   3560 
   3561          if (inst->src[1].file == IMM &&
   3562              inst->src[1].ud < (1 << 16)) {
   3563             /* The MUL instruction isn't commutative. On Gen <= 6, only the low
   3564              * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
   3565              * src1 are used.
   3566              *
   3567              * If multiplying by an immediate value that fits in 16-bits, do a
   3568              * single MUL instruction with that value in the proper location.
   3569              */
   3570             if (devinfo->gen < 7) {
   3571                fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
   3572                           inst->dst.type);
   3573                ibld.MOV(imm, inst->src[1]);
   3574                ibld.MUL(inst->dst, imm, inst->src[0]);
   3575             } else {
   3576                const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
   3577                ibld.MUL(inst->dst, inst->src[0],
   3578                         ud ? brw_imm_uw(inst->src[1].ud)
   3579                            : brw_imm_w(inst->src[1].d));
   3580             }
   3581          } else {
   3582             /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
   3583              * do 32-bit integer multiplication in one instruction, but instead
   3584              * must do a sequence (which actually calculates a 64-bit result):
   3585              *
   3586              *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
   3587              *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
   3588              *    mov(8)  g2<1>D     acc0<8,8,1>D
   3589              *
   3590              * But on Gen > 6, the ability to use second accumulator register
   3591              * (acc1) for non-float data types was removed, preventing a simple
   3592              * implementation in SIMD16. A 16-channel result can be calculated by
   3593              * executing the three instructions twice in SIMD8, once with quarter
   3594              * control of 1Q for the first eight channels and again with 2Q for
   3595              * the second eight channels.
   3596              *
   3597              * Which accumulator register is implicitly accessed (by AccWrEnable
   3598              * for instance) is determined by the quarter control. Unfortunately
   3599              * Ivybridge (and presumably Baytrail) has a hardware bug in which an
   3600              * implicit accumulator access by an instruction with 2Q will access
   3601              * acc1 regardless of whether the data type is usable in acc1.
   3602              *
   3603              * Specifically, the 2Q mach(8) writes acc1 which does not exist for
   3604              * integer data types.
   3605              *
   3606              * Since we only want the low 32-bits of the result, we can do two
   3607              * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
   3608              * adjust the high result and add them (like the mach is doing):
   3609              *
   3610              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
   3611              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
   3612              *    shl(8)  g9<1>D     g8<8,8,1>D      16D
   3613              *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
   3614              *
   3615              * We avoid the shl instruction by realizing that we only want to add
   3616              * the low 16-bits of the "high" result to the high 16-bits of the
   3617              * "low" result and using proper regioning on the add:
   3618              *
   3619              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
   3620              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
   3621              *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
   3622              *
   3623              * Since it does not use the (single) accumulator register, we can
   3624              * schedule multi-component multiplications much better.
   3625              */
   3626 
   3627             bool needs_mov = false;
   3628             fs_reg orig_dst = inst->dst;
   3629             fs_reg low = inst->dst;
   3630             if (orig_dst.is_null() || orig_dst.file == MRF ||
   3631                 regions_overlap(inst->dst, inst->size_written,
   3632                                 inst->src[0], inst->size_read(0)) ||
   3633                 regions_overlap(inst->dst, inst->size_written,
   3634                                 inst->src[1], inst->size_read(1))) {
   3635                needs_mov = true;
   3636                /* Get a new VGRF but keep the same stride as inst->dst */
   3637                low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
   3638                             inst->dst.type);
   3639                low.stride = inst->dst.stride;
   3640                low.offset = inst->dst.offset % REG_SIZE;
   3641             }
   3642 
   3643             /* Get a new VGRF but keep the same stride as inst->dst */
   3644             fs_reg high(VGRF, alloc.allocate(regs_written(inst)),
   3645                         inst->dst.type);
   3646             high.stride = inst->dst.stride;
   3647             high.offset = inst->dst.offset % REG_SIZE;
   3648 
   3649             if (devinfo->gen >= 7) {
   3650                if (inst->src[1].file == IMM) {
   3651                   ibld.MUL(low, inst->src[0],
   3652                            brw_imm_uw(inst->src[1].ud & 0xffff));
   3653                   ibld.MUL(high, inst->src[0],
   3654                            brw_imm_uw(inst->src[1].ud >> 16));
   3655                } else {
   3656                   ibld.MUL(low, inst->src[0],
   3657                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
   3658                   ibld.MUL(high, inst->src[0],
   3659                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
   3660                }
   3661             } else {
   3662                ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
   3663                         inst->src[1]);
   3664                ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
   3665                         inst->src[1]);
   3666             }
   3667 
   3668             ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
   3669                      subscript(low, BRW_REGISTER_TYPE_UW, 1),
   3670                      subscript(high, BRW_REGISTER_TYPE_UW, 0));
   3671 
   3672             if (needs_mov || inst->conditional_mod) {
   3673                set_condmod(inst->conditional_mod,
   3674                            ibld.MOV(orig_dst, low));
   3675             }
   3676          }
   3677 
   3678       } else if (inst->opcode == SHADER_OPCODE_MULH) {
   3679          /* Should have been lowered to 8-wide. */
   3680          assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
   3681          const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
   3682                                    inst->dst.type);
   3683          fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
   3684          fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
   3685 
   3686          if (devinfo->gen >= 8) {
   3687             /* Until Gen8, integer multiplies read 32-bits from one source,
   3688              * and 16-bits from the other, and relying on the MACH instruction
   3689              * to generate the high bits of the result.
   3690              *
   3691              * On Gen8, the multiply instruction does a full 32x32-bit
   3692              * multiply, but in order to do a 64-bit multiply we can simulate
   3693              * the previous behavior and then use a MACH instruction.
   3694              *
   3695              * FINISHME: Don't use source modifiers on src1.
   3696              */
   3697             assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
   3698                    mul->src[1].type == BRW_REGISTER_TYPE_UD);
   3699             mul->src[1].type = BRW_REGISTER_TYPE_UW;
   3700             mul->src[1].stride *= 2;
   3701 
   3702          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
   3703                     inst->group > 0) {
   3704             /* Among other things the quarter control bits influence which
   3705              * accumulator register is used by the hardware for instructions
   3706              * that access the accumulator implicitly (e.g. MACH).  A
   3707              * second-half instruction would normally map to acc1, which
   3708              * doesn't exist on Gen7 and up (the hardware does emulate it for
   3709              * floating-point instructions *only* by taking advantage of the
   3710              * extra precision of acc0 not normally used for floating point
   3711              * arithmetic).
   3712              *
   3713              * HSW and up are careful enough not to try to access an
   3714              * accumulator register that doesn't exist, but on earlier Gen7
   3715              * hardware we need to make sure that the quarter control bits are
   3716              * zero to avoid non-deterministic behaviour and emit an extra MOV
   3717              * to get the result masked correctly according to the current
   3718              * channel enables.
   3719              */
   3720             mach->group = 0;
   3721             mach->force_writemask_all = true;
   3722             mach->dst = ibld.vgrf(inst->dst.type);
   3723             ibld.MOV(inst->dst, mach->dst);
   3724          }
   3725       } else {
   3726          continue;
   3727       }
   3728 
   3729       inst->remove(block);
   3730       progress = true;
   3731    }
   3732 
   3733    if (progress)
   3734       invalidate_live_intervals();
   3735 
   3736    return progress;
   3737 }
   3738 
   3739 bool
   3740 fs_visitor::lower_minmax()
   3741 {
   3742    assert(devinfo->gen < 6);
   3743 
   3744    bool progress = false;
   3745 
   3746    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   3747       const fs_builder ibld(this, block, inst);
   3748 
   3749       if (inst->opcode == BRW_OPCODE_SEL &&
   3750           inst->predicate == BRW_PREDICATE_NONE) {
   3751          /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
   3752           *        the original SEL.L/GE instruction
   3753           */
   3754          ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
   3755                   inst->conditional_mod);
   3756          inst->predicate = BRW_PREDICATE_NORMAL;
   3757          inst->conditional_mod = BRW_CONDITIONAL_NONE;
   3758 
   3759          progress = true;
   3760       }
   3761    }
   3762 
   3763    if (progress)
   3764       invalidate_live_intervals();
   3765 
   3766    return progress;
   3767 }
   3768 
   3769 static void
   3770 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
   3771                     fs_reg *dst, fs_reg color, unsigned components)
   3772 {
   3773    if (key->clamp_fragment_color) {
   3774       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
   3775       assert(color.type == BRW_REGISTER_TYPE_F);
   3776 
   3777       for (unsigned i = 0; i < components; i++)
   3778          set_saturate(true,
   3779                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
   3780 
   3781       color = tmp;
   3782    }
   3783 
   3784    for (unsigned i = 0; i < components; i++)
   3785       dst[i] = offset(color, bld, i);
   3786 }
   3787 
   3788 static void
   3789 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
   3790                             const struct brw_wm_prog_data *prog_data,
   3791                             const brw_wm_prog_key *key,
   3792                             const fs_visitor::thread_payload &payload)
   3793 {
   3794    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
   3795    const gen_device_info *devinfo = bld.shader->devinfo;
   3796    const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
   3797    const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
   3798    const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
   3799    const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
   3800    const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
   3801    const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
   3802    fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
   3803    const unsigned components =
   3804       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
   3805 
   3806    /* We can potentially have a message length of up to 15, so we have to set
   3807     * base_mrf to either 0 or 1 in order to fit in m0..m15.
   3808     */
   3809    fs_reg sources[15];
   3810    int header_size = 2, payload_header_size;
   3811    unsigned length = 0;
   3812 
   3813    /* From the Sandy Bridge PRM, volume 4, page 198:
   3814     *
   3815     *     "Dispatched Pixel Enables. One bit per pixel indicating
   3816     *      which pixels were originally enabled when the thread was
   3817     *      dispatched. This field is only required for the end-of-
   3818     *      thread message and on all dual-source messages."
   3819     */
   3820    if (devinfo->gen >= 6 &&
   3821        (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
   3822        color1.file == BAD_FILE &&
   3823        key->nr_color_regions == 1) {
   3824       header_size = 0;
   3825    }
   3826 
   3827    if (header_size != 0) {
   3828       assert(header_size == 2);
   3829       /* Allocate 2 registers for a header */
   3830       length += 2;
   3831    }
   3832 
   3833    if (payload.aa_dest_stencil_reg) {
   3834       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
   3835       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
   3836          .MOV(sources[length],
   3837               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
   3838       length++;
   3839    }
   3840 
   3841    if (sample_mask.file != BAD_FILE) {
   3842       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
   3843                                BRW_REGISTER_TYPE_UD);
   3844 
   3845       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
   3846        * relevant.  Since it's unsigned single words one vgrf is always
   3847        * 16-wide, but only the lower or higher 8 channels will be used by the
   3848        * hardware when doing a SIMD8 write depending on whether we have
   3849        * selected the subspans for the first or second half respectively.
   3850        */
   3851       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
   3852       sample_mask.type = BRW_REGISTER_TYPE_UW;
   3853       sample_mask.stride *= 2;
   3854 
   3855       bld.exec_all().annotate("FB write oMask")
   3856          .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
   3857                            inst->group),
   3858               sample_mask);
   3859       length++;
   3860    }
   3861 
   3862    payload_header_size = length;
   3863 
   3864    if (src0_alpha.file != BAD_FILE) {
   3865       /* FIXME: This is being passed at the wrong location in the payload and
   3866        * doesn't work when gl_SampleMask and MRTs are used simultaneously.
   3867        * It's supposed to be immediately before oMask but there seems to be no
   3868        * reasonable way to pass them in the correct order because LOAD_PAYLOAD
   3869        * requires header sources to form a contiguous segment at the beginning
   3870        * of the message and src0_alpha has per-channel semantics.
   3871        */
   3872       setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
   3873       length++;
   3874    } else if (key->replicate_alpha && inst->target != 0) {
   3875       /* Handle the case when fragment shader doesn't write to draw buffer
   3876        * zero. No need to call setup_color_payload() for src0_alpha because
   3877        * alpha value will be undefined.
   3878        */
   3879       length++;
   3880    }
   3881 
   3882    setup_color_payload(bld, key, &sources[length], color0, components);
   3883    length += 4;
   3884 
   3885    if (color1.file != BAD_FILE) {
   3886       setup_color_payload(bld, key, &sources[length], color1, components);
   3887       length += 4;
   3888    }
   3889 
   3890    if (src_depth.file != BAD_FILE) {
   3891       sources[length] = src_depth;
   3892       length++;
   3893    }
   3894 
   3895    if (dst_depth.file != BAD_FILE) {
   3896       sources[length] = dst_depth;
   3897       length++;
   3898    }
   3899 
   3900    if (src_stencil.file != BAD_FILE) {
   3901       assert(devinfo->gen >= 9);
   3902       assert(bld.dispatch_width() != 16);
   3903 
   3904       /* XXX: src_stencil is only available on gen9+. dst_depth is never
   3905        * available on gen9+. As such it's impossible to have both enabled at the
   3906        * same time and therefore length cannot overrun the array.
   3907        */
   3908       assert(length < 15);
   3909 
   3910       sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
   3911       bld.exec_all().annotate("FB write OS")
   3912          .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
   3913               subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
   3914       length++;
   3915    }
   3916 
   3917    fs_inst *load;
   3918    if (devinfo->gen >= 7) {
   3919       /* Send from the GRF */
   3920       fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
   3921       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
   3922       payload.nr = bld.shader->alloc.allocate(regs_written(load));
   3923       load->dst = payload;
   3924 
   3925       inst->src[0] = payload;
   3926       inst->resize_sources(1);
   3927    } else {
   3928       /* Send from the MRF */
   3929       load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
   3930                               sources, length, payload_header_size);
   3931 
   3932       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
   3933        * will do this for us if we just give it a COMPR4 destination.
   3934        */
   3935       if (devinfo->gen < 6 && bld.dispatch_width() == 16)
   3936          load->dst.nr |= BRW_MRF_COMPR4;
   3937 
   3938       inst->resize_sources(0);
   3939       inst->base_mrf = 1;
   3940    }
   3941 
   3942    inst->opcode = FS_OPCODE_FB_WRITE;
   3943    inst->mlen = regs_written(load);
   3944    inst->header_size = header_size;
   3945 }
   3946 
   3947 static void
   3948 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
   3949 {
   3950    const fs_builder &ubld = bld.exec_all();
   3951    const unsigned length = 2;
   3952    const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length);
   3953 
   3954    ubld.group(16, 0)
   3955        .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   3956 
   3957    inst->resize_sources(1);
   3958    inst->src[0] = header;
   3959    inst->opcode = FS_OPCODE_FB_READ;
   3960    inst->mlen = length;
   3961    inst->header_size = length;
   3962 }
   3963 
   3964 static void
   3965 lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
   3966                                 const fs_reg &coordinate,
   3967                                 const fs_reg &shadow_c,
   3968                                 const fs_reg &lod, const fs_reg &lod2,
   3969                                 const fs_reg &surface,
   3970                                 const fs_reg &sampler,
   3971                                 unsigned coord_components,
   3972                                 unsigned grad_components)
   3973 {
   3974    const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
   3975                          op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
   3976    fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
   3977    fs_reg msg_end = msg_begin;
   3978 
   3979    /* g0 header. */
   3980    msg_end = offset(msg_end, bld.group(8, 0), 1);
   3981 
   3982    for (unsigned i = 0; i < coord_components; i++)
   3983       bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
   3984               offset(coordinate, bld, i));
   3985 
   3986    msg_end = offset(msg_end, bld, coord_components);
   3987 
   3988    /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
   3989     * require all three components to be present and zero if they are unused.
   3990     */
   3991    if (coord_components > 0 &&
   3992        (has_lod || shadow_c.file != BAD_FILE ||
   3993         (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
   3994       for (unsigned i = coord_components; i < 3; i++)
   3995          bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
   3996 
   3997       msg_end = offset(msg_end, bld, 3 - coord_components);
   3998    }
   3999 
   4000    if (op == SHADER_OPCODE_TXD) {
   4001       /* TXD unsupported in SIMD16 mode. */
   4002       assert(bld.dispatch_width() == 8);
   4003 
   4004       /* the slots for u and v are always present, but r is optional */
   4005       if (coord_components < 2)
   4006          msg_end = offset(msg_end, bld, 2 - coord_components);
   4007 
   4008       /*  P   = u, v, r
   4009        * dPdx = dudx, dvdx, drdx
   4010        * dPdy = dudy, dvdy, drdy
   4011        *
   4012        * 1-arg: Does not exist.
   4013        *
   4014        * 2-arg: dudx   dvdx   dudy   dvdy
   4015        *        dPdx.x dPdx.y dPdy.x dPdy.y
   4016        *        m4     m5     m6     m7
   4017        *
   4018        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
   4019        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
   4020        *        m5     m6     m7     m8     m9     m10
   4021        */
   4022       for (unsigned i = 0; i < grad_components; i++)
   4023          bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
   4024 
   4025       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
   4026 
   4027       for (unsigned i = 0; i < grad_components; i++)
   4028          bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
   4029 
   4030       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
   4031    }
   4032 
   4033    if (has_lod) {
   4034       /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
   4035        * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
   4036        */
   4037       assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
   4038              bld.dispatch_width() == 16);
   4039 
   4040       const brw_reg_type type =
   4041          (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
   4042           BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
   4043       bld.MOV(retype(msg_end, type), lod);
   4044       msg_end = offset(msg_end, bld, 1);
   4045    }
   4046 
   4047    if (shadow_c.file != BAD_FILE) {
   4048       if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
   4049          /* There's no plain shadow compare message, so we use shadow
   4050           * compare with a bias of 0.0.
   4051           */
   4052          bld.MOV(msg_end, brw_imm_f(0.0f));
   4053          msg_end = offset(msg_end, bld, 1);
   4054       }
   4055 
   4056       bld.MOV(msg_end, shadow_c);
   4057       msg_end = offset(msg_end, bld, 1);
   4058    }
   4059 
   4060    inst->opcode = op;
   4061    inst->src[0] = reg_undef;
   4062    inst->src[1] = surface;
   4063    inst->src[2] = sampler;
   4064    inst->resize_sources(3);
   4065    inst->base_mrf = msg_begin.nr;
   4066    inst->mlen = msg_end.nr - msg_begin.nr;
   4067    inst->header_size = 1;
   4068 }
   4069 
   4070 static void
   4071 lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
   4072                                 const fs_reg &coordinate,
   4073                                 const fs_reg &shadow_c,
   4074                                 const fs_reg &lod, const fs_reg &lod2,
   4075                                 const fs_reg &sample_index,
   4076                                 const fs_reg &surface,
   4077                                 const fs_reg &sampler,
   4078                                 unsigned coord_components,
   4079                                 unsigned grad_components)
   4080 {
   4081    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
   4082    fs_reg msg_coords = message;
   4083    unsigned header_size = 0;
   4084 
   4085    if (inst->offset != 0) {
   4086       /* The offsets set up by the visitor are in the m1 header, so we can't
   4087        * go headerless.
   4088        */
   4089       header_size = 1;
   4090       message.nr--;
   4091    }
   4092 
   4093    for (unsigned i = 0; i < coord_components; i++)
   4094       bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
   4095               offset(coordinate, bld, i));
   4096 
   4097    fs_reg msg_end = offset(msg_coords, bld, coord_components);
   4098    fs_reg msg_lod = offset(msg_coords, bld, 4);
   4099 
   4100    if (shadow_c.file != BAD_FILE) {
   4101       fs_reg msg_shadow = msg_lod;
   4102       bld.MOV(msg_shadow, shadow_c);
   4103       msg_lod = offset(msg_shadow, bld, 1);
   4104       msg_end = msg_lod;
   4105    }
   4106 
   4107    switch (op) {
   4108    case SHADER_OPCODE_TXL:
   4109    case FS_OPCODE_TXB:
   4110       bld.MOV(msg_lod, lod);
   4111       msg_end = offset(msg_lod, bld, 1);
   4112       break;
   4113    case SHADER_OPCODE_TXD:
   4114       /**
   4115        *  P   =  u,    v,    r
   4116        * dPdx = dudx, dvdx, drdx
   4117        * dPdy = dudy, dvdy, drdy
   4118        *
   4119        * Load up these values:
   4120        * - dudx   dudy   dvdx   dvdy   drdx   drdy
   4121        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
   4122        */
   4123       msg_end = msg_lod;
   4124       for (unsigned i = 0; i < grad_components; i++) {
   4125          bld.MOV(msg_end, offset(lod, bld, i));
   4126          msg_end = offset(msg_end, bld, 1);
   4127 
   4128          bld.MOV(msg_end, offset(lod2, bld, i));
   4129          msg_end = offset(msg_end, bld, 1);
   4130       }
   4131       break;
   4132    case SHADER_OPCODE_TXS:
   4133       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
   4134       bld.MOV(msg_lod, lod);
   4135       msg_end = offset(msg_lod, bld, 1);
   4136       break;
   4137    case SHADER_OPCODE_TXF:
   4138       msg_lod = offset(msg_coords, bld, 3);
   4139       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
   4140       msg_end = offset(msg_lod, bld, 1);
   4141       break;
   4142    case SHADER_OPCODE_TXF_CMS:
   4143       msg_lod = offset(msg_coords, bld, 3);
   4144       /* lod */
   4145       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
   4146       /* sample index */
   4147       bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
   4148       msg_end = offset(msg_lod, bld, 2);
   4149       break;
   4150    default:
   4151       break;
   4152    }
   4153 
   4154    inst->opcode = op;
   4155    inst->src[0] = reg_undef;
   4156    inst->src[1] = surface;
   4157    inst->src[2] = sampler;
   4158    inst->resize_sources(3);
   4159    inst->base_mrf = message.nr;
   4160    inst->mlen = msg_end.nr - message.nr;
   4161    inst->header_size = header_size;
   4162 
   4163    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
   4164    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
   4165 }
   4166 
   4167 static bool
   4168 is_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler)
   4169 {
   4170    if (devinfo->gen < 8 && !devinfo->is_haswell)
   4171       return false;
   4172 
   4173    return sampler.file != IMM || sampler.ud >= 16;
   4174 }
   4175 
   4176 static void
   4177 lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
   4178                                 const fs_reg &coordinate,
   4179                                 const fs_reg &shadow_c,
   4180                                 fs_reg lod, const fs_reg &lod2,
   4181                                 const fs_reg &sample_index,
   4182                                 const fs_reg &mcs,
   4183                                 const fs_reg &surface,
   4184                                 const fs_reg &sampler,
   4185                                 const fs_reg &tg4_offset,
   4186                                 unsigned coord_components,
   4187                                 unsigned grad_components)
   4188 {
   4189    const gen_device_info *devinfo = bld.shader->devinfo;
   4190    unsigned reg_width = bld.dispatch_width() / 8;
   4191    unsigned header_size = 0, length = 0;
   4192    fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
   4193    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
   4194       sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
   4195 
   4196    if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
   4197        inst->offset != 0 || inst->eot ||
   4198        op == SHADER_OPCODE_SAMPLEINFO ||
   4199        is_high_sampler(devinfo, sampler)) {
   4200       /* For general texture offsets (no txf workaround), we need a header to
   4201        * put them in.
   4202        *
   4203        * TG4 needs to place its channel select in the header, for interaction
   4204        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
   4205        * larger sampler numbers we need to offset the Sampler State Pointer in
   4206        * the header.
   4207        */
   4208       fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
   4209       header_size = 1;
   4210       length++;
   4211 
   4212       /* If we're requesting fewer than four channels worth of response,
   4213        * and we have an explicit header, we need to set up the sampler
   4214        * writemask.  It's reversed from normal: 1 means "don't write".
   4215        */
   4216       if (!inst->eot && regs_written(inst) != 4 * reg_width) {
   4217          assert(regs_written(inst) % reg_width == 0);
   4218          unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
   4219          inst->offset |= mask << 12;
   4220       }
   4221 
   4222       /* Build the actual header */
   4223       const fs_builder ubld = bld.exec_all().group(8, 0);
   4224       const fs_builder ubld1 = ubld.group(1, 0);
   4225       ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   4226       if (inst->offset) {
   4227          ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
   4228       } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
   4229                  bld.shader->stage != MESA_SHADER_FRAGMENT) {
   4230          /* The vertex and fragment stages have g0.2 set to 0, so
   4231           * header0.2 is 0 when g0 is copied. Other stages may not, so we
   4232           * must set it to 0 to avoid setting undesirable bits in the
   4233           * message.
   4234           */
   4235          ubld1.MOV(component(header, 2), brw_imm_ud(0));
   4236       }
   4237 
   4238       if (is_high_sampler(devinfo, sampler)) {
   4239          if (sampler.file == BRW_IMMEDIATE_VALUE) {
   4240             assert(sampler.ud >= 16);
   4241             const int sampler_state_size = 16; /* 16 bytes */
   4242 
   4243             ubld1.ADD(component(header, 3),
   4244                       retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
   4245                       brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
   4246          } else {
   4247             fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
   4248             ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
   4249             ubld1.SHL(tmp, tmp, brw_imm_ud(4));
   4250             ubld1.ADD(component(header, 3),
   4251                       retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
   4252                       tmp);
   4253          }
   4254       }
   4255    }
   4256 
   4257    if (shadow_c.file != BAD_FILE) {
   4258       bld.MOV(sources[length], shadow_c);
   4259       length++;
   4260    }
   4261 
   4262    bool coordinate_done = false;
   4263 
   4264    /* Set up the LOD info */
   4265    switch (op) {
   4266    case FS_OPCODE_TXB:
   4267    case SHADER_OPCODE_TXL:
   4268       if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
   4269          op = SHADER_OPCODE_TXL_LZ;
   4270          break;
   4271       }
   4272       bld.MOV(sources[length], lod);
   4273       length++;
   4274       break;
   4275    case SHADER_OPCODE_TXD:
   4276       /* TXD should have been lowered in SIMD16 mode. */
   4277       assert(bld.dispatch_width() == 8);
   4278 
   4279       /* Load dPdx and the coordinate together:
   4280        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
   4281        */
   4282       for (unsigned i = 0; i < coord_components; i++) {
   4283          bld.MOV(sources[length++], offset(coordinate, bld, i));
   4284 
   4285          /* For cube map array, the coordinate is (u,v,r,ai) but there are
   4286           * only derivatives for (u, v, r).
   4287           */
   4288          if (i < grad_components) {
   4289             bld.MOV(sources[length++], offset(lod, bld, i));
   4290             bld.MOV(sources[length++], offset(lod2, bld, i));
   4291          }
   4292       }
   4293 
   4294       coordinate_done = true;
   4295       break;
   4296    case SHADER_OPCODE_TXS:
   4297       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
   4298       length++;
   4299       break;
   4300    case SHADER_OPCODE_TXF:
   4301       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
   4302        * On Gen9 they are u, v, lod, r
   4303        */
   4304       bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
   4305 
   4306       if (devinfo->gen >= 9) {
   4307          if (coord_components >= 2) {
   4308             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
   4309                     offset(coordinate, bld, 1));
   4310          } else {
   4311             sources[length] = brw_imm_d(0);
   4312          }
   4313          length++;
   4314       }
   4315 
   4316       if (devinfo->gen >= 9 && lod.is_zero()) {
   4317          op = SHADER_OPCODE_TXF_LZ;
   4318       } else {
   4319          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
   4320          length++;
   4321       }
   4322 
   4323       for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++)
   4324          bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
   4325                  offset(coordinate, bld, i));
   4326 
   4327       coordinate_done = true;
   4328       break;
   4329 
   4330    case SHADER_OPCODE_TXF_CMS:
   4331    case SHADER_OPCODE_TXF_CMS_W:
   4332    case SHADER_OPCODE_TXF_UMS:
   4333    case SHADER_OPCODE_TXF_MCS:
   4334       if (op == SHADER_OPCODE_TXF_UMS ||
   4335           op == SHADER_OPCODE_TXF_CMS ||
   4336           op == SHADER_OPCODE_TXF_CMS_W) {
   4337          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
   4338          length++;
   4339       }
   4340 
   4341       if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
   4342          /* Data from the multisample control surface. */
   4343          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
   4344          length++;
   4345 
   4346          /* On Gen9+ we'll use ld2dms_w instead which has two registers for
   4347           * the MCS data.
   4348           */
   4349          if (op == SHADER_OPCODE_TXF_CMS_W) {
   4350             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
   4351                     mcs.file == IMM ?
   4352                     mcs :
   4353                     offset(mcs, bld, 1));
   4354             length++;
   4355          }
   4356       }
   4357 
   4358       /* There is no offsetting for this message; just copy in the integer
   4359        * texture coordinates.
   4360        */
   4361       for (unsigned i = 0; i < coord_components; i++)
   4362          bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
   4363                  offset(coordinate, bld, i));
   4364 
   4365       coordinate_done = true;
   4366       break;
   4367    case SHADER_OPCODE_TG4_OFFSET:
   4368       /* More crazy intermixing */
   4369       for (unsigned i = 0; i < 2; i++) /* u, v */
   4370          bld.MOV(sources[length++], offset(coordinate, bld, i));
   4371 
   4372       for (unsigned i = 0; i < 2; i++) /* offu, offv */
   4373          bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
   4374                  offset(tg4_offset, bld, i));
   4375 
   4376       if (coord_components == 3) /* r if present */
   4377          bld.MOV(sources[length++], offset(coordinate, bld, 2));
   4378 
   4379       coordinate_done = true;
   4380       break;
   4381    default:
   4382       break;
   4383    }
   4384 
   4385    /* Set up the coordinate (except for cases where it was done above) */
   4386    if (!coordinate_done) {
   4387       for (unsigned i = 0; i < coord_components; i++)
   4388          bld.MOV(sources[length++], offset(coordinate, bld, i));
   4389    }
   4390 
   4391    int mlen;
   4392    if (reg_width == 2)
   4393       mlen = length * reg_width - header_size;
   4394    else
   4395       mlen = length * reg_width;
   4396 
   4397    const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
   4398                                      BRW_REGISTER_TYPE_F);
   4399    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
   4400 
   4401    /* Generate the SEND. */
   4402    inst->opcode = op;
   4403    inst->src[0] = src_payload;
   4404    inst->src[1] = surface;
   4405    inst->src[2] = sampler;
   4406    inst->resize_sources(3);
   4407    inst->mlen = mlen;
   4408    inst->header_size = header_size;
   4409 
   4410    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
   4411    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
   4412 }
   4413 
   4414 static void
   4415 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
   4416 {
   4417    const gen_device_info *devinfo = bld.shader->devinfo;
   4418    const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
   4419    const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
   4420    const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
   4421    const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
   4422    const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
   4423    const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
   4424    const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
   4425    const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
   4426    const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
   4427    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
   4428    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
   4429    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
   4430    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
   4431 
   4432    if (devinfo->gen >= 7) {
   4433       lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
   4434                                       shadow_c, lod, lod2, sample_index,
   4435                                       mcs, surface, sampler, tg4_offset,
   4436                                       coord_components, grad_components);
   4437    } else if (devinfo->gen >= 5) {
   4438       lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
   4439                                       shadow_c, lod, lod2, sample_index,
   4440                                       surface, sampler,
   4441                                       coord_components, grad_components);
   4442    } else {
   4443       lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
   4444                                       shadow_c, lod, lod2,
   4445                                       surface, sampler,
   4446                                       coord_components, grad_components);
   4447    }
   4448 }
   4449 
   4450 /**
   4451  * Initialize the header present in some typed and untyped surface
   4452  * messages.
   4453  */
   4454 static fs_reg
   4455 emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
   4456 {
   4457    fs_builder ubld = bld.exec_all().group(8, 0);
   4458    const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
   4459    ubld.MOV(dst, brw_imm_d(0));
   4460    ubld.group(1, 0).MOV(component(dst, 7), sample_mask);
   4461    return dst;
   4462 }
   4463 
   4464 static void
   4465 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
   4466                            const fs_reg &sample_mask)
   4467 {
   4468    /* Get the logical send arguments. */
   4469    const fs_reg &addr = inst->src[0];
   4470    const fs_reg &src = inst->src[1];
   4471    const fs_reg &surface = inst->src[2];
   4472    const UNUSED fs_reg &dims = inst->src[3];
   4473    const fs_reg &arg = inst->src[4];
   4474 
   4475    /* Calculate the total number of components of the payload. */
   4476    const unsigned addr_sz = inst->components_read(0);
   4477    const unsigned src_sz = inst->components_read(1);
   4478    const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
   4479    const unsigned sz = header_sz + addr_sz + src_sz;
   4480 
   4481    /* Allocate space for the payload. */
   4482    fs_reg *const components = new fs_reg[sz];
   4483    const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
   4484    unsigned n = 0;
   4485 
   4486    /* Construct the payload. */
   4487    if (header_sz)
   4488       components[n++] = emit_surface_header(bld, sample_mask);
   4489 
   4490    for (unsigned i = 0; i < addr_sz; i++)
   4491       components[n++] = offset(addr, bld, i);
   4492 
   4493    for (unsigned i = 0; i < src_sz; i++)
   4494       components[n++] = offset(src, bld, i);
   4495 
   4496    bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
   4497 
   4498    /* Update the original instruction. */
   4499    inst->opcode = op;
   4500    inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
   4501    inst->header_size = header_sz;
   4502 
   4503    inst->src[0] = payload;
   4504    inst->src[1] = surface;
   4505    inst->src[2] = arg;
   4506    inst->resize_sources(3);
   4507 
   4508    delete[] components;
   4509 }
   4510 
   4511 static void
   4512 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
   4513 {
   4514    const gen_device_info *devinfo = bld.shader->devinfo;
   4515 
   4516    if (devinfo->gen >= 7) {
   4517       /* We are switching the instruction from an ALU-like instruction to a
   4518        * send-from-grf instruction.  Since sends can't handle strides or
   4519        * source modifiers, we have to make a copy of the offset source.
   4520        */
   4521       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
   4522       bld.MOV(tmp, inst->src[1]);
   4523       inst->src[1] = tmp;
   4524 
   4525       inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
   4526 
   4527    } else {
   4528       const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
   4529                            BRW_REGISTER_TYPE_UD);
   4530 
   4531       bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
   4532 
   4533       inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4;
   4534       inst->resize_sources(1);
   4535       inst->base_mrf = payload.nr;
   4536       inst->header_size = 1;
   4537       inst->mlen = 1 + inst->exec_size / 8;
   4538    }
   4539 }
   4540 
   4541 static void
   4542 lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
   4543 {
   4544    assert(bld.shader->devinfo->gen < 6);
   4545 
   4546    inst->base_mrf = 2;
   4547    inst->mlen = inst->sources * inst->exec_size / 8;
   4548 
   4549    if (inst->sources > 1) {
   4550       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
   4551        * "Message Payload":
   4552        *
   4553        * "Operand0[7].  For the INT DIV functions, this operand is the
   4554        *  denominator."
   4555        *  ...
   4556        * "Operand1[7].  For the INT DIV functions, this operand is the
   4557        *  numerator."
   4558        */
   4559       const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
   4560       const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
   4561       const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
   4562 
   4563       inst->resize_sources(1);
   4564       inst->src[0] = src0;
   4565 
   4566       assert(inst->exec_size == 8);
   4567       bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
   4568    }
   4569 }
   4570 
   4571 bool
   4572 fs_visitor::lower_logical_sends()
   4573 {
   4574    bool progress = false;
   4575 
   4576    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   4577       const fs_builder ibld(this, block, inst);
   4578 
   4579       switch (inst->opcode) {
   4580       case FS_OPCODE_FB_WRITE_LOGICAL:
   4581          assert(stage == MESA_SHADER_FRAGMENT);
   4582          lower_fb_write_logical_send(ibld, inst,
   4583                                      brw_wm_prog_data(prog_data),
   4584                                      (const brw_wm_prog_key *)key,
   4585                                      payload);
   4586          break;
   4587 
   4588       case FS_OPCODE_FB_READ_LOGICAL:
   4589          lower_fb_read_logical_send(ibld, inst);
   4590          break;
   4591 
   4592       case SHADER_OPCODE_TEX_LOGICAL:
   4593          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
   4594          break;
   4595 
   4596       case SHADER_OPCODE_TXD_LOGICAL:
   4597          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
   4598          break;
   4599 
   4600       case SHADER_OPCODE_TXF_LOGICAL:
   4601          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
   4602          break;
   4603 
   4604       case SHADER_OPCODE_TXL_LOGICAL:
   4605          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
   4606          break;
   4607 
   4608       case SHADER_OPCODE_TXS_LOGICAL:
   4609          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
   4610          break;
   4611 
   4612       case FS_OPCODE_TXB_LOGICAL:
   4613          lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
   4614          break;
   4615 
   4616       case SHADER_OPCODE_TXF_CMS_LOGICAL:
   4617          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
   4618          break;
   4619 
   4620       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
   4621          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
   4622          break;
   4623 
   4624       case SHADER_OPCODE_TXF_UMS_LOGICAL:
   4625          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
   4626          break;
   4627 
   4628       case SHADER_OPCODE_TXF_MCS_LOGICAL:
   4629          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
   4630          break;
   4631 
   4632       case SHADER_OPCODE_LOD_LOGICAL:
   4633          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
   4634          break;
   4635 
   4636       case SHADER_OPCODE_TG4_LOGICAL:
   4637          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
   4638          break;
   4639 
   4640       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
   4641          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
   4642          break;
   4643 
   4644       case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
   4645          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
   4646          break;
   4647 
   4648       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
   4649          lower_surface_logical_send(ibld, inst,
   4650                                     SHADER_OPCODE_UNTYPED_SURFACE_READ,
   4651                                     fs_reg());
   4652          break;
   4653 
   4654       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
   4655          lower_surface_logical_send(ibld, inst,
   4656                                     SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
   4657                                     ibld.sample_mask_reg());
   4658          break;
   4659 
   4660       case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
   4661          lower_surface_logical_send(ibld, inst,
   4662                                     SHADER_OPCODE_BYTE_SCATTERED_READ,
   4663                                     fs_reg());
   4664          break;
   4665 
   4666       case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
   4667          lower_surface_logical_send(ibld, inst,
   4668                                     SHADER_OPCODE_BYTE_SCATTERED_WRITE,
   4669                                     ibld.sample_mask_reg());
   4670          break;
   4671 
   4672       case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
   4673          lower_surface_logical_send(ibld, inst,
   4674                                     SHADER_OPCODE_UNTYPED_ATOMIC,
   4675                                     ibld.sample_mask_reg());
   4676          break;
   4677 
   4678       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
   4679          lower_surface_logical_send(ibld, inst,
   4680                                     SHADER_OPCODE_TYPED_SURFACE_READ,
   4681                                     brw_imm_d(0xffff));
   4682          break;
   4683 
   4684       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
   4685          lower_surface_logical_send(ibld, inst,
   4686                                     SHADER_OPCODE_TYPED_SURFACE_WRITE,
   4687                                     ibld.sample_mask_reg());
   4688          break;
   4689 
   4690       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
   4691          lower_surface_logical_send(ibld, inst,
   4692                                     SHADER_OPCODE_TYPED_ATOMIC,
   4693                                     ibld.sample_mask_reg());
   4694          break;
   4695 
   4696       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
   4697          lower_varying_pull_constant_logical_send(ibld, inst);
   4698          break;
   4699 
   4700       case SHADER_OPCODE_RCP:
   4701       case SHADER_OPCODE_RSQ:
   4702       case SHADER_OPCODE_SQRT:
   4703       case SHADER_OPCODE_EXP2:
   4704       case SHADER_OPCODE_LOG2:
   4705       case SHADER_OPCODE_SIN:
   4706       case SHADER_OPCODE_COS:
   4707       case SHADER_OPCODE_POW:
   4708       case SHADER_OPCODE_INT_QUOTIENT:
   4709       case SHADER_OPCODE_INT_REMAINDER:
   4710          /* The math opcodes are overloaded for the send-like and
   4711           * expression-like instructions which seems kind of icky.  Gen6+ has
   4712           * a native (but rather quirky) MATH instruction so we don't need to
   4713           * do anything here.  On Gen4-5 we'll have to lower the Gen6-like
   4714           * logical instructions (which we can easily recognize because they
   4715           * have mlen = 0) into send-like virtual instructions.
   4716           */
   4717          if (devinfo->gen < 6 && inst->mlen == 0) {
   4718             lower_math_logical_send(ibld, inst);
   4719             break;
   4720 
   4721          } else {
   4722             continue;
   4723          }
   4724 
   4725       default:
   4726          continue;
   4727       }
   4728 
   4729       progress = true;
   4730    }
   4731 
   4732    if (progress)
   4733       invalidate_live_intervals();
   4734 
   4735    return progress;
   4736 }
   4737 
   4738 /**
   4739  * Get the closest allowed SIMD width for instruction \p inst accounting for
   4740  * some common regioning and execution control restrictions that apply to FPU
   4741  * instructions.  These restrictions don't necessarily have any relevance to
   4742  * instructions not executed by the FPU pipeline like extended math, control
   4743  * flow or send message instructions.
   4744  *
   4745  * For virtual opcodes it's really up to the instruction -- In some cases
   4746  * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
   4747  * instructions) it may simplify virtual instruction lowering if we can
   4748  * enforce FPU-like regioning restrictions already on the virtual instruction,
   4749  * in other cases (e.g. virtual send-like instructions) this may be
   4750  * excessively restrictive.
   4751  */
   4752 static unsigned
   4753 get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
   4754                            const fs_inst *inst)
   4755 {
   4756    /* Maximum execution size representable in the instruction controls. */
   4757    unsigned max_width = MIN2(32, inst->exec_size);
   4758 
   4759    /* According to the PRMs:
   4760     *  "A. In Direct Addressing mode, a source cannot span more than 2
   4761     *      adjacent GRF registers.
   4762     *   B. A destination cannot span more than 2 adjacent GRF registers."
   4763     *
   4764     * Look for the source or destination with the largest register region
   4765     * which is the one that is going to limit the overall execution size of
   4766     * the instruction due to this rule.
   4767     */
   4768    unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
   4769 
   4770    for (unsigned i = 0; i < inst->sources; i++)
   4771       reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
   4772 
   4773    /* Calculate the maximum execution size of the instruction based on the
   4774     * factor by which it goes over the hardware limit of 2 GRFs.
   4775     */
   4776    if (reg_count > 2)
   4777       max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
   4778 
   4779    /* According to the IVB PRMs:
   4780     *  "When destination spans two registers, the source MUST span two
   4781     *   registers. The exception to the above rule:
   4782     *
   4783     *    - When source is scalar, the source registers are not incremented.
   4784     *    - When source is packed integer Word and destination is packed
   4785     *      integer DWord, the source register is not incremented but the
   4786     *      source sub register is incremented."
   4787     *
   4788     * The hardware specs from Gen4 to Gen7.5 mention similar regioning
   4789     * restrictions.  The code below intentionally doesn't check whether the
   4790     * destination type is integer because empirically the hardware doesn't
   4791     * seem to care what the actual type is as long as it's dword-aligned.
   4792     */
   4793    if (devinfo->gen < 8) {
   4794       for (unsigned i = 0; i < inst->sources; i++) {
   4795          /* IVB implements DF scalars as <0;2,1> regions. */
   4796          const bool is_scalar_exception = is_uniform(inst->src[i]) &&
   4797             (devinfo->is_haswell || type_sz(inst->src[i].type) != 8);
   4798          const bool is_packed_word_exception =
   4799             type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
   4800             type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
   4801 
   4802          if (inst->size_written > REG_SIZE &&
   4803              inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE &&
   4804              !is_scalar_exception && !is_packed_word_exception) {
   4805             const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
   4806             max_width = MIN2(max_width, inst->exec_size / reg_count);
   4807          }
   4808       }
   4809    }
   4810 
   4811    /* From the IVB PRMs:
   4812     *  "When an instruction is SIMD32, the low 16 bits of the execution mask
   4813     *   are applied for both halves of the SIMD32 instruction. If different
   4814     *   execution mask channels are required, split the instruction into two
   4815     *   SIMD16 instructions."
   4816     *
   4817     * There is similar text in the HSW PRMs.  Gen4-6 don't even implement
   4818     * 32-wide control flow support in hardware and will behave similarly.
   4819     */
   4820    if (devinfo->gen < 8 && !inst->force_writemask_all)
   4821       max_width = MIN2(max_width, 16);
   4822 
   4823    /* From the IVB PRMs (applies to HSW too):
   4824     *  "Instructions with condition modifiers must not use SIMD32."
   4825     *
   4826     * From the BDW PRMs (applies to later hardware too):
   4827     *  "Ternary instruction with condition modifiers must not use SIMD32."
   4828     */
   4829    if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo)))
   4830       max_width = MIN2(max_width, 16);
   4831 
   4832    /* From the IVB PRMs (applies to other devices that don't have the
   4833     * gen_device_info::supports_simd16_3src flag set):
   4834     *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
   4835     *   SIMD8 is not allowed for DF operations."
   4836     */
   4837    if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
   4838       max_width = MIN2(max_width, inst->exec_size / reg_count);
   4839 
   4840    /* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
   4841     * the 8-bit quarter of the execution mask signals specified in the
   4842     * instruction control fields) for the second compressed half of any
   4843     * single-precision instruction (for double-precision instructions
   4844     * it's hardwired to use NibCtrl+1, at least on HSW), which means that
   4845     * the EU will apply the wrong execution controls for the second
   4846     * sequential GRF write if the number of channels per GRF is not exactly
   4847     * eight in single-precision mode (or four in double-float mode).
   4848     *
   4849     * In this situation we calculate the maximum size of the split
   4850     * instructions so they only ever write to a single register.
   4851     */
   4852    if (devinfo->gen < 8 && inst->size_written > REG_SIZE &&
   4853        !inst->force_writemask_all) {
   4854       const unsigned channels_per_grf = inst->exec_size /
   4855          DIV_ROUND_UP(inst->size_written, REG_SIZE);
   4856       const unsigned exec_type_size = get_exec_type_size(inst);
   4857       assert(exec_type_size);
   4858 
   4859       /* The hardware shifts exactly 8 channels per compressed half of the
   4860        * instruction in single-precision mode and exactly 4 in double-precision.
   4861        */
   4862       if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
   4863          max_width = MIN2(max_width, channels_per_grf);
   4864 
   4865       /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
   4866        * because HW applies the same channel enable signals to both halves of
   4867        * the compressed instruction which will be just wrong under
   4868        * non-uniform control flow.
   4869        */
   4870       if (devinfo->gen == 7 && !devinfo->is_haswell &&
   4871           (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
   4872          max_width = MIN2(max_width, 4);
   4873    }
   4874 
   4875    /* Only power-of-two execution sizes are representable in the instruction
   4876     * control fields.
   4877     */
   4878    return 1 << _mesa_logbase2(max_width);
   4879 }
   4880 
   4881 /**
   4882  * Get the maximum allowed SIMD width for instruction \p inst accounting for
   4883  * various payload size restrictions that apply to sampler message
   4884  * instructions.
   4885  *
   4886  * This is only intended to provide a maximum theoretical bound for the
   4887  * execution size of the message based on the number of argument components
   4888  * alone, which in most cases will determine whether the SIMD8 or SIMD16
   4889  * variant of the message can be used, though some messages may have
   4890  * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
   4891  * the message length to determine the exact SIMD width and argument count,
   4892  * which makes a number of sampler message combinations impossible to
   4893  * represent).
   4894  */
   4895 static unsigned
   4896 get_sampler_lowered_simd_width(const struct gen_device_info *devinfo,
   4897                                const fs_inst *inst)
   4898 {
   4899    /* Calculate the number of coordinate components that have to be present
   4900     * assuming that additional arguments follow the texel coordinates in the
   4901     * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
   4902     * need to pad to four or three components depending on the message,
   4903     * pre-ILK we need to pad to at most three components.
   4904     */
   4905    const unsigned req_coord_components =
   4906       (devinfo->gen >= 7 ||
   4907        !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
   4908       (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
   4909                             inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
   4910       3;
   4911 
   4912    /* On Gen9+ the LOD argument is for free if we're able to use the LZ
   4913     * variant of the TXL or TXF message.
   4914     */
   4915    const bool implicit_lod = devinfo->gen >= 9 &&
   4916                              (inst->opcode == SHADER_OPCODE_TXL ||
   4917                               inst->opcode == SHADER_OPCODE_TXF) &&
   4918                              inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
   4919 
   4920    /* Calculate the total number of argument components that need to be passed
   4921     * to the sampler unit.
   4922     */
   4923    const unsigned num_payload_components =
   4924       MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
   4925            req_coord_components) +
   4926       inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
   4927       (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
   4928       inst->components_read(TEX_LOGICAL_SRC_LOD2) +
   4929       inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
   4930       (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
   4931        inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
   4932       inst->components_read(TEX_LOGICAL_SRC_MCS);
   4933 
   4934    /* SIMD16 messages with more than five arguments exceed the maximum message
   4935     * size supported by the sampler, regardless of whether a header is
   4936     * provided or not.
   4937     */
   4938    return MIN2(inst->exec_size,
   4939                num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
   4940 }
   4941 
   4942 /**
   4943  * Get the closest native SIMD width supported by the hardware for instruction
   4944  * \p inst.  The instruction will be left untouched by
   4945  * fs_visitor::lower_simd_width() if the returned value is equal to the
   4946  * original execution size.
   4947  */
   4948 static unsigned
   4949 get_lowered_simd_width(const struct gen_device_info *devinfo,
   4950                        const fs_inst *inst)
   4951 {
   4952    switch (inst->opcode) {
   4953    case BRW_OPCODE_MOV:
   4954    case BRW_OPCODE_SEL:
   4955    case BRW_OPCODE_NOT:
   4956    case BRW_OPCODE_AND:
   4957    case BRW_OPCODE_OR:
   4958    case BRW_OPCODE_XOR:
   4959    case BRW_OPCODE_SHR:
   4960    case BRW_OPCODE_SHL:
   4961    case BRW_OPCODE_ASR:
   4962    case BRW_OPCODE_CMPN:
   4963    case BRW_OPCODE_CSEL:
   4964    case BRW_OPCODE_F32TO16:
   4965    case BRW_OPCODE_F16TO32:
   4966    case BRW_OPCODE_BFREV:
   4967    case BRW_OPCODE_BFE:
   4968    case BRW_OPCODE_ADD:
   4969    case BRW_OPCODE_MUL:
   4970    case BRW_OPCODE_AVG:
   4971    case BRW_OPCODE_FRC:
   4972    case BRW_OPCODE_RNDU:
   4973    case BRW_OPCODE_RNDD:
   4974    case BRW_OPCODE_RNDE:
   4975    case BRW_OPCODE_RNDZ:
   4976    case BRW_OPCODE_LZD:
   4977    case BRW_OPCODE_FBH:
   4978    case BRW_OPCODE_FBL:
   4979    case BRW_OPCODE_CBIT:
   4980    case BRW_OPCODE_SAD2:
   4981    case BRW_OPCODE_MAD:
   4982    case BRW_OPCODE_LRP:
   4983    case FS_OPCODE_PACK:
   4984       return get_fpu_lowered_simd_width(devinfo, inst);
   4985 
   4986    case BRW_OPCODE_CMP: {
   4987       /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
   4988        * when the destination is a GRF the dependency-clear bit on the flag
   4989        * register is cleared early.
   4990        *
   4991        * Suggested workarounds are to disable coissuing CMP instructions
   4992        * or to split CMP(16) instructions into two CMP(8) instructions.
   4993        *
   4994        * We choose to split into CMP(8) instructions since disabling
   4995        * coissuing would affect CMP instructions not otherwise affected by
   4996        * the errata.
   4997        */
   4998       const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell &&
   4999                                   !inst->dst.is_null() ? 8 : ~0);
   5000       return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
   5001    }
   5002    case BRW_OPCODE_BFI1:
   5003    case BRW_OPCODE_BFI2:
   5004       /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
   5005        * should
   5006        *  "Force BFI instructions to be executed always in SIMD8."
   5007        */
   5008       return MIN2(devinfo->is_haswell ? 8 : ~0u,
   5009                   get_fpu_lowered_simd_width(devinfo, inst));
   5010 
   5011    case BRW_OPCODE_IF:
   5012       assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
   5013       return inst->exec_size;
   5014 
   5015    case SHADER_OPCODE_RCP:
   5016    case SHADER_OPCODE_RSQ:
   5017    case SHADER_OPCODE_SQRT:
   5018    case SHADER_OPCODE_EXP2:
   5019    case SHADER_OPCODE_LOG2:
   5020    case SHADER_OPCODE_SIN:
   5021    case SHADER_OPCODE_COS:
   5022       /* Unary extended math instructions are limited to SIMD8 on Gen4 and
   5023        * Gen6.
   5024        */
   5025       return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
   5026               devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
   5027               MIN2(8, inst->exec_size));
   5028 
   5029    case SHADER_OPCODE_POW:
   5030       /* SIMD16 is only allowed on Gen7+. */
   5031       return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
   5032               MIN2(8, inst->exec_size));
   5033 
   5034    case SHADER_OPCODE_INT_QUOTIENT:
   5035    case SHADER_OPCODE_INT_REMAINDER:
   5036       /* Integer division is limited to SIMD8 on all generations. */
   5037       return MIN2(8, inst->exec_size);
   5038 
   5039    case FS_OPCODE_LINTERP:
   5040    case SHADER_OPCODE_GET_BUFFER_SIZE:
   5041    case FS_OPCODE_DDX_COARSE:
   5042    case FS_OPCODE_DDX_FINE:
   5043    case FS_OPCODE_DDY_COARSE:
   5044    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
   5045    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
   5046    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
   5047    case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
   5048    case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
   5049    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
   5050    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
   5051    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
   5052       return MIN2(16, inst->exec_size);
   5053 
   5054    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
   5055       /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
   5056        * message used to implement varying pull constant loads, so expand it
   5057        * to SIMD16.  An alternative with longer message payload length but
   5058        * shorter return payload would be to use the SIMD8 sampler message that
   5059        * takes (header, u, v, r) as parameters instead of (header, u).
   5060        */
   5061       return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
   5062 
   5063    case FS_OPCODE_DDY_FINE:
   5064       /* The implementation of this virtual opcode may require emitting
   5065        * compressed Align16 instructions, which are severely limited on some
   5066        * generations.
   5067        *
   5068        * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
   5069        * Region Restrictions):
   5070        *
   5071        *  "In Align16 access mode, SIMD16 is not allowed for DW operations
   5072        *   and SIMD8 is not allowed for DF operations."
   5073        *
   5074        * In this context, "DW operations" means "operations acting on 32-bit
   5075        * values", so it includes operations on floats.
   5076        *
   5077        * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
   5078        * (Instruction Compression -> Rules and Restrictions):
   5079        *
   5080        *  "A compressed instruction must be in Align1 access mode. Align16
   5081        *   mode instructions cannot be compressed."
   5082        *
   5083        * Similar text exists in the g45 PRM.
   5084        *
   5085        * Empirically, compressed align16 instructions using odd register
   5086        * numbers don't appear to work on Sandybridge either.
   5087        */
   5088       return (devinfo->gen == 4 || devinfo->gen == 6 ||
   5089               (devinfo->gen == 7 && !devinfo->is_haswell) ?
   5090               MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
   5091 
   5092    case SHADER_OPCODE_MULH:
   5093       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
   5094        * is 8-wide on Gen7+.
   5095        */
   5096       return (devinfo->gen >= 7 ? 8 :
   5097               get_fpu_lowered_simd_width(devinfo, inst));
   5098 
   5099    case FS_OPCODE_FB_WRITE_LOGICAL:
   5100       /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
   5101        * here.
   5102        */
   5103       assert(devinfo->gen != 6 ||
   5104              inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
   5105              inst->exec_size == 8);
   5106       /* Dual-source FB writes are unsupported in SIMD16 mode. */
   5107       return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
   5108               8 : MIN2(16, inst->exec_size));
   5109 
   5110    case FS_OPCODE_FB_READ_LOGICAL:
   5111       return MIN2(16, inst->exec_size);
   5112 
   5113    case SHADER_OPCODE_TEX_LOGICAL:
   5114    case SHADER_OPCODE_TXF_CMS_LOGICAL:
   5115    case SHADER_OPCODE_TXF_UMS_LOGICAL:
   5116    case SHADER_OPCODE_TXF_MCS_LOGICAL:
   5117    case SHADER_OPCODE_LOD_LOGICAL:
   5118    case SHADER_OPCODE_TG4_LOGICAL:
   5119    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
   5120    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
   5121    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
   5122       return get_sampler_lowered_simd_width(devinfo, inst);
   5123 
   5124    case SHADER_OPCODE_TXD_LOGICAL:
   5125       /* TXD is unsupported in SIMD16 mode. */
   5126       return 8;
   5127 
   5128    case SHADER_OPCODE_TXL_LOGICAL:
   5129    case FS_OPCODE_TXB_LOGICAL:
   5130       /* Only one execution size is representable pre-ILK depending on whether
   5131        * the shadow reference argument is present.
   5132        */
   5133       if (devinfo->gen == 4)
   5134          return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
   5135       else
   5136          return get_sampler_lowered_simd_width(devinfo, inst);
   5137 
   5138    case SHADER_OPCODE_TXF_LOGICAL:
   5139    case SHADER_OPCODE_TXS_LOGICAL:
   5140       /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
   5141        * messages.  Use SIMD16 instead.
   5142        */
   5143       if (devinfo->gen == 4)
   5144          return 16;
   5145       else
   5146          return get_sampler_lowered_simd_width(devinfo, inst);
   5147 
   5148    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
   5149    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
   5150    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
   5151       return 8;
   5152 
   5153    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
   5154    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
   5155    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
   5156    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
   5157    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
   5158       return MIN2(16, inst->exec_size);
   5159 
   5160    case SHADER_OPCODE_URB_READ_SIMD8:
   5161    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
   5162    case SHADER_OPCODE_URB_WRITE_SIMD8:
   5163    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
   5164    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
   5165    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
   5166       return MIN2(8, inst->exec_size);
   5167 
   5168    case SHADER_OPCODE_MOV_INDIRECT: {
   5169       /* From IVB and HSW PRMs:
   5170        *
   5171        * "2.When the destination requires two registers and the sources are
   5172        *  indirect, the sources must use 1x1 regioning mode.
   5173        *
   5174        * In case of DF instructions in HSW/IVB, the exec_size is limited by
   5175        * the EU decompression logic not handling VxH indirect addressing
   5176        * correctly.
   5177        */
   5178       const unsigned max_size = (devinfo->gen >= 8 ? 2 : 1) * REG_SIZE;
   5179       /* Prior to Broadwell, we only have 8 address subregisters. */
   5180       return MIN3(devinfo->gen >= 8 ? 16 : 8,
   5181                   max_size / (inst->dst.stride * type_sz(inst->dst.type)),
   5182                   inst->exec_size);
   5183    }
   5184 
   5185    case SHADER_OPCODE_LOAD_PAYLOAD: {
   5186       const unsigned reg_count =
   5187          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
   5188 
   5189       if (reg_count > 2) {
   5190          /* Only LOAD_PAYLOAD instructions with per-channel destination region
   5191           * can be easily lowered (which excludes headers and heterogeneous
   5192           * types).
   5193           */
   5194          assert(!inst->header_size);
   5195          for (unsigned i = 0; i < inst->sources; i++)
   5196             assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
   5197                    inst->src[i].file == BAD_FILE);
   5198 
   5199          return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
   5200       } else {
   5201          return inst->exec_size;
   5202       }
   5203    }
   5204    default:
   5205       return inst->exec_size;
   5206    }
   5207 }
   5208 
   5209 /**
   5210  * Return true if splitting out the group of channels of instruction \p inst
   5211  * given by lbld.group() requires allocating a temporary for the i-th source
   5212  * of the lowered instruction.
   5213  */
   5214 static inline bool
   5215 needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
   5216 {
   5217    return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
   5218             (inst->components_read(i) == 1 &&
   5219              lbld.dispatch_width() <= inst->exec_size)) ||
   5220           (inst->flags_written() &
   5221            flag_mask(inst->src[i], type_sz(inst->src[i].type)));
   5222 }
   5223 
   5224 /**
   5225  * Extract the data that would be consumed by the channel group given by
   5226  * lbld.group() from the i-th source region of instruction \p inst and return
   5227  * it as result in packed form.
   5228  */
   5229 static fs_reg
   5230 emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
   5231 {
   5232    /* Specified channel group from the source region. */
   5233    const fs_reg src = horiz_offset(inst->src[i], lbld.group());
   5234 
   5235    if (needs_src_copy(lbld, inst, i)) {
   5236       /* Builder of the right width to perform the copy avoiding uninitialized
   5237        * data if the lowered execution size is greater than the original
   5238        * execution size of the instruction.
   5239        */
   5240       const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
   5241                                               inst->exec_size), 0);
   5242       const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
   5243 
   5244       for (unsigned k = 0; k < inst->components_read(i); ++k)
   5245          cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
   5246 
   5247       return tmp;
   5248 
   5249    } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
   5250       /* The source is invariant for all dispatch_width-wide groups of the
   5251        * original region.
   5252        */
   5253       return inst->src[i];
   5254 
   5255    } else {
   5256       /* We can just point the lowered instruction at the right channel group
   5257        * from the original region.
   5258        */
   5259       return src;
   5260    }
   5261 }
   5262 
   5263 /**
   5264  * Return true if splitting out the group of channels of instruction \p inst
   5265  * given by lbld.group() requires allocating a temporary for the destination
   5266  * of the lowered instruction and copying the data back to the original
   5267  * destination region.
   5268  */
   5269 static inline bool
   5270 needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
   5271 {
   5272    /* If the instruction writes more than one component we'll have to shuffle
   5273     * the results of multiple lowered instructions in order to make sure that
   5274     * they end up arranged correctly in the original destination region.
   5275     */
   5276    if (inst->size_written > inst->dst.component_size(inst->exec_size))
   5277       return true;
   5278 
   5279    /* If the lowered execution size is larger than the original the result of
   5280     * the instruction won't fit in the original destination, so we'll have to
   5281     * allocate a temporary in any case.
   5282     */
   5283    if (lbld.dispatch_width() > inst->exec_size)
   5284       return true;
   5285 
   5286    for (unsigned i = 0; i < inst->sources; i++) {
   5287       /* If we already made a copy of the source for other reasons there won't
   5288        * be any overlap with the destination.
   5289        */
   5290       if (needs_src_copy(lbld, inst, i))
   5291          continue;
   5292 
   5293       /* In order to keep the logic simple we emit a copy whenever the
   5294        * destination region doesn't exactly match an overlapping source, which
   5295        * may point at the source and destination not being aligned group by
   5296        * group which could cause one of the lowered instructions to overwrite
   5297        * the data read from the same source by other lowered instructions.
   5298        */
   5299       if (regions_overlap(inst->dst, inst->size_written,
   5300                           inst->src[i], inst->size_read(i)) &&
   5301           !inst->dst.equals(inst->src[i]))
   5302         return true;
   5303    }
   5304 
   5305    return false;
   5306 }
   5307 
   5308 /**
   5309  * Insert data from a packed temporary into the channel group given by
   5310  * lbld.group() of the destination region of instruction \p inst and return
   5311  * the temporary as result.  Any copy instructions that are required for
   5312  * unzipping the previous value (in the case of partial writes) will be
   5313  * inserted using \p lbld_before and any copy instructions required for
   5314  * zipping up the destination of \p inst will be inserted using \p lbld_after.
   5315  */
   5316 static fs_reg
   5317 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
   5318          fs_inst *inst)
   5319 {
   5320    assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
   5321    assert(lbld_before.group() == lbld_after.group());
   5322 
   5323    /* Specified channel group from the destination region. */
   5324    const fs_reg dst = horiz_offset(inst->dst, lbld_after.group());
   5325    const unsigned dst_size = inst->size_written /
   5326       inst->dst.component_size(inst->exec_size);
   5327 
   5328    if (needs_dst_copy(lbld_after, inst)) {
   5329       const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
   5330 
   5331       if (inst->predicate) {
   5332          /* Handle predication by copying the original contents of
   5333           * the destination into the temporary before emitting the
   5334           * lowered instruction.
   5335           */
   5336          const fs_builder gbld_before =
   5337             lbld_before.group(MIN2(lbld_before.dispatch_width(),
   5338                                    inst->exec_size), 0);
   5339          for (unsigned k = 0; k < dst_size; ++k) {
   5340             gbld_before.MOV(offset(tmp, lbld_before, k),
   5341                             offset(dst, inst->exec_size, k));
   5342          }
   5343       }
   5344 
   5345       const fs_builder gbld_after =
   5346          lbld_after.group(MIN2(lbld_after.dispatch_width(),
   5347                                inst->exec_size), 0);
   5348       for (unsigned k = 0; k < dst_size; ++k) {
   5349          /* Use a builder of the right width to perform the copy avoiding
   5350           * uninitialized data if the lowered execution size is greater than
   5351           * the original execution size of the instruction.
   5352           */
   5353          gbld_after.MOV(offset(dst, inst->exec_size, k),
   5354                         offset(tmp, lbld_after, k));
   5355       }
   5356 
   5357       return tmp;
   5358 
   5359    } else {
   5360       /* No need to allocate a temporary for the lowered instruction, just
   5361        * take the right group of channels from the original region.
   5362        */
   5363       return dst;
   5364    }
   5365 }
   5366 
   5367 bool
   5368 fs_visitor::lower_simd_width()
   5369 {
   5370    bool progress = false;
   5371 
   5372    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   5373       const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
   5374 
   5375       if (lower_width != inst->exec_size) {
   5376          /* Builder matching the original instruction.  We may also need to
   5377           * emit an instruction of width larger than the original, set the
   5378           * execution size of the builder to the highest of both for now so
   5379           * we're sure that both cases can be handled.
   5380           */
   5381          const unsigned max_width = MAX2(inst->exec_size, lower_width);
   5382          const fs_builder ibld = bld.at(block, inst)
   5383                                     .exec_all(inst->force_writemask_all)
   5384                                     .group(max_width, inst->group / max_width);
   5385 
   5386          /* Split the copies in chunks of the execution width of either the
   5387           * original or the lowered instruction, whichever is lower.
   5388           */
   5389          const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
   5390          const unsigned dst_size = inst->size_written /
   5391             inst->dst.component_size(inst->exec_size);
   5392 
   5393          assert(!inst->writes_accumulator && !inst->mlen);
   5394 
   5395          /* Inserting the zip, unzip, and duplicated instructions in all of
   5396           * the right spots is somewhat tricky.  All of the unzip and any
   5397           * instructions from the zip which unzip the destination prior to
   5398           * writing need to happen before all of the per-group instructions
   5399           * and the zip instructions need to happen after.  In order to sort
   5400           * this all out, we insert the unzip instructions before \p inst,
   5401           * insert the per-group instructions after \p inst (i.e. before
   5402           * inst->next), and insert the zip instructions before the
   5403           * instruction after \p inst.  Since we are inserting instructions
   5404           * after \p inst, inst->next is a moving target and we need to save
   5405           * it off here so that we insert the zip instructions in the right
   5406           * place.
   5407           */
   5408          exec_node *const after_inst = inst->next;
   5409          for (unsigned i = 0; i < n; i++) {
   5410             /* Emit a copy of the original instruction with the lowered width.
   5411              * If the EOT flag was set throw it away except for the last
   5412              * instruction to avoid killing the thread prematurely.
   5413              */
   5414             fs_inst split_inst = *inst;
   5415             split_inst.exec_size = lower_width;
   5416             split_inst.eot = inst->eot && i == 0;
   5417 
   5418             /* Select the correct channel enables for the i-th group, then
   5419              * transform the sources and destination and emit the lowered
   5420              * instruction.
   5421              */
   5422             const fs_builder lbld = ibld.group(lower_width, i);
   5423 
   5424             for (unsigned j = 0; j < inst->sources; j++)
   5425                split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
   5426 
   5427             split_inst.dst = emit_zip(lbld.at(block, inst),
   5428                                       lbld.at(block, after_inst), inst);
   5429             split_inst.size_written =
   5430                split_inst.dst.component_size(lower_width) * dst_size;
   5431 
   5432             lbld.at(block, inst->next).emit(split_inst);
   5433          }
   5434 
   5435          inst->remove(block);
   5436          progress = true;
   5437       }
   5438    }
   5439 
   5440    if (progress)
   5441       invalidate_live_intervals();
   5442 
   5443    return progress;
   5444 }
   5445 
   5446 void
   5447 fs_visitor::dump_instructions()
   5448 {
   5449    dump_instructions(NULL);
   5450 }
   5451 
   5452 void
   5453 fs_visitor::dump_instructions(const char *name)
   5454 {
   5455    FILE *file = stderr;
   5456    if (name && geteuid() != 0) {
   5457       file = fopen(name, "w");
   5458       if (!file)
   5459          file = stderr;
   5460    }
   5461 
   5462    if (cfg) {
   5463       calculate_register_pressure();
   5464       int ip = 0, max_pressure = 0;
   5465       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
   5466          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
   5467          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
   5468          dump_instruction(inst, file);
   5469          ip++;
   5470       }
   5471       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
   5472    } else {
   5473       int ip = 0;
   5474       foreach_in_list(backend_instruction, inst, &instructions) {
   5475          fprintf(file, "%4d: ", ip++);
   5476          dump_instruction(inst, file);
   5477       }
   5478    }
   5479 
   5480    if (file != stderr) {
   5481       fclose(file);
   5482    }
   5483 }
   5484 
   5485 void
   5486 fs_visitor::dump_instruction(backend_instruction *be_inst)
   5487 {
   5488    dump_instruction(be_inst, stderr);
   5489 }
   5490 
   5491 void
   5492 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
   5493 {
   5494    fs_inst *inst = (fs_inst *)be_inst;
   5495 
   5496    if (inst->predicate) {
   5497       fprintf(file, "(%cf0.%d) ",
   5498              inst->predicate_inverse ? '-' : '+',
   5499              inst->flag_subreg);
   5500    }
   5501 
   5502    fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
   5503    if (inst->saturate)
   5504       fprintf(file, ".sat");
   5505    if (inst->conditional_mod) {
   5506       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
   5507       if (!inst->predicate &&
   5508           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
   5509                                 inst->opcode != BRW_OPCODE_IF &&
   5510                                 inst->opcode != BRW_OPCODE_WHILE))) {
   5511          fprintf(file, ".f0.%d", inst->flag_subreg);
   5512       }
   5513    }
   5514    fprintf(file, "(%d) ", inst->exec_size);
   5515 
   5516    if (inst->mlen) {
   5517       fprintf(file, "(mlen: %d) ", inst->mlen);
   5518    }
   5519 
   5520    if (inst->eot) {
   5521       fprintf(file, "(EOT) ");
   5522    }
   5523 
   5524    switch (inst->dst.file) {
   5525    case VGRF:
   5526       fprintf(file, "vgrf%d", inst->dst.nr);
   5527       break;
   5528    case FIXED_GRF:
   5529       fprintf(file, "g%d", inst->dst.nr);
   5530       break;
   5531    case MRF:
   5532       fprintf(file, "m%d", inst->dst.nr);
   5533       break;
   5534    case BAD_FILE:
   5535       fprintf(file, "(null)");
   5536       break;
   5537    case UNIFORM:
   5538       fprintf(file, "***u%d***", inst->dst.nr);
   5539       break;
   5540    case ATTR:
   5541       fprintf(file, "***attr%d***", inst->dst.nr);
   5542       break;
   5543    case ARF:
   5544       switch (inst->dst.nr) {
   5545       case BRW_ARF_NULL:
   5546          fprintf(file, "null");
   5547          break;
   5548       case BRW_ARF_ADDRESS:
   5549          fprintf(file, "a0.%d", inst->dst.subnr);
   5550          break;
   5551       case BRW_ARF_ACCUMULATOR:
   5552          fprintf(file, "acc%d", inst->dst.subnr);
   5553          break;
   5554       case BRW_ARF_FLAG:
   5555          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
   5556          break;
   5557       default:
   5558          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
   5559          break;
   5560       }
   5561       break;
   5562    case IMM:
   5563       unreachable("not reached");
   5564    }
   5565 
   5566    if (inst->dst.offset ||
   5567        (inst->dst.file == VGRF &&
   5568         alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
   5569       const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
   5570       fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
   5571               inst->dst.offset % reg_size);
   5572    }
   5573 
   5574    if (inst->dst.stride != 1)
   5575       fprintf(file, "<%u>", inst->dst.stride);
   5576    fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
   5577 
   5578    for (int i = 0; i < inst->sources; i++) {
   5579       if (inst->src[i].negate)
   5580          fprintf(file, "-");
   5581       if (inst->src[i].abs)
   5582          fprintf(file, "|");
   5583       switch (inst->src[i].file) {
   5584       case VGRF:
   5585          fprintf(file, "vgrf%d", inst->src[i].nr);
   5586          break;
   5587       case FIXED_GRF:
   5588          fprintf(file, "g%d", inst->src[i].nr);
   5589          break;
   5590       case MRF:
   5591          fprintf(file, "***m%d***", inst->src[i].nr);
   5592          break;
   5593       case ATTR:
   5594          fprintf(file, "attr%d", inst->src[i].nr);
   5595          break;
   5596       case UNIFORM:
   5597          fprintf(file, "u%d", inst->src[i].nr);
   5598          break;
   5599       case BAD_FILE:
   5600          fprintf(file, "(null)");
   5601          break;
   5602       case IMM:
   5603          switch (inst->src[i].type) {
   5604          case BRW_REGISTER_TYPE_F:
   5605             fprintf(file, "%-gf", inst->src[i].f);
   5606             break;
   5607          case BRW_REGISTER_TYPE_DF:
   5608             fprintf(file, "%fdf", inst->src[i].df);
   5609             break;
   5610          case BRW_REGISTER_TYPE_W:
   5611          case BRW_REGISTER_TYPE_D:
   5612             fprintf(file, "%dd", inst->src[i].d);
   5613             break;
   5614          case BRW_REGISTER_TYPE_UW:
   5615          case BRW_REGISTER_TYPE_UD:
   5616             fprintf(file, "%uu", inst->src[i].ud);
   5617             break;
   5618          case BRW_REGISTER_TYPE_VF:
   5619             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
   5620                     brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
   5621                     brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
   5622                     brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
   5623                     brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
   5624             break;
   5625          default:
   5626             fprintf(file, "???");
   5627             break;
   5628          }
   5629          break;
   5630       case ARF:
   5631          switch (inst->src[i].nr) {
   5632          case BRW_ARF_NULL:
   5633             fprintf(file, "null");
   5634             break;
   5635          case BRW_ARF_ADDRESS:
   5636             fprintf(file, "a0.%d", inst->src[i].subnr);
   5637             break;
   5638          case BRW_ARF_ACCUMULATOR:
   5639             fprintf(file, "acc%d", inst->src[i].subnr);
   5640             break;
   5641          case BRW_ARF_FLAG:
   5642             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
   5643             break;
   5644          default:
   5645             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
   5646             break;
   5647          }
   5648          break;
   5649       }
   5650 
   5651       if (inst->src[i].offset ||
   5652           (inst->src[i].file == VGRF &&
   5653            alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
   5654          const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
   5655          fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
   5656                  inst->src[i].offset % reg_size);
   5657       }
   5658 
   5659       if (inst->src[i].abs)
   5660          fprintf(file, "|");
   5661 
   5662       if (inst->src[i].file != IMM) {
   5663          unsigned stride;
   5664          if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
   5665             unsigned hstride = inst->src[i].hstride;
   5666             stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
   5667          } else {
   5668             stride = inst->src[i].stride;
   5669          }
   5670          if (stride != 1)
   5671             fprintf(file, "<%u>", stride);
   5672 
   5673          fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
   5674       }
   5675 
   5676       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
   5677          fprintf(file, ", ");
   5678    }
   5679 
   5680    fprintf(file, " ");
   5681 
   5682    if (inst->force_writemask_all)
   5683       fprintf(file, "NoMask ");
   5684 
   5685    if (inst->exec_size != dispatch_width)
   5686       fprintf(file, "group%d ", inst->group);
   5687 
   5688    fprintf(file, "\n");
   5689 }
   5690 
   5691 /**
   5692  * Possibly returns an instruction that set up @param reg.
   5693  *
   5694  * Sometimes we want to take the result of some expression/variable
   5695  * dereference tree and rewrite the instruction generating the result
   5696  * of the tree.  When processing the tree, we know that the
   5697  * instructions generated are all writing temporaries that are dead
   5698  * outside of this tree.  So, if we have some instructions that write
   5699  * a temporary, we're free to point that temp write somewhere else.
   5700  *
   5701  * Note that this doesn't guarantee that the instruction generated
   5702  * only reg -- it might be the size=4 destination of a texture instruction.
   5703  */
   5704 fs_inst *
   5705 fs_visitor::get_instruction_generating_reg(fs_inst *start,
   5706 					   fs_inst *end,
   5707 					   const fs_reg &reg)
   5708 {
   5709    if (end == start ||
   5710        end->is_partial_write() ||
   5711        !reg.equals(end->dst)) {
   5712       return NULL;
   5713    } else {
   5714       return end;
   5715    }
   5716 }
   5717 
   5718 void
   5719 fs_visitor::setup_fs_payload_gen6()
   5720 {
   5721    assert(stage == MESA_SHADER_FRAGMENT);
   5722    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
   5723 
   5724    assert(devinfo->gen >= 6);
   5725 
   5726    /* R0-1: masks, pixel X/Y coordinates. */
   5727    payload.num_regs = 2;
   5728    /* R2: only for 32-pixel dispatch.*/
   5729 
   5730    /* R3-26: barycentric interpolation coordinates.  These appear in the
   5731     * same order that they appear in the brw_barycentric_mode
   5732     * enum.  Each set of coordinates occupies 2 registers if dispatch width
   5733     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
   5734     * appear if they were enabled using the "Barycentric Interpolation
   5735     * Mode" bits in WM_STATE.
   5736     */
   5737    for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
   5738       if (prog_data->barycentric_interp_modes & (1 << i)) {
   5739          payload.barycentric_coord_reg[i] = payload.num_regs;
   5740          payload.num_regs += 2;
   5741          if (dispatch_width == 16) {
   5742             payload.num_regs += 2;
   5743          }
   5744       }
   5745    }
   5746 
   5747    /* R27: interpolated depth if uses source depth */
   5748    prog_data->uses_src_depth =
   5749       (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
   5750    if (prog_data->uses_src_depth) {
   5751       payload.source_depth_reg = payload.num_regs;
   5752       payload.num_regs++;
   5753       if (dispatch_width == 16) {
   5754          /* R28: interpolated depth if not SIMD8. */
   5755          payload.num_regs++;
   5756       }
   5757    }
   5758 
   5759    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
   5760    prog_data->uses_src_w =
   5761       (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
   5762    if (prog_data->uses_src_w) {
   5763       payload.source_w_reg = payload.num_regs;
   5764       payload.num_regs++;
   5765       if (dispatch_width == 16) {
   5766          /* R30: interpolated W if not SIMD8. */
   5767          payload.num_regs++;
   5768       }
   5769    }
   5770 
   5771    /* R31: MSAA position offsets. */
   5772    if (prog_data->persample_dispatch &&
   5773        (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
   5774       /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
   5775        *
   5776        *    "MSDISPMODE_PERSAMPLE is required in order to select
   5777        *    POSOFFSET_SAMPLE"
   5778        *
   5779        * So we can only really get sample positions if we are doing real
   5780        * per-sample dispatch.  If we need gl_SamplePosition and we don't have
   5781        * persample dispatch, we hard-code it to 0.5.
   5782        */
   5783       prog_data->uses_pos_offset = true;
   5784       payload.sample_pos_reg = payload.num_regs;
   5785       payload.num_regs++;
   5786    }
   5787 
   5788    /* R32: MSAA input coverage mask */
   5789    prog_data->uses_sample_mask =
   5790       (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
   5791    if (prog_data->uses_sample_mask) {
   5792       assert(devinfo->gen >= 7);
   5793       payload.sample_mask_in_reg = payload.num_regs;
   5794       payload.num_regs++;
   5795       if (dispatch_width == 16) {
   5796          /* R33: input coverage mask if not SIMD8. */
   5797          payload.num_regs++;
   5798       }
   5799    }
   5800 
   5801    /* R34-: bary for 32-pixel. */
   5802    /* R58-59: interp W for 32-pixel. */
   5803 
   5804    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
   5805       source_depth_to_render_target = true;
   5806    }
   5807 }
   5808 
   5809 void
   5810 fs_visitor::setup_vs_payload()
   5811 {
   5812    /* R0: thread header, R1: urb handles */
   5813    payload.num_regs = 2;
   5814 }
   5815 
   5816 void
   5817 fs_visitor::setup_gs_payload()
   5818 {
   5819    assert(stage == MESA_SHADER_GEOMETRY);
   5820 
   5821    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
   5822    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
   5823 
   5824    /* R0: thread header, R1: output URB handles */
   5825    payload.num_regs = 2;
   5826 
   5827    if (gs_prog_data->include_primitive_id) {
   5828       /* R2: Primitive ID 0..7 */
   5829       payload.num_regs++;
   5830    }
   5831 
   5832    /* Always enable VUE handles so we can safely use pull model if needed.
   5833     *
   5834     * The push model for a GS uses a ton of register space even for trivial
   5835     * scenarios with just a few inputs, so just make things easier and a bit
   5836     * safer by always having pull model available.
   5837     */
   5838    gs_prog_data->base.include_vue_handles = true;
   5839 
   5840    /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
   5841    payload.num_regs += nir->info.gs.vertices_in;
   5842 
   5843    /* Use a maximum of 24 registers for push-model inputs. */
   5844    const unsigned max_push_components = 24;
   5845 
   5846    /* If pushing our inputs would take too many registers, reduce the URB read
   5847     * length (which is in HWords, or 8 registers), and resort to pulling.
   5848     *
   5849     * Note that the GS reads <URB Read Length> HWords for every vertex - so we
   5850     * have to multiply by VerticesIn to obtain the total storage requirement.
   5851     */
   5852    if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
   5853        max_push_components) {
   5854       vue_prog_data->urb_read_length =
   5855          ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
   5856    }
   5857 }
   5858 
   5859 void
   5860 fs_visitor::setup_cs_payload()
   5861 {
   5862    assert(devinfo->gen >= 7);
   5863    payload.num_regs = 1;
   5864 }
   5865 
   5866 void
   5867 fs_visitor::calculate_register_pressure()
   5868 {
   5869    invalidate_live_intervals();
   5870    calculate_live_intervals();
   5871 
   5872    unsigned num_instructions = 0;
   5873    foreach_block(block, cfg)
   5874       num_instructions += block->instructions.length();
   5875 
   5876    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
   5877 
   5878    for (unsigned reg = 0; reg < alloc.count; reg++) {
   5879       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
   5880          regs_live_at_ip[ip] += alloc.sizes[reg];
   5881    }
   5882 }
   5883 
   5884 /**
   5885  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
   5886  *
   5887  * The needs_unlit_centroid_workaround ends up producing one of these per
   5888  * channel of centroid input, so it's good to clean them up.
   5889  *
   5890  * An assumption here is that nothing ever modifies the dispatched pixels
   5891  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
   5892  * dictates that anyway.
   5893  */
   5894 bool
   5895 fs_visitor::opt_drop_redundant_mov_to_flags()
   5896 {
   5897    bool flag_mov_found[2] = {false};
   5898    bool progress = false;
   5899 
   5900    /* Instructions removed by this pass can only be added if this were true */
   5901    if (!devinfo->needs_unlit_centroid_workaround)
   5902       return false;
   5903 
   5904    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   5905       if (inst->is_control_flow()) {
   5906          memset(flag_mov_found, 0, sizeof(flag_mov_found));
   5907       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
   5908          if (!flag_mov_found[inst->flag_subreg]) {
   5909             flag_mov_found[inst->flag_subreg] = true;
   5910          } else {
   5911             inst->remove(block);
   5912             progress = true;
   5913          }
   5914       } else if (inst->flags_written()) {
   5915          flag_mov_found[inst->flag_subreg] = false;
   5916       }
   5917    }
   5918 
   5919    return progress;
   5920 }
   5921 
   5922 void
   5923 fs_visitor::optimize()
   5924 {
   5925    /* Start by validating the shader we currently have. */
   5926    validate();
   5927 
   5928    /* bld is the common builder object pointing at the end of the program we
   5929     * used to translate it into i965 IR.  For the optimization and lowering
   5930     * passes coming next, any code added after the end of the program without
   5931     * having explicitly called fs_builder::at() clearly points at a mistake.
   5932     * Ideally optimization passes wouldn't be part of the visitor so they
   5933     * wouldn't have access to bld at all, but they do, so just in case some
   5934     * pass forgets to ask for a location explicitly set it to NULL here to
   5935     * make it trip.  The dispatch width is initialized to a bogus value to
   5936     * make sure that optimizations set the execution controls explicitly to
   5937     * match the code they are manipulating instead of relying on the defaults.
   5938     */
   5939    bld = fs_builder(this, 64);
   5940 
   5941    assign_constant_locations();
   5942    lower_constant_loads();
   5943 
   5944    validate();
   5945 
   5946    split_virtual_grfs();
   5947    validate();
   5948 
   5949 #define OPT(pass, args...) ({                                           \
   5950       pass_num++;                                                       \
   5951       bool this_progress = pass(args);                                  \
   5952                                                                         \
   5953       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
   5954          char filename[64];                                             \
   5955          snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
   5956                   stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
   5957                                                                         \
   5958          backend_shader::dump_instructions(filename);                   \
   5959       }                                                                 \
   5960                                                                         \
   5961       validate();                                                       \
   5962                                                                         \
   5963       progress = progress || this_progress;                             \
   5964       this_progress;                                                    \
   5965    })
   5966 
   5967    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
   5968       char filename[64];
   5969       snprintf(filename, 64, "%s%d-%s-00-00-start",
   5970                stage_abbrev, dispatch_width, nir->info.name);
   5971 
   5972       backend_shader::dump_instructions(filename);
   5973    }
   5974 
   5975    bool progress = false;
   5976    int iteration = 0;
   5977    int pass_num = 0;
   5978 
   5979    OPT(opt_drop_redundant_mov_to_flags);
   5980    OPT(remove_extra_rounding_modes);
   5981 
   5982    do {
   5983       progress = false;
   5984       pass_num = 0;
   5985       iteration++;
   5986 
   5987       OPT(remove_duplicate_mrf_writes);
   5988 
   5989       OPT(opt_algebraic);
   5990       OPT(opt_cse);
   5991       OPT(opt_copy_propagation);
   5992       OPT(opt_predicated_break, this);
   5993       OPT(opt_cmod_propagation);
   5994       OPT(dead_code_eliminate);
   5995       OPT(opt_peephole_sel);
   5996       OPT(dead_control_flow_eliminate, this);
   5997       OPT(opt_register_renaming);
   5998       OPT(opt_saturate_propagation);
   5999       OPT(register_coalesce);
   6000       OPT(compute_to_mrf);
   6001       OPT(eliminate_find_live_channel);
   6002 
   6003       OPT(compact_virtual_grfs);
   6004    } while (progress);
   6005 
   6006    progress = false;
   6007    pass_num = 0;
   6008 
   6009    if (OPT(lower_pack)) {
   6010       OPT(register_coalesce);
   6011       OPT(dead_code_eliminate);
   6012    }
   6013 
   6014    OPT(lower_simd_width);
   6015 
   6016    /* After SIMD lowering just in case we had to unroll the EOT send. */
   6017    OPT(opt_sampler_eot);
   6018 
   6019    OPT(lower_logical_sends);
   6020 
   6021    if (progress) {
   6022       OPT(opt_copy_propagation);
   6023       /* Only run after logical send lowering because it's easier to implement
   6024        * in terms of physical sends.
   6025        */
   6026       if (OPT(opt_zero_samples))
   6027          OPT(opt_copy_propagation);
   6028       /* Run after logical send lowering to give it a chance to CSE the
   6029        * LOAD_PAYLOAD instructions created to construct the payloads of
   6030        * e.g. texturing messages in cases where it wasn't possible to CSE the
   6031        * whole logical instruction.
   6032        */
   6033       OPT(opt_cse);
   6034       OPT(register_coalesce);
   6035       OPT(compute_to_mrf);
   6036       OPT(dead_code_eliminate);
   6037       OPT(remove_duplicate_mrf_writes);
   6038       OPT(opt_peephole_sel);
   6039    }
   6040 
   6041    OPT(opt_redundant_discard_jumps);
   6042 
   6043    if (OPT(lower_load_payload)) {
   6044       split_virtual_grfs();
   6045       OPT(register_coalesce);
   6046       OPT(compute_to_mrf);
   6047       OPT(dead_code_eliminate);
   6048    }
   6049 
   6050    OPT(opt_combine_constants);
   6051    OPT(lower_integer_multiplication);
   6052 
   6053    if (devinfo->gen <= 5 && OPT(lower_minmax)) {
   6054       OPT(opt_cmod_propagation);
   6055       OPT(opt_cse);
   6056       OPT(opt_copy_propagation);
   6057       OPT(dead_code_eliminate);
   6058    }
   6059 
   6060    if (OPT(lower_conversions)) {
   6061       OPT(opt_copy_propagation);
   6062       OPT(dead_code_eliminate);
   6063       OPT(lower_simd_width);
   6064    }
   6065 
   6066    lower_uniform_pull_constant_loads();
   6067 
   6068    validate();
   6069 }
   6070 
   6071 /**
   6072  * Three source instruction must have a GRF/MRF destination register.
   6073  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
   6074  */
   6075 void
   6076 fs_visitor::fixup_3src_null_dest()
   6077 {
   6078    bool progress = false;
   6079 
   6080    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
   6081       if (inst->is_3src(devinfo) && inst->dst.is_null()) {
   6082          inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
   6083                             inst->dst.type);
   6084          progress = true;
   6085       }
   6086    }
   6087 
   6088    if (progress)
   6089       invalidate_live_intervals();
   6090 }
   6091 
   6092 void
   6093 fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
   6094 {
   6095    bool allocated_without_spills;
   6096 
   6097    static const enum instruction_scheduler_mode pre_modes[] = {
   6098       SCHEDULE_PRE,
   6099       SCHEDULE_PRE_NON_LIFO,
   6100       SCHEDULE_PRE_LIFO,
   6101    };
   6102 
   6103    bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
   6104 
   6105    /* Try each scheduling heuristic to see if it can successfully register
   6106     * allocate without spilling.  They should be ordered by decreasing
   6107     * performance but increasing likelihood of allocating.
   6108     */
   6109    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
   6110       schedule_instructions(pre_modes[i]);
   6111 
   6112       if (0) {
   6113          assign_regs_trivial();
   6114          allocated_without_spills = true;
   6115       } else {
   6116          allocated_without_spills = assign_regs(false, spill_all);
   6117       }
   6118       if (allocated_without_spills)
   6119          break;
   6120    }
   6121 
   6122    if (!allocated_without_spills) {
   6123       if (!allow_spilling)
   6124          fail("Failure to register allocate and spilling is not allowed.");
   6125 
   6126       /* We assume that any spilling is worse than just dropping back to
   6127        * SIMD8.  There's probably actually some intermediate point where
   6128        * SIMD16 with a couple of spills is still better.
   6129        */
   6130       if (dispatch_width > min_dispatch_width) {
   6131          fail("Failure to register allocate.  Reduce number of "
   6132               "live scalar values to avoid this.");
   6133       } else {
   6134          compiler->shader_perf_log(log_data,
   6135                                    "%s shader triggered register spilling.  "
   6136                                    "Try reducing the number of live scalar "
   6137                                    "values to improve performance.\n",
   6138                                    stage_name);
   6139       }
   6140 
   6141       /* Since we're out of heuristics, just go spill registers until we
   6142        * get an allocation.
   6143        */
   6144       while (!assign_regs(true, spill_all)) {
   6145          if (failed)
   6146             break;
   6147       }
   6148    }
   6149 
   6150    /* This must come after all optimization and register allocation, since
   6151     * it inserts dead code that happens to have side effects, and it does
   6152     * so based on the actual physical registers in use.
   6153     */
   6154    insert_gen4_send_dependency_workarounds();
   6155 
   6156    if (failed)
   6157       return;
   6158 
   6159    opt_bank_conflicts();
   6160 
   6161    schedule_instructions(SCHEDULE_POST);
   6162 
   6163    if (last_scratch > 0) {
   6164       MAYBE_UNUSED unsigned max_scratch_size = 2 * 1024 * 1024;
   6165 
   6166       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
   6167 
   6168       if (stage == MESA_SHADER_COMPUTE) {
   6169          if (devinfo->is_haswell) {
   6170             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
   6171              * field documentation, Haswell supports a minimum of 2kB of
   6172              * scratch space for compute shaders, unlike every other stage
   6173              * and platform.
   6174              */
   6175             prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
   6176          } else if (devinfo->gen <= 7) {
   6177             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
   6178              * field documentation, platforms prior to Haswell measure scratch
   6179              * size linearly with a range of [1kB, 12kB] and 1kB granularity.
   6180              */
   6181             prog_data->total_scratch = ALIGN(last_scratch, 1024);
   6182             max_scratch_size = 12 * 1024;
   6183          }
   6184       }
   6185 
   6186       /* We currently only support up to 2MB of scratch space.  If we
   6187        * need to support more eventually, the documentation suggests
   6188        * that we could allocate a larger buffer, and partition it out
   6189        * ourselves.  We'd just have to undo the hardware's address
   6190        * calculation by subtracting (FFTID * Per Thread Scratch Space)
   6191        * and then add FFTID * (Larger Per Thread Scratch Space).
   6192        *
   6193        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
   6194        * Thread Group Tracking > Local Memory/Scratch Space.
   6195        */
   6196       assert(prog_data->total_scratch < max_scratch_size);
   6197    }
   6198 }
   6199 
   6200 bool
   6201 fs_visitor::run_vs()
   6202 {
   6203    assert(stage == MESA_SHADER_VERTEX);
   6204 
   6205    setup_vs_payload();
   6206 
   6207    if (shader_time_index >= 0)
   6208       emit_shader_time_begin();
   6209 
   6210    emit_nir_code();
   6211 
   6212    if (failed)
   6213       return false;
   6214 
   6215    compute_clip_distance();
   6216 
   6217    emit_urb_writes();
   6218 
   6219    if (shader_time_index >= 0)
   6220       emit_shader_time_end();
   6221 
   6222    calculate_cfg();
   6223 
   6224    optimize();
   6225 
   6226    assign_curb_setup();
   6227    assign_vs_urb_setup();
   6228 
   6229    fixup_3src_null_dest();
   6230    allocate_registers(8, true);
   6231 
   6232    return !failed;
   6233 }
   6234 
   6235 bool
   6236 fs_visitor::run_tcs_single_patch()
   6237 {
   6238    assert(stage == MESA_SHADER_TESS_CTRL);
   6239 
   6240    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
   6241 
   6242    /* r1-r4 contain the ICP handles. */
   6243    payload.num_regs = 5;
   6244 
   6245    if (shader_time_index >= 0)
   6246       emit_shader_time_begin();
   6247 
   6248    /* Initialize gl_InvocationID */
   6249    fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
   6250    fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
   6251    bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
   6252    bld.MOV(channels_ud, channels_uw);
   6253 
   6254    if (tcs_prog_data->instances == 1) {
   6255       invocation_id = channels_ud;
   6256    } else {
   6257       invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
   6258 
   6259       /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
   6260       fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
   6261       fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
   6262       bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
   6263               brw_imm_ud(INTEL_MASK(23, 17)));
   6264       bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
   6265 
   6266       bld.ADD(invocation_id, instance_times_8, channels_ud);
   6267    }
   6268 
   6269    /* Fix the disptach mask */
   6270    if (nir->info.tess.tcs_vertices_out % 8) {
   6271       bld.CMP(bld.null_reg_ud(), invocation_id,
   6272               brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
   6273       bld.IF(BRW_PREDICATE_NORMAL);
   6274    }
   6275 
   6276    emit_nir_code();
   6277 
   6278    if (nir->info.tess.tcs_vertices_out % 8) {
   6279       bld.emit(BRW_OPCODE_ENDIF);
   6280    }
   6281 
   6282    /* Emit EOT write; set TR DS Cache bit */
   6283    fs_reg srcs[3] = {
   6284       fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
   6285       fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
   6286       fs_reg(brw_imm_ud(0)),
   6287    };
   6288    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
   6289    bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
   6290 
   6291    fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
   6292                             bld.null_reg_ud(), payload);
   6293    inst->mlen = 3;
   6294    inst->eot = true;
   6295 
   6296    if (shader_time_index >= 0)
   6297       emit_shader_time_end();
   6298 
   6299    if (failed)
   6300       return false;
   6301 
   6302    calculate_cfg();
   6303 
   6304    optimize();
   6305 
   6306    assign_curb_setup();
   6307    assign_tcs_single_patch_urb_setup();
   6308 
   6309    fixup_3src_null_dest();
   6310    allocate_registers(8, true);
   6311 
   6312    return !failed;
   6313 }
   6314 
   6315 bool
   6316 fs_visitor::run_tes()
   6317 {
   6318    assert(stage == MESA_SHADER_TESS_EVAL);
   6319 
   6320    /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
   6321    payload.num_regs = 5;
   6322 
   6323    if (shader_time_index >= 0)
   6324       emit_shader_time_begin();
   6325 
   6326    emit_nir_code();
   6327 
   6328    if (failed)
   6329       return false;
   6330 
   6331    emit_urb_writes();
   6332 
   6333    if (shader_time_index >= 0)
   6334       emit_shader_time_end();
   6335 
   6336    calculate_cfg();
   6337 
   6338    optimize();
   6339 
   6340    assign_curb_setup();
   6341    assign_tes_urb_setup();
   6342 
   6343    fixup_3src_null_dest();
   6344    allocate_registers(8, true);
   6345 
   6346    return !failed;
   6347 }
   6348 
   6349 bool
   6350 fs_visitor::run_gs()
   6351 {
   6352    assert(stage == MESA_SHADER_GEOMETRY);
   6353 
   6354    setup_gs_payload();
   6355 
   6356    this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
   6357 
   6358    if (gs_compile->control_data_header_size_bits > 0) {
   6359       /* Create a VGRF to store accumulated control data bits. */
   6360       this->control_data_bits = vgrf(glsl_type::uint_type);
   6361 
   6362       /* If we're outputting more than 32 control data bits, then EmitVertex()
   6363        * will set control_data_bits to 0 after emitting the first vertex.
   6364        * Otherwise, we need to initialize it to 0 here.
   6365        */
   6366       if (gs_compile->control_data_header_size_bits <= 32) {
   6367          const fs_builder abld = bld.annotate("initialize control data bits");
   6368          abld.MOV(this->control_data_bits, brw_imm_ud(0u));
   6369       }
   6370    }
   6371 
   6372    if (shader_time_index >= 0)
   6373       emit_shader_time_begin();
   6374 
   6375    emit_nir_code();
   6376 
   6377    emit_gs_thread_end();
   6378 
   6379    if (shader_time_index >= 0)
   6380       emit_shader_time_end();
   6381 
   6382    if (failed)
   6383       return false;
   6384 
   6385    calculate_cfg();
   6386 
   6387    optimize();
   6388 
   6389    assign_curb_setup();
   6390    assign_gs_urb_setup();
   6391 
   6392    fixup_3src_null_dest();
   6393    allocate_registers(8, true);
   6394 
   6395    return !failed;
   6396 }
   6397 
   6398 /* From the SKL PRM, Volume 16, Workarounds:
   6399  *
   6400  *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
   6401  *              only header phases (R0-R2)
   6402  *
   6403  *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
   6404  *       have been header only.
   6405  *
   6406  * Instead of enabling push constants one can alternatively enable one of the
   6407  * inputs. Here one simply chooses "layer" which shouldn't impose much
   6408  * overhead.
   6409  */
   6410 static void
   6411 gen9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
   6412 {
   6413    if (wm_prog_data->num_varying_inputs)
   6414       return;
   6415 
   6416    if (wm_prog_data->base.curb_read_length)
   6417       return;
   6418 
   6419    wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
   6420    wm_prog_data->num_varying_inputs = 1;
   6421 }
   6422 
   6423 bool
   6424 fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
   6425 {
   6426    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
   6427    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
   6428 
   6429    assert(stage == MESA_SHADER_FRAGMENT);
   6430 
   6431    if (devinfo->gen >= 6)
   6432       setup_fs_payload_gen6();
   6433    else
   6434       setup_fs_payload_gen4();
   6435 
   6436    if (0) {
   6437       emit_dummy_fs();
   6438    } else if (do_rep_send) {
   6439       assert(dispatch_width == 16);
   6440       emit_repclear_shader();
   6441    } else {
   6442       if (shader_time_index >= 0)
   6443          emit_shader_time_begin();
   6444 
   6445       calculate_urb_setup();
   6446       if (nir->info.inputs_read > 0 ||
   6447           (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
   6448          if (devinfo->gen < 6)
   6449             emit_interpolation_setup_gen4();
   6450          else
   6451             emit_interpolation_setup_gen6();
   6452       }
   6453 
   6454       /* We handle discards by keeping track of the still-live pixels in f0.1.
   6455        * Initialize it with the dispatched pixels.
   6456        */
   6457       if (wm_prog_data->uses_kill) {
   6458          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
   6459          discard_init->flag_subreg = 1;
   6460       }
   6461 
   6462       /* Generate FS IR for main().  (the visitor only descends into
   6463        * functions called "main").
   6464        */
   6465       emit_nir_code();
   6466 
   6467       if (failed)
   6468 	 return false;
   6469 
   6470       if (wm_prog_data->uses_kill)
   6471          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
   6472 
   6473       if (wm_key->alpha_test_func)
   6474          emit_alpha_test();
   6475 
   6476       emit_fb_writes();
   6477 
   6478       if (shader_time_index >= 0)
   6479          emit_shader_time_end();
   6480 
   6481       calculate_cfg();
   6482 
   6483       optimize();
   6484 
   6485       assign_curb_setup();
   6486 
   6487       if (devinfo->gen >= 9)
   6488          gen9_ps_header_only_workaround(wm_prog_data);
   6489 
   6490       assign_urb_setup();
   6491 
   6492       fixup_3src_null_dest();
   6493       allocate_registers(8, allow_spilling);
   6494 
   6495       if (failed)
   6496          return false;
   6497    }
   6498 
   6499    return !failed;
   6500 }
   6501 
   6502 bool
   6503 fs_visitor::run_cs(unsigned min_dispatch_width)
   6504 {
   6505    assert(stage == MESA_SHADER_COMPUTE);
   6506    assert(dispatch_width >= min_dispatch_width);
   6507 
   6508    setup_cs_payload();
   6509 
   6510    if (shader_time_index >= 0)
   6511       emit_shader_time_begin();
   6512 
   6513    if (devinfo->is_haswell && prog_data->total_shared > 0) {
   6514       /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
   6515       const fs_builder abld = bld.exec_all().group(1, 0);
   6516       abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
   6517                suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
   6518    }
   6519 
   6520    emit_nir_code();
   6521 
   6522    if (failed)
   6523       return false;
   6524 
   6525    emit_cs_terminate();
   6526 
   6527    if (shader_time_index >= 0)
   6528       emit_shader_time_end();
   6529 
   6530    calculate_cfg();
   6531 
   6532    optimize();
   6533 
   6534    assign_curb_setup();
   6535 
   6536    fixup_3src_null_dest();
   6537    allocate_registers(min_dispatch_width, true);
   6538 
   6539    if (failed)
   6540       return false;
   6541 
   6542    return !failed;
   6543 }
   6544 
   6545 /**
   6546  * Return a bitfield where bit n is set if barycentric interpolation mode n
   6547  * (see enum brw_barycentric_mode) is needed by the fragment shader.
   6548  *
   6549  * We examine the load_barycentric intrinsics rather than looking at input
   6550  * variables so that we catch interpolateAtCentroid() messages too, which
   6551  * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
   6552  */
   6553 static unsigned
   6554 brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo,
   6555                                      const nir_shader *shader)
   6556 {
   6557    unsigned barycentric_interp_modes = 0;
   6558 
   6559    nir_foreach_function(f, shader) {
   6560       if (!f->impl)
   6561          continue;
   6562 
   6563       nir_foreach_block(block, f->impl) {
   6564          nir_foreach_instr(instr, block) {
   6565             if (instr->type != nir_instr_type_intrinsic)
   6566                continue;
   6567 
   6568             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   6569             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
   6570                continue;
   6571 
   6572             /* Ignore WPOS; it doesn't require interpolation. */
   6573             if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
   6574                continue;
   6575 
   6576             intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
   6577             enum glsl_interp_mode interp = (enum glsl_interp_mode)
   6578                nir_intrinsic_interp_mode(intrin);
   6579             nir_intrinsic_op bary_op = intrin->intrinsic;
   6580             enum brw_barycentric_mode bary =
   6581                brw_barycentric_mode(interp, bary_op);
   6582 
   6583             barycentric_interp_modes |= 1 << bary;
   6584 
   6585             if (devinfo->needs_unlit_centroid_workaround &&
   6586                 bary_op == nir_intrinsic_load_barycentric_centroid)
   6587                barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
   6588          }
   6589       }
   6590    }
   6591 
   6592    return barycentric_interp_modes;
   6593 }
   6594 
   6595 static void
   6596 brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
   6597                         const nir_shader *shader)
   6598 {
   6599    prog_data->flat_inputs = 0;
   6600 
   6601    nir_foreach_variable(var, &shader->inputs) {
   6602       int input_index = prog_data->urb_setup[var->data.location];
   6603 
   6604       if (input_index < 0)
   6605 	 continue;
   6606 
   6607       /* flat shading */
   6608       if (var->data.interpolation == INTERP_MODE_FLAT)
   6609          prog_data->flat_inputs |= (1 << input_index);
   6610    }
   6611 }
   6612 
   6613 static uint8_t
   6614 computed_depth_mode(const nir_shader *shader)
   6615 {
   6616    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
   6617       switch (shader->info.fs.depth_layout) {
   6618       case FRAG_DEPTH_LAYOUT_NONE:
   6619       case FRAG_DEPTH_LAYOUT_ANY:
   6620          return BRW_PSCDEPTH_ON;
   6621       case FRAG_DEPTH_LAYOUT_GREATER:
   6622          return BRW_PSCDEPTH_ON_GE;
   6623       case FRAG_DEPTH_LAYOUT_LESS:
   6624          return BRW_PSCDEPTH_ON_LE;
   6625       case FRAG_DEPTH_LAYOUT_UNCHANGED:
   6626          return BRW_PSCDEPTH_OFF;
   6627       }
   6628    }
   6629    return BRW_PSCDEPTH_OFF;
   6630 }
   6631 
   6632 /**
   6633  * Move load_interpolated_input with simple (payload-based) barycentric modes
   6634  * to the top of the program so we don't emit multiple PLNs for the same input.
   6635  *
   6636  * This works around CSE not being able to handle non-dominating cases
   6637  * such as:
   6638  *
   6639  *    if (...) {
   6640  *       interpolate input
   6641  *    } else {
   6642  *       interpolate the same exact input
   6643  *    }
   6644  *
   6645  * This should be replaced by global value numbering someday.
   6646  */
   6647 static bool
   6648 move_interpolation_to_top(nir_shader *nir)
   6649 {
   6650    bool progress = false;
   6651 
   6652    nir_foreach_function(f, nir) {
   6653       if (!f->impl)
   6654          continue;
   6655 
   6656       nir_block *top = nir_start_block(f->impl);
   6657       exec_node *cursor_node = NULL;
   6658 
   6659       nir_foreach_block(block, f->impl) {
   6660          if (block == top)
   6661             continue;
   6662 
   6663          nir_foreach_instr_safe(instr, block) {
   6664             if (instr->type != nir_instr_type_intrinsic)
   6665                continue;
   6666 
   6667             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   6668             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
   6669                continue;
   6670             nir_intrinsic_instr *bary_intrinsic =
   6671                nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
   6672             nir_intrinsic_op op = bary_intrinsic->intrinsic;
   6673 
   6674             /* Leave interpolateAtSample/Offset() where they are. */
   6675             if (op == nir_intrinsic_load_barycentric_at_sample ||
   6676                 op == nir_intrinsic_load_barycentric_at_offset)
   6677                continue;
   6678 
   6679             nir_instr *move[3] = {
   6680                &bary_intrinsic->instr,
   6681                intrin->src[1].ssa->parent_instr,
   6682                instr
   6683             };
   6684 
   6685             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
   6686                if (move[i]->block != top) {
   6687                   move[i]->block = top;
   6688                   exec_node_remove(&move[i]->node);
   6689                   if (cursor_node) {
   6690                      exec_node_insert_after(cursor_node, &move[i]->node);
   6691                   } else {
   6692                      exec_list_push_head(&top->instr_list, &move[i]->node);
   6693                   }
   6694                   cursor_node = &move[i]->node;
   6695                   progress = true;
   6696                }
   6697             }
   6698          }
   6699       }
   6700       nir_metadata_preserve(f->impl, (nir_metadata)
   6701                             ((unsigned) nir_metadata_block_index |
   6702                              (unsigned) nir_metadata_dominance));
   6703    }
   6704 
   6705    return progress;
   6706 }
   6707 
   6708 /**
   6709  * Demote per-sample barycentric intrinsics to centroid.
   6710  *
   6711  * Useful when rendering to a non-multisampled buffer.
   6712  */
   6713 static bool
   6714 demote_sample_qualifiers(nir_shader *nir)
   6715 {
   6716    bool progress = true;
   6717 
   6718    nir_foreach_function(f, nir) {
   6719       if (!f->impl)
   6720          continue;
   6721 
   6722       nir_builder b;
   6723       nir_builder_init(&b, f->impl);
   6724 
   6725       nir_foreach_block(block, f->impl) {
   6726          nir_foreach_instr_safe(instr, block) {
   6727             if (instr->type != nir_instr_type_intrinsic)
   6728                continue;
   6729 
   6730             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   6731             if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
   6732                 intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
   6733                continue;
   6734 
   6735             b.cursor = nir_before_instr(instr);
   6736             nir_ssa_def *centroid =
   6737                nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
   6738                                     nir_intrinsic_interp_mode(intrin));
   6739             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
   6740                                      nir_src_for_ssa(centroid));
   6741             nir_instr_remove(instr);
   6742             progress = true;
   6743          }
   6744       }
   6745 
   6746       nir_metadata_preserve(f->impl, (nir_metadata)
   6747                             ((unsigned) nir_metadata_block_index |
   6748                              (unsigned) nir_metadata_dominance));
   6749    }
   6750 
   6751    return progress;
   6752 }
   6753 
   6754 /**
   6755  * Pre-gen6, the register file of the EUs was shared between threads,
   6756  * and each thread used some subset allocated on a 16-register block
   6757  * granularity.  The unit states wanted these block counts.
   6758  */
   6759 static inline int
   6760 brw_register_blocks(int reg_count)
   6761 {
   6762    return ALIGN(reg_count, 16) / 16 - 1;
   6763 }
   6764 
   6765 const unsigned *
   6766 brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
   6767                void *mem_ctx,
   6768                const struct brw_wm_prog_key *key,
   6769                struct brw_wm_prog_data *prog_data,
   6770                const nir_shader *src_shader,
   6771                struct gl_program *prog,
   6772                int shader_time_index8, int shader_time_index16,
   6773                bool allow_spilling,
   6774                bool use_rep_send, struct brw_vue_map *vue_map,
   6775                char **error_str)
   6776 {
   6777    const struct gen_device_info *devinfo = compiler->devinfo;
   6778 
   6779    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
   6780    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
   6781    brw_nir_lower_fs_inputs(shader, devinfo, key);
   6782    brw_nir_lower_fs_outputs(shader);
   6783 
   6784    if (devinfo->gen < 6) {
   6785       brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
   6786    }
   6787 
   6788    if (!key->multisample_fbo)
   6789       NIR_PASS_V(shader, demote_sample_qualifiers);
   6790    NIR_PASS_V(shader, move_interpolation_to_top);
   6791    shader = brw_postprocess_nir(shader, compiler, true);
   6792 
   6793    /* key->alpha_test_func means simulating alpha testing via discards,
   6794     * so the shader definitely kills pixels.
   6795     */
   6796    prog_data->uses_kill = shader->info.fs.uses_discard ||
   6797       key->alpha_test_func;
   6798    prog_data->uses_omask = key->multisample_fbo &&
   6799       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
   6800    prog_data->computed_depth_mode = computed_depth_mode(shader);
   6801    prog_data->computed_stencil =
   6802       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
   6803 
   6804    prog_data->persample_dispatch =
   6805       key->multisample_fbo &&
   6806       (key->persample_interp ||
   6807        (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
   6808                                             SYSTEM_BIT_SAMPLE_POS)) ||
   6809        shader->info.fs.uses_sample_qualifier ||
   6810        shader->info.outputs_read);
   6811 
   6812    prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
   6813 
   6814    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
   6815    prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
   6816    prog_data->inner_coverage = shader->info.fs.inner_coverage;
   6817 
   6818    prog_data->barycentric_interp_modes =
   6819       brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
   6820 
   6821    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
   6822    uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
   6823    unsigned simd8_grf_used = 0, simd16_grf_used = 0;
   6824 
   6825    fs_visitor v8(compiler, log_data, mem_ctx, key,
   6826                  &prog_data->base, prog, shader, 8,
   6827                  shader_time_index8);
   6828    if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
   6829       if (error_str)
   6830          *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
   6831 
   6832       return NULL;
   6833    } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
   6834       simd8_cfg = v8.cfg;
   6835       simd8_grf_start = v8.payload.num_regs;
   6836       simd8_grf_used = v8.grf_used;
   6837    }
   6838 
   6839    if (v8.max_dispatch_width >= 16 &&
   6840        likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
   6841       /* Try a SIMD16 compile */
   6842       fs_visitor v16(compiler, log_data, mem_ctx, key,
   6843                      &prog_data->base, prog, shader, 16,
   6844                      shader_time_index16);
   6845       v16.import_uniforms(&v8);
   6846       if (!v16.run_fs(allow_spilling, use_rep_send)) {
   6847          compiler->shader_perf_log(log_data,
   6848                                    "SIMD16 shader failed to compile: %s",
   6849                                    v16.fail_msg);
   6850       } else {
   6851          simd16_cfg = v16.cfg;
   6852          simd16_grf_start = v16.payload.num_regs;
   6853          simd16_grf_used = v16.grf_used;
   6854       }
   6855    }
   6856 
   6857    /* When the caller requests a repclear shader, they want SIMD16-only */
   6858    if (use_rep_send)
   6859       simd8_cfg = NULL;
   6860 
   6861    /* Prior to Iron Lake, the PS had a single shader offset with a jump table
   6862     * at the top to select the shader.  We've never implemented that.
   6863     * Instead, we just give them exactly one shader and we pick the widest one
   6864     * available.
   6865     */
   6866    if (compiler->devinfo->gen < 5 && simd16_cfg)
   6867       simd8_cfg = NULL;
   6868 
   6869    if (prog_data->persample_dispatch) {
   6870       /* Starting with SandyBridge (where we first get MSAA), the different
   6871        * pixel dispatch combinations are grouped into classifications A
   6872        * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
   6873        * generations, the only configurations supporting persample dispatch
   6874        * are are this in which only one dispatch width is enabled.
   6875        *
   6876        * If computed depth is enabled, SNB only allows SIMD8 while IVB+
   6877        * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
   6878        */
   6879       if (compiler->devinfo->gen == 6 &&
   6880           prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
   6881          simd16_cfg = NULL;
   6882       } else if (simd16_cfg) {
   6883          simd8_cfg = NULL;
   6884       }
   6885    }
   6886 
   6887    /* We have to compute the flat inputs after the visitor is finished running
   6888     * because it relies on prog_data->urb_setup which is computed in
   6889     * fs_visitor::calculate_urb_setup().
   6890     */
   6891    brw_compute_flat_inputs(prog_data, shader);
   6892 
   6893    fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
   6894                   v8.promoted_constants, v8.runtime_check_aads_emit,
   6895                   MESA_SHADER_FRAGMENT);
   6896 
   6897    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
   6898       g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
   6899                                      shader->info.label ?
   6900                                         shader->info.label : "unnamed",
   6901                                      shader->info.name));
   6902    }
   6903 
   6904    if (simd8_cfg) {
   6905       prog_data->dispatch_8 = true;
   6906       g.generate_code(simd8_cfg, 8);
   6907       prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
   6908       prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
   6909 
   6910       if (simd16_cfg) {
   6911          prog_data->dispatch_16 = true;
   6912          prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
   6913          prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
   6914          prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
   6915       }
   6916    } else if (simd16_cfg) {
   6917       prog_data->dispatch_16 = true;
   6918       g.generate_code(simd16_cfg, 16);
   6919       prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
   6920       prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
   6921    }
   6922 
   6923    return g.get_assembly(&prog_data->base.program_size);
   6924 }
   6925 
   6926 fs_reg *
   6927 fs_visitor::emit_cs_work_group_id_setup()
   6928 {
   6929    assert(stage == MESA_SHADER_COMPUTE);
   6930 
   6931    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
   6932 
   6933    struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
   6934    struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
   6935    struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
   6936 
   6937    bld.MOV(*reg, r0_1);
   6938    bld.MOV(offset(*reg, bld, 1), r0_6);
   6939    bld.MOV(offset(*reg, bld, 2), r0_7);
   6940 
   6941    return reg;
   6942 }
   6943 
   6944 static void
   6945 fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
   6946 {
   6947    block->dwords = dwords;
   6948    block->regs = DIV_ROUND_UP(dwords, 8);
   6949    block->size = block->regs * 32;
   6950 }
   6951 
   6952 static void
   6953 cs_fill_push_const_info(const struct gen_device_info *devinfo,
   6954                         struct brw_cs_prog_data *cs_prog_data)
   6955 {
   6956    const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
   6957    int subgroup_id_index = get_subgroup_id_param_index(prog_data);
   6958    bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell;
   6959 
   6960    /* The thread ID should be stored in the last param dword */
   6961    assert(subgroup_id_index == -1 ||
   6962           subgroup_id_index == (int)prog_data->nr_params - 1);
   6963 
   6964    unsigned cross_thread_dwords, per_thread_dwords;
   6965    if (!cross_thread_supported) {
   6966       cross_thread_dwords = 0u;
   6967       per_thread_dwords = prog_data->nr_params;
   6968    } else if (subgroup_id_index >= 0) {
   6969       /* Fill all but the last register with cross-thread payload */
   6970       cross_thread_dwords = 8 * (subgroup_id_index / 8);
   6971       per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
   6972       assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
   6973    } else {
   6974       /* Fill all data using cross-thread payload */
   6975       cross_thread_dwords = prog_data->nr_params;
   6976       per_thread_dwords = 0u;
   6977    }
   6978 
   6979    fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
   6980    fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
   6981 
   6982    unsigned total_dwords =
   6983       (cs_prog_data->push.per_thread.size * cs_prog_data->threads +
   6984        cs_prog_data->push.cross_thread.size) / 4;
   6985    fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
   6986 
   6987    assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
   6988           cs_prog_data->push.per_thread.size == 0);
   6989    assert(cs_prog_data->push.cross_thread.dwords +
   6990           cs_prog_data->push.per_thread.dwords ==
   6991              prog_data->nr_params);
   6992 }
   6993 
   6994 static void
   6995 cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
   6996 {
   6997    cs_prog_data->simd_size = size;
   6998    unsigned group_size = cs_prog_data->local_size[0] *
   6999       cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
   7000    cs_prog_data->threads = (group_size + size - 1) / size;
   7001 }
   7002 
   7003 static nir_shader *
   7004 compile_cs_to_nir(const struct brw_compiler *compiler,
   7005                   void *mem_ctx,
   7006                   const struct brw_cs_prog_key *key,
   7007                   struct brw_cs_prog_data *prog_data,
   7008                   const nir_shader *src_shader,
   7009                   unsigned dispatch_width)
   7010 {
   7011    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
   7012    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
   7013    brw_nir_lower_cs_intrinsics(shader, dispatch_width);
   7014    return brw_postprocess_nir(shader, compiler, true);
   7015 }
   7016 
   7017 const unsigned *
   7018 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
   7019                void *mem_ctx,
   7020                const struct brw_cs_prog_key *key,
   7021                struct brw_cs_prog_data *prog_data,
   7022                const nir_shader *src_shader,
   7023                int shader_time_index,
   7024                char **error_str)
   7025 {
   7026    prog_data->local_size[0] = src_shader->info.cs.local_size[0];
   7027    prog_data->local_size[1] = src_shader->info.cs.local_size[1];
   7028    prog_data->local_size[2] = src_shader->info.cs.local_size[2];
   7029    unsigned local_workgroup_size =
   7030       src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
   7031       src_shader->info.cs.local_size[2];
   7032 
   7033    unsigned min_dispatch_width =
   7034       DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
   7035    min_dispatch_width = MAX2(8, min_dispatch_width);
   7036    min_dispatch_width = util_next_power_of_two(min_dispatch_width);
   7037    assert(min_dispatch_width <= 32);
   7038 
   7039    fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
   7040    cfg_t *cfg = NULL;
   7041    const char *fail_msg = NULL;
   7042    unsigned promoted_constants = 0;
   7043 
   7044    /* Now the main event: Visit the shader IR and generate our CS IR for it.
   7045     */
   7046    if (min_dispatch_width <= 8) {
   7047       nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
   7048                                            prog_data, src_shader, 8);
   7049       v8 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
   7050                           NULL, /* Never used in core profile */
   7051                           nir8, 8, shader_time_index);
   7052       if (!v8->run_cs(min_dispatch_width)) {
   7053          fail_msg = v8->fail_msg;
   7054       } else {
   7055          /* We should always be able to do SIMD32 for compute shaders */
   7056          assert(v8->max_dispatch_width >= 32);
   7057 
   7058          cfg = v8->cfg;
   7059          cs_set_simd_size(prog_data, 8);
   7060          cs_fill_push_const_info(compiler->devinfo, prog_data);
   7061          promoted_constants = v8->promoted_constants;
   7062       }
   7063    }
   7064 
   7065    if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
   7066        !fail_msg && min_dispatch_width <= 16) {
   7067       /* Try a SIMD16 compile */
   7068       nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
   7069                                             prog_data, src_shader, 16);
   7070       v16 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
   7071                            NULL, /* Never used in core profile */
   7072                            nir16, 16, shader_time_index);
   7073       if (v8)
   7074          v16->import_uniforms(v8);
   7075 
   7076       if (!v16->run_cs(min_dispatch_width)) {
   7077          compiler->shader_perf_log(log_data,
   7078                                    "SIMD16 shader failed to compile: %s",
   7079                                    v16->fail_msg);
   7080          if (!cfg) {
   7081             fail_msg =
   7082                "Couldn't generate SIMD16 program and not "
   7083                "enough threads for SIMD8";
   7084          }
   7085       } else {
   7086          /* We should always be able to do SIMD32 for compute shaders */
   7087          assert(v16->max_dispatch_width >= 32);
   7088 
   7089          cfg = v16->cfg;
   7090          cs_set_simd_size(prog_data, 16);
   7091          cs_fill_push_const_info(compiler->devinfo, prog_data);
   7092          promoted_constants = v16->promoted_constants;
   7093       }
   7094    }
   7095 
   7096    /* We should always be able to do SIMD32 for compute shaders */
   7097    assert(!v16 || v16->max_dispatch_width >= 32);
   7098 
   7099    if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
   7100       /* Try a SIMD32 compile */
   7101       nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
   7102                                             prog_data, src_shader, 32);
   7103       v32 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
   7104                            NULL, /* Never used in core profile */
   7105                            nir32, 32, shader_time_index);
   7106       if (v8)
   7107          v32->import_uniforms(v8);
   7108       else if (v16)
   7109          v32->import_uniforms(v16);
   7110 
   7111       if (!v32->run_cs(min_dispatch_width)) {
   7112          compiler->shader_perf_log(log_data,
   7113                                    "SIMD32 shader failed to compile: %s",
   7114                                    v16->fail_msg);
   7115          if (!cfg) {
   7116             fail_msg =
   7117                "Couldn't generate SIMD32 program and not "
   7118                "enough threads for SIMD16";
   7119          }
   7120       } else {
   7121          cfg = v32->cfg;
   7122          cs_set_simd_size(prog_data, 32);
   7123          cs_fill_push_const_info(compiler->devinfo, prog_data);
   7124          promoted_constants = v32->promoted_constants;
   7125       }
   7126    }
   7127 
   7128    const unsigned *ret = NULL;
   7129    if (unlikely(cfg == NULL)) {
   7130       assert(fail_msg);
   7131       if (error_str)
   7132          *error_str = ralloc_strdup(mem_ctx, fail_msg);
   7133    } else {
   7134       fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
   7135                      promoted_constants, false, MESA_SHADER_COMPUTE);
   7136       if (INTEL_DEBUG & DEBUG_CS) {
   7137          char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
   7138                                       src_shader->info.label ?
   7139                                          src_shader->info.label : "unnamed",
   7140                                       src_shader->info.name);
   7141          g.enable_debug(name);
   7142       }
   7143 
   7144       g.generate_code(cfg, prog_data->simd_size);
   7145 
   7146       ret = g.get_assembly(&prog_data->base.program_size);
   7147    }
   7148 
   7149    delete v8;
   7150    delete v16;
   7151    delete v32;
   7152 
   7153    return ret;
   7154 }
   7155 
   7156 /**
   7157  * Test the dispatch mask packing assumptions of
   7158  * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
   7159  * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
   7160  * executed with an unexpected dispatch mask.
   7161  */
   7162 static UNUSED void
   7163 brw_fs_test_dispatch_packing(const fs_builder &bld)
   7164 {
   7165    const gl_shader_stage stage = bld.shader->stage;
   7166 
   7167    if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
   7168                                      bld.shader->stage_prog_data)) {
   7169       const fs_builder ubld = bld.exec_all().group(1, 0);
   7170       const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
   7171       const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
   7172                            brw_dmask_reg());
   7173 
   7174       ubld.ADD(tmp, mask, brw_imm_ud(1));
   7175       ubld.AND(tmp, mask, tmp);
   7176 
   7177       /* This will loop forever if the dispatch mask doesn't have the expected
   7178        * form '2^n-1', in which case tmp will be non-zero.
   7179        */
   7180       bld.emit(BRW_OPCODE_DO);
   7181       bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
   7182       set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
   7183    }
   7184 }
   7185