Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 /** @file brw_fs.cpp
     25  *
     26  * This file drives the GLSL IR -> LIR translation, contains the
     27  * optimizations on the LIR, and drives the generation of native code
     28  * from the LIR.
     29  */
     30 
     31 #include "main/macros.h"
     32 #include "brw_context.h"
     33 #include "brw_eu.h"
     34 #include "brw_fs.h"
     35 #include "brw_cs.h"
     36 #include "brw_nir.h"
     37 #include "brw_vec4_gs_visitor.h"
     38 #include "brw_cfg.h"
     39 #include "brw_program.h"
     40 #include "brw_dead_control_flow.h"
     41 #include "compiler/glsl_types.h"
     42 #include "compiler/nir/nir_builder.h"
     43 #include "program/prog_parameter.h"
     44 
     45 using namespace brw;
     46 
     47 static unsigned get_lowered_simd_width(const struct gen_device_info *devinfo,
     48                                        const fs_inst *inst);
     49 
     50 void
     51 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
     52               const fs_reg *src, unsigned sources)
     53 {
     54    memset(this, 0, sizeof(*this));
     55 
     56    this->src = new fs_reg[MAX2(sources, 3)];
     57    for (unsigned i = 0; i < sources; i++)
     58       this->src[i] = src[i];
     59 
     60    this->opcode = opcode;
     61    this->dst = dst;
     62    this->sources = sources;
     63    this->exec_size = exec_size;
     64    this->base_mrf = -1;
     65 
     66    assert(dst.file != IMM && dst.file != UNIFORM);
     67 
     68    assert(this->exec_size != 0);
     69 
     70    this->conditional_mod = BRW_CONDITIONAL_NONE;
     71 
     72    /* This will be the case for almost all instructions. */
     73    switch (dst.file) {
     74    case VGRF:
     75    case ARF:
     76    case FIXED_GRF:
     77    case MRF:
     78    case ATTR:
     79       this->size_written = dst.component_size(exec_size);
     80       break;
     81    case BAD_FILE:
     82       this->size_written = 0;
     83       break;
     84    case IMM:
     85    case UNIFORM:
     86       unreachable("Invalid destination register file");
     87    }
     88 
     89    this->writes_accumulator = false;
     90 }
     91 
     92 fs_inst::fs_inst()
     93 {
     94    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
     95 }
     96 
     97 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
     98 {
     99    init(opcode, exec_size, reg_undef, NULL, 0);
    100 }
    101 
    102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
    103 {
    104    init(opcode, exec_size, dst, NULL, 0);
    105 }
    106 
    107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    108                  const fs_reg &src0)
    109 {
    110    const fs_reg src[1] = { src0 };
    111    init(opcode, exec_size, dst, src, 1);
    112 }
    113 
    114 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    115                  const fs_reg &src0, const fs_reg &src1)
    116 {
    117    const fs_reg src[2] = { src0, src1 };
    118    init(opcode, exec_size, dst, src, 2);
    119 }
    120 
    121 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    122                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
    123 {
    124    const fs_reg src[3] = { src0, src1, src2 };
    125    init(opcode, exec_size, dst, src, 3);
    126 }
    127 
    128 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
    129                  const fs_reg src[], unsigned sources)
    130 {
    131    init(opcode, exec_width, dst, src, sources);
    132 }
    133 
    134 fs_inst::fs_inst(const fs_inst &that)
    135 {
    136    memcpy(this, &that, sizeof(that));
    137 
    138    this->src = new fs_reg[MAX2(that.sources, 3)];
    139 
    140    for (unsigned i = 0; i < that.sources; i++)
    141       this->src[i] = that.src[i];
    142 }
    143 
    144 fs_inst::~fs_inst()
    145 {
    146    delete[] this->src;
    147 }
    148 
    149 void
    150 fs_inst::resize_sources(uint8_t num_sources)
    151 {
    152    if (this->sources != num_sources) {
    153       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
    154 
    155       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
    156          src[i] = this->src[i];
    157 
    158       delete[] this->src;
    159       this->src = src;
    160       this->sources = num_sources;
    161    }
    162 }
    163 
    164 void
    165 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    166                                        const fs_reg &dst,
    167                                        const fs_reg &surf_index,
    168                                        const fs_reg &varying_offset,
    169                                        uint32_t const_offset)
    170 {
    171    /* We have our constant surface use a pitch of 4 bytes, so our index can
    172     * be any component of a vector, and then we load 4 contiguous
    173     * components starting from that.
    174     *
    175     * We break down the const_offset to a portion added to the variable offset
    176     * and a portion done using fs_reg::offset, which means that if you have
    177     * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
    178     * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
    179     * later notice that those loads are all the same and eliminate the
    180     * redundant ones.
    181     */
    182    fs_reg vec4_offset = vgrf(glsl_type::uint_type);
    183    bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
    184 
    185    /* The pull load message will load a vec4 (16 bytes). If we are loading
    186     * a double this means we are only loading 2 elements worth of data.
    187     * We also want to use a 32-bit data type for the dst of the load operation
    188     * so other parts of the driver don't get confused about the size of the
    189     * result.
    190     */
    191    fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
    192    fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
    193                             vec4_result, surf_index, vec4_offset);
    194    inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
    195 
    196    if (type_sz(dst.type) == 8) {
    197       shuffle_32bit_load_result_to_64bit_data(
    198          bld, retype(vec4_result, dst.type), vec4_result, 2);
    199    }
    200 
    201    vec4_result.type = dst.type;
    202    bld.MOV(dst, offset(vec4_result, bld,
    203                        (const_offset & 0xf) / type_sz(vec4_result.type)));
    204 }
    205 
    206 /**
    207  * A helper for MOV generation for fixing up broken hardware SEND dependency
    208  * handling.
    209  */
    210 void
    211 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
    212 {
    213    /* The caller always wants uncompressed to emit the minimal extra
    214     * dependencies, and to avoid having to deal with aligning its regs to 2.
    215     */
    216    const fs_builder ubld = bld.annotate("send dependency resolve")
    217                               .half(0);
    218 
    219    ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
    220 }
    221 
    222 bool
    223 fs_inst::equals(fs_inst *inst) const
    224 {
    225    return (opcode == inst->opcode &&
    226            dst.equals(inst->dst) &&
    227            src[0].equals(inst->src[0]) &&
    228            src[1].equals(inst->src[1]) &&
    229            src[2].equals(inst->src[2]) &&
    230            saturate == inst->saturate &&
    231            predicate == inst->predicate &&
    232            conditional_mod == inst->conditional_mod &&
    233            mlen == inst->mlen &&
    234            base_mrf == inst->base_mrf &&
    235            target == inst->target &&
    236            eot == inst->eot &&
    237            header_size == inst->header_size &&
    238            shadow_compare == inst->shadow_compare &&
    239            exec_size == inst->exec_size &&
    240            offset == inst->offset);
    241 }
    242 
    243 bool
    244 fs_inst::is_send_from_grf() const
    245 {
    246    switch (opcode) {
    247    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
    248    case SHADER_OPCODE_SHADER_TIME_ADD:
    249    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
    250    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
    251    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
    252    case SHADER_OPCODE_UNTYPED_ATOMIC:
    253    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    254    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
    255    case SHADER_OPCODE_TYPED_ATOMIC:
    256    case SHADER_OPCODE_TYPED_SURFACE_READ:
    257    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    258    case SHADER_OPCODE_URB_WRITE_SIMD8:
    259    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
    260    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    261    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    262    case SHADER_OPCODE_URB_READ_SIMD8:
    263    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
    264       return true;
    265    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
    266       return src[1].file == VGRF;
    267    case FS_OPCODE_FB_WRITE:
    268    case FS_OPCODE_FB_READ:
    269       return src[0].file == VGRF;
    270    default:
    271       if (is_tex())
    272          return src[0].file == VGRF;
    273 
    274       return false;
    275    }
    276 }
    277 
    278 /**
    279  * Returns true if this instruction's sources and destinations cannot
    280  * safely be the same register.
    281  *
    282  * In most cases, a register can be written over safely by the same
    283  * instruction that is its last use.  For a single instruction, the
    284  * sources are dereferenced before writing of the destination starts
    285  * (naturally).
    286  *
    287  * However, there are a few cases where this can be problematic:
    288  *
    289  * - Virtual opcodes that translate to multiple instructions in the
    290  *   code generator: if src == dst and one instruction writes the
    291  *   destination before a later instruction reads the source, then
    292  *   src will have been clobbered.
    293  *
    294  * - SIMD16 compressed instructions with certain regioning (see below).
    295  *
    296  * The register allocator uses this information to set up conflicts between
    297  * GRF sources and the destination.
    298  */
    299 bool
    300 fs_inst::has_source_and_destination_hazard() const
    301 {
    302    switch (opcode) {
    303    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
    304       /* Multiple partial writes to the destination */
    305       return true;
    306    default:
    307       /* The SIMD16 compressed instruction
    308        *
    309        * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
    310        *
    311        * is actually decoded in hardware as:
    312        *
    313        * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
    314        * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
    315        *
    316        * Which is safe.  However, if we have uniform accesses
    317        * happening, we get into trouble:
    318        *
    319        * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
    320        * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
    321        *
    322        * Now our destination for the first instruction overwrote the
    323        * second instruction's src0, and we get garbage for those 8
    324        * pixels.  There's a similar issue for the pre-gen6
    325        * pixel_x/pixel_y, which are registers of 16-bit values and thus
    326        * would get stomped by the first decode as well.
    327        */
    328       if (exec_size == 16) {
    329          for (int i = 0; i < sources; i++) {
    330             if (src[i].file == VGRF && (src[i].stride == 0 ||
    331                                         src[i].type == BRW_REGISTER_TYPE_UW ||
    332                                         src[i].type == BRW_REGISTER_TYPE_W ||
    333                                         src[i].type == BRW_REGISTER_TYPE_UB ||
    334                                         src[i].type == BRW_REGISTER_TYPE_B)) {
    335                return true;
    336             }
    337          }
    338       }
    339       return false;
    340    }
    341 }
    342 
    343 bool
    344 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
    345 {
    346    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
    347       return false;
    348 
    349    fs_reg reg = this->src[0];
    350    if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1)
    351       return false;
    352 
    353    if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
    354       return false;
    355 
    356    for (int i = 0; i < this->sources; i++) {
    357       reg.type = this->src[i].type;
    358       if (!this->src[i].equals(reg))
    359          return false;
    360 
    361       if (i < this->header_size) {
    362          reg.offset += REG_SIZE;
    363       } else {
    364          reg = horiz_offset(reg, this->exec_size);
    365       }
    366    }
    367 
    368    return true;
    369 }
    370 
    371 bool
    372 fs_inst::can_do_source_mods(const struct gen_device_info *devinfo)
    373 {
    374    if (devinfo->gen == 6 && is_math())
    375       return false;
    376 
    377    if (is_send_from_grf())
    378       return false;
    379 
    380    if (!backend_instruction::can_do_source_mods())
    381       return false;
    382 
    383    return true;
    384 }
    385 
    386 bool
    387 fs_inst::can_change_types() const
    388 {
    389    return dst.type == src[0].type &&
    390           !src[0].abs && !src[0].negate && !saturate &&
    391           (opcode == BRW_OPCODE_MOV ||
    392            (opcode == BRW_OPCODE_SEL &&
    393             dst.type == src[1].type &&
    394             predicate != BRW_PREDICATE_NONE &&
    395             !src[1].abs && !src[1].negate));
    396 }
    397 
    398 bool
    399 fs_inst::has_side_effects() const
    400 {
    401    return this->eot || backend_instruction::has_side_effects();
    402 }
    403 
    404 void
    405 fs_reg::init()
    406 {
    407    memset(this, 0, sizeof(*this));
    408    stride = 1;
    409 }
    410 
    411 /** Generic unset register constructor. */
    412 fs_reg::fs_reg()
    413 {
    414    init();
    415    this->file = BAD_FILE;
    416 }
    417 
    418 fs_reg::fs_reg(struct ::brw_reg reg) :
    419    backend_reg(reg)
    420 {
    421    this->offset = 0;
    422    this->stride = 1;
    423    if (this->file == IMM &&
    424        (this->type != BRW_REGISTER_TYPE_V &&
    425         this->type != BRW_REGISTER_TYPE_UV &&
    426         this->type != BRW_REGISTER_TYPE_VF)) {
    427       this->stride = 0;
    428    }
    429 }
    430 
    431 bool
    432 fs_reg::equals(const fs_reg &r) const
    433 {
    434    return (this->backend_reg::equals(r) &&
    435            stride == r.stride);
    436 }
    437 
    438 bool
    439 fs_reg::is_contiguous() const
    440 {
    441    return stride == 1;
    442 }
    443 
    444 unsigned
    445 fs_reg::component_size(unsigned width) const
    446 {
    447    const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
    448                             hstride == 0 ? 0 :
    449                             1 << (hstride - 1));
    450    return MAX2(width * stride, 1) * type_sz(type);
    451 }
    452 
    453 extern "C" int
    454 type_size_scalar(const struct glsl_type *type)
    455 {
    456    unsigned int size, i;
    457 
    458    switch (type->base_type) {
    459    case GLSL_TYPE_UINT:
    460    case GLSL_TYPE_INT:
    461    case GLSL_TYPE_FLOAT:
    462    case GLSL_TYPE_BOOL:
    463       return type->components();
    464    case GLSL_TYPE_DOUBLE:
    465       return type->components() * 2;
    466    case GLSL_TYPE_ARRAY:
    467       return type_size_scalar(type->fields.array) * type->length;
    468    case GLSL_TYPE_STRUCT:
    469       size = 0;
    470       for (i = 0; i < type->length; i++) {
    471 	 size += type_size_scalar(type->fields.structure[i].type);
    472       }
    473       return size;
    474    case GLSL_TYPE_SAMPLER:
    475       /* Samplers take up no register space, since they're baked in at
    476        * link time.
    477        */
    478       return 0;
    479    case GLSL_TYPE_ATOMIC_UINT:
    480       return 0;
    481    case GLSL_TYPE_SUBROUTINE:
    482       return 1;
    483    case GLSL_TYPE_IMAGE:
    484       return BRW_IMAGE_PARAM_SIZE;
    485    case GLSL_TYPE_VOID:
    486    case GLSL_TYPE_ERROR:
    487    case GLSL_TYPE_INTERFACE:
    488    case GLSL_TYPE_FUNCTION:
    489       unreachable("not reached");
    490    }
    491 
    492    return 0;
    493 }
    494 
    495 /**
    496  * Create a MOV to read the timestamp register.
    497  *
    498  * The caller is responsible for emitting the MOV.  The return value is
    499  * the destination of the MOV, with extra parameters set.
    500  */
    501 fs_reg
    502 fs_visitor::get_timestamp(const fs_builder &bld)
    503 {
    504    assert(devinfo->gen >= 7);
    505 
    506    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
    507                                           BRW_ARF_TIMESTAMP,
    508                                           0),
    509                              BRW_REGISTER_TYPE_UD));
    510 
    511    fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    512 
    513    /* We want to read the 3 fields we care about even if it's not enabled in
    514     * the dispatch.
    515     */
    516    bld.group(4, 0).exec_all().MOV(dst, ts);
    517 
    518    return dst;
    519 }
    520 
    521 void
    522 fs_visitor::emit_shader_time_begin()
    523 {
    524    /* We want only the low 32 bits of the timestamp.  Since it's running
    525     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
    526     * which is plenty of time for our purposes.  It is identical across the
    527     * EUs, but since it's tracking GPU core speed it will increment at a
    528     * varying rate as render P-states change.
    529     */
    530    shader_start_time = component(
    531       get_timestamp(bld.annotate("shader time start")), 0);
    532 }
    533 
    534 void
    535 fs_visitor::emit_shader_time_end()
    536 {
    537    /* Insert our code just before the final SEND with EOT. */
    538    exec_node *end = this->instructions.get_tail();
    539    assert(end && ((fs_inst *) end)->eot);
    540    const fs_builder ibld = bld.annotate("shader time end")
    541                               .exec_all().at(NULL, end);
    542    const fs_reg timestamp = get_timestamp(ibld);
    543 
    544    /* We only use the low 32 bits of the timestamp - see
    545     * emit_shader_time_begin()).
    546     *
    547     * We could also check if render P-states have changed (or anything
    548     * else that might disrupt timing) by setting smear to 2 and checking if
    549     * that field is != 0.
    550     */
    551    const fs_reg shader_end_time = component(timestamp, 0);
    552 
    553    /* Check that there weren't any timestamp reset events (assuming these
    554     * were the only two timestamp reads that happened).
    555     */
    556    const fs_reg reset = component(timestamp, 2);
    557    set_condmod(BRW_CONDITIONAL_Z,
    558                ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
    559    ibld.IF(BRW_PREDICATE_NORMAL);
    560 
    561    fs_reg start = shader_start_time;
    562    start.negate = true;
    563    const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
    564                                         BRW_REGISTER_TYPE_UD),
    565                                  0);
    566    const fs_builder cbld = ibld.group(1, 0);
    567    cbld.group(1, 0).ADD(diff, start, shader_end_time);
    568 
    569    /* If there were no instructions between the two timestamp gets, the diff
    570     * is 2 cycles.  Remove that overhead, so I can forget about that when
    571     * trying to determine the time taken for single instructions.
    572     */
    573    cbld.ADD(diff, diff, brw_imm_ud(-2u));
    574    SHADER_TIME_ADD(cbld, 0, diff);
    575    SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
    576    ibld.emit(BRW_OPCODE_ELSE);
    577    SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
    578    ibld.emit(BRW_OPCODE_ENDIF);
    579 }
    580 
    581 void
    582 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
    583                             int shader_time_subindex,
    584                             fs_reg value)
    585 {
    586    int index = shader_time_index * 3 + shader_time_subindex;
    587    struct brw_reg offset = brw_imm_d(index * SHADER_TIME_STRIDE);
    588 
    589    fs_reg payload;
    590    if (dispatch_width == 8)
    591       payload = vgrf(glsl_type::uvec2_type);
    592    else
    593       payload = vgrf(glsl_type::uint_type);
    594 
    595    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
    596 }
    597 
    598 void
    599 fs_visitor::vfail(const char *format, va_list va)
    600 {
    601    char *msg;
    602 
    603    if (failed)
    604       return;
    605 
    606    failed = true;
    607 
    608    msg = ralloc_vasprintf(mem_ctx, format, va);
    609    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
    610 
    611    this->fail_msg = msg;
    612 
    613    if (debug_enabled) {
    614       fprintf(stderr, "%s",  msg);
    615    }
    616 }
    617 
    618 void
    619 fs_visitor::fail(const char *format, ...)
    620 {
    621    va_list va;
    622 
    623    va_start(va, format);
    624    vfail(format, va);
    625    va_end(va);
    626 }
    627 
    628 /**
    629  * Mark this program as impossible to compile with dispatch width greater
    630  * than n.
    631  *
    632  * During the SIMD8 compile (which happens first), we can detect and flag
    633  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
    634  * SIMD16+ compile altogether.
    635  *
    636  * During a compile of dispatch width greater than n (if one happens anyway),
    637  * this just calls fail().
    638  */
    639 void
    640 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
    641 {
    642    if (dispatch_width > n) {
    643       fail("%s", msg);
    644    } else {
    645       max_dispatch_width = n;
    646       compiler->shader_perf_log(log_data,
    647                                 "Shader dispatch width limited to SIMD%d: %s",
    648                                 n, msg);
    649    }
    650 }
    651 
    652 /**
    653  * Returns true if the instruction has a flag that means it won't
    654  * update an entire destination register.
    655  *
    656  * For example, dead code elimination and live variable analysis want to know
    657  * when a write to a variable screens off any preceding values that were in
    658  * it.
    659  */
    660 bool
    661 fs_inst::is_partial_write() const
    662 {
    663    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
    664            (this->exec_size * type_sz(this->dst.type)) < 32 ||
    665            !this->dst.is_contiguous() ||
    666            this->dst.offset % REG_SIZE != 0);
    667 }
    668 
    669 unsigned
    670 fs_inst::components_read(unsigned i) const
    671 {
    672    /* Return zero if the source is not present. */
    673    if (src[i].file == BAD_FILE)
    674       return 0;
    675 
    676    switch (opcode) {
    677    case FS_OPCODE_LINTERP:
    678       if (i == 0)
    679          return 2;
    680       else
    681          return 1;
    682 
    683    case FS_OPCODE_PIXEL_X:
    684    case FS_OPCODE_PIXEL_Y:
    685       assert(i == 0);
    686       return 2;
    687 
    688    case FS_OPCODE_FB_WRITE_LOGICAL:
    689       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
    690       /* First/second FB write color. */
    691       if (i < 2)
    692          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
    693       else
    694          return 1;
    695 
    696    case SHADER_OPCODE_TEX_LOGICAL:
    697    case SHADER_OPCODE_TXD_LOGICAL:
    698    case SHADER_OPCODE_TXF_LOGICAL:
    699    case SHADER_OPCODE_TXL_LOGICAL:
    700    case SHADER_OPCODE_TXS_LOGICAL:
    701    case FS_OPCODE_TXB_LOGICAL:
    702    case SHADER_OPCODE_TXF_CMS_LOGICAL:
    703    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
    704    case SHADER_OPCODE_TXF_UMS_LOGICAL:
    705    case SHADER_OPCODE_TXF_MCS_LOGICAL:
    706    case SHADER_OPCODE_LOD_LOGICAL:
    707    case SHADER_OPCODE_TG4_LOGICAL:
    708    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
    709    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
    710       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
    711              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
    712       /* Texture coordinates. */
    713       if (i == TEX_LOGICAL_SRC_COORDINATE)
    714          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
    715       /* Texture derivatives. */
    716       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
    717                opcode == SHADER_OPCODE_TXD_LOGICAL)
    718          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
    719       /* Texture offset. */
    720       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
    721          return 2;
    722       /* MCS */
    723       else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
    724          return 2;
    725       else
    726          return 1;
    727 
    728    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
    729    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
    730       assert(src[3].file == IMM);
    731       /* Surface coordinates. */
    732       if (i == 0)
    733          return src[3].ud;
    734       /* Surface operation source (ignored for reads). */
    735       else if (i == 1)
    736          return 0;
    737       else
    738          return 1;
    739 
    740    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
    741    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
    742       assert(src[3].file == IMM &&
    743              src[4].file == IMM);
    744       /* Surface coordinates. */
    745       if (i == 0)
    746          return src[3].ud;
    747       /* Surface operation source. */
    748       else if (i == 1)
    749          return src[4].ud;
    750       else
    751          return 1;
    752 
    753    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
    754    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
    755       assert(src[3].file == IMM &&
    756              src[4].file == IMM);
    757       const unsigned op = src[4].ud;
    758       /* Surface coordinates. */
    759       if (i == 0)
    760          return src[3].ud;
    761       /* Surface operation source. */
    762       else if (i == 1 && op == BRW_AOP_CMPWR)
    763          return 2;
    764       else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
    765                           op == BRW_AOP_PREDEC))
    766          return 0;
    767       else
    768          return 1;
    769    }
    770 
    771    default:
    772       return 1;
    773    }
    774 }
    775 
    776 unsigned
    777 fs_inst::size_read(int arg) const
    778 {
    779    switch (opcode) {
    780    case FS_OPCODE_FB_WRITE:
    781    case FS_OPCODE_FB_READ:
    782    case SHADER_OPCODE_URB_WRITE_SIMD8:
    783    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
    784    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    785    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    786    case SHADER_OPCODE_URB_READ_SIMD8:
    787    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
    788    case SHADER_OPCODE_UNTYPED_ATOMIC:
    789    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    790    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
    791    case SHADER_OPCODE_TYPED_ATOMIC:
    792    case SHADER_OPCODE_TYPED_SURFACE_READ:
    793    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    794    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
    795       if (arg == 0)
    796          return mlen * REG_SIZE;
    797       break;
    798 
    799    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
    800       /* The payload is actually stored in src1 */
    801       if (arg == 1)
    802          return mlen * REG_SIZE;
    803       break;
    804 
    805    case FS_OPCODE_LINTERP:
    806       if (arg == 1)
    807          return 16;
    808       break;
    809 
    810    case SHADER_OPCODE_LOAD_PAYLOAD:
    811       if (arg < this->header_size)
    812          return REG_SIZE;
    813       break;
    814 
    815    case CS_OPCODE_CS_TERMINATE:
    816    case SHADER_OPCODE_BARRIER:
    817       return REG_SIZE;
    818 
    819    case SHADER_OPCODE_MOV_INDIRECT:
    820       if (arg == 0) {
    821          assert(src[2].file == IMM);
    822          return src[2].ud;
    823       }
    824       break;
    825 
    826    default:
    827       if (is_tex() && arg == 0 && src[0].file == VGRF)
    828          return mlen * REG_SIZE;
    829       break;
    830    }
    831 
    832    switch (src[arg].file) {
    833    case UNIFORM:
    834    case IMM:
    835       return components_read(arg) * type_sz(src[arg].type);
    836    case BAD_FILE:
    837    case ARF:
    838    case FIXED_GRF:
    839    case VGRF:
    840    case ATTR:
    841       return components_read(arg) * src[arg].component_size(exec_size);
    842    case MRF:
    843       unreachable("MRF registers are not allowed as sources");
    844    }
    845    return 0;
    846 }
    847 
    848 namespace {
    849    /* Return the subset of flag registers that an instruction could
    850     * potentially read or write based on the execution controls and flag
    851     * subregister number of the instruction.
    852     */
    853    unsigned
    854    flag_mask(const fs_inst *inst)
    855    {
    856       const unsigned start = inst->flag_subreg * 16 + inst->group;
    857       const unsigned end = start + inst->exec_size;
    858       return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
    859    }
    860 }
    861 
    862 unsigned
    863 fs_inst::flags_read(const gen_device_info *devinfo) const
    864 {
    865    /* XXX - This doesn't consider explicit uses of the flag register as source
    866     *       region.
    867     */
    868    if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
    869        predicate == BRW_PREDICATE_ALIGN1_ALLV) {
    870       /* The vertical predication modes combine corresponding bits from
    871        * f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware.
    872        */
    873       const unsigned shift = devinfo->gen >= 7 ? 4 : 2;
    874       return flag_mask(this) << shift | flag_mask(this);
    875    } else if (predicate) {
    876       return flag_mask(this);
    877    } else {
    878       return 0;
    879    }
    880 }
    881 
    882 unsigned
    883 fs_inst::flags_written() const
    884 {
    885    /* XXX - This doesn't consider explicit uses of the flag register as
    886     *       destination region.
    887     */
    888    if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
    889                             opcode != BRW_OPCODE_IF &&
    890                             opcode != BRW_OPCODE_WHILE)) ||
    891        opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
    892       return flag_mask(this);
    893    } else {
    894       return 0;
    895    }
    896 }
    897 
    898 /**
    899  * Returns how many MRFs an FS opcode will write over.
    900  *
    901  * Note that this is not the 0 or 1 implied writes in an actual gen
    902  * instruction -- the FS opcodes often generate MOVs in addition.
    903  */
    904 int
    905 fs_visitor::implied_mrf_writes(fs_inst *inst)
    906 {
    907    if (inst->mlen == 0)
    908       return 0;
    909 
    910    if (inst->base_mrf == -1)
    911       return 0;
    912 
    913    switch (inst->opcode) {
    914    case SHADER_OPCODE_RCP:
    915    case SHADER_OPCODE_RSQ:
    916    case SHADER_OPCODE_SQRT:
    917    case SHADER_OPCODE_EXP2:
    918    case SHADER_OPCODE_LOG2:
    919    case SHADER_OPCODE_SIN:
    920    case SHADER_OPCODE_COS:
    921       return 1 * dispatch_width / 8;
    922    case SHADER_OPCODE_POW:
    923    case SHADER_OPCODE_INT_QUOTIENT:
    924    case SHADER_OPCODE_INT_REMAINDER:
    925       return 2 * dispatch_width / 8;
    926    case SHADER_OPCODE_TEX:
    927    case FS_OPCODE_TXB:
    928    case SHADER_OPCODE_TXD:
    929    case SHADER_OPCODE_TXF:
    930    case SHADER_OPCODE_TXF_CMS:
    931    case SHADER_OPCODE_TXF_MCS:
    932    case SHADER_OPCODE_TG4:
    933    case SHADER_OPCODE_TG4_OFFSET:
    934    case SHADER_OPCODE_TXL:
    935    case SHADER_OPCODE_TXS:
    936    case SHADER_OPCODE_LOD:
    937    case SHADER_OPCODE_SAMPLEINFO:
    938       return 1;
    939    case FS_OPCODE_FB_WRITE:
    940       return 2;
    941    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
    942    case SHADER_OPCODE_GEN4_SCRATCH_READ:
    943       return 1;
    944    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
    945       return inst->mlen;
    946    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
    947       return inst->mlen;
    948    default:
    949       unreachable("not reached");
    950    }
    951 }
    952 
    953 fs_reg
    954 fs_visitor::vgrf(const glsl_type *const type)
    955 {
    956    int reg_width = dispatch_width / 8;
    957    return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
    958                  brw_type_for_base_type(type));
    959 }
    960 
    961 fs_reg::fs_reg(enum brw_reg_file file, int nr)
    962 {
    963    init();
    964    this->file = file;
    965    this->nr = nr;
    966    this->type = BRW_REGISTER_TYPE_F;
    967    this->stride = (file == UNIFORM ? 0 : 1);
    968 }
    969 
    970 fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
    971 {
    972    init();
    973    this->file = file;
    974    this->nr = nr;
    975    this->type = type;
    976    this->stride = (file == UNIFORM ? 0 : 1);
    977 }
    978 
    979 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
    980  * This brings in those uniform definitions
    981  */
    982 void
    983 fs_visitor::import_uniforms(fs_visitor *v)
    984 {
    985    this->push_constant_loc = v->push_constant_loc;
    986    this->pull_constant_loc = v->pull_constant_loc;
    987    this->uniforms = v->uniforms;
    988 }
    989 
    990 void
    991 fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
    992 {
    993    assert(stage == MESA_SHADER_FRAGMENT);
    994 
    995    /* gl_FragCoord.x */
    996    bld.MOV(wpos, this->pixel_x);
    997    wpos = offset(wpos, bld, 1);
    998 
    999    /* gl_FragCoord.y */
   1000    bld.MOV(wpos, this->pixel_y);
   1001    wpos = offset(wpos, bld, 1);
   1002 
   1003    /* gl_FragCoord.z */
   1004    if (devinfo->gen >= 6) {
   1005       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
   1006    } else {
   1007       bld.emit(FS_OPCODE_LINTERP, wpos,
   1008            this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
   1009            interp_reg(VARYING_SLOT_POS, 2));
   1010    }
   1011    wpos = offset(wpos, bld, 1);
   1012 
   1013    /* gl_FragCoord.w: Already set up in emit_interpolation */
   1014    bld.MOV(wpos, this->wpos_w);
   1015 }
   1016 
   1017 enum brw_barycentric_mode
   1018 brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
   1019 {
   1020    /* Barycentric modes don't make sense for flat inputs. */
   1021    assert(mode != INTERP_MODE_FLAT);
   1022 
   1023    unsigned bary;
   1024    switch (op) {
   1025    case nir_intrinsic_load_barycentric_pixel:
   1026    case nir_intrinsic_load_barycentric_at_offset:
   1027       bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
   1028       break;
   1029    case nir_intrinsic_load_barycentric_centroid:
   1030       bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
   1031       break;
   1032    case nir_intrinsic_load_barycentric_sample:
   1033    case nir_intrinsic_load_barycentric_at_sample:
   1034       bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
   1035       break;
   1036    default:
   1037       unreachable("invalid intrinsic");
   1038    }
   1039 
   1040    if (mode == INTERP_MODE_NOPERSPECTIVE)
   1041       bary += 3;
   1042 
   1043    return (enum brw_barycentric_mode) bary;
   1044 }
   1045 
   1046 /**
   1047  * Turn one of the two CENTROID barycentric modes into PIXEL mode.
   1048  */
   1049 static enum brw_barycentric_mode
   1050 centroid_to_pixel(enum brw_barycentric_mode bary)
   1051 {
   1052    assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
   1053           bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
   1054    return (enum brw_barycentric_mode) ((unsigned) bary - 1);
   1055 }
   1056 
   1057 fs_reg *
   1058 fs_visitor::emit_frontfacing_interpolation()
   1059 {
   1060    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
   1061 
   1062    if (devinfo->gen >= 6) {
   1063       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
   1064        * a boolean result from this (~0/true or 0/false).
   1065        *
   1066        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
   1067        * this task in only one instruction:
   1068        *    - a negation source modifier will flip the bit; and
   1069        *    - a W -> D type conversion will sign extend the bit into the high
   1070        *      word of the destination.
   1071        *
   1072        * An ASR 15 fills the low word of the destination.
   1073        */
   1074       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
   1075       g0.negate = true;
   1076 
   1077       bld.ASR(*reg, g0, brw_imm_d(15));
   1078    } else {
   1079       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
   1080        * a boolean result from this (1/true or 0/false).
   1081        *
   1082        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
   1083        * the negation source modifier to flip it. Unfortunately the SHR
   1084        * instruction only operates on UD (or D with an abs source modifier)
   1085        * sources without negation.
   1086        *
   1087        * Instead, use ASR (which will give ~0/true or 0/false).
   1088        */
   1089       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
   1090       g1_6.negate = true;
   1091 
   1092       bld.ASR(*reg, g1_6, brw_imm_d(31));
   1093    }
   1094 
   1095    return reg;
   1096 }
   1097 
   1098 void
   1099 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
   1100 {
   1101    assert(stage == MESA_SHADER_FRAGMENT);
   1102    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
   1103    assert(dst.type == BRW_REGISTER_TYPE_F);
   1104 
   1105    if (wm_prog_data->persample_dispatch) {
   1106       /* Convert int_sample_pos to floating point */
   1107       bld.MOV(dst, int_sample_pos);
   1108       /* Scale to the range [0, 1] */
   1109       bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
   1110    }
   1111    else {
   1112       /* From ARB_sample_shading specification:
   1113        * "When rendering to a non-multisample buffer, or if multisample
   1114        *  rasterization is disabled, gl_SamplePosition will always be
   1115        *  (0.5, 0.5).
   1116        */
   1117       bld.MOV(dst, brw_imm_f(0.5f));
   1118    }
   1119 }
   1120 
   1121 fs_reg *
   1122 fs_visitor::emit_samplepos_setup()
   1123 {
   1124    assert(devinfo->gen >= 6);
   1125 
   1126    const fs_builder abld = bld.annotate("compute sample position");
   1127    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
   1128    fs_reg pos = *reg;
   1129    fs_reg int_sample_x = vgrf(glsl_type::int_type);
   1130    fs_reg int_sample_y = vgrf(glsl_type::int_type);
   1131 
   1132    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
   1133     * mode will be enabled.
   1134     *
   1135     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
   1136     * R31.1:0         Position Offset X/Y for Slot[3:0]
   1137     * R31.3:2         Position Offset X/Y for Slot[7:4]
   1138     * .....
   1139     *
   1140     * The X, Y sample positions come in as bytes in  thread payload. So, read
   1141     * the positions using vstride=16, width=8, hstride=2.
   1142     */
   1143    struct brw_reg sample_pos_reg =
   1144       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
   1145                     BRW_REGISTER_TYPE_B), 16, 8, 2);
   1146 
   1147    if (dispatch_width == 8) {
   1148       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
   1149    } else {
   1150       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
   1151       abld.half(1).MOV(half(int_sample_x, 1),
   1152                        fs_reg(suboffset(sample_pos_reg, 16)));
   1153    }
   1154    /* Compute gl_SamplePosition.x */
   1155    compute_sample_position(pos, int_sample_x);
   1156    pos = offset(pos, abld, 1);
   1157    if (dispatch_width == 8) {
   1158       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
   1159    } else {
   1160       abld.half(0).MOV(half(int_sample_y, 0),
   1161                        fs_reg(suboffset(sample_pos_reg, 1)));
   1162       abld.half(1).MOV(half(int_sample_y, 1),
   1163                        fs_reg(suboffset(sample_pos_reg, 17)));
   1164    }
   1165    /* Compute gl_SamplePosition.y */
   1166    compute_sample_position(pos, int_sample_y);
   1167    return reg;
   1168 }
   1169 
   1170 fs_reg *
   1171 fs_visitor::emit_sampleid_setup()
   1172 {
   1173    assert(stage == MESA_SHADER_FRAGMENT);
   1174    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
   1175    assert(devinfo->gen >= 6);
   1176 
   1177    const fs_builder abld = bld.annotate("compute sample id");
   1178    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
   1179 
   1180    if (!key->multisample_fbo) {
   1181       /* As per GL_ARB_sample_shading specification:
   1182        * "When rendering to a non-multisample buffer, or if multisample
   1183        *  rasterization is disabled, gl_SampleID will always be zero."
   1184        */
   1185       abld.MOV(*reg, brw_imm_d(0));
   1186    } else if (devinfo->gen >= 8) {
   1187       /* Sample ID comes in as 4-bit numbers in g1.0:
   1188        *
   1189        *    15:12 Slot 3 SampleID (only used in SIMD16)
   1190        *     11:8 Slot 2 SampleID (only used in SIMD16)
   1191        *      7:4 Slot 1 SampleID
   1192        *      3:0 Slot 0 SampleID
   1193        *
   1194        * Each slot corresponds to four channels, so we want to replicate each
   1195        * half-byte value to 4 channels in a row:
   1196        *
   1197        *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
   1198        *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
   1199        *
   1200        *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
   1201        *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
   1202        *
   1203        * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
   1204        * channels to read the first byte (7:0), and the second group of 8
   1205        * channels to read the second byte (15:8).  Then, we shift right by
   1206        * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
   1207        * values into place.  Finally, we AND with 0xf to keep the low nibble.
   1208        *
   1209        *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
   1210        *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
   1211        *
   1212        * TODO: These payload bits exist on Gen7 too, but they appear to always
   1213        *       be zero, so this code fails to work.  We should find out why.
   1214        */
   1215       fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
   1216 
   1217       abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
   1218                                          BRW_REGISTER_TYPE_B), 1, 8, 0)),
   1219                     brw_imm_v(0x44440000));
   1220       abld.AND(*reg, tmp, brw_imm_w(0xf));
   1221    } else {
   1222       const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
   1223                                          BRW_REGISTER_TYPE_D), 0);
   1224       const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
   1225 
   1226       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
   1227        * 8x multisampling, subspan 0 will represent sample N (where N
   1228        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
   1229        * 7. We can find the value of N by looking at R0.0 bits 7:6
   1230        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
   1231        * (since samples are always delivered in pairs). That is, we
   1232        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
   1233        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
   1234        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
   1235        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
   1236        * populating a temporary variable with the sequence (0, 1, 2, 3),
   1237        * and then reading from it using vstride=1, width=4, hstride=0.
   1238        * These computations hold good for 4x multisampling as well.
   1239        *
   1240        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
   1241        * the first four slots are sample 0 of subspan 0; the next four
   1242        * are sample 1 of subspan 0; the third group is sample 0 of
   1243        * subspan 1, and finally sample 1 of subspan 1.
   1244        */
   1245 
   1246       /* SKL+ has an extra bit for the Starting Sample Pair Index to
   1247        * accomodate 16x MSAA.
   1248        */
   1249       abld.exec_all().group(1, 0)
   1250           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
   1251                brw_imm_ud(0xc0));
   1252       abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
   1253 
   1254       /* This works for both SIMD8 and SIMD16 */
   1255       abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210));
   1256 
   1257       /* This special instruction takes care of setting vstride=1,
   1258        * width=4, hstride=0 of t2 during an ADD instruction.
   1259        */
   1260       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
   1261    }
   1262 
   1263    return reg;
   1264 }
   1265 
   1266 fs_reg *
   1267 fs_visitor::emit_samplemaskin_setup()
   1268 {
   1269    assert(stage == MESA_SHADER_FRAGMENT);
   1270    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
   1271    assert(devinfo->gen >= 6);
   1272 
   1273    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
   1274 
   1275    fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
   1276                                BRW_REGISTER_TYPE_D));
   1277 
   1278    if (wm_prog_data->persample_dispatch) {
   1279       /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
   1280        * and a mask representing which sample is being processed by the
   1281        * current shader invocation.
   1282        *
   1283        * From the OES_sample_variables specification:
   1284        * "When per-sample shading is active due to the use of a fragment input
   1285        *  qualified by "sample" or due to the use of the gl_SampleID or
   1286        *  gl_SamplePosition variables, only the bit for the current sample is
   1287        *  set in gl_SampleMaskIn."
   1288        */
   1289       const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
   1290 
   1291       if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
   1292          nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
   1293 
   1294       fs_reg one = vgrf(glsl_type::int_type);
   1295       fs_reg enabled_mask = vgrf(glsl_type::int_type);
   1296       abld.MOV(one, brw_imm_d(1));
   1297       abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
   1298       abld.AND(*reg, enabled_mask, coverage_mask);
   1299    } else {
   1300       /* In per-pixel mode, the coverage mask is sufficient. */
   1301       *reg = coverage_mask;
   1302    }
   1303    return reg;
   1304 }
   1305 
   1306 fs_reg
   1307 fs_visitor::resolve_source_modifiers(const fs_reg &src)
   1308 {
   1309    if (!src.abs && !src.negate)
   1310       return src;
   1311 
   1312    fs_reg temp = bld.vgrf(src.type);
   1313    bld.MOV(temp, src);
   1314 
   1315    return temp;
   1316 }
   1317 
   1318 void
   1319 fs_visitor::emit_discard_jump()
   1320 {
   1321    assert(brw_wm_prog_data(this->prog_data)->uses_kill);
   1322 
   1323    /* For performance, after a discard, jump to the end of the
   1324     * shader if all relevant channels have been discarded.
   1325     */
   1326    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
   1327    discard_jump->flag_subreg = 1;
   1328 
   1329    discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
   1330    discard_jump->predicate_inverse = true;
   1331 }
   1332 
   1333 void
   1334 fs_visitor::emit_gs_thread_end()
   1335 {
   1336    assert(stage == MESA_SHADER_GEOMETRY);
   1337 
   1338    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
   1339 
   1340    if (gs_compile->control_data_header_size_bits > 0) {
   1341       emit_gs_control_data_bits(this->final_gs_vertex_count);
   1342    }
   1343 
   1344    const fs_builder abld = bld.annotate("thread end");
   1345    fs_inst *inst;
   1346 
   1347    if (gs_prog_data->static_vertex_count != -1) {
   1348       foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
   1349          if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
   1350              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
   1351              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
   1352              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
   1353             prev->eot = true;
   1354 
   1355             /* Delete now dead instructions. */
   1356             foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
   1357                if (dead == prev)
   1358                   break;
   1359                dead->remove();
   1360             }
   1361             return;
   1362          } else if (prev->is_control_flow() || prev->has_side_effects()) {
   1363             break;
   1364          }
   1365       }
   1366       fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
   1367       abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
   1368       inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
   1369       inst->mlen = 1;
   1370    } else {
   1371       fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
   1372       fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
   1373       sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
   1374       sources[1] = this->final_gs_vertex_count;
   1375       abld.LOAD_PAYLOAD(payload, sources, 2, 2);
   1376       inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
   1377       inst->mlen = 2;
   1378    }
   1379    inst->eot = true;
   1380    inst->offset = 0;
   1381 }
   1382 
   1383 void
   1384 fs_visitor::assign_curb_setup()
   1385 {
   1386    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
   1387 
   1388    /* Map the offsets in the UNIFORM file to fixed HW regs. */
   1389    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1390       for (unsigned int i = 0; i < inst->sources; i++) {
   1391 	 if (inst->src[i].file == UNIFORM) {
   1392             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
   1393             int constant_nr;
   1394             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
   1395                constant_nr = push_constant_loc[uniform_nr];
   1396             } else {
   1397                /* Section 5.11 of the OpenGL 4.1 spec says:
   1398                 * "Out-of-bounds reads return undefined values, which include
   1399                 *  values from other variables of the active program or zero."
   1400                 * Just return the first push constant.
   1401                 */
   1402                constant_nr = 0;
   1403             }
   1404 
   1405 	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
   1406 						  constant_nr / 8,
   1407 						  constant_nr % 8);
   1408             brw_reg.abs = inst->src[i].abs;
   1409             brw_reg.negate = inst->src[i].negate;
   1410 
   1411             assert(inst->src[i].stride == 0);
   1412             inst->src[i] = byte_offset(
   1413                retype(brw_reg, inst->src[i].type),
   1414                inst->src[i].offset % 4);
   1415 	 }
   1416       }
   1417    }
   1418 
   1419    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
   1420    this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
   1421 }
   1422 
   1423 void
   1424 fs_visitor::calculate_urb_setup()
   1425 {
   1426    assert(stage == MESA_SHADER_FRAGMENT);
   1427    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
   1428    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
   1429 
   1430    memset(prog_data->urb_setup, -1,
   1431           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
   1432 
   1433    int urb_next = 0;
   1434    /* Figure out where each of the incoming setup attributes lands. */
   1435    if (devinfo->gen >= 6) {
   1436       if (_mesa_bitcount_64(nir->info->inputs_read &
   1437                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
   1438          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
   1439           * first 16 varying inputs, so we can put them wherever we want.
   1440           * Just put them in order.
   1441           *
   1442           * This is useful because it means that (a) inputs not used by the
   1443           * fragment shader won't take up valuable register space, and (b) we
   1444           * won't have to recompile the fragment shader if it gets paired with
   1445           * a different vertex (or geometry) shader.
   1446           */
   1447          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
   1448             if (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK &
   1449                 BITFIELD64_BIT(i)) {
   1450                prog_data->urb_setup[i] = urb_next++;
   1451             }
   1452          }
   1453       } else {
   1454          bool include_vue_header =
   1455             nir->info->inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
   1456 
   1457          /* We have enough input varyings that the SF/SBE pipeline stage can't
   1458           * arbitrarily rearrange them to suit our whim; we have to put them
   1459           * in an order that matches the output of the previous pipeline stage
   1460           * (geometry or vertex shader).
   1461           */
   1462          struct brw_vue_map prev_stage_vue_map;
   1463          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
   1464                              key->input_slots_valid,
   1465                              nir->info->separate_shader);
   1466          int first_slot =
   1467             include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
   1468 
   1469          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
   1470          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
   1471               slot++) {
   1472             int varying = prev_stage_vue_map.slot_to_varying[slot];
   1473             if (varying != BRW_VARYING_SLOT_PAD &&
   1474                 (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK &
   1475                  BITFIELD64_BIT(varying))) {
   1476                prog_data->urb_setup[varying] = slot - first_slot;
   1477             }
   1478          }
   1479          urb_next = prev_stage_vue_map.num_slots - first_slot;
   1480       }
   1481    } else {
   1482       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
   1483       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
   1484          /* Point size is packed into the header, not as a general attribute */
   1485          if (i == VARYING_SLOT_PSIZ)
   1486             continue;
   1487 
   1488 	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
   1489 	    /* The back color slot is skipped when the front color is
   1490 	     * also written to.  In addition, some slots can be
   1491 	     * written in the vertex shader and not read in the
   1492 	     * fragment shader.  So the register number must always be
   1493 	     * incremented, mapped or not.
   1494 	     */
   1495 	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
   1496 	       prog_data->urb_setup[i] = urb_next;
   1497             urb_next++;
   1498 	 }
   1499       }
   1500 
   1501       /*
   1502        * It's a FS only attribute, and we did interpolation for this attribute
   1503        * in SF thread. So, count it here, too.
   1504        *
   1505        * See compile_sf_prog() for more info.
   1506        */
   1507       if (nir->info->inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
   1508          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
   1509    }
   1510 
   1511    prog_data->num_varying_inputs = urb_next;
   1512 }
   1513 
   1514 void
   1515 fs_visitor::assign_urb_setup()
   1516 {
   1517    assert(stage == MESA_SHADER_FRAGMENT);
   1518    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
   1519 
   1520    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
   1521 
   1522    /* Offset all the urb_setup[] index by the actual position of the
   1523     * setup regs, now that the location of the constants has been chosen.
   1524     */
   1525    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1526       if (inst->opcode == FS_OPCODE_LINTERP) {
   1527 	 assert(inst->src[1].file == FIXED_GRF);
   1528          inst->src[1].nr += urb_start;
   1529       }
   1530 
   1531       if (inst->opcode == FS_OPCODE_CINTERP) {
   1532 	 assert(inst->src[0].file == FIXED_GRF);
   1533          inst->src[0].nr += urb_start;
   1534       }
   1535    }
   1536 
   1537    /* Each attribute is 4 setup channels, each of which is half a reg. */
   1538    this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
   1539 }
   1540 
   1541 void
   1542 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
   1543 {
   1544    for (int i = 0; i < inst->sources; i++) {
   1545       if (inst->src[i].file == ATTR) {
   1546          int grf = payload.num_regs +
   1547                    prog_data->curb_read_length +
   1548                    inst->src[i].nr +
   1549                    inst->src[i].offset / REG_SIZE;
   1550 
   1551          /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
   1552           *
   1553           * VertStride must be used to cross GRF register boundaries. This
   1554           * rule implies that elements within a 'Width' cannot cross GRF
   1555           * boundaries.
   1556           *
   1557           * So, for registers that are large enough, we have to split the exec
   1558           * size in two and trust the compression state to sort it out.
   1559           */
   1560          unsigned total_size = inst->exec_size *
   1561                                inst->src[i].stride *
   1562                                type_sz(inst->src[i].type);
   1563 
   1564          assert(total_size <= 2 * REG_SIZE);
   1565          const unsigned exec_size =
   1566             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
   1567 
   1568          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
   1569          struct brw_reg reg =
   1570             stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
   1571                                inst->src[i].offset % REG_SIZE),
   1572                    exec_size * inst->src[i].stride,
   1573                    width, inst->src[i].stride);
   1574          reg.abs = inst->src[i].abs;
   1575          reg.negate = inst->src[i].negate;
   1576 
   1577          inst->src[i] = reg;
   1578       }
   1579    }
   1580 }
   1581 
   1582 void
   1583 fs_visitor::assign_vs_urb_setup()
   1584 {
   1585    struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
   1586 
   1587    assert(stage == MESA_SHADER_VERTEX);
   1588 
   1589    /* Each attribute is 4 regs. */
   1590    this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
   1591 
   1592    assert(vs_prog_data->base.urb_read_length <= 15);
   1593 
   1594    /* Rewrite all ATTR file references to the hw grf that they land in. */
   1595    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1596       convert_attr_sources_to_hw_regs(inst);
   1597    }
   1598 }
   1599 
   1600 void
   1601 fs_visitor::assign_tcs_single_patch_urb_setup()
   1602 {
   1603    assert(stage == MESA_SHADER_TESS_CTRL);
   1604 
   1605    /* Rewrite all ATTR file references to HW_REGs. */
   1606    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1607       convert_attr_sources_to_hw_regs(inst);
   1608    }
   1609 }
   1610 
   1611 void
   1612 fs_visitor::assign_tes_urb_setup()
   1613 {
   1614    assert(stage == MESA_SHADER_TESS_EVAL);
   1615 
   1616    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
   1617 
   1618    first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
   1619 
   1620    /* Rewrite all ATTR file references to HW_REGs. */
   1621    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1622       convert_attr_sources_to_hw_regs(inst);
   1623    }
   1624 }
   1625 
   1626 void
   1627 fs_visitor::assign_gs_urb_setup()
   1628 {
   1629    assert(stage == MESA_SHADER_GEOMETRY);
   1630 
   1631    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
   1632 
   1633    first_non_payload_grf +=
   1634       8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in;
   1635 
   1636    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1637       /* Rewrite all ATTR file references to GRFs. */
   1638       convert_attr_sources_to_hw_regs(inst);
   1639    }
   1640 }
   1641 
   1642 
   1643 /**
   1644  * Split large virtual GRFs into separate components if we can.
   1645  *
   1646  * This is mostly duplicated with what brw_fs_vector_splitting does,
   1647  * but that's really conservative because it's afraid of doing
   1648  * splitting that doesn't result in real progress after the rest of
   1649  * the optimization phases, which would cause infinite looping in
   1650  * optimization.  We can do it once here, safely.  This also has the
   1651  * opportunity to split interpolated values, or maybe even uniforms,
   1652  * which we don't have at the IR level.
   1653  *
   1654  * We want to split, because virtual GRFs are what we register
   1655  * allocate and spill (due to contiguousness requirements for some
   1656  * instructions), and they're what we naturally generate in the
   1657  * codegen process, but most virtual GRFs don't actually need to be
   1658  * contiguous sets of GRFs.  If we split, we'll end up with reduced
   1659  * live intervals and better dead code elimination and coalescing.
   1660  */
   1661 void
   1662 fs_visitor::split_virtual_grfs()
   1663 {
   1664    /* Compact the register file so we eliminate dead vgrfs.  This
   1665     * only defines split points for live registers, so if we have
   1666     * too large dead registers they will hit assertions later.
   1667     */
   1668    compact_virtual_grfs();
   1669 
   1670    int num_vars = this->alloc.count;
   1671 
   1672    /* Count the total number of registers */
   1673    int reg_count = 0;
   1674    int vgrf_to_reg[num_vars];
   1675    for (int i = 0; i < num_vars; i++) {
   1676       vgrf_to_reg[i] = reg_count;
   1677       reg_count += alloc.sizes[i];
   1678    }
   1679 
   1680    /* An array of "split points".  For each register slot, this indicates
   1681     * if this slot can be separated from the previous slot.  Every time an
   1682     * instruction uses multiple elements of a register (as a source or
   1683     * destination), we mark the used slots as inseparable.  Then we go
   1684     * through and split the registers into the smallest pieces we can.
   1685     */
   1686    bool split_points[reg_count];
   1687    memset(split_points, 0, sizeof(split_points));
   1688 
   1689    /* Mark all used registers as fully splittable */
   1690    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1691       if (inst->dst.file == VGRF) {
   1692          int reg = vgrf_to_reg[inst->dst.nr];
   1693          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
   1694             split_points[reg + j] = true;
   1695       }
   1696 
   1697       for (int i = 0; i < inst->sources; i++) {
   1698          if (inst->src[i].file == VGRF) {
   1699             int reg = vgrf_to_reg[inst->src[i].nr];
   1700             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
   1701                split_points[reg + j] = true;
   1702          }
   1703       }
   1704    }
   1705 
   1706    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1707       if (inst->dst.file == VGRF) {
   1708          int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
   1709          for (unsigned j = 1; j < regs_written(inst); j++)
   1710             split_points[reg + j] = false;
   1711       }
   1712       for (int i = 0; i < inst->sources; i++) {
   1713          if (inst->src[i].file == VGRF) {
   1714             int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
   1715             for (unsigned j = 1; j < regs_read(inst, i); j++)
   1716                split_points[reg + j] = false;
   1717          }
   1718       }
   1719    }
   1720 
   1721    int new_virtual_grf[reg_count];
   1722    int new_reg_offset[reg_count];
   1723 
   1724    int reg = 0;
   1725    for (int i = 0; i < num_vars; i++) {
   1726       /* The first one should always be 0 as a quick sanity check. */
   1727       assert(split_points[reg] == false);
   1728 
   1729       /* j = 0 case */
   1730       new_reg_offset[reg] = 0;
   1731       reg++;
   1732       int offset = 1;
   1733 
   1734       /* j > 0 case */
   1735       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
   1736          /* If this is a split point, reset the offset to 0 and allocate a
   1737           * new virtual GRF for the previous offset many registers
   1738           */
   1739          if (split_points[reg]) {
   1740             assert(offset <= MAX_VGRF_SIZE);
   1741             int grf = alloc.allocate(offset);
   1742             for (int k = reg - offset; k < reg; k++)
   1743                new_virtual_grf[k] = grf;
   1744             offset = 0;
   1745          }
   1746          new_reg_offset[reg] = offset;
   1747          offset++;
   1748          reg++;
   1749       }
   1750 
   1751       /* The last one gets the original register number */
   1752       assert(offset <= MAX_VGRF_SIZE);
   1753       alloc.sizes[i] = offset;
   1754       for (int k = reg - offset; k < reg; k++)
   1755          new_virtual_grf[k] = i;
   1756    }
   1757    assert(reg == reg_count);
   1758 
   1759    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1760       if (inst->dst.file == VGRF) {
   1761          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
   1762          inst->dst.nr = new_virtual_grf[reg];
   1763          inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
   1764                             inst->dst.offset % REG_SIZE;
   1765          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
   1766       }
   1767       for (int i = 0; i < inst->sources; i++) {
   1768 	 if (inst->src[i].file == VGRF) {
   1769             reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
   1770             inst->src[i].nr = new_virtual_grf[reg];
   1771             inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
   1772                                   inst->src[i].offset % REG_SIZE;
   1773             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
   1774          }
   1775       }
   1776    }
   1777    invalidate_live_intervals();
   1778 }
   1779 
   1780 /**
   1781  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
   1782  *
   1783  * During code generation, we create tons of temporary variables, many of
   1784  * which get immediately killed and are never used again.  Yet, in later
   1785  * optimization and analysis passes, such as compute_live_intervals, we need
   1786  * to loop over all the virtual GRFs.  Compacting them can save a lot of
   1787  * overhead.
   1788  */
   1789 bool
   1790 fs_visitor::compact_virtual_grfs()
   1791 {
   1792    bool progress = false;
   1793    int remap_table[this->alloc.count];
   1794    memset(remap_table, -1, sizeof(remap_table));
   1795 
   1796    /* Mark which virtual GRFs are used. */
   1797    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
   1798       if (inst->dst.file == VGRF)
   1799          remap_table[inst->dst.nr] = 0;
   1800 
   1801       for (int i = 0; i < inst->sources; i++) {
   1802          if (inst->src[i].file == VGRF)
   1803             remap_table[inst->src[i].nr] = 0;
   1804       }
   1805    }
   1806 
   1807    /* Compact the GRF arrays. */
   1808    int new_index = 0;
   1809    for (unsigned i = 0; i < this->alloc.count; i++) {
   1810       if (remap_table[i] == -1) {
   1811          /* We just found an unused register.  This means that we are
   1812           * actually going to compact something.
   1813           */
   1814          progress = true;
   1815       } else {
   1816          remap_table[i] = new_index;
   1817          alloc.sizes[new_index] = alloc.sizes[i];
   1818          invalidate_live_intervals();
   1819          ++new_index;
   1820       }
   1821    }
   1822 
   1823    this->alloc.count = new_index;
   1824 
   1825    /* Patch all the instructions to use the newly renumbered registers */
   1826    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   1827       if (inst->dst.file == VGRF)
   1828          inst->dst.nr = remap_table[inst->dst.nr];
   1829 
   1830       for (int i = 0; i < inst->sources; i++) {
   1831          if (inst->src[i].file == VGRF)
   1832             inst->src[i].nr = remap_table[inst->src[i].nr];
   1833       }
   1834    }
   1835 
   1836    /* Patch all the references to delta_xy, since they're used in register
   1837     * allocation.  If they're unused, switch them to BAD_FILE so we don't
   1838     * think some random VGRF is delta_xy.
   1839     */
   1840    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
   1841       if (delta_xy[i].file == VGRF) {
   1842          if (remap_table[delta_xy[i].nr] != -1) {
   1843             delta_xy[i].nr = remap_table[delta_xy[i].nr];
   1844          } else {
   1845             delta_xy[i].file = BAD_FILE;
   1846          }
   1847       }
   1848    }
   1849 
   1850    return progress;
   1851 }
   1852 
   1853 static void
   1854 set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
   1855                            unsigned *max_chunk_bitsize,
   1856                            bool contiguous, unsigned bitsize,
   1857                            const unsigned target_bitsize,
   1858                            int *push_constant_loc, int *pull_constant_loc,
   1859                            unsigned *num_push_constants,
   1860                            unsigned *num_pull_constants,
   1861                            const unsigned max_push_components,
   1862                            const unsigned max_chunk_size,
   1863                            struct brw_stage_prog_data *stage_prog_data)
   1864 {
   1865    /* This is the first live uniform in the chunk */
   1866    if (*chunk_start < 0)
   1867       *chunk_start = uniform;
   1868 
   1869    /* Keep track of the maximum bit size access in contiguous uniforms */
   1870    *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
   1871 
   1872    /* If this element does not need to be contiguous with the next, we
   1873     * split at this point and everything between chunk_start and u forms a
   1874     * single chunk.
   1875     */
   1876    if (!contiguous) {
   1877       /* If bitsize doesn't match the target one, skip it */
   1878       if (*max_chunk_bitsize != target_bitsize) {
   1879          /* FIXME: right now we only support 32 and 64-bit accesses */
   1880          assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
   1881          *max_chunk_bitsize = 0;
   1882          *chunk_start = -1;
   1883          return;
   1884       }
   1885 
   1886       unsigned chunk_size = uniform - *chunk_start + 1;
   1887 
   1888       /* Decide whether we should push or pull this parameter.  In the
   1889        * Vulkan driver, push constants are explicitly exposed via the API
   1890        * so we push everything.  In GL, we only push small arrays.
   1891        */
   1892       if (stage_prog_data->pull_param == NULL ||
   1893           (*num_push_constants + chunk_size <= max_push_components &&
   1894            chunk_size <= max_chunk_size)) {
   1895          assert(*num_push_constants + chunk_size <= max_push_components);
   1896          for (unsigned j = *chunk_start; j <= uniform; j++)
   1897             push_constant_loc[j] = (*num_push_constants)++;
   1898       } else {
   1899          for (unsigned j = *chunk_start; j <= uniform; j++)
   1900             pull_constant_loc[j] = (*num_pull_constants)++;
   1901       }
   1902 
   1903       *max_chunk_bitsize = 0;
   1904       *chunk_start = -1;
   1905    }
   1906 }
   1907 
   1908 /**
   1909  * Assign UNIFORM file registers to either push constants or pull constants.
   1910  *
   1911  * We allow a fragment shader to have more than the specified minimum
   1912  * maximum number of fragment shader uniform components (64).  If
   1913  * there are too many of these, they'd fill up all of register space.
   1914  * So, this will push some of them out to the pull constant buffer and
   1915  * update the program to load them.
   1916  */
   1917 void
   1918 fs_visitor::assign_constant_locations()
   1919 {
   1920    /* Only the first compile gets to decide on locations. */
   1921    if (dispatch_width != min_dispatch_width)
   1922       return;
   1923 
   1924    bool is_live[uniforms];
   1925    memset(is_live, 0, sizeof(is_live));
   1926    unsigned bitsize_access[uniforms];
   1927    memset(bitsize_access, 0, sizeof(bitsize_access));
   1928 
   1929    /* For each uniform slot, a value of true indicates that the given slot and
   1930     * the next slot must remain contiguous.  This is used to keep us from
   1931     * splitting arrays apart.
   1932     */
   1933    bool contiguous[uniforms];
   1934    memset(contiguous, 0, sizeof(contiguous));
   1935 
   1936    int thread_local_id_index =
   1937       (stage == MESA_SHADER_COMPUTE) ?
   1938       brw_cs_prog_data(stage_prog_data)->thread_local_id_index : -1;
   1939 
   1940    /* First, we walk through the instructions and do two things:
   1941     *
   1942     *  1) Figure out which uniforms are live.
   1943     *
   1944     *  2) Mark any indirectly used ranges of registers as contiguous.
   1945     *
   1946     * Note that we don't move constant-indexed accesses to arrays.  No
   1947     * testing has been done of the performance impact of this choice.
   1948     */
   1949    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   1950       for (int i = 0 ; i < inst->sources; i++) {
   1951          if (inst->src[i].file != UNIFORM)
   1952             continue;
   1953 
   1954          int constant_nr = inst->src[i].nr + inst->src[i].offset / 4;
   1955 
   1956          if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
   1957             assert(inst->src[2].ud % 4 == 0);
   1958             unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
   1959             assert(last < uniforms);
   1960 
   1961             for (unsigned j = constant_nr; j < last; j++) {
   1962                is_live[j] = true;
   1963                contiguous[j] = true;
   1964                bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type));
   1965             }
   1966             is_live[last] = true;
   1967             bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type));
   1968          } else {
   1969             if (constant_nr >= 0 && constant_nr < (int) uniforms) {
   1970                int regs_read = inst->components_read(i) *
   1971                   type_sz(inst->src[i].type) / 4;
   1972                for (int j = 0; j < regs_read; j++) {
   1973                   is_live[constant_nr + j] = true;
   1974                   bitsize_access[constant_nr + j] =
   1975                      MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type));
   1976                }
   1977             }
   1978          }
   1979       }
   1980    }
   1981 
   1982    if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])
   1983       thread_local_id_index = -1;
   1984 
   1985    /* Only allow 16 registers (128 uniform components) as push constants.
   1986     *
   1987     * Just demote the end of the list.  We could probably do better
   1988     * here, demoting things that are rarely used in the program first.
   1989     *
   1990     * If changing this value, note the limitation about total_regs in
   1991     * brw_curbe.c.
   1992     */
   1993    unsigned int max_push_components = 16 * 8;
   1994    if (thread_local_id_index >= 0)
   1995       max_push_components--; /* Save a slot for the thread ID */
   1996 
   1997    /* We push small arrays, but no bigger than 16 floats.  This is big enough
   1998     * for a vec4 but hopefully not large enough to push out other stuff.  We
   1999     * should probably use a better heuristic at some point.
   2000     */
   2001    const unsigned int max_chunk_size = 16;
   2002 
   2003    unsigned int num_push_constants = 0;
   2004    unsigned int num_pull_constants = 0;
   2005 
   2006    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
   2007    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
   2008 
   2009    /* Default to -1 meaning no location */
   2010    memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
   2011    memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
   2012 
   2013    int chunk_start = -1;
   2014    unsigned max_chunk_bitsize = 0;
   2015 
   2016    /* First push 64-bit uniforms to ensure they are properly aligned */
   2017    const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF);
   2018    for (unsigned u = 0; u < uniforms; u++) {
   2019       if (!is_live[u])
   2020          continue;
   2021 
   2022       set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
   2023                                  contiguous[u], bitsize_access[u],
   2024                                  uniform_64_bit_size,
   2025                                  push_constant_loc, pull_constant_loc,
   2026                                  &num_push_constants, &num_pull_constants,
   2027                                  max_push_components, max_chunk_size,
   2028                                  stage_prog_data);
   2029 
   2030    }
   2031 
   2032    /* Then push the rest of uniforms */
   2033    const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F);
   2034    for (unsigned u = 0; u < uniforms; u++) {
   2035       if (!is_live[u])
   2036          continue;
   2037 
   2038       /* Skip thread_local_id_index to put it in the last push register. */
   2039       if (thread_local_id_index == (int)u)
   2040          continue;
   2041 
   2042       set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
   2043                                  contiguous[u], bitsize_access[u],
   2044                                  uniform_32_bit_size,
   2045                                  push_constant_loc, pull_constant_loc,
   2046                                  &num_push_constants, &num_pull_constants,
   2047                                  max_push_components, max_chunk_size,
   2048                                  stage_prog_data);
   2049    }
   2050 
   2051    /* Add the CS local thread ID uniform at the end of the push constants */
   2052    if (thread_local_id_index >= 0)
   2053       push_constant_loc[thread_local_id_index] = num_push_constants++;
   2054 
   2055    /* As the uniforms are going to be reordered, take the data from a temporary
   2056     * copy of the original param[].
   2057     */
   2058    gl_constant_value **param = ralloc_array(NULL, gl_constant_value*,
   2059                                             stage_prog_data->nr_params);
   2060    memcpy(param, stage_prog_data->param,
   2061           sizeof(gl_constant_value*) * stage_prog_data->nr_params);
   2062    stage_prog_data->nr_params = num_push_constants;
   2063    stage_prog_data->nr_pull_params = num_pull_constants;
   2064 
   2065    /* Up until now, the param[] array has been indexed by reg + offset
   2066     * of UNIFORM registers.  Move pull constants into pull_param[] and
   2067     * condense param[] to only contain the uniforms we chose to push.
   2068     *
   2069     * NOTE: Because we are condensing the params[] array, we know that
   2070     * push_constant_loc[i] <= i and we can do it in one smooth loop without
   2071     * having to make a copy.
   2072     */
   2073    int new_thread_local_id_index = -1;
   2074    for (unsigned int i = 0; i < uniforms; i++) {
   2075       const gl_constant_value *value = param[i];
   2076 
   2077       if (pull_constant_loc[i] != -1) {
   2078          stage_prog_data->pull_param[pull_constant_loc[i]] = value;
   2079       } else if (push_constant_loc[i] != -1) {
   2080          stage_prog_data->param[push_constant_loc[i]] = value;
   2081          if (thread_local_id_index == (int)i)
   2082             new_thread_local_id_index = push_constant_loc[i];
   2083       }
   2084    }
   2085    ralloc_free(param);
   2086 
   2087    if (stage == MESA_SHADER_COMPUTE)
   2088       brw_cs_prog_data(stage_prog_data)->thread_local_id_index =
   2089          new_thread_local_id_index;
   2090 }
   2091 
   2092 /**
   2093  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
   2094  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
   2095  */
   2096 void
   2097 fs_visitor::lower_constant_loads()
   2098 {
   2099    const unsigned index = stage_prog_data->binding_table.pull_constants_start;
   2100 
   2101    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
   2102       /* Set up the annotation tracking for new generated instructions. */
   2103       const fs_builder ibld(this, block, inst);
   2104 
   2105       for (int i = 0; i < inst->sources; i++) {
   2106 	 if (inst->src[i].file != UNIFORM)
   2107 	    continue;
   2108 
   2109          /* We'll handle this case later */
   2110          if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
   2111             continue;
   2112 
   2113          unsigned location = inst->src[i].nr + inst->src[i].offset / 4;
   2114          if (location >= uniforms)
   2115             continue; /* Out of bounds access */
   2116 
   2117          int pull_index = pull_constant_loc[location];
   2118 
   2119          if (pull_index == -1)
   2120 	    continue;
   2121 
   2122          assert(inst->src[i].stride == 0);
   2123 
   2124          const unsigned index = stage_prog_data->binding_table.pull_constants_start;
   2125          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
   2126          const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
   2127          const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
   2128          const unsigned base = pull_index * 4;
   2129 
   2130          ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
   2131                    dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
   2132 
   2133          /* Rewrite the instruction to use the temporary VGRF. */
   2134          inst->src[i].file = VGRF;
   2135          inst->src[i].nr = dst.nr;
   2136          inst->src[i].offset = (base & (block_sz - 1)) +
   2137                                inst->src[i].offset % 4;
   2138 
   2139          brw_mark_surface_used(prog_data, index);
   2140       }
   2141 
   2142       if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
   2143           inst->src[0].file == UNIFORM) {
   2144 
   2145          unsigned location = inst->src[0].nr + inst->src[0].offset / 4;
   2146          if (location >= uniforms)
   2147             continue; /* Out of bounds access */
   2148 
   2149          int pull_index = pull_constant_loc[location];
   2150 
   2151          if (pull_index == -1)
   2152 	    continue;
   2153 
   2154          VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
   2155                                     brw_imm_ud(index),
   2156                                     inst->src[1],
   2157                                     pull_index * 4);
   2158          inst->remove(block);
   2159 
   2160          brw_mark_surface_used(prog_data, index);
   2161       }
   2162    }
   2163    invalidate_live_intervals();
   2164 }
   2165 
   2166 bool
   2167 fs_visitor::opt_algebraic()
   2168 {
   2169    bool progress = false;
   2170 
   2171    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   2172       switch (inst->opcode) {
   2173       case BRW_OPCODE_MOV:
   2174          if (inst->src[0].file != IMM)
   2175             break;
   2176 
   2177          if (inst->saturate) {
   2178             if (inst->dst.type != inst->src[0].type)
   2179                assert(!"unimplemented: saturate mixed types");
   2180 
   2181             if (brw_saturate_immediate(inst->dst.type,
   2182                                        &inst->src[0].as_brw_reg())) {
   2183                inst->saturate = false;
   2184                progress = true;
   2185             }
   2186          }
   2187          break;
   2188 
   2189       case BRW_OPCODE_MUL:
   2190 	 if (inst->src[1].file != IMM)
   2191 	    continue;
   2192 
   2193 	 /* a * 1.0 = a */
   2194 	 if (inst->src[1].is_one()) {
   2195 	    inst->opcode = BRW_OPCODE_MOV;
   2196 	    inst->src[1] = reg_undef;
   2197 	    progress = true;
   2198 	    break;
   2199 	 }
   2200 
   2201          /* a * -1.0 = -a */
   2202          if (inst->src[1].is_negative_one()) {
   2203             inst->opcode = BRW_OPCODE_MOV;
   2204             inst->src[0].negate = !inst->src[0].negate;
   2205             inst->src[1] = reg_undef;
   2206             progress = true;
   2207             break;
   2208          }
   2209 
   2210          /* a * 0.0 = 0.0 */
   2211          if (inst->src[1].is_zero()) {
   2212             inst->opcode = BRW_OPCODE_MOV;
   2213             inst->src[0] = inst->src[1];
   2214             inst->src[1] = reg_undef;
   2215             progress = true;
   2216             break;
   2217          }
   2218 
   2219          if (inst->src[0].file == IMM) {
   2220             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
   2221             inst->opcode = BRW_OPCODE_MOV;
   2222             inst->src[0].f *= inst->src[1].f;
   2223             inst->src[1] = reg_undef;
   2224             progress = true;
   2225             break;
   2226          }
   2227 	 break;
   2228       case BRW_OPCODE_ADD:
   2229          if (inst->src[1].file != IMM)
   2230             continue;
   2231 
   2232          /* a + 0.0 = a */
   2233          if (inst->src[1].is_zero()) {
   2234             inst->opcode = BRW_OPCODE_MOV;
   2235             inst->src[1] = reg_undef;
   2236             progress = true;
   2237             break;
   2238          }
   2239 
   2240          if (inst->src[0].file == IMM) {
   2241             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
   2242             inst->opcode = BRW_OPCODE_MOV;
   2243             inst->src[0].f += inst->src[1].f;
   2244             inst->src[1] = reg_undef;
   2245             progress = true;
   2246             break;
   2247          }
   2248          break;
   2249       case BRW_OPCODE_OR:
   2250          if (inst->src[0].equals(inst->src[1])) {
   2251             inst->opcode = BRW_OPCODE_MOV;
   2252             inst->src[1] = reg_undef;
   2253             progress = true;
   2254             break;
   2255          }
   2256          break;
   2257       case BRW_OPCODE_LRP:
   2258          if (inst->src[1].equals(inst->src[2])) {
   2259             inst->opcode = BRW_OPCODE_MOV;
   2260             inst->src[0] = inst->src[1];
   2261             inst->src[1] = reg_undef;
   2262             inst->src[2] = reg_undef;
   2263             progress = true;
   2264             break;
   2265          }
   2266          break;
   2267       case BRW_OPCODE_CMP:
   2268          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
   2269              inst->src[0].abs &&
   2270              inst->src[0].negate &&
   2271              inst->src[1].is_zero()) {
   2272             inst->src[0].abs = false;
   2273             inst->src[0].negate = false;
   2274             inst->conditional_mod = BRW_CONDITIONAL_Z;
   2275             progress = true;
   2276             break;
   2277          }
   2278          break;
   2279       case BRW_OPCODE_SEL:
   2280          if (inst->src[0].equals(inst->src[1])) {
   2281             inst->opcode = BRW_OPCODE_MOV;
   2282             inst->src[1] = reg_undef;
   2283             inst->predicate = BRW_PREDICATE_NONE;
   2284             inst->predicate_inverse = false;
   2285             progress = true;
   2286          } else if (inst->saturate && inst->src[1].file == IMM) {
   2287             switch (inst->conditional_mod) {
   2288             case BRW_CONDITIONAL_LE:
   2289             case BRW_CONDITIONAL_L:
   2290                switch (inst->src[1].type) {
   2291                case BRW_REGISTER_TYPE_F:
   2292                   if (inst->src[1].f >= 1.0f) {
   2293                      inst->opcode = BRW_OPCODE_MOV;
   2294                      inst->src[1] = reg_undef;
   2295                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
   2296                      progress = true;
   2297                   }
   2298                   break;
   2299                default:
   2300                   break;
   2301                }
   2302                break;
   2303             case BRW_CONDITIONAL_GE:
   2304             case BRW_CONDITIONAL_G:
   2305                switch (inst->src[1].type) {
   2306                case BRW_REGISTER_TYPE_F:
   2307                   if (inst->src[1].f <= 0.0f) {
   2308                      inst->opcode = BRW_OPCODE_MOV;
   2309                      inst->src[1] = reg_undef;
   2310                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
   2311                      progress = true;
   2312                   }
   2313                   break;
   2314                default:
   2315                   break;
   2316                }
   2317             default:
   2318                break;
   2319             }
   2320          }
   2321          break;
   2322       case BRW_OPCODE_MAD:
   2323          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
   2324             inst->opcode = BRW_OPCODE_MOV;
   2325             inst->src[1] = reg_undef;
   2326             inst->src[2] = reg_undef;
   2327             progress = true;
   2328          } else if (inst->src[0].is_zero()) {
   2329             inst->opcode = BRW_OPCODE_MUL;
   2330             inst->src[0] = inst->src[2];
   2331             inst->src[2] = reg_undef;
   2332             progress = true;
   2333          } else if (inst->src[1].is_one()) {
   2334             inst->opcode = BRW_OPCODE_ADD;
   2335             inst->src[1] = inst->src[2];
   2336             inst->src[2] = reg_undef;
   2337             progress = true;
   2338          } else if (inst->src[2].is_one()) {
   2339             inst->opcode = BRW_OPCODE_ADD;
   2340             inst->src[2] = reg_undef;
   2341             progress = true;
   2342          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
   2343             inst->opcode = BRW_OPCODE_ADD;
   2344             inst->src[1].f *= inst->src[2].f;
   2345             inst->src[2] = reg_undef;
   2346             progress = true;
   2347          }
   2348          break;
   2349       case SHADER_OPCODE_BROADCAST:
   2350          if (is_uniform(inst->src[0])) {
   2351             inst->opcode = BRW_OPCODE_MOV;
   2352             inst->sources = 1;
   2353             inst->force_writemask_all = true;
   2354             progress = true;
   2355          } else if (inst->src[1].file == IMM) {
   2356             inst->opcode = BRW_OPCODE_MOV;
   2357             inst->src[0] = component(inst->src[0],
   2358                                      inst->src[1].ud);
   2359             inst->sources = 1;
   2360             inst->force_writemask_all = true;
   2361             progress = true;
   2362          }
   2363          break;
   2364 
   2365       default:
   2366 	 break;
   2367       }
   2368 
   2369       /* Swap if src[0] is immediate. */
   2370       if (progress && inst->is_commutative()) {
   2371          if (inst->src[0].file == IMM) {
   2372             fs_reg tmp = inst->src[1];
   2373             inst->src[1] = inst->src[0];
   2374             inst->src[0] = tmp;
   2375          }
   2376       }
   2377    }
   2378    return progress;
   2379 }
   2380 
   2381 /**
   2382  * Optimize sample messages that have constant zero values for the trailing
   2383  * texture coordinates. We can just reduce the message length for these
   2384  * instructions instead of reserving a register for it. Trailing parameters
   2385  * that aren't sent default to zero anyway. This will cause the dead code
   2386  * eliminator to remove the MOV instruction that would otherwise be emitted to
   2387  * set up the zero value.
   2388  */
   2389 bool
   2390 fs_visitor::opt_zero_samples()
   2391 {
   2392    /* Gen4 infers the texturing opcode based on the message length so we can't
   2393     * change it.
   2394     */
   2395    if (devinfo->gen < 5)
   2396       return false;
   2397 
   2398    bool progress = false;
   2399 
   2400    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   2401       if (!inst->is_tex())
   2402          continue;
   2403 
   2404       fs_inst *load_payload = (fs_inst *) inst->prev;
   2405 
   2406       if (load_payload->is_head_sentinel() ||
   2407           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
   2408          continue;
   2409 
   2410       /* We don't want to remove the message header or the first parameter.
   2411        * Removing the first parameter is not allowed, see the Haswell PRM
   2412        * volume 7, page 149:
   2413        *
   2414        *     "Parameter 0 is required except for the sampleinfo message, which
   2415        *      has no parameter 0"
   2416        */
   2417       while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
   2418              load_payload->src[(inst->mlen - inst->header_size) /
   2419                                (inst->exec_size / 8) +
   2420                                inst->header_size - 1].is_zero()) {
   2421          inst->mlen -= inst->exec_size / 8;
   2422          progress = true;
   2423       }
   2424    }
   2425 
   2426    if (progress)
   2427       invalidate_live_intervals();
   2428 
   2429    return progress;
   2430 }
   2431 
   2432 /**
   2433  * Optimize sample messages which are followed by the final RT write.
   2434  *
   2435  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
   2436  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
   2437  * final texturing results copied to the framebuffer write payload and modify
   2438  * them to write to the framebuffer directly.
   2439  */
   2440 bool
   2441 fs_visitor::opt_sampler_eot()
   2442 {
   2443    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
   2444 
   2445    if (stage != MESA_SHADER_FRAGMENT)
   2446       return false;
   2447 
   2448    if (devinfo->gen < 9 && !devinfo->is_cherryview)
   2449       return false;
   2450 
   2451    /* FINISHME: It should be possible to implement this optimization when there
   2452     * are multiple drawbuffers.
   2453     */
   2454    if (key->nr_color_regions != 1)
   2455       return false;
   2456 
   2457    /* Requires emitting a bunch of saturating MOV instructions during logical
   2458     * send lowering to clamp the color payload, which the sampler unit isn't
   2459     * going to do for us.
   2460     */
   2461    if (key->clamp_fragment_color)
   2462       return false;
   2463 
   2464    /* Look for a texturing instruction immediately before the final FB_WRITE. */
   2465    bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
   2466    fs_inst *fb_write = (fs_inst *)block->end();
   2467    assert(fb_write->eot);
   2468    assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL);
   2469 
   2470    /* There wasn't one; nothing to do. */
   2471    if (unlikely(fb_write->prev->is_head_sentinel()))
   2472       return false;
   2473 
   2474    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
   2475 
   2476    /* 3D Sampler  Messages  Message Format
   2477     *
   2478     * Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
   2479     *  messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*
   2480     */
   2481    if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
   2482        tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL &&
   2483        tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
   2484        tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL &&
   2485        tex_inst->opcode != FS_OPCODE_TXB_LOGICAL &&
   2486        tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL &&
   2487        tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL &&
   2488        tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL)
   2489       return false;
   2490 
   2491    /* XXX - This shouldn't be necessary. */
   2492    if (tex_inst->prev->is_head_sentinel())
   2493       return false;
   2494 
   2495    /* Check that the FB write sources are fully initialized by the single
   2496     * texturing instruction.
   2497     */
   2498    for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) {
   2499       if (i == FB_WRITE_LOGICAL_SRC_COLOR0) {
   2500          if (!fb_write->src[i].equals(tex_inst->dst) ||
   2501              fb_write->size_read(i) != tex_inst->size_written)
   2502          return false;
   2503       } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) {
   2504          if (fb_write->src[i].file != BAD_FILE)
   2505             return false;
   2506       }
   2507    }
   2508 
   2509    assert(!tex_inst->eot); /* We can't get here twice */
   2510    assert((tex_inst->offset & (0xff << 24)) == 0);
   2511 
   2512    const fs_builder ibld(this, block, tex_inst);
   2513 
   2514    tex_inst->offset |= fb_write->target << 24;
   2515    tex_inst->eot = true;
   2516    tex_inst->dst = ibld.null_reg_ud();
   2517    tex_inst->size_written = 0;
   2518    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
   2519 
   2520    /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT
   2521     * flag and submit a header together with the sampler message as required
   2522     * by the hardware.
   2523     */
   2524    invalidate_live_intervals();
   2525    return true;
   2526 }
   2527 
   2528 bool
   2529 fs_visitor::opt_register_renaming()
   2530 {
   2531    bool progress = false;
   2532    int depth = 0;
   2533 
   2534    int remap[alloc.count];
   2535    memset(remap, -1, sizeof(int) * alloc.count);
   2536 
   2537    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   2538       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
   2539          depth++;
   2540       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
   2541                  inst->opcode == BRW_OPCODE_WHILE) {
   2542          depth--;
   2543       }
   2544 
   2545       /* Rewrite instruction sources. */
   2546       for (int i = 0; i < inst->sources; i++) {
   2547          if (inst->src[i].file == VGRF &&
   2548              remap[inst->src[i].nr] != -1 &&
   2549              remap[inst->src[i].nr] != inst->src[i].nr) {
   2550             inst->src[i].nr = remap[inst->src[i].nr];
   2551             progress = true;
   2552          }
   2553       }
   2554 
   2555       const int dst = inst->dst.nr;
   2556 
   2557       if (depth == 0 &&
   2558           inst->dst.file == VGRF &&
   2559           alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
   2560           !inst->is_partial_write()) {
   2561          if (remap[dst] == -1) {
   2562             remap[dst] = dst;
   2563          } else {
   2564             remap[dst] = alloc.allocate(regs_written(inst));
   2565             inst->dst.nr = remap[dst];
   2566             progress = true;
   2567          }
   2568       } else if (inst->dst.file == VGRF &&
   2569                  remap[dst] != -1 &&
   2570                  remap[dst] != dst) {
   2571          inst->dst.nr = remap[dst];
   2572          progress = true;
   2573       }
   2574    }
   2575 
   2576    if (progress) {
   2577       invalidate_live_intervals();
   2578 
   2579       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
   2580          if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
   2581             delta_xy[i].nr = remap[delta_xy[i].nr];
   2582          }
   2583       }
   2584    }
   2585 
   2586    return progress;
   2587 }
   2588 
   2589 /**
   2590  * Remove redundant or useless discard jumps.
   2591  *
   2592  * For example, we can eliminate jumps in the following sequence:
   2593  *
   2594  * discard-jump       (redundant with the next jump)
   2595  * discard-jump       (useless; jumps to the next instruction)
   2596  * placeholder-halt
   2597  */
   2598 bool
   2599 fs_visitor::opt_redundant_discard_jumps()
   2600 {
   2601    bool progress = false;
   2602 
   2603    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
   2604 
   2605    fs_inst *placeholder_halt = NULL;
   2606    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
   2607       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
   2608          placeholder_halt = inst;
   2609          break;
   2610       }
   2611    }
   2612 
   2613    if (!placeholder_halt)
   2614       return false;
   2615 
   2616    /* Delete any HALTs immediately before the placeholder halt. */
   2617    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
   2618         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
   2619         prev = (fs_inst *) placeholder_halt->prev) {
   2620       prev->remove(last_bblock);
   2621       progress = true;
   2622    }
   2623 
   2624    if (progress)
   2625       invalidate_live_intervals();
   2626 
   2627    return progress;
   2628 }
   2629 
   2630 /**
   2631  * Compute a bitmask with GRF granularity with a bit set for each GRF starting
   2632  * from \p r.offset which overlaps the region starting at \p s.offset and
   2633  * spanning \p ds bytes.
   2634  */
   2635 static inline unsigned
   2636 mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
   2637 {
   2638    const int rel_offset = reg_offset(s) - reg_offset(r);
   2639    const int shift = rel_offset / REG_SIZE;
   2640    const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
   2641    assert(reg_space(r) == reg_space(s) &&
   2642           shift >= 0 && shift < int(8 * sizeof(unsigned)));
   2643    return ((1 << n) - 1) << shift;
   2644 }
   2645 
   2646 bool
   2647 fs_visitor::compute_to_mrf()
   2648 {
   2649    bool progress = false;
   2650    int next_ip = 0;
   2651 
   2652    /* No MRFs on Gen >= 7. */
   2653    if (devinfo->gen >= 7)
   2654       return false;
   2655 
   2656    calculate_live_intervals();
   2657 
   2658    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   2659       int ip = next_ip;
   2660       next_ip++;
   2661 
   2662       if (inst->opcode != BRW_OPCODE_MOV ||
   2663 	  inst->is_partial_write() ||
   2664 	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
   2665 	  inst->dst.type != inst->src[0].type ||
   2666 	  inst->src[0].abs || inst->src[0].negate ||
   2667           !inst->src[0].is_contiguous() ||
   2668           inst->src[0].offset % REG_SIZE != 0)
   2669 	 continue;
   2670 
   2671       /* Can't compute-to-MRF this GRF if someone else was going to
   2672        * read it later.
   2673        */
   2674       if (this->virtual_grf_end[inst->src[0].nr] > ip)
   2675 	 continue;
   2676 
   2677       /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
   2678        * things that computed the value of all GRFs of the source region.  The
   2679        * regs_left bitset keeps track of the registers we haven't yet found a
   2680        * generating instruction for.
   2681        */
   2682       unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
   2683 
   2684       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
   2685          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
   2686                              inst->src[0], inst->size_read(0))) {
   2687 	    /* Found the last thing to write our reg we want to turn
   2688 	     * into a compute-to-MRF.
   2689 	     */
   2690 
   2691 	    /* If this one instruction didn't populate all the
   2692 	     * channels, bail.  We might be able to rewrite everything
   2693 	     * that writes that reg, but it would require smarter
   2694 	     * tracking.
   2695 	     */
   2696 	    if (scan_inst->is_partial_write())
   2697 	       break;
   2698 
   2699             /* Handling things not fully contained in the source of the copy
   2700              * would need us to understand coalescing out more than one MOV at
   2701              * a time.
   2702              */
   2703             if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
   2704                                      inst->src[0], inst->size_read(0)))
   2705                break;
   2706 
   2707 	    /* SEND instructions can't have MRF as a destination. */
   2708 	    if (scan_inst->mlen)
   2709 	       break;
   2710 
   2711 	    if (devinfo->gen == 6) {
   2712 	       /* gen6 math instructions must have the destination be
   2713 		* GRF, so no compute-to-MRF for them.
   2714 		*/
   2715 	       if (scan_inst->is_math()) {
   2716 		  break;
   2717 	       }
   2718 	    }
   2719 
   2720             /* Clear the bits for any registers this instruction overwrites. */
   2721             regs_left &= ~mask_relative_to(
   2722                inst->src[0], scan_inst->dst, scan_inst->size_written);
   2723             if (!regs_left)
   2724                break;
   2725 	 }
   2726 
   2727 	 /* We don't handle control flow here.  Most computation of
   2728 	  * values that end up in MRFs are shortly before the MRF
   2729 	  * write anyway.
   2730 	  */
   2731 	 if (block->start() == scan_inst)
   2732 	    break;
   2733 
   2734 	 /* You can't read from an MRF, so if someone else reads our
   2735 	  * MRF's source GRF that we wanted to rewrite, that stops us.
   2736 	  */
   2737 	 bool interfered = false;
   2738 	 for (int i = 0; i < scan_inst->sources; i++) {
   2739             if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
   2740                                 inst->src[0], inst->size_read(0))) {
   2741 	       interfered = true;
   2742 	    }
   2743 	 }
   2744 	 if (interfered)
   2745 	    break;
   2746 
   2747          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
   2748                              inst->dst, inst->size_written)) {
   2749 	    /* If somebody else writes our MRF here, we can't
   2750 	     * compute-to-MRF before that.
   2751 	     */
   2752             break;
   2753          }
   2754 
   2755          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
   2756              regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
   2757                              inst->dst, inst->size_written)) {
   2758 	    /* Found a SEND instruction, which means that there are
   2759 	     * live values in MRFs from base_mrf to base_mrf +
   2760 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
   2761 	     * above it.
   2762 	     */
   2763             break;
   2764          }
   2765       }
   2766 
   2767       if (regs_left)
   2768          continue;
   2769 
   2770       /* Found all generating instructions of our MRF's source value, so it
   2771        * should be safe to rewrite them to point to the MRF directly.
   2772        */
   2773       regs_left = (1 << regs_read(inst, 0)) - 1;
   2774 
   2775       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
   2776          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
   2777                              inst->src[0], inst->size_read(0))) {
   2778             /* Clear the bits for any registers this instruction overwrites. */
   2779             regs_left &= ~mask_relative_to(
   2780                inst->src[0], scan_inst->dst, scan_inst->size_written);
   2781 
   2782             const unsigned rel_offset = reg_offset(scan_inst->dst) -
   2783                                         reg_offset(inst->src[0]);
   2784 
   2785             if (inst->dst.nr & BRW_MRF_COMPR4) {
   2786                /* Apply the same address transformation done by the hardware
   2787                 * for COMPR4 MRF writes.
   2788                 */
   2789                assert(rel_offset < 2 * REG_SIZE);
   2790                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
   2791 
   2792                /* Clear the COMPR4 bit if the generating instruction is not
   2793                 * compressed.
   2794                 */
   2795                if (scan_inst->size_written < 2 * REG_SIZE)
   2796                   scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
   2797 
   2798             } else {
   2799                /* Calculate the MRF number the result of this instruction is
   2800                 * ultimately written to.
   2801                 */
   2802                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
   2803             }
   2804 
   2805             scan_inst->dst.file = MRF;
   2806             scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
   2807             scan_inst->saturate |= inst->saturate;
   2808             if (!regs_left)
   2809                break;
   2810          }
   2811       }
   2812 
   2813       assert(!regs_left);
   2814       inst->remove(block);
   2815       progress = true;
   2816    }
   2817 
   2818    if (progress)
   2819       invalidate_live_intervals();
   2820 
   2821    return progress;
   2822 }
   2823 
   2824 /**
   2825  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
   2826  * flow.  We could probably do better here with some form of divergence
   2827  * analysis.
   2828  */
   2829 bool
   2830 fs_visitor::eliminate_find_live_channel()
   2831 {
   2832    bool progress = false;
   2833    unsigned depth = 0;
   2834 
   2835    if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
   2836       /* The optimization below assumes that channel zero is live on thread
   2837        * dispatch, which may not be the case if the fixed function dispatches
   2838        * threads sparsely.
   2839        */
   2840       return false;
   2841    }
   2842 
   2843    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   2844       switch (inst->opcode) {
   2845       case BRW_OPCODE_IF:
   2846       case BRW_OPCODE_DO:
   2847          depth++;
   2848          break;
   2849 
   2850       case BRW_OPCODE_ENDIF:
   2851       case BRW_OPCODE_WHILE:
   2852          depth--;
   2853          break;
   2854 
   2855       case FS_OPCODE_DISCARD_JUMP:
   2856          /* This can potentially make control flow non-uniform until the end
   2857           * of the program.
   2858           */
   2859          return progress;
   2860 
   2861       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
   2862          if (depth == 0) {
   2863             inst->opcode = BRW_OPCODE_MOV;
   2864             inst->src[0] = brw_imm_ud(0u);
   2865             inst->sources = 1;
   2866             inst->force_writemask_all = true;
   2867             progress = true;
   2868          }
   2869          break;
   2870 
   2871       default:
   2872          break;
   2873       }
   2874    }
   2875 
   2876    return progress;
   2877 }
   2878 
   2879 /**
   2880  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
   2881  * instructions to FS_OPCODE_REP_FB_WRITE.
   2882  */
   2883 void
   2884 fs_visitor::emit_repclear_shader()
   2885 {
   2886    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
   2887    int base_mrf = 0;
   2888    int color_mrf = base_mrf + 2;
   2889    fs_inst *mov;
   2890 
   2891    if (uniforms > 0) {
   2892       mov = bld.exec_all().group(4, 0)
   2893                .MOV(brw_message_reg(color_mrf),
   2894                     fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
   2895    } else {
   2896       struct brw_reg reg =
   2897          brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
   2898                  BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
   2899                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   2900 
   2901       mov = bld.exec_all().group(4, 0)
   2902                .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
   2903    }
   2904 
   2905    fs_inst *write;
   2906    if (key->nr_color_regions == 1) {
   2907       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
   2908       write->saturate = key->clamp_fragment_color;
   2909       write->base_mrf = color_mrf;
   2910       write->target = 0;
   2911       write->header_size = 0;
   2912       write->mlen = 1;
   2913    } else {
   2914       assume(key->nr_color_regions > 0);
   2915       for (int i = 0; i < key->nr_color_regions; ++i) {
   2916          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
   2917          write->saturate = key->clamp_fragment_color;
   2918          write->base_mrf = base_mrf;
   2919          write->target = i;
   2920          write->header_size = 2;
   2921          write->mlen = 3;
   2922       }
   2923    }
   2924    write->eot = true;
   2925 
   2926    calculate_cfg();
   2927 
   2928    assign_constant_locations();
   2929    assign_curb_setup();
   2930 
   2931    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
   2932    if (uniforms > 0) {
   2933       assert(mov->src[0].file == FIXED_GRF);
   2934       mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
   2935    }
   2936 }
   2937 
   2938 /**
   2939  * Walks through basic blocks, looking for repeated MRF writes and
   2940  * removing the later ones.
   2941  */
   2942 bool
   2943 fs_visitor::remove_duplicate_mrf_writes()
   2944 {
   2945    fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
   2946    bool progress = false;
   2947 
   2948    /* Need to update the MRF tracking for compressed instructions. */
   2949    if (dispatch_width >= 16)
   2950       return false;
   2951 
   2952    memset(last_mrf_move, 0, sizeof(last_mrf_move));
   2953 
   2954    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
   2955       if (inst->is_control_flow()) {
   2956 	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
   2957       }
   2958 
   2959       if (inst->opcode == BRW_OPCODE_MOV &&
   2960 	  inst->dst.file == MRF) {
   2961          fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
   2962 	 if (prev_inst && inst->equals(prev_inst)) {
   2963 	    inst->remove(block);
   2964 	    progress = true;
   2965 	    continue;
   2966 	 }
   2967       }
   2968 
   2969       /* Clear out the last-write records for MRFs that were overwritten. */
   2970       if (inst->dst.file == MRF) {
   2971          last_mrf_move[inst->dst.nr] = NULL;
   2972       }
   2973 
   2974       if (inst->mlen > 0 && inst->base_mrf != -1) {
   2975 	 /* Found a SEND instruction, which will include two or fewer
   2976 	  * implied MRF writes.  We could do better here.
   2977 	  */
   2978 	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
   2979 	    last_mrf_move[inst->base_mrf + i] = NULL;
   2980 	 }
   2981       }
   2982 
   2983       /* Clear out any MRF move records whose sources got overwritten. */
   2984       for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
   2985          if (last_mrf_move[i] &&
   2986              regions_overlap(inst->dst, inst->size_written,
   2987                              last_mrf_move[i]->src[0],
   2988                              last_mrf_move[i]->size_read(0))) {
   2989             last_mrf_move[i] = NULL;
   2990          }
   2991       }
   2992 
   2993       if (inst->opcode == BRW_OPCODE_MOV &&
   2994 	  inst->dst.file == MRF &&
   2995 	  inst->src[0].file != ARF &&
   2996 	  !inst->is_partial_write()) {
   2997          last_mrf_move[inst->dst.nr] = inst;
   2998       }
   2999    }
   3000 
   3001    if (progress)
   3002       invalidate_live_intervals();
   3003 
   3004    return progress;
   3005 }
   3006 
   3007 static void
   3008 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
   3009 {
   3010    /* Clear the flag for registers that actually got read (as expected). */
   3011    for (int i = 0; i < inst->sources; i++) {
   3012       int grf;
   3013       if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
   3014          grf = inst->src[i].nr;
   3015       } else {
   3016          continue;
   3017       }
   3018 
   3019       if (grf >= first_grf &&
   3020           grf < first_grf + grf_len) {
   3021          deps[grf - first_grf] = false;
   3022          if (inst->exec_size == 16)
   3023             deps[grf - first_grf + 1] = false;
   3024       }
   3025    }
   3026 }
   3027 
   3028 /**
   3029  * Implements this workaround for the original 965:
   3030  *
   3031  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
   3032  *      check for post destination dependencies on this instruction, software
   3033  *      must ensure that there is no destination hazard for the case of write
   3034  *      followed by a posted write shown in the following example.
   3035  *
   3036  *      1. mov r3 0
   3037  *      2. send r3.xy <rest of send instruction>
   3038  *      3. mov r2 r3
   3039  *
   3040  *      Due to no post-destination dependency check on the send, the above
   3041  *      code sequence could have two instructions (1 and 2) in flight at the
   3042  *      same time that both consider r3 as the target of their final writes.
   3043  */
   3044 void
   3045 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
   3046                                                         fs_inst *inst)
   3047 {
   3048    int write_len = regs_written(inst);
   3049    int first_write_grf = inst->dst.nr;
   3050    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
   3051    assert(write_len < (int)sizeof(needs_dep) - 1);
   3052 
   3053    memset(needs_dep, false, sizeof(needs_dep));
   3054    memset(needs_dep, true, write_len);
   3055 
   3056    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
   3057 
   3058    /* Walk backwards looking for writes to registers we're writing which
   3059     * aren't read since being written.  If we hit the start of the program,
   3060     * we assume that there are no outstanding dependencies on entry to the
   3061     * program.
   3062     */
   3063    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
   3064       /* If we hit control flow, assume that there *are* outstanding
   3065        * dependencies, and force their cleanup before our instruction.
   3066        */
   3067       if (block->start() == scan_inst && block->num != 0) {
   3068          for (int i = 0; i < write_len; i++) {
   3069             if (needs_dep[i])
   3070                DEP_RESOLVE_MOV(fs_builder(this, block, inst),
   3071                                first_write_grf + i);
   3072          }
   3073          return;
   3074       }
   3075 
   3076       /* We insert our reads as late as possible on the assumption that any
   3077        * instruction but a MOV that might have left us an outstanding
   3078        * dependency has more latency than a MOV.
   3079        */
   3080       if (scan_inst->dst.file == VGRF) {
   3081          for (unsigned i = 0; i < regs_written(scan_inst); i++) {
   3082             int reg = scan_inst->dst.nr + i;
   3083 
   3084             if (reg >= first_write_grf &&
   3085                 reg < first_write_grf + write_len &&
   3086                 needs_dep[reg - first_write_grf]) {
   3087                DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
   3088                needs_dep[reg - first_write_grf] = false;
   3089                if (scan_inst->exec_size == 16)
   3090                   needs_dep[reg - first_write_grf + 1] = false;
   3091             }
   3092          }
   3093       }
   3094 
   3095       /* Clear the flag for registers that actually got read (as expected). */
   3096       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
   3097 
   3098       /* Continue the loop only if we haven't resolved all the dependencies */
   3099       int i;
   3100       for (i = 0; i < write_len; i++) {
   3101          if (needs_dep[i])
   3102             break;
   3103       }
   3104       if (i == write_len)
   3105          return;
   3106    }
   3107 }
   3108 
   3109 /**
   3110  * Implements this workaround for the original 965:
   3111  *
   3112  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
   3113  *      used as a destination register until after it has been sourced by an
   3114  *      instruction with a different destination register.
   3115  */
   3116 void
   3117 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
   3118 {
   3119    int write_len = regs_written(inst);
   3120    int first_write_grf = inst->dst.nr;
   3121    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
   3122    assert(write_len < (int)sizeof(needs_dep) - 1);
   3123 
   3124    memset(needs_dep, false, sizeof(needs_dep));
   3125    memset(needs_dep, true, write_len);
   3126    /* Walk forwards looking for writes to registers we're writing which aren't
   3127     * read before being written.
   3128     */
   3129    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
   3130       /* If we hit control flow, force resolve all remaining dependencies. */
   3131       if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
   3132          for (int i = 0; i < write_len; i++) {
   3133             if (needs_dep[i])
   3134                DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
   3135                                first_write_grf + i);
   3136          }
   3137          return;
   3138       }
   3139 
   3140       /* Clear the flag for registers that actually got read (as expected). */
   3141       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
   3142 
   3143       /* We insert our reads as late as possible since they're reading the
   3144        * result of a SEND, which has massive latency.
   3145        */
   3146       if (scan_inst->dst.file == VGRF &&
   3147           scan_inst->dst.nr >= first_write_grf &&
   3148           scan_inst->dst.nr < first_write_grf + write_len &&
   3149           needs_dep[scan_inst->dst.nr - first_write_grf]) {
   3150          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
   3151                          scan_inst->dst.nr);
   3152          needs_dep[scan_inst->dst.nr - first_write_grf] = false;
   3153       }
   3154 
   3155       /* Continue the loop only if we haven't resolved all the dependencies */
   3156       int i;
   3157       for (i = 0; i < write_len; i++) {
   3158          if (needs_dep[i])
   3159             break;
   3160       }
   3161       if (i == write_len)
   3162          return;
   3163    }
   3164 }
   3165 
   3166 void
   3167 fs_visitor::insert_gen4_send_dependency_workarounds()
   3168 {
   3169    if (devinfo->gen != 4 || devinfo->is_g4x)
   3170       return;
   3171 
   3172    bool progress = false;
   3173 
   3174    foreach_block_and_inst(block, fs_inst, inst, cfg) {
   3175       if (inst->mlen != 0 && inst->dst.file == VGRF) {
   3176          insert_gen4_pre_send_dependency_workarounds(block, inst);
   3177          insert_gen4_post_send_dependency_workarounds(block, inst);
   3178          progress = true;
   3179       }
   3180    }
   3181 
   3182    if (progress)
   3183       invalidate_live_intervals();
   3184 }
   3185 
   3186 /**
   3187  * Turns the generic expression-style uniform pull constant load instruction
   3188  * into a hardware-specific series of instructions for loading a pull
   3189  * constant.
   3190  *
   3191  * The expression style allows the CSE pass before this to optimize out
   3192  * repeated loads from the same offset, and gives the pre-register-allocation
   3193  * scheduling full flexibility, while the conversion to native instructions
   3194  * allows the post-register-allocation scheduler the best information
   3195  * possible.
   3196  *
   3197  * Note that execution masking for setting up pull constant loads is special:
   3198  * the channels that need to be written are unrelated to the current execution
   3199  * mask, since a later instruction will use one of the result channels as a
   3200  * source operand for all 8 or 16 of its channels.
   3201  */
   3202 void
   3203 fs_visitor::lower_uniform_pull_constant_loads()
   3204 {
   3205    foreach_block_and_inst (block, fs_inst, inst, cfg) {
   3206       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
   3207          continue;
   3208 
   3209       if (devinfo->gen >= 7) {
   3210          const fs_builder ubld = fs_builder(this, block, inst).exec_all();
   3211          const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
   3212 
   3213          ubld.group(8, 0).MOV(payload,
   3214                               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   3215          ubld.group(1, 0).MOV(component(payload, 2),
   3216                               brw_imm_ud(inst->src[1].ud / 16));
   3217 
   3218          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
   3219          inst->src[1] = payload;
   3220          inst->header_size = 1;
   3221          inst->mlen = 1;
   3222 
   3223          invalidate_live_intervals();
   3224       } else {
   3225          /* Before register allocation, we didn't tell the scheduler about the
   3226           * MRF we use.  We know it's safe to use this MRF because nothing
   3227           * else does except for register spill/unspill, which generates and
   3228           * uses its MRF within a single IR instruction.
   3229           */
   3230          inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
   3231          inst->mlen = 1;
   3232       }
   3233    }
   3234 }
   3235 
   3236 bool
   3237 fs_visitor::lower_load_payload()
   3238 {
   3239    bool progress = false;
   3240 
   3241    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
   3242       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
   3243          continue;
   3244 
   3245       assert(inst->dst.file == MRF || inst->dst.file == VGRF);
   3246       assert(inst->saturate == false);
   3247       fs_reg dst = inst->dst;
   3248 
   3249       /* Get rid of COMPR4.  We'll add it back in if we need it */
   3250       if (dst.file == MRF)
   3251          dst.nr = dst.nr & ~BRW_MRF_COMPR4;
   3252 
   3253       const fs_builder ibld(this, block, inst);
   3254       const fs_builder hbld = ibld.exec_all().group(8, 0);
   3255 
   3256       for (uint8_t i = 0; i < inst->header_size; i++) {
   3257          if (inst->src[i].file != BAD_FILE) {
   3258             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
   3259             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
   3260             hbld.MOV(mov_dst, mov_src);
   3261          }
   3262          dst = offset(dst, hbld, 1);
   3263       }
   3264 
   3265       if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
   3266           inst->exec_size > 8) {
   3267          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
   3268           * a straightforward copy.  Instead, the result of the
   3269           * LOAD_PAYLOAD is treated as interleaved and the first four
   3270           * non-header sources are unpacked as:
   3271           *
   3272           * m + 0: r0
   3273           * m + 1: g0
   3274           * m + 2: b0
   3275           * m + 3: a0
   3276           * m + 4: r1
   3277           * m + 5: g1
   3278           * m + 6: b1
   3279           * m + 7: a1
   3280           *
   3281           * This is used for gen <= 5 fb writes.
   3282           */
   3283          assert(inst->exec_size == 16);
   3284          assert(inst->header_size + 4 <= inst->sources);
   3285          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
   3286             if (inst->src[i].file != BAD_FILE) {
   3287                if (devinfo->has_compr4) {
   3288                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
   3289                   compr4_dst.nr |= BRW_MRF_COMPR4;
   3290                   ibld.MOV(compr4_dst, inst->src[i]);
   3291                } else {
   3292                   /* Platform doesn't have COMPR4.  We have to fake it */
   3293                   fs_reg mov_dst = retype(dst, inst->src[i].type);
   3294                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
   3295                   mov_dst.nr += 4;
   3296                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
   3297                }
   3298             }
   3299 
   3300             dst.nr++;
   3301          }
   3302 
   3303          /* The loop above only ever incremented us through the first set
   3304           * of 4 registers.  However, thanks to the magic of COMPR4, we
   3305           * actually wrote to the first 8 registers, so we need to take
   3306           * that into account now.
   3307           */
   3308          dst.nr += 4;
   3309 
   3310          /* The COMPR4 code took care of the first 4 sources.  We'll let
   3311           * the regular path handle any remaining sources.  Yes, we are
   3312           * modifying the instruction but we're about to delete it so
   3313           * this really doesn't hurt anything.
   3314           */
   3315          inst->header_size += 4;
   3316       }
   3317 
   3318       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
   3319          if (inst->src[i].file != BAD_FILE)
   3320             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
   3321          dst = offset(dst, ibld, 1);
   3322       }
   3323 
   3324       inst->remove(block);
   3325       progress = true;
   3326    }
   3327 
   3328    if (progress)
   3329       invalidate_live_intervals();
   3330 
   3331    return progress;
   3332 }
   3333 
   3334 bool
   3335 fs_visitor::lower_integer_multiplication()
   3336 {
   3337    bool progress = false;
   3338 
   3339    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   3340       const fs_builder ibld(this, block, inst);
   3341 
   3342       if (inst->opcode == BRW_OPCODE_MUL) {
   3343          if (inst->dst.is_accumulator() ||
   3344              (inst->dst.type != BRW_REGISTER_TYPE_D &&
   3345               inst->dst.type != BRW_REGISTER_TYPE_UD))
   3346             continue;
   3347 
   3348          /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
   3349           * operation directly, but CHV/BXT cannot.
   3350           */
   3351          if (devinfo->gen >= 8 &&
   3352              !devinfo->is_cherryview && !devinfo->is_broxton)
   3353             continue;
   3354 
   3355          if (inst->src[1].file == IMM &&
   3356              inst->src[1].ud < (1 << 16)) {
   3357             /* The MUL instruction isn't commutative. On Gen <= 6, only the low
   3358              * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
   3359              * src1 are used.
   3360              *
   3361              * If multiplying by an immediate value that fits in 16-bits, do a
   3362              * single MUL instruction with that value in the proper location.
   3363              */
   3364             if (devinfo->gen < 7) {
   3365                fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
   3366                           inst->dst.type);
   3367                ibld.MOV(imm, inst->src[1]);
   3368                ibld.MUL(inst->dst, imm, inst->src[0]);
   3369             } else {
   3370                const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
   3371                ibld.MUL(inst->dst, inst->src[0],
   3372                         ud ? brw_imm_uw(inst->src[1].ud)
   3373                            : brw_imm_w(inst->src[1].d));
   3374             }
   3375          } else {
   3376             /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
   3377              * do 32-bit integer multiplication in one instruction, but instead
   3378              * must do a sequence (which actually calculates a 64-bit result):
   3379              *
   3380              *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
   3381              *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
   3382              *    mov(8)  g2<1>D     acc0<8,8,1>D
   3383              *
   3384              * But on Gen > 6, the ability to use second accumulator register
   3385              * (acc1) for non-float data types was removed, preventing a simple
   3386              * implementation in SIMD16. A 16-channel result can be calculated by
   3387              * executing the three instructions twice in SIMD8, once with quarter
   3388              * control of 1Q for the first eight channels and again with 2Q for
   3389              * the second eight channels.
   3390              *
   3391              * Which accumulator register is implicitly accessed (by AccWrEnable
   3392              * for instance) is determined by the quarter control. Unfortunately
   3393              * Ivybridge (and presumably Baytrail) has a hardware bug in which an
   3394              * implicit accumulator access by an instruction with 2Q will access
   3395              * acc1 regardless of whether the data type is usable in acc1.
   3396              *
   3397              * Specifically, the 2Q mach(8) writes acc1 which does not exist for
   3398              * integer data types.
   3399              *
   3400              * Since we only want the low 32-bits of the result, we can do two
   3401              * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
   3402              * adjust the high result and add them (like the mach is doing):
   3403              *
   3404              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
   3405              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
   3406              *    shl(8)  g9<1>D     g8<8,8,1>D      16D
   3407              *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
   3408              *
   3409              * We avoid the shl instruction by realizing that we only want to add
   3410              * the low 16-bits of the "high" result to the high 16-bits of the
   3411              * "low" result and using proper regioning on the add:
   3412              *
   3413              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
   3414              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
   3415              *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
   3416              *
   3417              * Since it does not use the (single) accumulator register, we can
   3418              * schedule multi-component multiplications much better.
   3419              */
   3420 
   3421             fs_reg orig_dst = inst->dst;
   3422             if (orig_dst.is_null() || orig_dst.file == MRF) {
   3423                inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
   3424                                   inst->dst.type);
   3425             }
   3426             fs_reg low = inst->dst;
   3427             fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
   3428                         inst->dst.type);
   3429 
   3430             if (devinfo->gen >= 7) {
   3431                if (inst->src[1].file == IMM) {
   3432                   ibld.MUL(low, inst->src[0],
   3433                            brw_imm_uw(inst->src[1].ud & 0xffff));
   3434                   ibld.MUL(high, inst->src[0],
   3435                            brw_imm_uw(inst->src[1].ud >> 16));
   3436                } else {
   3437                   ibld.MUL(low, inst->src[0],
   3438                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
   3439                   ibld.MUL(high, inst->src[0],
   3440                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
   3441                }
   3442             } else {
   3443                ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
   3444                         inst->src[1]);
   3445                ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
   3446                         inst->src[1]);
   3447             }
   3448 
   3449             ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1),
   3450                      subscript(low, BRW_REGISTER_TYPE_UW, 1),
   3451                      subscript(high, BRW_REGISTER_TYPE_UW, 0));
   3452 
   3453             if (inst->conditional_mod || orig_dst.file == MRF) {
   3454                set_condmod(inst->conditional_mod,
   3455                            ibld.MOV(orig_dst, inst->dst));
   3456             }
   3457          }
   3458 
   3459       } else if (inst->opcode == SHADER_OPCODE_MULH) {
   3460          /* Should have been lowered to 8-wide. */
   3461          assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
   3462          const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
   3463                                    inst->dst.type);
   3464          fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
   3465          fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
   3466 
   3467          if (devinfo->gen >= 8) {
   3468             /* Until Gen8, integer multiplies read 32-bits from one source,
   3469              * and 16-bits from the other, and relying on the MACH instruction
   3470              * to generate the high bits of the result.
   3471              *
   3472              * On Gen8, the multiply instruction does a full 32x32-bit
   3473              * multiply, but in order to do a 64-bit multiply we can simulate
   3474              * the previous behavior and then use a MACH instruction.
   3475              *
   3476              * FINISHME: Don't use source modifiers on src1.
   3477              */
   3478             assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
   3479                    mul->src[1].type == BRW_REGISTER_TYPE_UD);
   3480             mul->src[1].type = BRW_REGISTER_TYPE_UW;
   3481             mul->src[1].stride *= 2;
   3482 
   3483          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
   3484                     inst->group > 0) {
   3485             /* Among other things the quarter control bits influence which
   3486              * accumulator register is used by the hardware for instructions
   3487              * that access the accumulator implicitly (e.g. MACH).  A
   3488              * second-half instruction would normally map to acc1, which
   3489              * doesn't exist on Gen7 and up (the hardware does emulate it for
   3490              * floating-point instructions *only* by taking advantage of the
   3491              * extra precision of acc0 not normally used for floating point
   3492              * arithmetic).
   3493              *
   3494              * HSW and up are careful enough not to try to access an
   3495              * accumulator register that doesn't exist, but on earlier Gen7
   3496              * hardware we need to make sure that the quarter control bits are
   3497              * zero to avoid non-deterministic behaviour and emit an extra MOV
   3498              * to get the result masked correctly according to the current
   3499              * channel enables.
   3500              */
   3501             mach->group = 0;
   3502             mach->force_writemask_all = true;
   3503             mach->dst = ibld.vgrf(inst->dst.type);
   3504             ibld.MOV(inst->dst, mach->dst);
   3505          }
   3506       } else {
   3507          continue;
   3508       }
   3509 
   3510       inst->remove(block);
   3511       progress = true;
   3512    }
   3513 
   3514    if (progress)
   3515       invalidate_live_intervals();
   3516 
   3517    return progress;
   3518 }
   3519 
   3520 bool
   3521 fs_visitor::lower_minmax()
   3522 {
   3523    assert(devinfo->gen < 6);
   3524 
   3525    bool progress = false;
   3526 
   3527    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   3528       const fs_builder ibld(this, block, inst);
   3529 
   3530       if (inst->opcode == BRW_OPCODE_SEL &&
   3531           inst->predicate == BRW_PREDICATE_NONE) {
   3532          /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
   3533           *        the original SEL.L/GE instruction
   3534           */
   3535          ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
   3536                   inst->conditional_mod);
   3537          inst->predicate = BRW_PREDICATE_NORMAL;
   3538          inst->conditional_mod = BRW_CONDITIONAL_NONE;
   3539 
   3540          progress = true;
   3541       }
   3542    }
   3543 
   3544    if (progress)
   3545       invalidate_live_intervals();
   3546 
   3547    return progress;
   3548 }
   3549 
   3550 static void
   3551 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
   3552                     fs_reg *dst, fs_reg color, unsigned components)
   3553 {
   3554    if (key->clamp_fragment_color) {
   3555       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
   3556       assert(color.type == BRW_REGISTER_TYPE_F);
   3557 
   3558       for (unsigned i = 0; i < components; i++)
   3559          set_saturate(true,
   3560                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
   3561 
   3562       color = tmp;
   3563    }
   3564 
   3565    for (unsigned i = 0; i < components; i++)
   3566       dst[i] = offset(color, bld, i);
   3567 }
   3568 
   3569 static void
   3570 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
   3571                             const struct brw_wm_prog_data *prog_data,
   3572                             const brw_wm_prog_key *key,
   3573                             const fs_visitor::thread_payload &payload)
   3574 {
   3575    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
   3576    const gen_device_info *devinfo = bld.shader->devinfo;
   3577    const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
   3578    const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
   3579    const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
   3580    const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
   3581    const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
   3582    const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
   3583    fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
   3584    const unsigned components =
   3585       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
   3586 
   3587    /* We can potentially have a message length of up to 15, so we have to set
   3588     * base_mrf to either 0 or 1 in order to fit in m0..m15.
   3589     */
   3590    fs_reg sources[15];
   3591    int header_size = 2, payload_header_size;
   3592    unsigned length = 0;
   3593 
   3594    /* From the Sandy Bridge PRM, volume 4, page 198:
   3595     *
   3596     *     "Dispatched Pixel Enables. One bit per pixel indicating
   3597     *      which pixels were originally enabled when the thread was
   3598     *      dispatched. This field is only required for the end-of-
   3599     *      thread message and on all dual-source messages."
   3600     */
   3601    if (devinfo->gen >= 6 &&
   3602        (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
   3603        color1.file == BAD_FILE &&
   3604        key->nr_color_regions == 1) {
   3605       header_size = 0;
   3606    }
   3607 
   3608    if (header_size != 0) {
   3609       assert(header_size == 2);
   3610       /* Allocate 2 registers for a header */
   3611       length += 2;
   3612    }
   3613 
   3614    if (payload.aa_dest_stencil_reg) {
   3615       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
   3616       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
   3617          .MOV(sources[length],
   3618               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
   3619       length++;
   3620    }
   3621 
   3622    if (sample_mask.file != BAD_FILE) {
   3623       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
   3624                                BRW_REGISTER_TYPE_UD);
   3625 
   3626       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
   3627        * relevant.  Since it's unsigned single words one vgrf is always
   3628        * 16-wide, but only the lower or higher 8 channels will be used by the
   3629        * hardware when doing a SIMD8 write depending on whether we have
   3630        * selected the subspans for the first or second half respectively.
   3631        */
   3632       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
   3633       sample_mask.type = BRW_REGISTER_TYPE_UW;
   3634       sample_mask.stride *= 2;
   3635 
   3636       bld.exec_all().annotate("FB write oMask")
   3637          .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
   3638                            inst->group),
   3639               sample_mask);
   3640       length++;
   3641    }
   3642 
   3643    payload_header_size = length;
   3644 
   3645    if (src0_alpha.file != BAD_FILE) {
   3646       /* FIXME: This is being passed at the wrong location in the payload and
   3647        * doesn't work when gl_SampleMask and MRTs are used simultaneously.
   3648        * It's supposed to be immediately before oMask but there seems to be no
   3649        * reasonable way to pass them in the correct order because LOAD_PAYLOAD
   3650        * requires header sources to form a contiguous segment at the beginning
   3651        * of the message and src0_alpha has per-channel semantics.
   3652        */
   3653       setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
   3654       length++;
   3655    } else if (key->replicate_alpha && inst->target != 0) {
   3656       /* Handle the case when fragment shader doesn't write to draw buffer
   3657        * zero. No need to call setup_color_payload() for src0_alpha because
   3658        * alpha value will be undefined.
   3659        */
   3660       length++;
   3661    }
   3662 
   3663    setup_color_payload(bld, key, &sources[length], color0, components);
   3664    length += 4;
   3665 
   3666    if (color1.file != BAD_FILE) {
   3667       setup_color_payload(bld, key, &sources[length], color1, components);
   3668       length += 4;
   3669    }
   3670 
   3671    if (src_depth.file != BAD_FILE) {
   3672       sources[length] = src_depth;
   3673       length++;
   3674    }
   3675 
   3676    if (dst_depth.file != BAD_FILE) {
   3677       sources[length] = dst_depth;
   3678       length++;
   3679    }
   3680 
   3681    if (src_stencil.file != BAD_FILE) {
   3682       assert(devinfo->gen >= 9);
   3683       assert(bld.dispatch_width() != 16);
   3684 
   3685       /* XXX: src_stencil is only available on gen9+. dst_depth is never
   3686        * available on gen9+. As such it's impossible to have both enabled at the
   3687        * same time and therefore length cannot overrun the array.
   3688        */
   3689       assert(length < 15);
   3690 
   3691       sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
   3692       bld.exec_all().annotate("FB write OS")
   3693          .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
   3694               subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
   3695       length++;
   3696    }
   3697 
   3698    fs_inst *load;
   3699    if (devinfo->gen >= 7) {
   3700       /* Send from the GRF */
   3701       fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
   3702       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
   3703       payload.nr = bld.shader->alloc.allocate(regs_written(load));
   3704       load->dst = payload;
   3705 
   3706       inst->src[0] = payload;
   3707       inst->resize_sources(1);
   3708    } else {
   3709       /* Send from the MRF */
   3710       load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
   3711                               sources, length, payload_header_size);
   3712 
   3713       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
   3714        * will do this for us if we just give it a COMPR4 destination.
   3715        */
   3716       if (devinfo->gen < 6 && bld.dispatch_width() == 16)
   3717          load->dst.nr |= BRW_MRF_COMPR4;
   3718 
   3719       inst->resize_sources(0);
   3720       inst->base_mrf = 1;
   3721    }
   3722 
   3723    inst->opcode = FS_OPCODE_FB_WRITE;
   3724    inst->mlen = regs_written(load);
   3725    inst->header_size = header_size;
   3726 }
   3727 
   3728 static void
   3729 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
   3730 {
   3731    const fs_builder &ubld = bld.exec_all();
   3732    const unsigned length = 2;
   3733    const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length);
   3734 
   3735    ubld.group(16, 0)
   3736        .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
   3737 
   3738    inst->resize_sources(1);
   3739    inst->src[0] = header;
   3740    inst->opcode = FS_OPCODE_FB_READ;
   3741    inst->mlen = length;
   3742    inst->header_size = length;
   3743 }
   3744 
   3745 static void
   3746 lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
   3747                                 const fs_reg &coordinate,
   3748                                 const fs_reg &shadow_c,
   3749                                 const fs_reg &lod, const fs_reg &lod2,
   3750                                 const fs_reg &surface,
   3751                                 const fs_reg &sampler,
   3752                                 unsigned coord_components,
   3753                                 unsigned grad_components)
   3754 {
   3755    const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
   3756                          op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
   3757    fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
   3758    fs_reg msg_end = msg_begin;
   3759 
   3760    /* g0 header. */
   3761    msg_end = offset(msg_end, bld.group(8, 0), 1);
   3762 
   3763    for (unsigned i = 0; i < coord_components; i++)
   3764       bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
   3765               offset(coordinate, bld, i));
   3766 
   3767    msg_end = offset(msg_end, bld, coord_components);
   3768 
   3769    /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
   3770     * require all three components to be present and zero if they are unused.
   3771     */
   3772    if (coord_components > 0 &&
   3773        (has_lod || shadow_c.file != BAD_FILE ||
   3774         (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
   3775       for (unsigned i = coord_components; i < 3; i++)
   3776          bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
   3777 
   3778       msg_end = offset(msg_end, bld, 3 - coord_components);
   3779    }
   3780 
   3781    if (op == SHADER_OPCODE_TXD) {
   3782       /* TXD unsupported in SIMD16 mode. */
   3783       assert(bld.dispatch_width() == 8);
   3784 
   3785       /* the slots for u and v are always present, but r is optional */
   3786       if (coord_components < 2)
   3787          msg_end = offset(msg_end, bld, 2 - coord_components);
   3788 
   3789       /*  P   = u, v, r
   3790        * dPdx = dudx, dvdx, drdx
   3791        * dPdy = dudy, dvdy, drdy
   3792        *
   3793        * 1-arg: Does not exist.
   3794        *
   3795        * 2-arg: dudx   dvdx   dudy   dvdy
   3796        *        dPdx.x dPdx.y dPdy.x dPdy.y
   3797        *        m4     m5     m6     m7
   3798        *
   3799        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
   3800        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
   3801        *        m5     m6     m7     m8     m9     m10
   3802        */
   3803       for (unsigned i = 0; i < grad_components; i++)
   3804          bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
   3805 
   3806       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
   3807 
   3808       for (unsigned i = 0; i < grad_components; i++)
   3809          bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
   3810 
   3811       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
   3812    }
   3813 
   3814    if (has_lod) {
   3815       /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
   3816        * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
   3817        */
   3818       assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
   3819              bld.dispatch_width() == 16);
   3820 
   3821       const brw_reg_type type =
   3822          (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
   3823           BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
   3824       bld.MOV(retype(msg_end, type), lod);
   3825       msg_end = offset(msg_end, bld, 1);
   3826    }
   3827 
   3828    if (shadow_c.file != BAD_FILE) {
   3829       if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
   3830          /* There's no plain shadow compare message, so we use shadow
   3831           * compare with a bias of 0.0.
   3832           */
   3833          bld.MOV(msg_end, brw_imm_f(0.0f));
   3834          msg_end = offset(msg_end, bld, 1);
   3835       }
   3836 
   3837       bld.MOV(msg_end, shadow_c);
   3838       msg_end = offset(msg_end, bld, 1);
   3839    }
   3840 
   3841    inst->opcode = op;
   3842    inst->src[0] = reg_undef;
   3843    inst->src[1] = surface;
   3844    inst->src[2] = sampler;
   3845    inst->resize_sources(3);
   3846    inst->base_mrf = msg_begin.nr;
   3847    inst->mlen = msg_end.nr - msg_begin.nr;
   3848    inst->header_size = 1;
   3849 }
   3850 
   3851 static void
   3852 lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
   3853                                 const fs_reg &coordinate,
   3854                                 const fs_reg &shadow_c,
   3855                                 const fs_reg &lod, const fs_reg &lod2,
   3856                                 const fs_reg &sample_index,
   3857                                 const fs_reg &surface,
   3858                                 const fs_reg &sampler,
   3859                                 unsigned coord_components,
   3860                                 unsigned grad_components)
   3861 {
   3862    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
   3863    fs_reg msg_coords = message;
   3864    unsigned header_size = 0;
   3865 
   3866    if (inst->offset != 0) {
   3867       /* The offsets set up by the visitor are in the m1 header, so we can't
   3868        * go headerless.
   3869        */
   3870       header_size = 1;
   3871       message.nr--;
   3872    }
   3873 
   3874    for (unsigned i = 0; i < coord_components; i++)
   3875       bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
   3876               offset(coordinate, bld, i));
   3877 
   3878    fs_reg msg_end = offset(msg_coords, bld, coord_components);
   3879    fs_reg msg_lod = offset(msg_coords, bld, 4);
   3880 
   3881    if (shadow_c.file != BAD_FILE) {
   3882       fs_reg msg_shadow = msg_lod;
   3883       bld.MOV(msg_shadow, shadow_c);
   3884       msg_lod = offset(msg_shadow, bld, 1);
   3885       msg_end = msg_lod;
   3886    }
   3887 
   3888    switch (op) {
   3889    case SHADER_OPCODE_TXL:
   3890    case FS_OPCODE_TXB:
   3891       bld.MOV(msg_lod, lod);
   3892       msg_end = offset(msg_lod, bld, 1);
   3893       break;
   3894    case SHADER_OPCODE_TXD:
   3895       /**
   3896        *  P   =  u,    v,    r
   3897        * dPdx = dudx, dvdx, drdx
   3898        * dPdy = dudy, dvdy, drdy
   3899        *
   3900        * Load up these values:
   3901        * - dudx   dudy   dvdx   dvdy   drdx   drdy
   3902        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
   3903        */
   3904       msg_end = msg_lod;
   3905       for (unsigned i = 0; i < grad_components; i++) {
   3906          bld.MOV(msg_end, offset(lod, bld, i));
   3907          msg_end = offset(msg_end, bld, 1);
   3908 
   3909          bld.MOV(msg_end, offset(lod2, bld, i));
   3910          msg_end = offset(msg_end, bld, 1);
   3911       }
   3912       break;
   3913    case SHADER_OPCODE_TXS:
   3914       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
   3915       bld.MOV(msg_lod, lod);
   3916       msg_end = offset(msg_lod, bld, 1);
   3917       break;
   3918    case SHADER_OPCODE_TXF:
   3919       msg_lod = offset(msg_coords, bld, 3);
   3920       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
   3921       msg_end = offset(msg_lod, bld, 1);
   3922       break;
   3923    case SHADER_OPCODE_TXF_CMS:
   3924       msg_lod = offset(msg_coords, bld, 3);
   3925       /* lod */
   3926       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
   3927       /* sample index */
   3928       bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
   3929       msg_end = offset(msg_lod, bld, 2);
   3930       break;
   3931    default:
   3932       break;
   3933    }
   3934 
   3935    inst->opcode = op;
   3936    inst->src[0] = reg_undef;
   3937    inst->src[1] = surface;
   3938    inst->src[2] = sampler;
   3939    inst->resize_sources(3);
   3940    inst->base_mrf = message.nr;
   3941    inst->mlen = msg_end.nr - message.nr;
   3942    inst->header_size = header_size;
   3943 
   3944    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
   3945    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
   3946 }
   3947 
   3948 static bool
   3949 is_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler)
   3950 {
   3951    if (devinfo->gen < 8 && !devinfo->is_haswell)
   3952       return false;
   3953 
   3954    return sampler.file != IMM || sampler.ud >= 16;
   3955 }
   3956 
   3957 static void
   3958 lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
   3959                                 const fs_reg &coordinate,
   3960                                 const fs_reg &shadow_c,
   3961                                 fs_reg lod, const fs_reg &lod2,
   3962                                 const fs_reg &sample_index,
   3963                                 const fs_reg &mcs,
   3964                                 const fs_reg &surface,
   3965                                 const fs_reg &sampler,
   3966                                 const fs_reg &tg4_offset,
   3967                                 unsigned coord_components,
   3968                                 unsigned grad_components)
   3969 {
   3970    const gen_device_info *devinfo = bld.shader->devinfo;
   3971    unsigned reg_width = bld.dispatch_width() / 8;
   3972    unsigned header_size = 0, length = 0;
   3973    fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
   3974    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
   3975       sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
   3976 
   3977    if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
   3978        inst->offset != 0 || inst->eot ||
   3979        op == SHADER_OPCODE_SAMPLEINFO ||
   3980        is_high_sampler(devinfo, sampler)) {
   3981       /* For general texture offsets (no txf workaround), we need a header to
   3982        * put them in.  Note that we're only reserving space for it in the
   3983        * message payload as it will be initialized implicitly by the
   3984        * generator.
   3985        *
   3986        * TG4 needs to place its channel select in the header, for interaction
   3987        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
   3988        * larger sampler numbers we need to offset the Sampler State Pointer in
   3989        * the header.
   3990        */
   3991       header_size = 1;
   3992       sources[0] = fs_reg();
   3993       length++;
   3994 
   3995       /* If we're requesting fewer than four channels worth of response,
   3996        * and we have an explicit header, we need to set up the sampler
   3997        * writemask.  It's reversed from normal: 1 means "don't write".
   3998        */
   3999       if (!inst->eot && regs_written(inst) != 4 * reg_width) {
   4000          assert(regs_written(inst) % reg_width == 0);
   4001          unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
   4002          inst->offset |= mask << 12;
   4003       }
   4004    }
   4005 
   4006    if (shadow_c.file != BAD_FILE) {
   4007       bld.MOV(sources[length], shadow_c);
   4008       length++;
   4009    }
   4010 
   4011    bool coordinate_done = false;
   4012 
   4013    /* Set up the LOD info */
   4014    switch (op) {
   4015    case FS_OPCODE_TXB:
   4016    case SHADER_OPCODE_TXL:
   4017       if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
   4018          op = SHADER_OPCODE_TXL_LZ;
   4019          break;
   4020       }
   4021       bld.MOV(sources[length], lod);
   4022       length++;
   4023       break;
   4024    case SHADER_OPCODE_TXD:
   4025       /* TXD should have been lowered in SIMD16 mode. */
   4026       assert(bld.dispatch_width() == 8);
   4027 
   4028       /* Load dPdx and the coordinate together:
   4029        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
   4030        */
   4031       for (unsigned i = 0; i < coord_components; i++) {
   4032          bld.MOV(sources[length++], offset(coordinate, bld, i));
   4033 
   4034          /* For cube map array, the coordinate is (u,v,r,ai) but there are
   4035           * only derivatives for (u, v, r).
   4036           */
   4037          if (i < grad_components) {
   4038             bld.MOV(sources[length++], offset(lod, bld, i));
   4039             bld.MOV(sources[length++], offset(lod2, bld, i));
   4040          }
   4041       }
   4042 
   4043       coordinate_done = true;
   4044       break;
   4045    case SHADER_OPCODE_TXS:
   4046       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
   4047       length++;
   4048       break;
   4049    case SHADER_OPCODE_TXF:
   4050       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
   4051        * On Gen9 they are u, v, lod, r
   4052        */
   4053       bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
   4054 
   4055       if (devinfo->gen >= 9) {
   4056          if (coord_components >= 2) {
   4057             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
   4058                     offset(coordinate, bld, 1));
   4059          } else {
   4060             sources[length] = brw_imm_d(0);
   4061          }
   4062          length++;
   4063       }
   4064 
   4065       if (devinfo->gen >= 9 && lod.is_zero()) {
   4066          op = SHADER_OPCODE_TXF_LZ;
   4067       } else {
   4068          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
   4069          length++;
   4070       }
   4071 
   4072       for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++)
   4073          bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
   4074                  offset(coordinate, bld, i));
   4075 
   4076       coordinate_done = true;
   4077       break;
   4078 
   4079    case SHADER_OPCODE_TXF_CMS:
   4080    case SHADER_OPCODE_TXF_CMS_W:
   4081    case SHADER_OPCODE_TXF_UMS:
   4082    case SHADER_OPCODE_TXF_MCS:
   4083       if (op == SHADER_OPCODE_TXF_UMS ||
   4084           op == SHADER_OPCODE_TXF_CMS ||
   4085           op == SHADER_OPCODE_TXF_CMS_W) {
   4086          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
   4087          length++;
   4088       }
   4089 
   4090       if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
   4091          /* Data from the multisample control surface. */
   4092          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
   4093          length++;
   4094 
   4095          /* On Gen9+ we'll use ld2dms_w instead which has two registers for
   4096           * the MCS data.
   4097           */
   4098          if (op == SHADER_OPCODE_TXF_CMS_W) {
   4099             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
   4100                     mcs.file == IMM ?
   4101                     mcs :
   4102                     offset(mcs, bld, 1));
   4103             length++;
   4104          }
   4105       }
   4106 
   4107       /* There is no offsetting for this message; just copy in the integer
   4108        * texture coordinates.
   4109        */
   4110       for (unsigned i = 0; i < coord_components; i++)
   4111          bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
   4112                  offset(coordinate, bld, i));
   4113 
   4114       coordinate_done = true;
   4115       break;
   4116    case SHADER_OPCODE_TG4_OFFSET:
   4117       /* More crazy intermixing */
   4118       for (unsigned i = 0; i < 2; i++) /* u, v */
   4119          bld.MOV(sources[length++], offset(coordinate, bld, i));
   4120 
   4121       for (unsigned i = 0; i < 2; i++) /* offu, offv */
   4122          bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
   4123                  offset(tg4_offset, bld, i));
   4124 
   4125       if (coord_components == 3) /* r if present */
   4126          bld.MOV(sources[length++], offset(coordinate, bld, 2));
   4127 
   4128       coordinate_done = true;
   4129       break;
   4130    default:
   4131       break;
   4132    }
   4133 
   4134    /* Set up the coordinate (except for cases where it was done above) */
   4135    if (!coordinate_done) {
   4136       for (unsigned i = 0; i < coord_components; i++)
   4137          bld.MOV(sources[length++], offset(coordinate, bld, i));
   4138    }
   4139 
   4140    int mlen;
   4141    if (reg_width == 2)
   4142       mlen = length * reg_width - header_size;
   4143    else
   4144       mlen = length * reg_width;
   4145 
   4146    const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
   4147                                      BRW_REGISTER_TYPE_F);
   4148    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
   4149 
   4150    /* Generate the SEND. */
   4151    inst->opcode = op;
   4152    inst->src[0] = src_payload;
   4153    inst->src[1] = surface;
   4154    inst->src[2] = sampler;
   4155    inst->resize_sources(3);
   4156    inst->mlen = mlen;
   4157    inst->header_size = header_size;
   4158 
   4159    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
   4160    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
   4161 }
   4162 
   4163 static void
   4164 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
   4165 {
   4166    const gen_device_info *devinfo = bld.shader->devinfo;
   4167    const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
   4168    const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
   4169    const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
   4170    const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
   4171    const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
   4172    const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
   4173    const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
   4174    const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
   4175    const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
   4176    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
   4177    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
   4178    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
   4179    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
   4180 
   4181    if (devinfo->gen >= 7) {
   4182       lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
   4183                                       shadow_c, lod, lod2, sample_index,
   4184                                       mcs, surface, sampler, tg4_offset,
   4185                                       coord_components, grad_components);
   4186    } else if (devinfo->gen >= 5) {
   4187       lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
   4188                                       shadow_c, lod, lod2, sample_index,
   4189                                       surface, sampler,
   4190                                       coord_components, grad_components);
   4191    } else {
   4192       lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
   4193                                       shadow_c, lod, lod2,
   4194                                       surface, sampler,
   4195                                       coord_components, grad_components);
   4196    }
   4197 }
   4198 
   4199 /**
   4200  * Initialize the header present in some typed and untyped surface
   4201  * messages.
   4202  */
   4203 static fs_reg
   4204 emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
   4205 {
   4206    fs_builder ubld = bld.exec_all().group(8, 0);
   4207    const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
   4208    ubld.MOV(dst, brw_imm_d(0));
   4209    ubld.MOV(component(dst, 7), sample_mask);
   4210    return dst;
   4211 }
   4212 
   4213 static void
   4214 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
   4215                            const fs_reg &sample_mask)
   4216 {
   4217    /* Get the logical send arguments. */
   4218    const fs_reg &addr = inst->src[0];
   4219    const fs_reg &src = inst->src[1];
   4220    const fs_reg &surface = inst->src[2];
   4221    const UNUSED fs_reg &dims = inst->src[3];
   4222    const fs_reg &arg = inst->src[4];
   4223 
   4224    /* Calculate the total number of components of the payload. */
   4225    const unsigned addr_sz = inst->components_read(0);
   4226    const unsigned src_sz = inst->components_read(1);
   4227    const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
   4228    const unsigned sz = header_sz + addr_sz + src_sz;
   4229 
   4230    /* Allocate space for the payload. */
   4231    fs_reg *const components = new fs_reg[sz];
   4232    const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
   4233    unsigned n = 0;
   4234 
   4235    /* Construct the payload. */
   4236    if (header_sz)
   4237       components[n++] = emit_surface_header(bld, sample_mask);
   4238 
   4239    for (unsigned i = 0; i < addr_sz; i++)
   4240       components[n++] = offset(addr, bld, i);
   4241 
   4242    for (unsigned i = 0; i < src_sz; i++)
   4243       components[n++] = offset(src, bld, i);
   4244 
   4245    bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
   4246 
   4247    /* Update the original instruction. */
   4248    inst->opcode = op;
   4249    inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
   4250    inst->header_size = header_sz;
   4251 
   4252    inst->src[0] = payload;
   4253    inst->src[1] = surface;
   4254    inst->src[2] = arg;
   4255    inst->resize_sources(3);
   4256 
   4257    delete[] components;
   4258 }
   4259 
   4260 static void
   4261 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
   4262 {
   4263    const gen_device_info *devinfo = bld.shader->devinfo;
   4264 
   4265    if (devinfo->gen >= 7) {
   4266       /* We are switching the instruction from an ALU-like instruction to a
   4267        * send-from-grf instruction.  Since sends can't handle strides or
   4268        * source modifiers, we have to make a copy of the offset source.
   4269        */
   4270       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
   4271       bld.MOV(tmp, inst->src[1]);
   4272       inst->src[1] = tmp;
   4273 
   4274       inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
   4275 
   4276    } else {
   4277       const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
   4278                            BRW_REGISTER_TYPE_UD);
   4279 
   4280       bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
   4281 
   4282       inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4;
   4283       inst->resize_sources(1);
   4284       inst->base_mrf = payload.nr;
   4285       inst->header_size = 1;
   4286       inst->mlen = 1 + inst->exec_size / 8;
   4287    }
   4288 }
   4289 
   4290 static void
   4291 lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
   4292 {
   4293    assert(bld.shader->devinfo->gen < 6);
   4294 
   4295    inst->base_mrf = 2;
   4296    inst->mlen = inst->sources * inst->exec_size / 8;
   4297 
   4298    if (inst->sources > 1) {
   4299       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
   4300        * "Message Payload":
   4301        *
   4302        * "Operand0[7].  For the INT DIV functions, this operand is the
   4303        *  denominator."
   4304        *  ...
   4305        * "Operand1[7].  For the INT DIV functions, this operand is the
   4306        *  numerator."
   4307        */
   4308       const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
   4309       const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
   4310       const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
   4311 
   4312       inst->resize_sources(1);
   4313       inst->src[0] = src0;
   4314 
   4315       assert(inst->exec_size == 8);
   4316       bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
   4317    }
   4318 }
   4319 
   4320 bool
   4321 fs_visitor::lower_logical_sends()
   4322 {
   4323    bool progress = false;
   4324 
   4325    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   4326       const fs_builder ibld(this, block, inst);
   4327 
   4328       switch (inst->opcode) {
   4329       case FS_OPCODE_FB_WRITE_LOGICAL:
   4330          assert(stage == MESA_SHADER_FRAGMENT);
   4331          lower_fb_write_logical_send(ibld, inst,
   4332                                      brw_wm_prog_data(prog_data),
   4333                                      (const brw_wm_prog_key *)key,
   4334                                      payload);
   4335          break;
   4336 
   4337       case FS_OPCODE_FB_READ_LOGICAL:
   4338          lower_fb_read_logical_send(ibld, inst);
   4339          break;
   4340 
   4341       case SHADER_OPCODE_TEX_LOGICAL:
   4342          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
   4343          break;
   4344 
   4345       case SHADER_OPCODE_TXD_LOGICAL:
   4346          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
   4347          break;
   4348 
   4349       case SHADER_OPCODE_TXF_LOGICAL:
   4350          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
   4351          break;
   4352 
   4353       case SHADER_OPCODE_TXL_LOGICAL:
   4354          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
   4355          break;
   4356 
   4357       case SHADER_OPCODE_TXS_LOGICAL:
   4358          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
   4359          break;
   4360 
   4361       case FS_OPCODE_TXB_LOGICAL:
   4362          lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
   4363          break;
   4364 
   4365       case SHADER_OPCODE_TXF_CMS_LOGICAL:
   4366          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
   4367          break;
   4368 
   4369       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
   4370          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
   4371          break;
   4372 
   4373       case SHADER_OPCODE_TXF_UMS_LOGICAL:
   4374          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
   4375          break;
   4376 
   4377       case SHADER_OPCODE_TXF_MCS_LOGICAL:
   4378          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
   4379          break;
   4380 
   4381       case SHADER_OPCODE_LOD_LOGICAL:
   4382          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
   4383          break;
   4384 
   4385       case SHADER_OPCODE_TG4_LOGICAL:
   4386          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
   4387          break;
   4388 
   4389       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
   4390          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
   4391          break;
   4392 
   4393       case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
   4394          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
   4395          break;
   4396 
   4397       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
   4398          lower_surface_logical_send(ibld, inst,
   4399                                     SHADER_OPCODE_UNTYPED_SURFACE_READ,
   4400                                     fs_reg());
   4401          break;
   4402 
   4403       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
   4404          lower_surface_logical_send(ibld, inst,
   4405                                     SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
   4406                                     ibld.sample_mask_reg());
   4407          break;
   4408 
   4409       case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
   4410          lower_surface_logical_send(ibld, inst,
   4411                                     SHADER_OPCODE_UNTYPED_ATOMIC,
   4412                                     ibld.sample_mask_reg());
   4413          break;
   4414 
   4415       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
   4416          lower_surface_logical_send(ibld, inst,
   4417                                     SHADER_OPCODE_TYPED_SURFACE_READ,
   4418                                     brw_imm_d(0xffff));
   4419          break;
   4420 
   4421       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
   4422          lower_surface_logical_send(ibld, inst,
   4423                                     SHADER_OPCODE_TYPED_SURFACE_WRITE,
   4424                                     ibld.sample_mask_reg());
   4425          break;
   4426 
   4427       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
   4428          lower_surface_logical_send(ibld, inst,
   4429                                     SHADER_OPCODE_TYPED_ATOMIC,
   4430                                     ibld.sample_mask_reg());
   4431          break;
   4432 
   4433       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
   4434          lower_varying_pull_constant_logical_send(ibld, inst);
   4435          break;
   4436 
   4437       case SHADER_OPCODE_RCP:
   4438       case SHADER_OPCODE_RSQ:
   4439       case SHADER_OPCODE_SQRT:
   4440       case SHADER_OPCODE_EXP2:
   4441       case SHADER_OPCODE_LOG2:
   4442       case SHADER_OPCODE_SIN:
   4443       case SHADER_OPCODE_COS:
   4444       case SHADER_OPCODE_POW:
   4445       case SHADER_OPCODE_INT_QUOTIENT:
   4446       case SHADER_OPCODE_INT_REMAINDER:
   4447          /* The math opcodes are overloaded for the send-like and
   4448           * expression-like instructions which seems kind of icky.  Gen6+ has
   4449           * a native (but rather quirky) MATH instruction so we don't need to
   4450           * do anything here.  On Gen4-5 we'll have to lower the Gen6-like
   4451           * logical instructions (which we can easily recognize because they
   4452           * have mlen = 0) into send-like virtual instructions.
   4453           */
   4454          if (devinfo->gen < 6 && inst->mlen == 0) {
   4455             lower_math_logical_send(ibld, inst);
   4456             break;
   4457 
   4458          } else {
   4459             continue;
   4460          }
   4461 
   4462       default:
   4463          continue;
   4464       }
   4465 
   4466       progress = true;
   4467    }
   4468 
   4469    if (progress)
   4470       invalidate_live_intervals();
   4471 
   4472    return progress;
   4473 }
   4474 
   4475 /**
   4476  * Get the closest allowed SIMD width for instruction \p inst accounting for
   4477  * some common regioning and execution control restrictions that apply to FPU
   4478  * instructions.  These restrictions don't necessarily have any relevance to
   4479  * instructions not executed by the FPU pipeline like extended math, control
   4480  * flow or send message instructions.
   4481  *
   4482  * For virtual opcodes it's really up to the instruction -- In some cases
   4483  * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
   4484  * instructions) it may simplify virtual instruction lowering if we can
   4485  * enforce FPU-like regioning restrictions already on the virtual instruction,
   4486  * in other cases (e.g. virtual send-like instructions) this may be
   4487  * excessively restrictive.
   4488  */
   4489 static unsigned
   4490 get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
   4491                            const fs_inst *inst)
   4492 {
   4493    /* Maximum execution size representable in the instruction controls. */
   4494    unsigned max_width = MIN2(32, inst->exec_size);
   4495 
   4496    /* According to the PRMs:
   4497     *  "A. In Direct Addressing mode, a source cannot span more than 2
   4498     *      adjacent GRF registers.
   4499     *   B. A destination cannot span more than 2 adjacent GRF registers."
   4500     *
   4501     * Look for the source or destination with the largest register region
   4502     * which is the one that is going to limit the overall execution size of
   4503     * the instruction due to this rule.
   4504     */
   4505    unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
   4506 
   4507    for (unsigned i = 0; i < inst->sources; i++)
   4508       reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
   4509 
   4510    /* Calculate the maximum execution size of the instruction based on the
   4511     * factor by which it goes over the hardware limit of 2 GRFs.
   4512     */
   4513    if (reg_count > 2)
   4514       max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
   4515 
   4516    /* According to the IVB PRMs:
   4517     *  "When destination spans two registers, the source MUST span two
   4518     *   registers. The exception to the above rule:
   4519     *
   4520     *    - When source is scalar, the source registers are not incremented.
   4521     *    - When source is packed integer Word and destination is packed
   4522     *      integer DWord, the source register is not incremented but the
   4523     *      source sub register is incremented."
   4524     *
   4525     * The hardware specs from Gen4 to Gen7.5 mention similar regioning
   4526     * restrictions.  The code below intentionally doesn't check whether the
   4527     * destination type is integer because empirically the hardware doesn't
   4528     * seem to care what the actual type is as long as it's dword-aligned.
   4529     */
   4530    if (devinfo->gen < 8) {
   4531       for (unsigned i = 0; i < inst->sources; i++) {
   4532          if (inst->size_written > REG_SIZE &&
   4533              inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE &&
   4534              !is_uniform(inst->src[i]) &&
   4535              !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
   4536                type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) {
   4537             const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
   4538             max_width = MIN2(max_width, inst->exec_size / reg_count);
   4539          }
   4540       }
   4541    }
   4542 
   4543    /* From the IVB PRMs:
   4544     *  "When an instruction is SIMD32, the low 16 bits of the execution mask
   4545     *   are applied for both halves of the SIMD32 instruction. If different
   4546     *   execution mask channels are required, split the instruction into two
   4547     *   SIMD16 instructions."
   4548     *
   4549     * There is similar text in the HSW PRMs.  Gen4-6 don't even implement
   4550     * 32-wide control flow support in hardware and will behave similarly.
   4551     */
   4552    if (devinfo->gen < 8 && !inst->force_writemask_all)
   4553       max_width = MIN2(max_width, 16);
   4554 
   4555    /* From the IVB PRMs (applies to HSW too):
   4556     *  "Instructions with condition modifiers must not use SIMD32."
   4557     *
   4558     * From the BDW PRMs (applies to later hardware too):
   4559     *  "Ternary instruction with condition modifiers must not use SIMD32."
   4560     */
   4561    if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo)))
   4562       max_width = MIN2(max_width, 16);
   4563 
   4564    /* From the IVB PRMs (applies to other devices that don't have the
   4565     * gen_device_info::supports_simd16_3src flag set):
   4566     *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
   4567     *   SIMD8 is not allowed for DF operations."
   4568     */
   4569    if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
   4570       max_width = MIN2(max_width, inst->exec_size / reg_count);
   4571 
   4572    /* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
   4573     * the 8-bit quarter of the execution mask signals specified in the
   4574     * instruction control fields) for the second compressed half of any
   4575     * single-precision instruction (for double-precision instructions
   4576     * it's hardwired to use NibCtrl+1, at least on HSW), which means that
   4577     * the EU will apply the wrong execution controls for the second
   4578     * sequential GRF write if the number of channels per GRF is not exactly
   4579     * eight in single-precision mode (or four in double-float mode).
   4580     *
   4581     * In this situation we calculate the maximum size of the split
   4582     * instructions so they only ever write to a single register.
   4583     */
   4584    if (devinfo->gen < 8 && inst->size_written > REG_SIZE &&
   4585        !inst->force_writemask_all) {
   4586       const unsigned channels_per_grf = inst->exec_size /
   4587          DIV_ROUND_UP(inst->size_written, REG_SIZE);
   4588       unsigned exec_type_size = 0;
   4589       for (int i = 0; i < inst->sources; i++) {
   4590          if (inst->src[i].file != BAD_FILE)
   4591             exec_type_size = MAX2(exec_type_size, type_sz(inst->src[i].type));
   4592       }
   4593       assert(exec_type_size);
   4594 
   4595       /* The hardware shifts exactly 8 channels per compressed half of the
   4596        * instruction in single-precision mode and exactly 4 in double-precision.
   4597        */
   4598       if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
   4599          max_width = MIN2(max_width, channels_per_grf);
   4600    }
   4601 
   4602    /* Only power-of-two execution sizes are representable in the instruction
   4603     * control fields.
   4604     */
   4605    return 1 << _mesa_logbase2(max_width);
   4606 }
   4607 
   4608 /**
   4609  * Get the maximum allowed SIMD width for instruction \p inst accounting for
   4610  * various payload size restrictions that apply to sampler message
   4611  * instructions.
   4612  *
   4613  * This is only intended to provide a maximum theoretical bound for the
   4614  * execution size of the message based on the number of argument components
   4615  * alone, which in most cases will determine whether the SIMD8 or SIMD16
   4616  * variant of the message can be used, though some messages may have
   4617  * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
   4618  * the message length to determine the exact SIMD width and argument count,
   4619  * which makes a number of sampler message combinations impossible to
   4620  * represent).
   4621  */
   4622 static unsigned
   4623 get_sampler_lowered_simd_width(const struct gen_device_info *devinfo,
   4624                                const fs_inst *inst)
   4625 {
   4626    /* Calculate the number of coordinate components that have to be present
   4627     * assuming that additional arguments follow the texel coordinates in the
   4628     * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
   4629     * need to pad to four or three components depending on the message,
   4630     * pre-ILK we need to pad to at most three components.
   4631     */
   4632    const unsigned req_coord_components =
   4633       (devinfo->gen >= 7 ||
   4634        !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
   4635       (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
   4636                             inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
   4637       3;
   4638 
   4639    /* On Gen9+ the LOD argument is for free if we're able to use the LZ
   4640     * variant of the TXL or TXF message.
   4641     */
   4642    const bool implicit_lod = devinfo->gen >= 9 &&
   4643                              (inst->opcode == SHADER_OPCODE_TXL ||
   4644                               inst->opcode == SHADER_OPCODE_TXF) &&
   4645                              inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
   4646 
   4647    /* Calculate the total number of argument components that need to be passed
   4648     * to the sampler unit.
   4649     */
   4650    const unsigned num_payload_components =
   4651       MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
   4652            req_coord_components) +
   4653       inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
   4654       (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
   4655       inst->components_read(TEX_LOGICAL_SRC_LOD2) +
   4656       inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
   4657       (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
   4658        inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
   4659       inst->components_read(TEX_LOGICAL_SRC_MCS);
   4660 
   4661    /* SIMD16 messages with more than five arguments exceed the maximum message
   4662     * size supported by the sampler, regardless of whether a header is
   4663     * provided or not.
   4664     */
   4665    return MIN2(inst->exec_size,
   4666                num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
   4667 }
   4668 
   4669 /**
   4670  * Get the closest native SIMD width supported by the hardware for instruction
   4671  * \p inst.  The instruction will be left untouched by
   4672  * fs_visitor::lower_simd_width() if the returned value is equal to the
   4673  * original execution size.
   4674  */
   4675 static unsigned
   4676 get_lowered_simd_width(const struct gen_device_info *devinfo,
   4677                        const fs_inst *inst)
   4678 {
   4679    switch (inst->opcode) {
   4680    case BRW_OPCODE_MOV:
   4681    case BRW_OPCODE_SEL:
   4682    case BRW_OPCODE_NOT:
   4683    case BRW_OPCODE_AND:
   4684    case BRW_OPCODE_OR:
   4685    case BRW_OPCODE_XOR:
   4686    case BRW_OPCODE_SHR:
   4687    case BRW_OPCODE_SHL:
   4688    case BRW_OPCODE_ASR:
   4689    case BRW_OPCODE_CMPN:
   4690    case BRW_OPCODE_CSEL:
   4691    case BRW_OPCODE_F32TO16:
   4692    case BRW_OPCODE_F16TO32:
   4693    case BRW_OPCODE_BFREV:
   4694    case BRW_OPCODE_BFE:
   4695    case BRW_OPCODE_ADD:
   4696    case BRW_OPCODE_MUL:
   4697    case BRW_OPCODE_AVG:
   4698    case BRW_OPCODE_FRC:
   4699    case BRW_OPCODE_RNDU:
   4700    case BRW_OPCODE_RNDD:
   4701    case BRW_OPCODE_RNDE:
   4702    case BRW_OPCODE_RNDZ:
   4703    case BRW_OPCODE_LZD:
   4704    case BRW_OPCODE_FBH:
   4705    case BRW_OPCODE_FBL:
   4706    case BRW_OPCODE_CBIT:
   4707    case BRW_OPCODE_SAD2:
   4708    case BRW_OPCODE_MAD:
   4709    case BRW_OPCODE_LRP:
   4710    case FS_OPCODE_PACK:
   4711       return get_fpu_lowered_simd_width(devinfo, inst);
   4712 
   4713    case BRW_OPCODE_CMP: {
   4714       /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
   4715        * when the destination is a GRF the dependency-clear bit on the flag
   4716        * register is cleared early.
   4717        *
   4718        * Suggested workarounds are to disable coissuing CMP instructions
   4719        * or to split CMP(16) instructions into two CMP(8) instructions.
   4720        *
   4721        * We choose to split into CMP(8) instructions since disabling
   4722        * coissuing would affect CMP instructions not otherwise affected by
   4723        * the errata.
   4724        */
   4725       const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell &&
   4726                                   !inst->dst.is_null() ? 8 : ~0);
   4727       return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
   4728    }
   4729    case BRW_OPCODE_BFI1:
   4730    case BRW_OPCODE_BFI2:
   4731       /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
   4732        * should
   4733        *  "Force BFI instructions to be executed always in SIMD8."
   4734        */
   4735       return MIN2(devinfo->is_haswell ? 8 : ~0u,
   4736                   get_fpu_lowered_simd_width(devinfo, inst));
   4737 
   4738    case BRW_OPCODE_IF:
   4739       assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
   4740       return inst->exec_size;
   4741 
   4742    case SHADER_OPCODE_RCP:
   4743    case SHADER_OPCODE_RSQ:
   4744    case SHADER_OPCODE_SQRT:
   4745    case SHADER_OPCODE_EXP2:
   4746    case SHADER_OPCODE_LOG2:
   4747    case SHADER_OPCODE_SIN:
   4748    case SHADER_OPCODE_COS:
   4749       /* Unary extended math instructions are limited to SIMD8 on Gen4 and
   4750        * Gen6.
   4751        */
   4752       return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
   4753               devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
   4754               MIN2(8, inst->exec_size));
   4755 
   4756    case SHADER_OPCODE_POW:
   4757       /* SIMD16 is only allowed on Gen7+. */
   4758       return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
   4759               MIN2(8, inst->exec_size));
   4760 
   4761    case SHADER_OPCODE_INT_QUOTIENT:
   4762    case SHADER_OPCODE_INT_REMAINDER:
   4763       /* Integer division is limited to SIMD8 on all generations. */
   4764       return MIN2(8, inst->exec_size);
   4765 
   4766    case FS_OPCODE_LINTERP:
   4767    case FS_OPCODE_GET_BUFFER_SIZE:
   4768    case FS_OPCODE_DDX_COARSE:
   4769    case FS_OPCODE_DDX_FINE:
   4770    case FS_OPCODE_DDY_COARSE:
   4771    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
   4772    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
   4773    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
   4774    case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
   4775    case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
   4776    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
   4777    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
   4778    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
   4779       return MIN2(16, inst->exec_size);
   4780 
   4781    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
   4782       /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
   4783        * message used to implement varying pull constant loads, so expand it
   4784        * to SIMD16.  An alternative with longer message payload length but
   4785        * shorter return payload would be to use the SIMD8 sampler message that
   4786        * takes (header, u, v, r) as parameters instead of (header, u).
   4787        */
   4788       return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
   4789 
   4790    case FS_OPCODE_DDY_FINE:
   4791       /* The implementation of this virtual opcode may require emitting
   4792        * compressed Align16 instructions, which are severely limited on some
   4793        * generations.
   4794        *
   4795        * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
   4796        * Region Restrictions):
   4797        *
   4798        *  "In Align16 access mode, SIMD16 is not allowed for DW operations
   4799        *   and SIMD8 is not allowed for DF operations."
   4800        *
   4801        * In this context, "DW operations" means "operations acting on 32-bit
   4802        * values", so it includes operations on floats.
   4803        *
   4804        * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
   4805        * (Instruction Compression -> Rules and Restrictions):
   4806        *
   4807        *  "A compressed instruction must be in Align1 access mode. Align16
   4808        *   mode instructions cannot be compressed."
   4809        *
   4810        * Similar text exists in the g45 PRM.
   4811        *
   4812        * Empirically, compressed align16 instructions using odd register
   4813        * numbers don't appear to work on Sandybridge either.
   4814        */
   4815       return (devinfo->gen == 4 || devinfo->gen == 6 ||
   4816               (devinfo->gen == 7 && !devinfo->is_haswell) ?
   4817               MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
   4818 
   4819    case SHADER_OPCODE_MULH:
   4820       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
   4821        * is 8-wide on Gen7+.
   4822        */
   4823       return (devinfo->gen >= 7 ? 8 :
   4824               get_fpu_lowered_simd_width(devinfo, inst));
   4825 
   4826    case FS_OPCODE_FB_WRITE_LOGICAL:
   4827       /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
   4828        * here.
   4829        */
   4830       assert(devinfo->gen != 6 ||
   4831              inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
   4832              inst->exec_size == 8);
   4833       /* Dual-source FB writes are unsupported in SIMD16 mode. */
   4834       return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
   4835               8 : MIN2(16, inst->exec_size));
   4836 
   4837    case FS_OPCODE_FB_READ_LOGICAL:
   4838       return MIN2(16, inst->exec_size);
   4839 
   4840    case SHADER_OPCODE_TEX_LOGICAL:
   4841    case SHADER_OPCODE_TXF_CMS_LOGICAL:
   4842    case SHADER_OPCODE_TXF_UMS_LOGICAL:
   4843    case SHADER_OPCODE_TXF_MCS_LOGICAL:
   4844    case SHADER_OPCODE_LOD_LOGICAL:
   4845    case SHADER_OPCODE_TG4_LOGICAL:
   4846    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
   4847    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
   4848    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
   4849       return get_sampler_lowered_simd_width(devinfo, inst);
   4850 
   4851    case SHADER_OPCODE_TXD_LOGICAL:
   4852       /* TXD is unsupported in SIMD16 mode. */
   4853       return 8;
   4854 
   4855    case SHADER_OPCODE_TXL_LOGICAL:
   4856    case FS_OPCODE_TXB_LOGICAL:
   4857       /* Only one execution size is representable pre-ILK depending on whether
   4858        * the shadow reference argument is present.
   4859        */
   4860       if (devinfo->gen == 4)
   4861          return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
   4862       else
   4863          return get_sampler_lowered_simd_width(devinfo, inst);
   4864 
   4865    case SHADER_OPCODE_TXF_LOGICAL:
   4866    case SHADER_OPCODE_TXS_LOGICAL:
   4867       /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
   4868        * messages.  Use SIMD16 instead.
   4869        */
   4870       if (devinfo->gen == 4)
   4871          return 16;
   4872       else
   4873          return get_sampler_lowered_simd_width(devinfo, inst);
   4874 
   4875    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
   4876    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
   4877    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
   4878       return 8;
   4879 
   4880    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
   4881    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
   4882    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
   4883       return MIN2(16, inst->exec_size);
   4884 
   4885    case SHADER_OPCODE_URB_READ_SIMD8:
   4886    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
   4887    case SHADER_OPCODE_URB_WRITE_SIMD8:
   4888    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
   4889    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
   4890    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
   4891       return MIN2(8, inst->exec_size);
   4892 
   4893    case SHADER_OPCODE_MOV_INDIRECT:
   4894       /* Prior to Broadwell, we only have 8 address subregisters */
   4895       return MIN3(devinfo->gen >= 8 ? 16 : 8,
   4896                   2 * REG_SIZE / (inst->dst.stride * type_sz(inst->dst.type)),
   4897                   inst->exec_size);
   4898 
   4899    case SHADER_OPCODE_LOAD_PAYLOAD: {
   4900       const unsigned reg_count =
   4901          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
   4902 
   4903       if (reg_count > 2) {
   4904          /* Only LOAD_PAYLOAD instructions with per-channel destination region
   4905           * can be easily lowered (which excludes headers and heterogeneous
   4906           * types).
   4907           */
   4908          assert(!inst->header_size);
   4909          for (unsigned i = 0; i < inst->sources; i++)
   4910             assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
   4911                    inst->src[i].file == BAD_FILE);
   4912 
   4913          return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
   4914       } else {
   4915          return inst->exec_size;
   4916       }
   4917    }
   4918    default:
   4919       return inst->exec_size;
   4920    }
   4921 }
   4922 
   4923 /**
   4924  * Return true if splitting out the group of channels of instruction \p inst
   4925  * given by lbld.group() requires allocating a temporary for the i-th source
   4926  * of the lowered instruction.
   4927  */
   4928 static inline bool
   4929 needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
   4930 {
   4931    return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
   4932             (inst->components_read(i) == 1 &&
   4933              lbld.dispatch_width() <= inst->exec_size));
   4934 }
   4935 
   4936 /**
   4937  * Extract the data that would be consumed by the channel group given by
   4938  * lbld.group() from the i-th source region of instruction \p inst and return
   4939  * it as result in packed form.  If any copy instructions are required they
   4940  * will be emitted before the given \p inst in \p block.
   4941  */
   4942 static fs_reg
   4943 emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,
   4944            unsigned i)
   4945 {
   4946    /* Specified channel group from the source region. */
   4947    const fs_reg src = horiz_offset(inst->src[i], lbld.group());
   4948 
   4949    if (needs_src_copy(lbld, inst, i)) {
   4950       /* Builder of the right width to perform the copy avoiding uninitialized
   4951        * data if the lowered execution size is greater than the original
   4952        * execution size of the instruction.
   4953        */
   4954       const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
   4955                                               inst->exec_size), 0);
   4956       const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
   4957 
   4958       for (unsigned k = 0; k < inst->components_read(i); ++k)
   4959          cbld.at(block, inst)
   4960              .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
   4961 
   4962       return tmp;
   4963 
   4964    } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
   4965       /* The source is invariant for all dispatch_width-wide groups of the
   4966        * original region.
   4967        */
   4968       return inst->src[i];
   4969 
   4970    } else {
   4971       /* We can just point the lowered instruction at the right channel group
   4972        * from the original region.
   4973        */
   4974       return src;
   4975    }
   4976 }
   4977 
   4978 /**
   4979  * Return true if splitting out the group of channels of instruction \p inst
   4980  * given by lbld.group() requires allocating a temporary for the destination
   4981  * of the lowered instruction and copying the data back to the original
   4982  * destination region.
   4983  */
   4984 static inline bool
   4985 needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
   4986 {
   4987    /* If the instruction writes more than one component we'll have to shuffle
   4988     * the results of multiple lowered instructions in order to make sure that
   4989     * they end up arranged correctly in the original destination region.
   4990     */
   4991    if (inst->size_written > inst->dst.component_size(inst->exec_size))
   4992       return true;
   4993 
   4994    /* If the lowered execution size is larger than the original the result of
   4995     * the instruction won't fit in the original destination, so we'll have to
   4996     * allocate a temporary in any case.
   4997     */
   4998    if (lbld.dispatch_width() > inst->exec_size)
   4999       return true;
   5000 
   5001    for (unsigned i = 0; i < inst->sources; i++) {
   5002       /* If we already made a copy of the source for other reasons there won't
   5003        * be any overlap with the destination.
   5004        */
   5005       if (needs_src_copy(lbld, inst, i))
   5006          continue;
   5007 
   5008       /* In order to keep the logic simple we emit a copy whenever the
   5009        * destination region doesn't exactly match an overlapping source, which
   5010        * may point at the source and destination not being aligned group by
   5011        * group which could cause one of the lowered instructions to overwrite
   5012        * the data read from the same source by other lowered instructions.
   5013        */
   5014       if (regions_overlap(inst->dst, inst->size_written,
   5015                           inst->src[i], inst->size_read(i)) &&
   5016           !inst->dst.equals(inst->src[i]))
   5017         return true;
   5018    }
   5019 
   5020    return false;
   5021 }
   5022 
   5023 /**
   5024  * Insert data from a packed temporary into the channel group given by
   5025  * lbld.group() of the destination region of instruction \p inst and return
   5026  * the temporary as result.  If any copy instructions are required they will
   5027  * be emitted around the given \p inst in \p block.
   5028  */
   5029 static fs_reg
   5030 emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
   5031 {
   5032    /* Builder of the right width to perform the copy avoiding uninitialized
   5033     * data if the lowered execution size is greater than the original
   5034     * execution size of the instruction.
   5035     */
   5036    const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
   5037                                            inst->exec_size), 0);
   5038 
   5039    /* Specified channel group from the destination region. */
   5040    const fs_reg dst = horiz_offset(inst->dst, lbld.group());
   5041    const unsigned dst_size = inst->size_written /
   5042       inst->dst.component_size(inst->exec_size);
   5043 
   5044    if (needs_dst_copy(lbld, inst)) {
   5045       const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
   5046 
   5047       if (inst->predicate) {
   5048          /* Handle predication by copying the original contents of
   5049           * the destination into the temporary before emitting the
   5050           * lowered instruction.
   5051           */
   5052          for (unsigned k = 0; k < dst_size; ++k)
   5053             cbld.at(block, inst)
   5054                 .MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k));
   5055       }
   5056 
   5057       for (unsigned k = 0; k < dst_size; ++k)
   5058          cbld.at(block, inst->next)
   5059              .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
   5060 
   5061       return tmp;
   5062 
   5063    } else {
   5064       /* No need to allocate a temporary for the lowered instruction, just
   5065        * take the right group of channels from the original region.
   5066        */
   5067       return dst;
   5068    }
   5069 }
   5070 
   5071 bool
   5072 fs_visitor::lower_simd_width()
   5073 {
   5074    bool progress = false;
   5075 
   5076    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   5077       const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
   5078 
   5079       if (lower_width != inst->exec_size) {
   5080          /* Builder matching the original instruction.  We may also need to
   5081           * emit an instruction of width larger than the original, set the
   5082           * execution size of the builder to the highest of both for now so
   5083           * we're sure that both cases can be handled.
   5084           */
   5085          const unsigned max_width = MAX2(inst->exec_size, lower_width);
   5086          const fs_builder ibld = bld.at(block, inst)
   5087                                     .exec_all(inst->force_writemask_all)
   5088                                     .group(max_width, inst->group / max_width);
   5089 
   5090          /* Split the copies in chunks of the execution width of either the
   5091           * original or the lowered instruction, whichever is lower.
   5092           */
   5093          const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
   5094          const unsigned dst_size = inst->size_written /
   5095             inst->dst.component_size(inst->exec_size);
   5096 
   5097          assert(!inst->writes_accumulator && !inst->mlen);
   5098 
   5099          for (unsigned i = 0; i < n; i++) {
   5100             /* Emit a copy of the original instruction with the lowered width.
   5101              * If the EOT flag was set throw it away except for the last
   5102              * instruction to avoid killing the thread prematurely.
   5103              */
   5104             fs_inst split_inst = *inst;
   5105             split_inst.exec_size = lower_width;
   5106             split_inst.eot = inst->eot && i == n - 1;
   5107 
   5108             /* Select the correct channel enables for the i-th group, then
   5109              * transform the sources and destination and emit the lowered
   5110              * instruction.
   5111              */
   5112             const fs_builder lbld = ibld.group(lower_width, i);
   5113 
   5114             for (unsigned j = 0; j < inst->sources; j++)
   5115                split_inst.src[j] = emit_unzip(lbld, block, inst, j);
   5116 
   5117             split_inst.dst = emit_zip(lbld, block, inst);
   5118             split_inst.size_written =
   5119                split_inst.dst.component_size(lower_width) * dst_size;
   5120 
   5121             lbld.emit(split_inst);
   5122          }
   5123 
   5124          inst->remove(block);
   5125          progress = true;
   5126       }
   5127    }
   5128 
   5129    if (progress)
   5130       invalidate_live_intervals();
   5131 
   5132    return progress;
   5133 }
   5134 
   5135 void
   5136 fs_visitor::dump_instructions()
   5137 {
   5138    dump_instructions(NULL);
   5139 }
   5140 
   5141 void
   5142 fs_visitor::dump_instructions(const char *name)
   5143 {
   5144    FILE *file = stderr;
   5145    if (name && geteuid() != 0) {
   5146       file = fopen(name, "w");
   5147       if (!file)
   5148          file = stderr;
   5149    }
   5150 
   5151    if (cfg) {
   5152       calculate_register_pressure();
   5153       int ip = 0, max_pressure = 0;
   5154       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
   5155          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
   5156          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
   5157          dump_instruction(inst, file);
   5158          ip++;
   5159       }
   5160       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
   5161    } else {
   5162       int ip = 0;
   5163       foreach_in_list(backend_instruction, inst, &instructions) {
   5164          fprintf(file, "%4d: ", ip++);
   5165          dump_instruction(inst, file);
   5166       }
   5167    }
   5168 
   5169    if (file != stderr) {
   5170       fclose(file);
   5171    }
   5172 }
   5173 
   5174 void
   5175 fs_visitor::dump_instruction(backend_instruction *be_inst)
   5176 {
   5177    dump_instruction(be_inst, stderr);
   5178 }
   5179 
   5180 void
   5181 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
   5182 {
   5183    fs_inst *inst = (fs_inst *)be_inst;
   5184 
   5185    if (inst->predicate) {
   5186       fprintf(file, "(%cf0.%d) ",
   5187              inst->predicate_inverse ? '-' : '+',
   5188              inst->flag_subreg);
   5189    }
   5190 
   5191    fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
   5192    if (inst->saturate)
   5193       fprintf(file, ".sat");
   5194    if (inst->conditional_mod) {
   5195       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
   5196       if (!inst->predicate &&
   5197           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
   5198                               inst->opcode != BRW_OPCODE_IF &&
   5199                               inst->opcode != BRW_OPCODE_WHILE))) {
   5200          fprintf(file, ".f0.%d", inst->flag_subreg);
   5201       }
   5202    }
   5203    fprintf(file, "(%d) ", inst->exec_size);
   5204 
   5205    if (inst->mlen) {
   5206       fprintf(file, "(mlen: %d) ", inst->mlen);
   5207    }
   5208 
   5209    if (inst->eot) {
   5210       fprintf(file, "(EOT) ");
   5211    }
   5212 
   5213    switch (inst->dst.file) {
   5214    case VGRF:
   5215       fprintf(file, "vgrf%d", inst->dst.nr);
   5216       break;
   5217    case FIXED_GRF:
   5218       fprintf(file, "g%d", inst->dst.nr);
   5219       break;
   5220    case MRF:
   5221       fprintf(file, "m%d", inst->dst.nr);
   5222       break;
   5223    case BAD_FILE:
   5224       fprintf(file, "(null)");
   5225       break;
   5226    case UNIFORM:
   5227       fprintf(file, "***u%d***", inst->dst.nr);
   5228       break;
   5229    case ATTR:
   5230       fprintf(file, "***attr%d***", inst->dst.nr);
   5231       break;
   5232    case ARF:
   5233       switch (inst->dst.nr) {
   5234       case BRW_ARF_NULL:
   5235          fprintf(file, "null");
   5236          break;
   5237       case BRW_ARF_ADDRESS:
   5238          fprintf(file, "a0.%d", inst->dst.subnr);
   5239          break;
   5240       case BRW_ARF_ACCUMULATOR:
   5241          fprintf(file, "acc%d", inst->dst.subnr);
   5242          break;
   5243       case BRW_ARF_FLAG:
   5244          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
   5245          break;
   5246       default:
   5247          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
   5248          break;
   5249       }
   5250       break;
   5251    case IMM:
   5252       unreachable("not reached");
   5253    }
   5254 
   5255    if (inst->dst.offset ||
   5256        (inst->dst.file == VGRF &&
   5257         alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
   5258       const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
   5259       fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
   5260               inst->dst.offset % reg_size);
   5261    }
   5262 
   5263    if (inst->dst.stride != 1)
   5264       fprintf(file, "<%u>", inst->dst.stride);
   5265    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
   5266 
   5267    for (int i = 0; i < inst->sources; i++) {
   5268       if (inst->src[i].negate)
   5269          fprintf(file, "-");
   5270       if (inst->src[i].abs)
   5271          fprintf(file, "|");
   5272       switch (inst->src[i].file) {
   5273       case VGRF:
   5274          fprintf(file, "vgrf%d", inst->src[i].nr);
   5275          break;
   5276       case FIXED_GRF:
   5277          fprintf(file, "g%d", inst->src[i].nr);
   5278          break;
   5279       case MRF:
   5280          fprintf(file, "***m%d***", inst->src[i].nr);
   5281          break;
   5282       case ATTR:
   5283          fprintf(file, "attr%d", inst->src[i].nr);
   5284          break;
   5285       case UNIFORM:
   5286          fprintf(file, "u%d", inst->src[i].nr);
   5287          break;
   5288       case BAD_FILE:
   5289          fprintf(file, "(null)");
   5290          break;
   5291       case IMM:
   5292          switch (inst->src[i].type) {
   5293          case BRW_REGISTER_TYPE_F:
   5294             fprintf(file, "%-gf", inst->src[i].f);
   5295             break;
   5296          case BRW_REGISTER_TYPE_DF:
   5297             fprintf(file, "%fdf", inst->src[i].df);
   5298             break;
   5299          case BRW_REGISTER_TYPE_W:
   5300          case BRW_REGISTER_TYPE_D:
   5301             fprintf(file, "%dd", inst->src[i].d);
   5302             break;
   5303          case BRW_REGISTER_TYPE_UW:
   5304          case BRW_REGISTER_TYPE_UD:
   5305             fprintf(file, "%uu", inst->src[i].ud);
   5306             break;
   5307          case BRW_REGISTER_TYPE_VF:
   5308             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
   5309                     brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
   5310                     brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
   5311                     brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
   5312                     brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
   5313             break;
   5314          default:
   5315             fprintf(file, "???");
   5316             break;
   5317          }
   5318          break;
   5319       case ARF:
   5320          switch (inst->src[i].nr) {
   5321          case BRW_ARF_NULL:
   5322             fprintf(file, "null");
   5323             break;
   5324          case BRW_ARF_ADDRESS:
   5325             fprintf(file, "a0.%d", inst->src[i].subnr);
   5326             break;
   5327          case BRW_ARF_ACCUMULATOR:
   5328             fprintf(file, "acc%d", inst->src[i].subnr);
   5329             break;
   5330          case BRW_ARF_FLAG:
   5331             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
   5332             break;
   5333          default:
   5334             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
   5335             break;
   5336          }
   5337          break;
   5338       }
   5339 
   5340       if (inst->src[i].offset ||
   5341           (inst->src[i].file == VGRF &&
   5342            alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
   5343          const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
   5344          fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
   5345                  inst->src[i].offset % reg_size);
   5346       }
   5347 
   5348       if (inst->src[i].abs)
   5349          fprintf(file, "|");
   5350 
   5351       if (inst->src[i].file != IMM) {
   5352          unsigned stride;
   5353          if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
   5354             unsigned hstride = inst->src[i].hstride;
   5355             stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
   5356          } else {
   5357             stride = inst->src[i].stride;
   5358          }
   5359          if (stride != 1)
   5360             fprintf(file, "<%u>", stride);
   5361 
   5362          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
   5363       }
   5364 
   5365       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
   5366          fprintf(file, ", ");
   5367    }
   5368 
   5369    fprintf(file, " ");
   5370 
   5371    if (inst->force_writemask_all)
   5372       fprintf(file, "NoMask ");
   5373 
   5374    if (inst->exec_size != dispatch_width)
   5375       fprintf(file, "group%d ", inst->group);
   5376 
   5377    fprintf(file, "\n");
   5378 }
   5379 
   5380 /**
   5381  * Possibly returns an instruction that set up @param reg.
   5382  *
   5383  * Sometimes we want to take the result of some expression/variable
   5384  * dereference tree and rewrite the instruction generating the result
   5385  * of the tree.  When processing the tree, we know that the
   5386  * instructions generated are all writing temporaries that are dead
   5387  * outside of this tree.  So, if we have some instructions that write
   5388  * a temporary, we're free to point that temp write somewhere else.
   5389  *
   5390  * Note that this doesn't guarantee that the instruction generated
   5391  * only reg -- it might be the size=4 destination of a texture instruction.
   5392  */
   5393 fs_inst *
   5394 fs_visitor::get_instruction_generating_reg(fs_inst *start,
   5395 					   fs_inst *end,
   5396 					   const fs_reg &reg)
   5397 {
   5398    if (end == start ||
   5399        end->is_partial_write() ||
   5400        !reg.equals(end->dst)) {
   5401       return NULL;
   5402    } else {
   5403       return end;
   5404    }
   5405 }
   5406 
   5407 void
   5408 fs_visitor::setup_fs_payload_gen6()
   5409 {
   5410    assert(stage == MESA_SHADER_FRAGMENT);
   5411    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
   5412 
   5413    assert(devinfo->gen >= 6);
   5414 
   5415    /* R0-1: masks, pixel X/Y coordinates. */
   5416    payload.num_regs = 2;
   5417    /* R2: only for 32-pixel dispatch.*/
   5418 
   5419    /* R3-26: barycentric interpolation coordinates.  These appear in the
   5420     * same order that they appear in the brw_barycentric_mode
   5421     * enum.  Each set of coordinates occupies 2 registers if dispatch width
   5422     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
   5423     * appear if they were enabled using the "Barycentric Interpolation
   5424     * Mode" bits in WM_STATE.
   5425     */
   5426    for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
   5427       if (prog_data->barycentric_interp_modes & (1 << i)) {
   5428          payload.barycentric_coord_reg[i] = payload.num_regs;
   5429          payload.num_regs += 2;
   5430          if (dispatch_width == 16) {
   5431             payload.num_regs += 2;
   5432          }
   5433       }
   5434    }
   5435 
   5436    /* R27: interpolated depth if uses source depth */
   5437    prog_data->uses_src_depth =
   5438       (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
   5439    if (prog_data->uses_src_depth) {
   5440       payload.source_depth_reg = payload.num_regs;
   5441       payload.num_regs++;
   5442       if (dispatch_width == 16) {
   5443          /* R28: interpolated depth if not SIMD8. */
   5444          payload.num_regs++;
   5445       }
   5446    }
   5447 
   5448    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
   5449    prog_data->uses_src_w =
   5450       (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
   5451    if (prog_data->uses_src_w) {
   5452       payload.source_w_reg = payload.num_regs;
   5453       payload.num_regs++;
   5454       if (dispatch_width == 16) {
   5455          /* R30: interpolated W if not SIMD8. */
   5456          payload.num_regs++;
   5457       }
   5458    }
   5459 
   5460    /* R31: MSAA position offsets. */
   5461    if (prog_data->persample_dispatch &&
   5462        (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
   5463       /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
   5464        *
   5465        *    "MSDISPMODE_PERSAMPLE is required in order to select
   5466        *    POSOFFSET_SAMPLE"
   5467        *
   5468        * So we can only really get sample positions if we are doing real
   5469        * per-sample dispatch.  If we need gl_SamplePosition and we don't have
   5470        * persample dispatch, we hard-code it to 0.5.
   5471        */
   5472       prog_data->uses_pos_offset = true;
   5473       payload.sample_pos_reg = payload.num_regs;
   5474       payload.num_regs++;
   5475    }
   5476 
   5477    /* R32: MSAA input coverage mask */
   5478    prog_data->uses_sample_mask =
   5479       (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
   5480    if (prog_data->uses_sample_mask) {
   5481       assert(devinfo->gen >= 7);
   5482       payload.sample_mask_in_reg = payload.num_regs;
   5483       payload.num_regs++;
   5484       if (dispatch_width == 16) {
   5485          /* R33: input coverage mask if not SIMD8. */
   5486          payload.num_regs++;
   5487       }
   5488    }
   5489 
   5490    /* R34-: bary for 32-pixel. */
   5491    /* R58-59: interp W for 32-pixel. */
   5492 
   5493    if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
   5494       source_depth_to_render_target = true;
   5495    }
   5496 }
   5497 
   5498 void
   5499 fs_visitor::setup_vs_payload()
   5500 {
   5501    /* R0: thread header, R1: urb handles */
   5502    payload.num_regs = 2;
   5503 }
   5504 
   5505 void
   5506 fs_visitor::setup_gs_payload()
   5507 {
   5508    assert(stage == MESA_SHADER_GEOMETRY);
   5509 
   5510    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
   5511    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
   5512 
   5513    /* R0: thread header, R1: output URB handles */
   5514    payload.num_regs = 2;
   5515 
   5516    if (gs_prog_data->include_primitive_id) {
   5517       /* R2: Primitive ID 0..7 */
   5518       payload.num_regs++;
   5519    }
   5520 
   5521    /* Use a maximum of 24 registers for push-model inputs. */
   5522    const unsigned max_push_components = 24;
   5523 
   5524    /* If pushing our inputs would take too many registers, reduce the URB read
   5525     * length (which is in HWords, or 8 registers), and resort to pulling.
   5526     *
   5527     * Note that the GS reads <URB Read Length> HWords for every vertex - so we
   5528     * have to multiply by VerticesIn to obtain the total storage requirement.
   5529     */
   5530    if (8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in >
   5531        max_push_components || gs_prog_data->invocations > 1) {
   5532       gs_prog_data->base.include_vue_handles = true;
   5533 
   5534       /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
   5535       payload.num_regs += nir->info->gs.vertices_in;
   5536 
   5537       vue_prog_data->urb_read_length =
   5538          ROUND_DOWN_TO(max_push_components / nir->info->gs.vertices_in, 8) / 8;
   5539    }
   5540 }
   5541 
   5542 void
   5543 fs_visitor::setup_cs_payload()
   5544 {
   5545    assert(devinfo->gen >= 7);
   5546    payload.num_regs = 1;
   5547 }
   5548 
   5549 void
   5550 fs_visitor::calculate_register_pressure()
   5551 {
   5552    invalidate_live_intervals();
   5553    calculate_live_intervals();
   5554 
   5555    unsigned num_instructions = 0;
   5556    foreach_block(block, cfg)
   5557       num_instructions += block->instructions.length();
   5558 
   5559    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
   5560 
   5561    for (unsigned reg = 0; reg < alloc.count; reg++) {
   5562       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
   5563          regs_live_at_ip[ip] += alloc.sizes[reg];
   5564    }
   5565 }
   5566 
   5567 /**
   5568  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
   5569  *
   5570  * The needs_unlit_centroid_workaround ends up producing one of these per
   5571  * channel of centroid input, so it's good to clean them up.
   5572  *
   5573  * An assumption here is that nothing ever modifies the dispatched pixels
   5574  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
   5575  * dictates that anyway.
   5576  */
   5577 bool
   5578 fs_visitor::opt_drop_redundant_mov_to_flags()
   5579 {
   5580    bool flag_mov_found[2] = {false};
   5581    bool progress = false;
   5582 
   5583    /* Instructions removed by this pass can only be added if this were true */
   5584    if (!devinfo->needs_unlit_centroid_workaround)
   5585       return false;
   5586 
   5587    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
   5588       if (inst->is_control_flow()) {
   5589          memset(flag_mov_found, 0, sizeof(flag_mov_found));
   5590       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
   5591          if (!flag_mov_found[inst->flag_subreg]) {
   5592             flag_mov_found[inst->flag_subreg] = true;
   5593          } else {
   5594             inst->remove(block);
   5595             progress = true;
   5596          }
   5597       } else if (inst->flags_written()) {
   5598          flag_mov_found[inst->flag_subreg] = false;
   5599       }
   5600    }
   5601 
   5602    return progress;
   5603 }
   5604 
   5605 void
   5606 fs_visitor::optimize()
   5607 {
   5608    /* Start by validating the shader we currently have. */
   5609    validate();
   5610 
   5611    /* bld is the common builder object pointing at the end of the program we
   5612     * used to translate it into i965 IR.  For the optimization and lowering
   5613     * passes coming next, any code added after the end of the program without
   5614     * having explicitly called fs_builder::at() clearly points at a mistake.
   5615     * Ideally optimization passes wouldn't be part of the visitor so they
   5616     * wouldn't have access to bld at all, but they do, so just in case some
   5617     * pass forgets to ask for a location explicitly set it to NULL here to
   5618     * make it trip.  The dispatch width is initialized to a bogus value to
   5619     * make sure that optimizations set the execution controls explicitly to
   5620     * match the code they are manipulating instead of relying on the defaults.
   5621     */
   5622    bld = fs_builder(this, 64);
   5623 
   5624    assign_constant_locations();
   5625    lower_constant_loads();
   5626 
   5627    validate();
   5628 
   5629    split_virtual_grfs();
   5630    validate();
   5631 
   5632 #define OPT(pass, args...) ({                                           \
   5633       pass_num++;                                                       \
   5634       bool this_progress = pass(args);                                  \
   5635                                                                         \
   5636       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
   5637          char filename[64];                                             \
   5638          snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
   5639                   stage_abbrev, dispatch_width, nir->info->name, iteration, pass_num); \
   5640                                                                         \
   5641          backend_shader::dump_instructions(filename);                   \
   5642       }                                                                 \
   5643                                                                         \
   5644       validate();                                                       \
   5645                                                                         \
   5646       progress = progress || this_progress;                             \
   5647       this_progress;                                                    \
   5648    })
   5649 
   5650    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
   5651       char filename[64];
   5652       snprintf(filename, 64, "%s%d-%s-00-00-start",
   5653                stage_abbrev, dispatch_width, nir->info->name);
   5654 
   5655       backend_shader::dump_instructions(filename);
   5656    }
   5657 
   5658    bool progress = false;
   5659    int iteration = 0;
   5660    int pass_num = 0;
   5661 
   5662    OPT(opt_drop_redundant_mov_to_flags);
   5663 
   5664    do {
   5665       progress = false;
   5666       pass_num = 0;
   5667       iteration++;
   5668 
   5669       OPT(remove_duplicate_mrf_writes);
   5670 
   5671       OPT(opt_algebraic);
   5672       OPT(opt_cse);
   5673       OPT(opt_copy_propagation);
   5674       OPT(opt_predicated_break, this);
   5675       OPT(opt_cmod_propagation);
   5676       OPT(dead_code_eliminate);
   5677       OPT(opt_peephole_sel);
   5678       OPT(dead_control_flow_eliminate, this);
   5679       OPT(opt_register_renaming);
   5680       OPT(opt_saturate_propagation);
   5681       OPT(register_coalesce);
   5682       OPT(compute_to_mrf);
   5683       OPT(eliminate_find_live_channel);
   5684 
   5685       OPT(compact_virtual_grfs);
   5686    } while (progress);
   5687 
   5688    progress = false;
   5689    pass_num = 0;
   5690 
   5691    if (OPT(lower_pack)) {
   5692       OPT(register_coalesce);
   5693       OPT(dead_code_eliminate);
   5694    }
   5695 
   5696    if (OPT(lower_d2x)) {
   5697       OPT(opt_copy_propagation);
   5698       OPT(dead_code_eliminate);
   5699    }
   5700 
   5701    OPT(lower_simd_width);
   5702 
   5703    /* After SIMD lowering just in case we had to unroll the EOT send. */
   5704    OPT(opt_sampler_eot);
   5705 
   5706    OPT(lower_logical_sends);
   5707 
   5708    if (progress) {
   5709       OPT(opt_copy_propagation);
   5710       /* Only run after logical send lowering because it's easier to implement
   5711        * in terms of physical sends.
   5712        */
   5713       if (OPT(opt_zero_samples))
   5714          OPT(opt_copy_propagation);
   5715       /* Run after logical send lowering to give it a chance to CSE the
   5716        * LOAD_PAYLOAD instructions created to construct the payloads of
   5717        * e.g. texturing messages in cases where it wasn't possible to CSE the
   5718        * whole logical instruction.
   5719        */
   5720       OPT(opt_cse);
   5721       OPT(register_coalesce);
   5722       OPT(compute_to_mrf);
   5723       OPT(dead_code_eliminate);
   5724       OPT(remove_duplicate_mrf_writes);
   5725       OPT(opt_peephole_sel);
   5726    }
   5727 
   5728    OPT(opt_redundant_discard_jumps);
   5729 
   5730    if (OPT(lower_load_payload)) {
   5731       split_virtual_grfs();
   5732       OPT(register_coalesce);
   5733       OPT(compute_to_mrf);
   5734       OPT(dead_code_eliminate);
   5735    }
   5736 
   5737    OPT(opt_combine_constants);
   5738    OPT(lower_integer_multiplication);
   5739 
   5740    if (devinfo->gen <= 5 && OPT(lower_minmax)) {
   5741       OPT(opt_cmod_propagation);
   5742       OPT(opt_cse);
   5743       OPT(opt_copy_propagation);
   5744       OPT(dead_code_eliminate);
   5745    }
   5746 
   5747    lower_uniform_pull_constant_loads();
   5748 
   5749    validate();
   5750 }
   5751 
   5752 /**
   5753  * Three source instruction must have a GRF/MRF destination register.
   5754  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
   5755  */
   5756 void
   5757 fs_visitor::fixup_3src_null_dest()
   5758 {
   5759    bool progress = false;
   5760 
   5761    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
   5762       if (inst->is_3src(devinfo) && inst->dst.is_null()) {
   5763          inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
   5764                             inst->dst.type);
   5765          progress = true;
   5766       }
   5767    }
   5768 
   5769    if (progress)
   5770       invalidate_live_intervals();
   5771 }
   5772 
   5773 void
   5774 fs_visitor::allocate_registers(bool allow_spilling)
   5775 {
   5776    bool allocated_without_spills;
   5777 
   5778    static const enum instruction_scheduler_mode pre_modes[] = {
   5779       SCHEDULE_PRE,
   5780       SCHEDULE_PRE_NON_LIFO,
   5781       SCHEDULE_PRE_LIFO,
   5782    };
   5783 
   5784    bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
   5785 
   5786    /* Try each scheduling heuristic to see if it can successfully register
   5787     * allocate without spilling.  They should be ordered by decreasing
   5788     * performance but increasing likelihood of allocating.
   5789     */
   5790    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
   5791       schedule_instructions(pre_modes[i]);
   5792 
   5793       if (0) {
   5794          assign_regs_trivial();
   5795          allocated_without_spills = true;
   5796       } else {
   5797          allocated_without_spills = assign_regs(false, spill_all);
   5798       }
   5799       if (allocated_without_spills)
   5800          break;
   5801    }
   5802 
   5803    if (!allocated_without_spills) {
   5804       if (!allow_spilling)
   5805          fail("Failure to register allocate and spilling is not allowed.");
   5806 
   5807       /* We assume that any spilling is worse than just dropping back to
   5808        * SIMD8.  There's probably actually some intermediate point where
   5809        * SIMD16 with a couple of spills is still better.
   5810        */
   5811       if (dispatch_width > min_dispatch_width) {
   5812          fail("Failure to register allocate.  Reduce number of "
   5813               "live scalar values to avoid this.");
   5814       } else {
   5815          compiler->shader_perf_log(log_data,
   5816                                    "%s shader triggered register spilling.  "
   5817                                    "Try reducing the number of live scalar "
   5818                                    "values to improve performance.\n",
   5819                                    stage_name);
   5820       }
   5821 
   5822       /* Since we're out of heuristics, just go spill registers until we
   5823        * get an allocation.
   5824        */
   5825       while (!assign_regs(true, spill_all)) {
   5826          if (failed)
   5827             break;
   5828       }
   5829    }
   5830 
   5831    /* This must come after all optimization and register allocation, since
   5832     * it inserts dead code that happens to have side effects, and it does
   5833     * so based on the actual physical registers in use.
   5834     */
   5835    insert_gen4_send_dependency_workarounds();
   5836 
   5837    if (failed)
   5838       return;
   5839 
   5840    schedule_instructions(SCHEDULE_POST);
   5841 
   5842    if (last_scratch > 0) {
   5843       MAYBE_UNUSED unsigned max_scratch_size = 2 * 1024 * 1024;
   5844 
   5845       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
   5846 
   5847       if (stage == MESA_SHADER_COMPUTE) {
   5848          if (devinfo->is_haswell) {
   5849             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
   5850              * field documentation, Haswell supports a minimum of 2kB of
   5851              * scratch space for compute shaders, unlike every other stage
   5852              * and platform.
   5853              */
   5854             prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
   5855          } else if (devinfo->gen <= 7) {
   5856             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
   5857              * field documentation, platforms prior to Haswell measure scratch
   5858              * size linearly with a range of [1kB, 12kB] and 1kB granularity.
   5859              */
   5860             prog_data->total_scratch = ALIGN(last_scratch, 1024);
   5861             max_scratch_size = 12 * 1024;
   5862          }
   5863       }
   5864 
   5865       /* We currently only support up to 2MB of scratch space.  If we
   5866        * need to support more eventually, the documentation suggests
   5867        * that we could allocate a larger buffer, and partition it out
   5868        * ourselves.  We'd just have to undo the hardware's address
   5869        * calculation by subtracting (FFTID * Per Thread Scratch Space)
   5870        * and then add FFTID * (Larger Per Thread Scratch Space).
   5871        *
   5872        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
   5873        * Thread Group Tracking > Local Memory/Scratch Space.
   5874        */
   5875       assert(prog_data->total_scratch < max_scratch_size);
   5876    }
   5877 }
   5878 
   5879 bool
   5880 fs_visitor::run_vs(gl_clip_plane *clip_planes)
   5881 {
   5882    assert(stage == MESA_SHADER_VERTEX);
   5883 
   5884    setup_vs_payload();
   5885 
   5886    if (shader_time_index >= 0)
   5887       emit_shader_time_begin();
   5888 
   5889    emit_nir_code();
   5890 
   5891    if (failed)
   5892       return false;
   5893 
   5894    compute_clip_distance(clip_planes);
   5895 
   5896    emit_urb_writes();
   5897 
   5898    if (shader_time_index >= 0)
   5899       emit_shader_time_end();
   5900 
   5901    calculate_cfg();
   5902 
   5903    optimize();
   5904 
   5905    assign_curb_setup();
   5906    assign_vs_urb_setup();
   5907 
   5908    fixup_3src_null_dest();
   5909    allocate_registers(true);
   5910 
   5911    return !failed;
   5912 }
   5913 
   5914 bool
   5915 fs_visitor::run_tcs_single_patch()
   5916 {
   5917    assert(stage == MESA_SHADER_TESS_CTRL);
   5918 
   5919    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
   5920 
   5921    /* r1-r4 contain the ICP handles. */
   5922    payload.num_regs = 5;
   5923 
   5924    if (shader_time_index >= 0)
   5925       emit_shader_time_begin();
   5926 
   5927    /* Initialize gl_InvocationID */
   5928    fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
   5929    fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
   5930    bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
   5931    bld.MOV(channels_ud, channels_uw);
   5932 
   5933    if (tcs_prog_data->instances == 1) {
   5934       invocation_id = channels_ud;
   5935    } else {
   5936       invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
   5937 
   5938       /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
   5939       fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
   5940       fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
   5941       bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
   5942               brw_imm_ud(INTEL_MASK(23, 17)));
   5943       bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
   5944 
   5945       bld.ADD(invocation_id, instance_times_8, channels_ud);
   5946    }
   5947 
   5948    /* Fix the disptach mask */
   5949    if (nir->info->tess.tcs_vertices_out % 8) {
   5950       bld.CMP(bld.null_reg_ud(), invocation_id,
   5951               brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L);
   5952       bld.IF(BRW_PREDICATE_NORMAL);
   5953    }
   5954 
   5955    emit_nir_code();
   5956 
   5957    if (nir->info->tess.tcs_vertices_out % 8) {
   5958       bld.emit(BRW_OPCODE_ENDIF);
   5959    }
   5960 
   5961    /* Emit EOT write; set TR DS Cache bit */
   5962    fs_reg srcs[3] = {
   5963       fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
   5964       fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
   5965       fs_reg(brw_imm_ud(0)),
   5966    };
   5967    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
   5968    bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
   5969 
   5970    fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
   5971                             bld.null_reg_ud(), payload);
   5972    inst->mlen = 3;
   5973    inst->eot = true;
   5974 
   5975    if (shader_time_index >= 0)
   5976       emit_shader_time_end();
   5977 
   5978    if (failed)
   5979       return false;
   5980 
   5981    calculate_cfg();
   5982 
   5983    optimize();
   5984 
   5985    assign_curb_setup();
   5986    assign_tcs_single_patch_urb_setup();
   5987 
   5988    fixup_3src_null_dest();
   5989    allocate_registers(true);
   5990 
   5991    return !failed;
   5992 }
   5993 
   5994 bool
   5995 fs_visitor::run_tes()
   5996 {
   5997    assert(stage == MESA_SHADER_TESS_EVAL);
   5998 
   5999    /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
   6000    payload.num_regs = 5;
   6001 
   6002    if (shader_time_index >= 0)
   6003       emit_shader_time_begin();
   6004 
   6005    emit_nir_code();
   6006 
   6007    if (failed)
   6008       return false;
   6009 
   6010    emit_urb_writes();
   6011 
   6012    if (shader_time_index >= 0)
   6013       emit_shader_time_end();
   6014 
   6015    calculate_cfg();
   6016 
   6017    optimize();
   6018 
   6019    assign_curb_setup();
   6020    assign_tes_urb_setup();
   6021 
   6022    fixup_3src_null_dest();
   6023    allocate_registers(true);
   6024 
   6025    return !failed;
   6026 }
   6027 
   6028 bool
   6029 fs_visitor::run_gs()
   6030 {
   6031    assert(stage == MESA_SHADER_GEOMETRY);
   6032 
   6033    setup_gs_payload();
   6034 
   6035    this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
   6036 
   6037    if (gs_compile->control_data_header_size_bits > 0) {
   6038       /* Create a VGRF to store accumulated control data bits. */
   6039       this->control_data_bits = vgrf(glsl_type::uint_type);
   6040 
   6041       /* If we're outputting more than 32 control data bits, then EmitVertex()
   6042        * will set control_data_bits to 0 after emitting the first vertex.
   6043        * Otherwise, we need to initialize it to 0 here.
   6044        */
   6045       if (gs_compile->control_data_header_size_bits <= 32) {
   6046          const fs_builder abld = bld.annotate("initialize control data bits");
   6047          abld.MOV(this->control_data_bits, brw_imm_ud(0u));
   6048       }
   6049    }
   6050 
   6051    if (shader_time_index >= 0)
   6052       emit_shader_time_begin();
   6053 
   6054    emit_nir_code();
   6055 
   6056    emit_gs_thread_end();
   6057 
   6058    if (shader_time_index >= 0)
   6059       emit_shader_time_end();
   6060 
   6061    if (failed)
   6062       return false;
   6063 
   6064    calculate_cfg();
   6065 
   6066    optimize();
   6067 
   6068    assign_curb_setup();
   6069    assign_gs_urb_setup();
   6070 
   6071    fixup_3src_null_dest();
   6072    allocate_registers(true);
   6073 
   6074    return !failed;
   6075 }
   6076 
   6077 bool
   6078 fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
   6079 {
   6080    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
   6081    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
   6082 
   6083    assert(stage == MESA_SHADER_FRAGMENT);
   6084 
   6085    if (devinfo->gen >= 6)
   6086       setup_fs_payload_gen6();
   6087    else
   6088       setup_fs_payload_gen4();
   6089 
   6090    if (0) {
   6091       emit_dummy_fs();
   6092    } else if (do_rep_send) {
   6093       assert(dispatch_width == 16);
   6094       emit_repclear_shader();
   6095    } else {
   6096       if (shader_time_index >= 0)
   6097          emit_shader_time_begin();
   6098 
   6099       calculate_urb_setup();
   6100       if (nir->info->inputs_read > 0 ||
   6101           (nir->info->outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
   6102          if (devinfo->gen < 6)
   6103             emit_interpolation_setup_gen4();
   6104          else
   6105             emit_interpolation_setup_gen6();
   6106       }
   6107 
   6108       /* We handle discards by keeping track of the still-live pixels in f0.1.
   6109        * Initialize it with the dispatched pixels.
   6110        */
   6111       if (wm_prog_data->uses_kill) {
   6112          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
   6113          discard_init->flag_subreg = 1;
   6114       }
   6115 
   6116       /* Generate FS IR for main().  (the visitor only descends into
   6117        * functions called "main").
   6118        */
   6119       emit_nir_code();
   6120 
   6121       if (failed)
   6122 	 return false;
   6123 
   6124       if (wm_prog_data->uses_kill)
   6125          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
   6126 
   6127       if (wm_key->alpha_test_func)
   6128          emit_alpha_test();
   6129 
   6130       emit_fb_writes();
   6131 
   6132       if (shader_time_index >= 0)
   6133          emit_shader_time_end();
   6134 
   6135       calculate_cfg();
   6136 
   6137       optimize();
   6138 
   6139       assign_curb_setup();
   6140       assign_urb_setup();
   6141 
   6142       fixup_3src_null_dest();
   6143       allocate_registers(allow_spilling);
   6144 
   6145       if (failed)
   6146          return false;
   6147    }
   6148 
   6149    return !failed;
   6150 }
   6151 
   6152 bool
   6153 fs_visitor::run_cs()
   6154 {
   6155    assert(stage == MESA_SHADER_COMPUTE);
   6156 
   6157    setup_cs_payload();
   6158 
   6159    if (shader_time_index >= 0)
   6160       emit_shader_time_begin();
   6161 
   6162    if (devinfo->is_haswell && prog_data->total_shared > 0) {
   6163       /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
   6164       const fs_builder abld = bld.exec_all().group(1, 0);
   6165       abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
   6166                suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
   6167    }
   6168 
   6169    emit_nir_code();
   6170 
   6171    if (failed)
   6172       return false;
   6173 
   6174    emit_cs_terminate();
   6175 
   6176    if (shader_time_index >= 0)
   6177       emit_shader_time_end();
   6178 
   6179    calculate_cfg();
   6180 
   6181    optimize();
   6182 
   6183    assign_curb_setup();
   6184 
   6185    fixup_3src_null_dest();
   6186    allocate_registers(true);
   6187 
   6188    if (failed)
   6189       return false;
   6190 
   6191    return !failed;
   6192 }
   6193 
   6194 /**
   6195  * Return a bitfield where bit n is set if barycentric interpolation mode n
   6196  * (see enum brw_barycentric_mode) is needed by the fragment shader.
   6197  *
   6198  * We examine the load_barycentric intrinsics rather than looking at input
   6199  * variables so that we catch interpolateAtCentroid() messages too, which
   6200  * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
   6201  */
   6202 static unsigned
   6203 brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo,
   6204                                      const nir_shader *shader)
   6205 {
   6206    unsigned barycentric_interp_modes = 0;
   6207 
   6208    nir_foreach_function(f, shader) {
   6209       if (!f->impl)
   6210          continue;
   6211 
   6212       nir_foreach_block(block, f->impl) {
   6213          nir_foreach_instr(instr, block) {
   6214             if (instr->type != nir_instr_type_intrinsic)
   6215                continue;
   6216 
   6217             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   6218             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
   6219                continue;
   6220 
   6221             /* Ignore WPOS; it doesn't require interpolation. */
   6222             if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
   6223                continue;
   6224 
   6225             intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
   6226             enum glsl_interp_mode interp = (enum glsl_interp_mode)
   6227                nir_intrinsic_interp_mode(intrin);
   6228             nir_intrinsic_op bary_op = intrin->intrinsic;
   6229             enum brw_barycentric_mode bary =
   6230                brw_barycentric_mode(interp, bary_op);
   6231 
   6232             barycentric_interp_modes |= 1 << bary;
   6233 
   6234             if (devinfo->needs_unlit_centroid_workaround &&
   6235                 bary_op == nir_intrinsic_load_barycentric_centroid)
   6236                barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
   6237          }
   6238       }
   6239    }
   6240 
   6241    return barycentric_interp_modes;
   6242 }
   6243 
   6244 static void
   6245 brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
   6246                         const nir_shader *shader)
   6247 {
   6248    prog_data->flat_inputs = 0;
   6249 
   6250    nir_foreach_variable(var, &shader->inputs) {
   6251       int input_index = prog_data->urb_setup[var->data.location];
   6252 
   6253       if (input_index < 0)
   6254 	 continue;
   6255 
   6256       /* flat shading */
   6257       if (var->data.interpolation == INTERP_MODE_FLAT)
   6258          prog_data->flat_inputs |= (1 << input_index);
   6259    }
   6260 }
   6261 
   6262 static uint8_t
   6263 computed_depth_mode(const nir_shader *shader)
   6264 {
   6265    if (shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
   6266       switch (shader->info->fs.depth_layout) {
   6267       case FRAG_DEPTH_LAYOUT_NONE:
   6268       case FRAG_DEPTH_LAYOUT_ANY:
   6269          return BRW_PSCDEPTH_ON;
   6270       case FRAG_DEPTH_LAYOUT_GREATER:
   6271          return BRW_PSCDEPTH_ON_GE;
   6272       case FRAG_DEPTH_LAYOUT_LESS:
   6273          return BRW_PSCDEPTH_ON_LE;
   6274       case FRAG_DEPTH_LAYOUT_UNCHANGED:
   6275          return BRW_PSCDEPTH_OFF;
   6276       }
   6277    }
   6278    return BRW_PSCDEPTH_OFF;
   6279 }
   6280 
   6281 /**
   6282  * Move load_interpolated_input with simple (payload-based) barycentric modes
   6283  * to the top of the program so we don't emit multiple PLNs for the same input.
   6284  *
   6285  * This works around CSE not being able to handle non-dominating cases
   6286  * such as:
   6287  *
   6288  *    if (...) {
   6289  *       interpolate input
   6290  *    } else {
   6291  *       interpolate the same exact input
   6292  *    }
   6293  *
   6294  * This should be replaced by global value numbering someday.
   6295  */
   6296 void
   6297 move_interpolation_to_top(nir_shader *nir)
   6298 {
   6299    nir_foreach_function(f, nir) {
   6300       if (!f->impl)
   6301          continue;
   6302 
   6303       nir_block *top = nir_start_block(f->impl);
   6304       exec_node *cursor_node = NULL;
   6305 
   6306       nir_foreach_block(block, f->impl) {
   6307          if (block == top)
   6308             continue;
   6309 
   6310          nir_foreach_instr_safe(instr, block) {
   6311             if (instr->type != nir_instr_type_intrinsic)
   6312                continue;
   6313 
   6314             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   6315             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
   6316                continue;
   6317             nir_intrinsic_instr *bary_intrinsic =
   6318                nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
   6319             nir_intrinsic_op op = bary_intrinsic->intrinsic;
   6320 
   6321             /* Leave interpolateAtSample/Offset() where they are. */
   6322             if (op == nir_intrinsic_load_barycentric_at_sample ||
   6323                 op == nir_intrinsic_load_barycentric_at_offset)
   6324                continue;
   6325 
   6326             nir_instr *move[3] = {
   6327                &bary_intrinsic->instr,
   6328                intrin->src[1].ssa->parent_instr,
   6329                instr
   6330             };
   6331 
   6332             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
   6333                if (move[i]->block != top) {
   6334                   move[i]->block = top;
   6335                   exec_node_remove(&move[i]->node);
   6336                   if (cursor_node) {
   6337                      exec_node_insert_after(cursor_node, &move[i]->node);
   6338                   } else {
   6339                      exec_list_push_head(&top->instr_list, &move[i]->node);
   6340                   }
   6341                   cursor_node = &move[i]->node;
   6342                }
   6343             }
   6344          }
   6345       }
   6346       nir_metadata_preserve(f->impl, (nir_metadata)
   6347                             ((unsigned) nir_metadata_block_index |
   6348                              (unsigned) nir_metadata_dominance));
   6349    }
   6350 }
   6351 
   6352 /**
   6353  * Demote per-sample barycentric intrinsics to centroid.
   6354  *
   6355  * Useful when rendering to a non-multisampled buffer.
   6356  */
   6357 static void
   6358 demote_sample_qualifiers(nir_shader *nir)
   6359 {
   6360    nir_foreach_function(f, nir) {
   6361       if (!f->impl)
   6362          continue;
   6363 
   6364       nir_builder b;
   6365       nir_builder_init(&b, f->impl);
   6366 
   6367       nir_foreach_block(block, f->impl) {
   6368          nir_foreach_instr_safe(instr, block) {
   6369             if (instr->type != nir_instr_type_intrinsic)
   6370                continue;
   6371 
   6372             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   6373             if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
   6374                 intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
   6375                continue;
   6376 
   6377             b.cursor = nir_before_instr(instr);
   6378             nir_ssa_def *centroid =
   6379                nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
   6380                                     nir_intrinsic_interp_mode(intrin));
   6381             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
   6382                                      nir_src_for_ssa(centroid));
   6383             nir_instr_remove(instr);
   6384          }
   6385       }
   6386 
   6387       nir_metadata_preserve(f->impl, (nir_metadata)
   6388                             ((unsigned) nir_metadata_block_index |
   6389                              (unsigned) nir_metadata_dominance));
   6390    }
   6391 }
   6392 
   6393 const unsigned *
   6394 brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
   6395                void *mem_ctx,
   6396                const struct brw_wm_prog_key *key,
   6397                struct brw_wm_prog_data *prog_data,
   6398                const nir_shader *src_shader,
   6399                struct gl_program *prog,
   6400                int shader_time_index8, int shader_time_index16,
   6401                bool allow_spilling,
   6402                bool use_rep_send, struct brw_vue_map *vue_map,
   6403                unsigned *final_assembly_size,
   6404                char **error_str)
   6405 {
   6406    const struct gen_device_info *devinfo = compiler->devinfo;
   6407 
   6408    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
   6409    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
   6410    brw_nir_lower_fs_inputs(shader, devinfo, key);
   6411    brw_nir_lower_fs_outputs(shader);
   6412 
   6413    if (devinfo->gen < 6) {
   6414       brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
   6415    }
   6416 
   6417    if (!key->multisample_fbo)
   6418       NIR_PASS_V(shader, demote_sample_qualifiers);
   6419    NIR_PASS_V(shader, move_interpolation_to_top);
   6420    shader = brw_postprocess_nir(shader, compiler, true);
   6421 
   6422    /* key->alpha_test_func means simulating alpha testing via discards,
   6423     * so the shader definitely kills pixels.
   6424     */
   6425    prog_data->uses_kill = shader->info->fs.uses_discard ||
   6426       key->alpha_test_func;
   6427    prog_data->uses_omask = key->multisample_fbo &&
   6428       shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
   6429    prog_data->computed_depth_mode = computed_depth_mode(shader);
   6430    prog_data->computed_stencil =
   6431       shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
   6432 
   6433    prog_data->persample_dispatch =
   6434       key->multisample_fbo &&
   6435       (key->persample_interp ||
   6436        (shader->info->system_values_read & (SYSTEM_BIT_SAMPLE_ID |
   6437                                             SYSTEM_BIT_SAMPLE_POS)) ||
   6438        shader->info->fs.uses_sample_qualifier ||
   6439        shader->info->outputs_read);
   6440 
   6441    prog_data->early_fragment_tests = shader->info->fs.early_fragment_tests;
   6442    prog_data->post_depth_coverage = shader->info->fs.post_depth_coverage;
   6443    prog_data->inner_coverage = shader->info->fs.inner_coverage;
   6444 
   6445    prog_data->barycentric_interp_modes =
   6446       brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
   6447 
   6448    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
   6449    uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
   6450    unsigned simd8_grf_used = 0, simd16_grf_used = 0;
   6451 
   6452    fs_visitor v8(compiler, log_data, mem_ctx, key,
   6453                  &prog_data->base, prog, shader, 8,
   6454                  shader_time_index8);
   6455    if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
   6456       if (error_str)
   6457          *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
   6458 
   6459       return NULL;
   6460    } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
   6461       simd8_cfg = v8.cfg;
   6462       simd8_grf_start = v8.payload.num_regs;
   6463       simd8_grf_used = v8.grf_used;
   6464    }
   6465 
   6466    if (v8.max_dispatch_width >= 16 &&
   6467        likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
   6468       /* Try a SIMD16 compile */
   6469       fs_visitor v16(compiler, log_data, mem_ctx, key,
   6470                      &prog_data->base, prog, shader, 16,
   6471                      shader_time_index16);
   6472       v16.import_uniforms(&v8);
   6473       if (!v16.run_fs(allow_spilling, use_rep_send)) {
   6474          compiler->shader_perf_log(log_data,
   6475                                    "SIMD16 shader failed to compile: %s",
   6476                                    v16.fail_msg);
   6477       } else {
   6478          simd16_cfg = v16.cfg;
   6479          simd16_grf_start = v16.payload.num_regs;
   6480          simd16_grf_used = v16.grf_used;
   6481       }
   6482    }
   6483 
   6484    /* When the caller requests a repclear shader, they want SIMD16-only */
   6485    if (use_rep_send)
   6486       simd8_cfg = NULL;
   6487 
   6488    /* Prior to Iron Lake, the PS had a single shader offset with a jump table
   6489     * at the top to select the shader.  We've never implemented that.
   6490     * Instead, we just give them exactly one shader and we pick the widest one
   6491     * available.
   6492     */
   6493    if (compiler->devinfo->gen < 5 && simd16_cfg)
   6494       simd8_cfg = NULL;
   6495 
   6496    if (prog_data->persample_dispatch) {
   6497       /* Starting with SandyBridge (where we first get MSAA), the different
   6498        * pixel dispatch combinations are grouped into classifications A
   6499        * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
   6500        * generations, the only configurations supporting persample dispatch
   6501        * are are this in which only one dispatch width is enabled.
   6502        *
   6503        * If computed depth is enabled, SNB only allows SIMD8 while IVB+
   6504        * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
   6505        */
   6506       if (compiler->devinfo->gen == 6 &&
   6507           prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
   6508          simd16_cfg = NULL;
   6509       } else if (simd16_cfg) {
   6510          simd8_cfg = NULL;
   6511       }
   6512    }
   6513 
   6514    /* We have to compute the flat inputs after the visitor is finished running
   6515     * because it relies on prog_data->urb_setup which is computed in
   6516     * fs_visitor::calculate_urb_setup().
   6517     */
   6518    brw_compute_flat_inputs(prog_data, shader);
   6519 
   6520    fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
   6521                   v8.promoted_constants, v8.runtime_check_aads_emit,
   6522                   MESA_SHADER_FRAGMENT);
   6523 
   6524    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
   6525       g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
   6526                                      shader->info->label ?
   6527                                         shader->info->label : "unnamed",
   6528                                      shader->info->name));
   6529    }
   6530 
   6531    if (simd8_cfg) {
   6532       prog_data->dispatch_8 = true;
   6533       g.generate_code(simd8_cfg, 8);
   6534       prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
   6535       prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
   6536 
   6537       if (simd16_cfg) {
   6538          prog_data->dispatch_16 = true;
   6539          prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
   6540          prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
   6541          prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
   6542       }
   6543    } else if (simd16_cfg) {
   6544       prog_data->dispatch_16 = true;
   6545       g.generate_code(simd16_cfg, 16);
   6546       prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
   6547       prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
   6548    }
   6549 
   6550    return g.get_assembly(final_assembly_size);
   6551 }
   6552 
   6553 fs_reg *
   6554 fs_visitor::emit_cs_work_group_id_setup()
   6555 {
   6556    assert(stage == MESA_SHADER_COMPUTE);
   6557 
   6558    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
   6559 
   6560    struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
   6561    struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
   6562    struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
   6563 
   6564    bld.MOV(*reg, r0_1);
   6565    bld.MOV(offset(*reg, bld, 1), r0_6);
   6566    bld.MOV(offset(*reg, bld, 2), r0_7);
   6567 
   6568    return reg;
   6569 }
   6570 
   6571 static void
   6572 fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
   6573 {
   6574    block->dwords = dwords;
   6575    block->regs = DIV_ROUND_UP(dwords, 8);
   6576    block->size = block->regs * 32;
   6577 }
   6578 
   6579 static void
   6580 cs_fill_push_const_info(const struct gen_device_info *devinfo,
   6581                         struct brw_cs_prog_data *cs_prog_data)
   6582 {
   6583    const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
   6584    bool fill_thread_id =
   6585       cs_prog_data->thread_local_id_index >= 0 &&
   6586       cs_prog_data->thread_local_id_index < (int)prog_data->nr_params;
   6587    bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell;
   6588 
   6589    /* The thread ID should be stored in the last param dword */
   6590    assert(prog_data->nr_params > 0 || !fill_thread_id);
   6591    assert(!fill_thread_id ||
   6592           cs_prog_data->thread_local_id_index ==
   6593              (int)prog_data->nr_params - 1);
   6594 
   6595    unsigned cross_thread_dwords, per_thread_dwords;
   6596    if (!cross_thread_supported) {
   6597       cross_thread_dwords = 0u;
   6598       per_thread_dwords = prog_data->nr_params;
   6599    } else if (fill_thread_id) {
   6600       /* Fill all but the last register with cross-thread payload */
   6601       cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8);
   6602       per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
   6603       assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
   6604    } else {
   6605       /* Fill all data using cross-thread payload */
   6606       cross_thread_dwords = prog_data->nr_params;
   6607       per_thread_dwords = 0u;
   6608    }
   6609 
   6610    fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
   6611    fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
   6612 
   6613    unsigned total_dwords =
   6614       (cs_prog_data->push.per_thread.size * cs_prog_data->threads +
   6615        cs_prog_data->push.cross_thread.size) / 4;
   6616    fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
   6617 
   6618    assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
   6619           cs_prog_data->push.per_thread.size == 0);
   6620    assert(cs_prog_data->push.cross_thread.dwords +
   6621           cs_prog_data->push.per_thread.dwords ==
   6622              prog_data->nr_params);
   6623 }
   6624 
   6625 static void
   6626 cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
   6627 {
   6628    cs_prog_data->simd_size = size;
   6629    unsigned group_size = cs_prog_data->local_size[0] *
   6630       cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
   6631    cs_prog_data->threads = (group_size + size - 1) / size;
   6632 }
   6633 
   6634 const unsigned *
   6635 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
   6636                void *mem_ctx,
   6637                const struct brw_cs_prog_key *key,
   6638                struct brw_cs_prog_data *prog_data,
   6639                const nir_shader *src_shader,
   6640                int shader_time_index,
   6641                unsigned *final_assembly_size,
   6642                char **error_str)
   6643 {
   6644    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
   6645    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
   6646    brw_nir_lower_cs_shared(shader);
   6647    prog_data->base.total_shared += shader->num_shared;
   6648 
   6649    /* Now that we cloned the nir_shader, we can update num_uniforms based on
   6650     * the thread_local_id_index.
   6651     */
   6652    assert(prog_data->thread_local_id_index >= 0);
   6653    shader->num_uniforms =
   6654       MAX2(shader->num_uniforms,
   6655            (unsigned)4 * (prog_data->thread_local_id_index + 1));
   6656 
   6657    brw_nir_lower_intrinsics(shader, &prog_data->base);
   6658    shader = brw_postprocess_nir(shader, compiler, true);
   6659 
   6660    prog_data->local_size[0] = shader->info->cs.local_size[0];
   6661    prog_data->local_size[1] = shader->info->cs.local_size[1];
   6662    prog_data->local_size[2] = shader->info->cs.local_size[2];
   6663    unsigned local_workgroup_size =
   6664       shader->info->cs.local_size[0] * shader->info->cs.local_size[1] *
   6665       shader->info->cs.local_size[2];
   6666 
   6667    unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
   6668    unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
   6669 
   6670    cfg_t *cfg = NULL;
   6671    const char *fail_msg = NULL;
   6672 
   6673    /* Now the main event: Visit the shader IR and generate our CS IR for it.
   6674     */
   6675    fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
   6676                  NULL, /* Never used in core profile */
   6677                  shader, 8, shader_time_index);
   6678    if (simd_required <= 8) {
   6679       if (!v8.run_cs()) {
   6680          fail_msg = v8.fail_msg;
   6681       } else {
   6682          cfg = v8.cfg;
   6683          cs_set_simd_size(prog_data, 8);
   6684          cs_fill_push_const_info(compiler->devinfo, prog_data);
   6685          prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
   6686       }
   6687    }
   6688 
   6689    fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
   6690                  NULL, /* Never used in core profile */
   6691                  shader, 16, shader_time_index);
   6692    if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
   6693        !fail_msg && v8.max_dispatch_width >= 16 &&
   6694        simd_required <= 16) {
   6695       /* Try a SIMD16 compile */
   6696       if (simd_required <= 8)
   6697          v16.import_uniforms(&v8);
   6698       if (!v16.run_cs()) {
   6699          compiler->shader_perf_log(log_data,
   6700                                    "SIMD16 shader failed to compile: %s",
   6701                                    v16.fail_msg);
   6702          if (!cfg) {
   6703             fail_msg =
   6704                "Couldn't generate SIMD16 program and not "
   6705                "enough threads for SIMD8";
   6706          }
   6707       } else {
   6708          cfg = v16.cfg;
   6709          cs_set_simd_size(prog_data, 16);
   6710          cs_fill_push_const_info(compiler->devinfo, prog_data);
   6711          prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
   6712       }
   6713    }
   6714 
   6715    fs_visitor v32(compiler, log_data, mem_ctx, key, &prog_data->base,
   6716                  NULL, /* Never used in core profile */
   6717                  shader, 32, shader_time_index);
   6718    if (!fail_msg && v8.max_dispatch_width >= 32 &&
   6719        (simd_required > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
   6720       /* Try a SIMD32 compile */
   6721       if (simd_required <= 8)
   6722          v32.import_uniforms(&v8);
   6723       else if (simd_required <= 16)
   6724          v32.import_uniforms(&v16);
   6725 
   6726       if (!v32.run_cs()) {
   6727          compiler->shader_perf_log(log_data,
   6728                                    "SIMD32 shader failed to compile: %s",
   6729                                    v16.fail_msg);
   6730          if (!cfg) {
   6731             fail_msg =
   6732                "Couldn't generate SIMD32 program and not "
   6733                "enough threads for SIMD16";
   6734          }
   6735       } else {
   6736          cfg = v32.cfg;
   6737          cs_set_simd_size(prog_data, 32);
   6738          cs_fill_push_const_info(compiler->devinfo, prog_data);
   6739       }
   6740    }
   6741 
   6742    if (unlikely(cfg == NULL)) {
   6743       assert(fail_msg);
   6744       if (error_str)
   6745          *error_str = ralloc_strdup(mem_ctx, fail_msg);
   6746 
   6747       return NULL;
   6748    }
   6749 
   6750    fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
   6751                   v8.promoted_constants, v8.runtime_check_aads_emit,
   6752                   MESA_SHADER_COMPUTE);
   6753    if (INTEL_DEBUG & DEBUG_CS) {
   6754       char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
   6755                                    shader->info->label ? shader->info->label :
   6756                                                         "unnamed",
   6757                                    shader->info->name);
   6758       g.enable_debug(name);
   6759    }
   6760 
   6761    g.generate_code(cfg, prog_data->simd_size);
   6762 
   6763    return g.get_assembly(final_assembly_size);
   6764 }
   6765 
   6766 /**
   6767  * Test the dispatch mask packing assumptions of
   6768  * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
   6769  * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
   6770  * executed with an unexpected dispatch mask.
   6771  */
   6772 static UNUSED void
   6773 brw_fs_test_dispatch_packing(const fs_builder &bld)
   6774 {
   6775    const gl_shader_stage stage = bld.shader->stage;
   6776 
   6777    if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
   6778                                      bld.shader->stage_prog_data)) {
   6779       const fs_builder ubld = bld.exec_all().group(1, 0);
   6780       const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
   6781       const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
   6782                            brw_dmask_reg());
   6783 
   6784       ubld.ADD(tmp, mask, brw_imm_ud(1));
   6785       ubld.AND(tmp, mask, tmp);
   6786 
   6787       /* This will loop forever if the dispatch mask doesn't have the expected
   6788        * form '2^n-1', in which case tmp will be non-zero.
   6789        */
   6790       bld.emit(BRW_OPCODE_DO);
   6791       bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
   6792       set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
   6793    }
   6794 }
   6795