Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 /** @file brw_fs_generator.cpp
     25  *
     26  * This file supports generating code from the FS LIR to the actual
     27  * native instructions.
     28  */
     29 
     30 #include "brw_eu.h"
     31 #include "brw_fs.h"
     32 #include "brw_cfg.h"
     33 #include "brw_program.h"
     34 
     35 static enum brw_reg_file
     36 brw_file_from_reg(fs_reg *reg)
     37 {
     38    switch (reg->file) {
     39    case ARF:
     40       return BRW_ARCHITECTURE_REGISTER_FILE;
     41    case FIXED_GRF:
     42    case VGRF:
     43       return BRW_GENERAL_REGISTER_FILE;
     44    case MRF:
     45       return BRW_MESSAGE_REGISTER_FILE;
     46    case IMM:
     47       return BRW_IMMEDIATE_VALUE;
     48    case BAD_FILE:
     49    case ATTR:
     50    case UNIFORM:
     51       unreachable("not reached");
     52    }
     53    return BRW_ARCHITECTURE_REGISTER_FILE;
     54 }
     55 
     56 static struct brw_reg
     57 brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed)
     58 {
     59    struct brw_reg brw_reg;
     60 
     61    switch (reg->file) {
     62    case MRF:
     63       assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen));
     64       /* Fallthrough */
     65    case VGRF:
     66       if (reg->stride == 0) {
     67          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
     68       } else {
     69          /* From the Haswell PRM:
     70           *
     71           *  "VertStride must be used to cross GRF register boundaries. This
     72           *   rule implies that elements within a 'Width' cannot cross GRF
     73           *   boundaries."
     74           *
     75           * The maximum width value that could satisfy this restriction is:
     76           */
     77          const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
     78 
     79          /* Because the hardware can only split source regions at a whole
     80           * multiple of width during decompression (i.e. vertically), clamp
     81           * the value obtained above to the physical execution size of a
     82           * single decompressed chunk of the instruction:
     83           */
     84          const unsigned phys_width = compressed ? inst->exec_size / 2 :
     85                                      inst->exec_size;
     86 
     87          /* XXX - The equation above is strictly speaking not correct on
     88           *       hardware that supports unbalanced GRF writes -- On Gen9+
     89           *       each decompressed chunk of the instruction may have a
     90           *       different execution size when the number of components
     91           *       written to each destination GRF is not the same.
     92           */
     93          const unsigned width = MIN2(reg_width, phys_width);
     94          brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
     95          brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
     96       }
     97 
     98       brw_reg = retype(brw_reg, reg->type);
     99       brw_reg = byte_offset(brw_reg, reg->offset);
    100       brw_reg.abs = reg->abs;
    101       brw_reg.negate = reg->negate;
    102       break;
    103    case ARF:
    104    case FIXED_GRF:
    105    case IMM:
    106       assert(reg->offset == 0);
    107       brw_reg = reg->as_brw_reg();
    108       break;
    109    case BAD_FILE:
    110       /* Probably unused. */
    111       brw_reg = brw_null_reg();
    112       break;
    113    case ATTR:
    114    case UNIFORM:
    115       unreachable("not reached");
    116    }
    117 
    118    return brw_reg;
    119 }
    120 
    121 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
    122                            void *mem_ctx,
    123                            const void *key,
    124                            struct brw_stage_prog_data *prog_data,
    125                            unsigned promoted_constants,
    126                            bool runtime_check_aads_emit,
    127                            gl_shader_stage stage)
    128 
    129    : compiler(compiler), log_data(log_data),
    130      devinfo(compiler->devinfo), key(key),
    131      prog_data(prog_data),
    132      promoted_constants(promoted_constants),
    133      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
    134      stage(stage), mem_ctx(mem_ctx)
    135 {
    136    p = rzalloc(mem_ctx, struct brw_codegen);
    137    brw_init_codegen(devinfo, p, mem_ctx);
    138 }
    139 
    140 fs_generator::~fs_generator()
    141 {
    142 }
    143 
    144 class ip_record : public exec_node {
    145 public:
    146    DECLARE_RALLOC_CXX_OPERATORS(ip_record)
    147 
    148    ip_record(int ip)
    149    {
    150       this->ip = ip;
    151    }
    152 
    153    int ip;
    154 };
    155 
    156 bool
    157 fs_generator::patch_discard_jumps_to_fb_writes()
    158 {
    159    if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
    160       return false;
    161 
    162    int scale = brw_jump_scale(p->devinfo);
    163 
    164    /* There is a somewhat strange undocumented requirement of using
    165     * HALT, according to the simulator.  If some channel has HALTed to
    166     * a particular UIP, then by the end of the program, every channel
    167     * must have HALTed to that UIP.  Furthermore, the tracking is a
    168     * stack, so you can't do the final halt of a UIP after starting
    169     * halting to a new UIP.
    170     *
    171     * Symptoms of not emitting this instruction on actual hardware
    172     * included GPU hangs and sparkly rendering on the piglit discard
    173     * tests.
    174     */
    175    brw_inst *last_halt = gen6_HALT(p);
    176    brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
    177    brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
    178 
    179    int ip = p->nr_insn;
    180 
    181    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
    182       brw_inst *patch = &p->store[patch_ip->ip];
    183 
    184       assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
    185       /* HALT takes a half-instruction distance from the pre-incremented IP. */
    186       brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
    187    }
    188 
    189    this->discard_halt_patches.make_empty();
    190    return true;
    191 }
    192 
    193 void
    194 fs_generator::fire_fb_write(fs_inst *inst,
    195                             struct brw_reg payload,
    196                             struct brw_reg implied_header,
    197                             GLuint nr)
    198 {
    199    uint32_t msg_control;
    200 
    201    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
    202 
    203    if (devinfo->gen < 6) {
    204       brw_push_insn_state(p);
    205       brw_set_default_exec_size(p, BRW_EXECUTE_8);
    206       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    207       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
    208       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
    209       brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
    210       brw_pop_insn_state(p);
    211    }
    212 
    213    if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
    214       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
    215    else if (prog_data->dual_src_blend) {
    216       if (!inst->group)
    217          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
    218       else
    219          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
    220    } else if (inst->exec_size == 16)
    221       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
    222    else
    223       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
    224 
    225    uint32_t surf_index =
    226       prog_data->binding_table.render_target_start + inst->target;
    227 
    228    bool last_render_target = inst->eot ||
    229                              (prog_data->dual_src_blend && dispatch_width == 16);
    230 
    231 
    232    brw_fb_WRITE(p,
    233                 payload,
    234                 implied_header,
    235                 msg_control,
    236                 surf_index,
    237                 nr,
    238                 0,
    239                 inst->eot,
    240                 last_render_target,
    241                 inst->header_size != 0);
    242 
    243    brw_mark_surface_used(&prog_data->base, surf_index);
    244 }
    245 
    246 void
    247 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
    248 {
    249    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
    250    const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
    251    struct brw_reg implied_header;
    252 
    253    if (devinfo->gen < 8 && !devinfo->is_haswell) {
    254       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
    255    }
    256 
    257    if (inst->base_mrf >= 0)
    258       payload = brw_message_reg(inst->base_mrf);
    259 
    260    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
    261     * move, here's g1.
    262     */
    263    if (inst->header_size != 0) {
    264       brw_push_insn_state(p);
    265       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    266       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
    267       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
    268       brw_set_default_flag_reg(p, 0, 0);
    269 
    270       /* On HSW, the GPU will use the predicate on SENDC, unless the header is
    271        * present.
    272        */
    273       if (prog_data->uses_kill) {
    274          struct brw_reg pixel_mask;
    275 
    276          if (devinfo->gen >= 6)
    277             pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
    278          else
    279             pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
    280 
    281          brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
    282       }
    283 
    284       if (devinfo->gen >= 6) {
    285          brw_push_insn_state(p);
    286          brw_set_default_exec_size(p, BRW_EXECUTE_16);
    287 	 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
    288 	 brw_MOV(p,
    289 		 retype(payload, BRW_REGISTER_TYPE_UD),
    290 		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    291          brw_pop_insn_state(p);
    292 
    293          if (inst->target > 0 && key->replicate_alpha) {
    294             /* Set "Source0 Alpha Present to RenderTarget" bit in message
    295              * header.
    296              */
    297             brw_OR(p,
    298 		   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
    299 		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
    300 		   brw_imm_ud(0x1 << 11));
    301          }
    302 
    303 	 if (inst->target > 0) {
    304 	    /* Set the render target index for choosing BLEND_STATE. */
    305 	    brw_MOV(p, retype(vec1(suboffset(payload, 2)),
    306                               BRW_REGISTER_TYPE_UD),
    307 		    brw_imm_ud(inst->target));
    308 	 }
    309 
    310          /* Set computes stencil to render target */
    311          if (prog_data->computed_stencil) {
    312             brw_OR(p,
    313                    vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
    314                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
    315                    brw_imm_ud(0x1 << 14));
    316          }
    317 
    318 	 implied_header = brw_null_reg();
    319       } else {
    320 	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
    321       }
    322 
    323       brw_pop_insn_state(p);
    324    } else {
    325       implied_header = brw_null_reg();
    326    }
    327 
    328    if (!runtime_check_aads_emit) {
    329       fire_fb_write(inst, payload, implied_header, inst->mlen);
    330    } else {
    331       /* This can only happen in gen < 6 */
    332       assert(devinfo->gen < 6);
    333 
    334       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
    335 
    336       /* Check runtime bit to detect if we have to send AA data or not */
    337       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
    338       brw_AND(p,
    339               v1_null_ud,
    340               retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
    341               brw_imm_ud(1<<26));
    342       brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
    343 
    344       int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
    345       brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
    346       {
    347          /* Don't send AA data */
    348          fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
    349       }
    350       brw_land_fwd_jump(p, jmp);
    351       fire_fb_write(inst, payload, implied_header, inst->mlen);
    352    }
    353 }
    354 
    355 void
    356 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
    357                                struct brw_reg payload)
    358 {
    359    assert(inst->size_written % REG_SIZE == 0);
    360    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
    361    const unsigned surf_index =
    362       prog_data->binding_table.render_target_start + inst->target;
    363 
    364    gen9_fb_READ(p, dst, payload, surf_index,
    365                 inst->header_size, inst->size_written / REG_SIZE,
    366                 prog_data->persample_dispatch);
    367 
    368    brw_mark_surface_used(&prog_data->base, surf_index);
    369 }
    370 
    371 void
    372 fs_generator::generate_mov_indirect(fs_inst *inst,
    373                                     struct brw_reg dst,
    374                                     struct brw_reg reg,
    375                                     struct brw_reg indirect_byte_offset)
    376 {
    377    assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
    378    assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
    379 
    380    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
    381 
    382    if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
    383       imm_byte_offset += indirect_byte_offset.ud;
    384 
    385       reg.nr = imm_byte_offset / REG_SIZE;
    386       reg.subnr = imm_byte_offset % REG_SIZE;
    387       brw_MOV(p, dst, reg);
    388    } else {
    389       /* Prior to Broadwell, there are only 8 address registers. */
    390       assert(inst->exec_size == 8 || devinfo->gen >= 8);
    391 
    392       /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
    393       struct brw_reg addr = vec8(brw_address_reg(0));
    394 
    395       /* The destination stride of an instruction (in bytes) must be greater
    396        * than or equal to the size of the rest of the instruction.  Since the
    397        * address register is of type UW, we can't use a D-type instruction.
    398        * In order to get around this, re retype to UW and use a stride.
    399        */
    400       indirect_byte_offset =
    401          retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
    402 
    403       /* There are a number of reasons why we don't use the base offset here.
    404        * One reason is that the field is only 9 bits which means we can only
    405        * use it to access the first 16 GRFs.  Also, from the Haswell PRM
    406        * section "Register Region Restrictions":
    407        *
    408        *    "The lower bits of the AddressImmediate must not overflow to
    409        *    change the register address.  The lower 5 bits of Address
    410        *    Immediate when added to lower 5 bits of address register gives
    411        *    the sub-register offset. The upper bits of Address Immediate
    412        *    when added to upper bits of address register gives the register
    413        *    address. Any overflow from sub-register offset is dropped."
    414        *
    415        * Since the indirect may cause us to cross a register boundary, this
    416        * makes the base offset almost useless.  We could try and do something
    417        * clever where we use a actual base offset if base_offset % 32 == 0 but
    418        * that would mean we were generating different code depending on the
    419        * base offset.  Instead, for the sake of consistency, we'll just do the
    420        * add ourselves.  This restriction is only listed in the Haswell PRM
    421        * but empirical testing indicates that it applies on all older
    422        * generations and is lifted on Broadwell.
    423        *
    424        * In the end, while base_offset is nice to look at in the generated
    425        * code, using it saves us 0 instructions and would require quite a bit
    426        * of case-by-case work.  It's just not worth it.
    427        */
    428       brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
    429       struct brw_reg ind_src = brw_VxH_indirect(0, 0);
    430 
    431       brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
    432 
    433       if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
    434           !inst->get_next()->is_tail_sentinel() &&
    435           ((fs_inst *)inst->get_next())->mlen > 0) {
    436          /* From the Sandybridge PRM:
    437           *
    438           *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
    439           *    instruction that indexed/indirect source AND is followed by a
    440           *    send, the instruction requires a Switch. This is to avoid
    441           *    race condition where send may dispatch before MRF is updated."
    442           */
    443          brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
    444       }
    445    }
    446 }
    447 
    448 void
    449 fs_generator::generate_urb_read(fs_inst *inst,
    450                                 struct brw_reg dst,
    451                                 struct brw_reg header)
    452 {
    453    assert(inst->size_written % REG_SIZE == 0);
    454    assert(header.file == BRW_GENERAL_REGISTER_FILE);
    455    assert(header.type == BRW_REGISTER_TYPE_UD);
    456 
    457    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    458    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
    459    brw_set_src0(p, send, header);
    460    brw_set_src1(p, send, brw_imm_ud(0u));
    461 
    462    brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
    463    brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
    464 
    465    if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
    466       brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
    467 
    468    brw_inst_set_mlen(p->devinfo, send, inst->mlen);
    469    brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
    470    brw_inst_set_header_present(p->devinfo, send, true);
    471    brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
    472 }
    473 
    474 void
    475 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
    476 {
    477    brw_inst *insn;
    478 
    479    insn = brw_next_insn(p, BRW_OPCODE_SEND);
    480 
    481    brw_set_dest(p, insn, brw_null_reg());
    482    brw_set_src0(p, insn, payload);
    483    brw_set_src1(p, insn, brw_imm_d(0));
    484 
    485    brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
    486    brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
    487 
    488    if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
    489        inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
    490       brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
    491 
    492    if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
    493        inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
    494       brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
    495 
    496    brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
    497    brw_inst_set_rlen(p->devinfo, insn, 0);
    498    brw_inst_set_eot(p->devinfo, insn, inst->eot);
    499    brw_inst_set_header_present(p->devinfo, insn, true);
    500    brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
    501 }
    502 
    503 void
    504 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
    505 {
    506    struct brw_inst *insn;
    507 
    508    insn = brw_next_insn(p, BRW_OPCODE_SEND);
    509 
    510    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
    511    brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
    512    brw_set_src1(p, insn, brw_imm_d(0));
    513 
    514    /* Terminate a compute shader by sending a message to the thread spawner.
    515     */
    516    brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
    517    brw_inst_set_mlen(devinfo, insn, 1);
    518    brw_inst_set_rlen(devinfo, insn, 0);
    519    brw_inst_set_eot(devinfo, insn, inst->eot);
    520    brw_inst_set_header_present(devinfo, insn, false);
    521 
    522    brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
    523    brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
    524 
    525    /* Note that even though the thread has a URB resource associated with it,
    526     * we set the "do not dereference URB" bit, because the URB resource is
    527     * managed by the fixed-function unit, so it will free it automatically.
    528     */
    529    brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
    530 
    531    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
    532 }
    533 
    534 void
    535 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
    536 {
    537    brw_barrier(p, src);
    538    brw_WAIT(p);
    539 }
    540 
    541 void
    542 fs_generator::generate_linterp(fs_inst *inst,
    543 			     struct brw_reg dst, struct brw_reg *src)
    544 {
    545    /* PLN reads:
    546     *                      /   in SIMD16   \
    547     *    -----------------------------------
    548     *   | src1+0 | src1+1 | src1+2 | src1+3 |
    549     *   |-----------------------------------|
    550     *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
    551     *    -----------------------------------
    552     *
    553     * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
    554     *
    555     *    -----------------------------------
    556     *   | src1+0 | src1+1 | src1+2 | src1+3 |
    557     *   |-----------------------------------|
    558     *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
    559     *   |-----------------------------------|
    560     *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
    561     *    -----------------------------------
    562     *
    563     * See also: emit_interpolation_setup_gen4().
    564     */
    565    struct brw_reg delta_x = src[0];
    566    struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
    567    struct brw_reg interp = src[1];
    568 
    569    if (devinfo->has_pln &&
    570        (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
    571       brw_PLN(p, dst, interp, delta_x);
    572    } else {
    573       brw_LINE(p, brw_null_reg(), interp, delta_x);
    574       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
    575    }
    576 }
    577 
    578 void
    579 fs_generator::generate_get_buffer_size(fs_inst *inst,
    580                                        struct brw_reg dst,
    581                                        struct brw_reg src,
    582                                        struct brw_reg surf_index)
    583 {
    584    assert(devinfo->gen >= 7);
    585    assert(surf_index.file == BRW_IMMEDIATE_VALUE);
    586 
    587    uint32_t simd_mode;
    588    int rlen = 4;
    589 
    590    switch (inst->exec_size) {
    591    case 8:
    592       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
    593       break;
    594    case 16:
    595       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    596       break;
    597    default:
    598       unreachable("Invalid width for texture instruction");
    599    }
    600 
    601    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
    602       rlen = 8;
    603       dst = vec16(dst);
    604    }
    605 
    606    brw_SAMPLE(p,
    607               retype(dst, BRW_REGISTER_TYPE_UW),
    608               inst->base_mrf,
    609               src,
    610               surf_index.ud,
    611               0,
    612               GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
    613               rlen, /* response length */
    614               inst->mlen,
    615               inst->header_size > 0,
    616               simd_mode,
    617               BRW_SAMPLER_RETURN_FORMAT_SINT32);
    618 
    619    brw_mark_surface_used(prog_data, surf_index.ud);
    620 }
    621 
    622 void
    623 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
    624                            struct brw_reg surface_index,
    625                            struct brw_reg sampler_index)
    626 {
    627    assert(inst->size_written % REG_SIZE == 0);
    628    int msg_type = -1;
    629    uint32_t simd_mode;
    630    uint32_t return_format;
    631    bool is_combined_send = inst->eot;
    632 
    633    switch (dst.type) {
    634    case BRW_REGISTER_TYPE_D:
    635       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
    636       break;
    637    case BRW_REGISTER_TYPE_UD:
    638       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
    639       break;
    640    default:
    641       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
    642       break;
    643    }
    644 
    645    /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
    646     * is set as part of the message descriptor.  On gen4, the PRM seems to
    647     * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
    648     * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
    649     * gone from the message descriptor entirely and you just get UINT32 all
    650     * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
    651     * just stomp it to UINT32 all the time.
    652     */
    653    if (inst->opcode == SHADER_OPCODE_TXS)
    654       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
    655 
    656    switch (inst->exec_size) {
    657    case 8:
    658       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
    659       break;
    660    case 16:
    661       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    662       break;
    663    default:
    664       unreachable("Invalid width for texture instruction");
    665    }
    666 
    667    if (devinfo->gen >= 5) {
    668       switch (inst->opcode) {
    669       case SHADER_OPCODE_TEX:
    670 	 if (inst->shadow_compare) {
    671 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
    672 	 } else {
    673 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
    674 	 }
    675 	 break;
    676       case FS_OPCODE_TXB:
    677 	 if (inst->shadow_compare) {
    678 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
    679 	 } else {
    680 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
    681 	 }
    682 	 break;
    683       case SHADER_OPCODE_TXL:
    684 	 if (inst->shadow_compare) {
    685 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
    686 	 } else {
    687 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
    688 	 }
    689 	 break;
    690       case SHADER_OPCODE_TXL_LZ:
    691          assert(devinfo->gen >= 9);
    692 	 if (inst->shadow_compare) {
    693             msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
    694          } else {
    695             msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
    696          }
    697          break;
    698       case SHADER_OPCODE_TXS:
    699 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
    700 	 break;
    701       case SHADER_OPCODE_TXD:
    702          if (inst->shadow_compare) {
    703             /* Gen7.5+.  Otherwise, lowered in NIR */
    704             assert(devinfo->gen >= 8 || devinfo->is_haswell);
    705             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
    706          } else {
    707             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
    708          }
    709 	 break;
    710       case SHADER_OPCODE_TXF:
    711 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
    712 	 break;
    713       case SHADER_OPCODE_TXF_LZ:
    714          assert(devinfo->gen >= 9);
    715          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
    716          break;
    717       case SHADER_OPCODE_TXF_CMS_W:
    718          assert(devinfo->gen >= 9);
    719          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
    720          break;
    721       case SHADER_OPCODE_TXF_CMS:
    722          if (devinfo->gen >= 7)
    723             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
    724          else
    725             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
    726          break;
    727       case SHADER_OPCODE_TXF_UMS:
    728          assert(devinfo->gen >= 7);
    729          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
    730          break;
    731       case SHADER_OPCODE_TXF_MCS:
    732          assert(devinfo->gen >= 7);
    733          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
    734          break;
    735       case SHADER_OPCODE_LOD:
    736          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
    737          break;
    738       case SHADER_OPCODE_TG4:
    739          if (inst->shadow_compare) {
    740             assert(devinfo->gen >= 7);
    741             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
    742          } else {
    743             assert(devinfo->gen >= 6);
    744             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
    745          }
    746          break;
    747       case SHADER_OPCODE_TG4_OFFSET:
    748          assert(devinfo->gen >= 7);
    749          if (inst->shadow_compare) {
    750             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
    751          } else {
    752             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
    753          }
    754          break;
    755       case SHADER_OPCODE_SAMPLEINFO:
    756          msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
    757          break;
    758       default:
    759 	 unreachable("not reached");
    760       }
    761    } else {
    762       switch (inst->opcode) {
    763       case SHADER_OPCODE_TEX:
    764 	 /* Note that G45 and older determines shadow compare and dispatch width
    765 	  * from message length for most messages.
    766 	  */
    767          if (inst->exec_size == 8) {
    768             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
    769             if (inst->shadow_compare) {
    770                assert(inst->mlen == 6);
    771             } else {
    772                assert(inst->mlen <= 4);
    773             }
    774          } else {
    775             if (inst->shadow_compare) {
    776                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
    777                assert(inst->mlen == 9);
    778             } else {
    779                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
    780                assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
    781             }
    782          }
    783 	 break;
    784       case FS_OPCODE_TXB:
    785 	 if (inst->shadow_compare) {
    786             assert(inst->exec_size == 8);
    787 	    assert(inst->mlen == 6);
    788 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
    789 	 } else {
    790 	    assert(inst->mlen == 9);
    791 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
    792 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    793 	 }
    794 	 break;
    795       case SHADER_OPCODE_TXL:
    796 	 if (inst->shadow_compare) {
    797             assert(inst->exec_size == 8);
    798 	    assert(inst->mlen == 6);
    799 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
    800 	 } else {
    801 	    assert(inst->mlen == 9);
    802 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
    803 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    804 	 }
    805 	 break;
    806       case SHADER_OPCODE_TXD:
    807 	 /* There is no sample_d_c message; comparisons are done manually */
    808          assert(inst->exec_size == 8);
    809 	 assert(inst->mlen == 7 || inst->mlen == 10);
    810 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
    811 	 break;
    812       case SHADER_OPCODE_TXF:
    813          assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
    814 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
    815 	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    816 	 break;
    817       case SHADER_OPCODE_TXS:
    818 	 assert(inst->mlen == 3);
    819 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
    820 	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    821 	 break;
    822       default:
    823 	 unreachable("not reached");
    824       }
    825    }
    826    assert(msg_type != -1);
    827 
    828    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
    829       dst = vec16(dst);
    830    }
    831 
    832    assert(devinfo->gen < 7 || inst->header_size == 0 ||
    833           src.file == BRW_GENERAL_REGISTER_FILE);
    834 
    835    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
    836 
    837    /* Load the message header if present.  If there's a texture offset,
    838     * we need to set it up explicitly and load the offset bitfield.
    839     * Otherwise, we can use an implied move from g0 to the first message reg.
    840     */
    841    if (inst->header_size != 0) {
    842       if (devinfo->gen < 6 && !inst->offset) {
    843          /* Set up an implied move from g0 to the MRF. */
    844          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
    845       } else {
    846          struct brw_reg header_reg;
    847 
    848          if (devinfo->gen >= 7) {
    849             header_reg = src;
    850          } else {
    851             assert(inst->base_mrf != -1);
    852             header_reg = brw_message_reg(inst->base_mrf);
    853          }
    854 
    855          brw_push_insn_state(p);
    856          brw_set_default_exec_size(p, BRW_EXECUTE_8);
    857          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    858          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
    859          /* Explicitly set up the message header by copying g0 to the MRF. */
    860          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
    861 
    862          if (inst->offset) {
    863             /* Set the offset bits in DWord 2. */
    864             brw_MOV(p, get_element_ud(header_reg, 2),
    865                        brw_imm_ud(inst->offset));
    866          } else if (stage != MESA_SHADER_VERTEX &&
    867                     stage != MESA_SHADER_FRAGMENT) {
    868             /* The vertex and fragment stages have g0.2 set to 0, so
    869              * header0.2 is 0 when g0 is copied. Other stages may not, so we
    870              * must set it to 0 to avoid setting undesirable bits in the
    871              * message.
    872              */
    873             brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
    874          }
    875 
    876          brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
    877          brw_pop_insn_state(p);
    878       }
    879    }
    880 
    881    uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
    882          inst->opcode == SHADER_OPCODE_TG4_OFFSET)
    883          ? prog_data->binding_table.gather_texture_start
    884          : prog_data->binding_table.texture_start;
    885 
    886    if (surface_index.file == BRW_IMMEDIATE_VALUE &&
    887        sampler_index.file == BRW_IMMEDIATE_VALUE) {
    888       uint32_t surface = surface_index.ud;
    889       uint32_t sampler = sampler_index.ud;
    890 
    891       brw_SAMPLE(p,
    892                  retype(dst, BRW_REGISTER_TYPE_UW),
    893                  inst->base_mrf,
    894                  src,
    895                  surface + base_binding_table_index,
    896                  sampler % 16,
    897                  msg_type,
    898                  inst->size_written / REG_SIZE,
    899                  inst->mlen,
    900                  inst->header_size != 0,
    901                  simd_mode,
    902                  return_format);
    903 
    904       brw_mark_surface_used(prog_data, surface + base_binding_table_index);
    905    } else {
    906       /* Non-const sampler index */
    907 
    908       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
    909       struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
    910       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
    911 
    912       brw_push_insn_state(p);
    913       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
    914       brw_set_default_access_mode(p, BRW_ALIGN_1);
    915 
    916       if (brw_regs_equal(&surface_reg, &sampler_reg)) {
    917          brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
    918       } else {
    919          if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
    920             brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
    921          } else {
    922             brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
    923             brw_OR(p, addr, addr, surface_reg);
    924          }
    925       }
    926       if (base_binding_table_index)
    927          brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
    928       brw_AND(p, addr, addr, brw_imm_ud(0xfff));
    929 
    930       brw_pop_insn_state(p);
    931 
    932       /* dst = send(offset, a0.0 | <descriptor>) */
    933       brw_inst *insn = brw_send_indirect_message(
    934          p, BRW_SFID_SAMPLER, dst, src, addr);
    935       brw_set_sampler_message(p, insn,
    936                               0 /* surface */,
    937                               0 /* sampler */,
    938                               msg_type,
    939                               inst->size_written / REG_SIZE,
    940                               inst->mlen /* mlen */,
    941                               inst->header_size != 0 /* header */,
    942                               simd_mode,
    943                               return_format);
    944 
    945       /* visitor knows more than we do about the surface limit required,
    946        * so has already done marking.
    947        */
    948    }
    949 
    950    if (is_combined_send) {
    951       brw_inst_set_eot(p->devinfo, brw_last_inst, true);
    952       brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
    953    }
    954 }
    955 
    956 
    957 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
    958  * looking like:
    959  *
    960  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
    961  *
    962  * Ideally, we want to produce:
    963  *
    964  *           DDX                     DDY
    965  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
    966  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
    967  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
    968  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
    969  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
    970  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
    971  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
    972  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
    973  *
    974  * and add another set of two more subspans if in 16-pixel dispatch mode.
    975  *
    976  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
    977  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
    978  * pair.  But the ideal approximation may impose a huge performance cost on
    979  * sample_d.  On at least Haswell, sample_d instruction does some
    980  * optimizations if the same LOD is used for all pixels in the subspan.
    981  *
    982  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
    983  * appropriate swizzling.
    984  */
    985 void
    986 fs_generator::generate_ddx(enum opcode opcode,
    987                            struct brw_reg dst, struct brw_reg src)
    988 {
    989    unsigned vstride, width;
    990 
    991    if (opcode == FS_OPCODE_DDX_FINE) {
    992       /* produce accurate derivatives */
    993       vstride = BRW_VERTICAL_STRIDE_2;
    994       width = BRW_WIDTH_2;
    995    } else {
    996       /* replicate the derivative at the top-left pixel to other pixels */
    997       vstride = BRW_VERTICAL_STRIDE_4;
    998       width = BRW_WIDTH_4;
    999    }
   1000 
   1001    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
   1002                                  src.negate, src.abs,
   1003 				 BRW_REGISTER_TYPE_F,
   1004 				 vstride,
   1005 				 width,
   1006 				 BRW_HORIZONTAL_STRIDE_0,
   1007 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   1008    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
   1009                                  src.negate, src.abs,
   1010 				 BRW_REGISTER_TYPE_F,
   1011 				 vstride,
   1012 				 width,
   1013 				 BRW_HORIZONTAL_STRIDE_0,
   1014 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   1015    brw_ADD(p, dst, src0, negate(src1));
   1016 }
   1017 
   1018 /* The negate_value boolean is used to negate the derivative computation for
   1019  * FBOs, since they place the origin at the upper left instead of the lower
   1020  * left.
   1021  */
   1022 void
   1023 fs_generator::generate_ddy(enum opcode opcode,
   1024                            struct brw_reg dst, struct brw_reg src)
   1025 {
   1026    if (opcode == FS_OPCODE_DDY_FINE) {
   1027       /* produce accurate derivatives */
   1028       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
   1029                                     src.negate, src.abs,
   1030                                     BRW_REGISTER_TYPE_F,
   1031                                     BRW_VERTICAL_STRIDE_4,
   1032                                     BRW_WIDTH_4,
   1033                                     BRW_HORIZONTAL_STRIDE_1,
   1034                                     BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
   1035       struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
   1036                                     src.negate, src.abs,
   1037                                     BRW_REGISTER_TYPE_F,
   1038                                     BRW_VERTICAL_STRIDE_4,
   1039                                     BRW_WIDTH_4,
   1040                                     BRW_HORIZONTAL_STRIDE_1,
   1041                                     BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
   1042       brw_push_insn_state(p);
   1043       brw_set_default_access_mode(p, BRW_ALIGN_16);
   1044       brw_ADD(p, dst, negate(src0), src1);
   1045       brw_pop_insn_state(p);
   1046    } else {
   1047       /* replicate the derivative at the top-left pixel to other pixels */
   1048       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
   1049                                     src.negate, src.abs,
   1050                                     BRW_REGISTER_TYPE_F,
   1051                                     BRW_VERTICAL_STRIDE_4,
   1052                                     BRW_WIDTH_4,
   1053                                     BRW_HORIZONTAL_STRIDE_0,
   1054                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   1055       struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
   1056                                     src.negate, src.abs,
   1057                                     BRW_REGISTER_TYPE_F,
   1058                                     BRW_VERTICAL_STRIDE_4,
   1059                                     BRW_WIDTH_4,
   1060                                     BRW_HORIZONTAL_STRIDE_0,
   1061                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
   1062       brw_ADD(p, dst, negate(src0), src1);
   1063    }
   1064 }
   1065 
   1066 void
   1067 fs_generator::generate_discard_jump(fs_inst *inst)
   1068 {
   1069    assert(devinfo->gen >= 6);
   1070 
   1071    /* This HALT will be patched up at FB write time to point UIP at the end of
   1072     * the program, and at brw_uip_jip() JIP will be set to the end of the
   1073     * current block (or the program).
   1074     */
   1075    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
   1076    gen6_HALT(p);
   1077 }
   1078 
   1079 void
   1080 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
   1081 {
   1082    /* The 32-wide messages only respect the first 16-wide half of the channel
   1083     * enable signals which are replicated identically for the second group of
   1084     * 16 channels, so we cannot use them unless the write is marked
   1085     * force_writemask_all.
   1086     */
   1087    const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
   1088                                MIN2(16, inst->exec_size);
   1089    const unsigned block_size = 4 * lower_size / REG_SIZE;
   1090    assert(inst->mlen != 0);
   1091 
   1092    brw_push_insn_state(p);
   1093    brw_set_default_exec_size(p, cvt(lower_size) - 1);
   1094    brw_set_default_compression(p, lower_size > 8);
   1095 
   1096    for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
   1097       brw_set_default_group(p, inst->group + lower_size * i);
   1098 
   1099       brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
   1100               retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
   1101 
   1102       brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
   1103                                     block_size,
   1104                                     inst->offset + block_size * REG_SIZE * i);
   1105    }
   1106 
   1107    brw_pop_insn_state(p);
   1108 }
   1109 
   1110 void
   1111 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
   1112 {
   1113    assert(inst->exec_size <= 16 || inst->force_writemask_all);
   1114    assert(inst->mlen != 0);
   1115 
   1116    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
   1117                                 inst->exec_size / 8, inst->offset);
   1118 }
   1119 
   1120 void
   1121 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
   1122 {
   1123    assert(inst->exec_size <= 16 || inst->force_writemask_all);
   1124 
   1125    gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
   1126 }
   1127 
   1128 void
   1129 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
   1130                                                   struct brw_reg dst,
   1131                                                   struct brw_reg index,
   1132                                                   struct brw_reg offset)
   1133 {
   1134    assert(type_sz(dst.type) == 4);
   1135    assert(inst->mlen != 0);
   1136 
   1137    assert(index.file == BRW_IMMEDIATE_VALUE &&
   1138 	  index.type == BRW_REGISTER_TYPE_UD);
   1139    uint32_t surf_index = index.ud;
   1140 
   1141    assert(offset.file == BRW_IMMEDIATE_VALUE &&
   1142 	  offset.type == BRW_REGISTER_TYPE_UD);
   1143    uint32_t read_offset = offset.ud;
   1144 
   1145    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
   1146 			read_offset, surf_index);
   1147 }
   1148 
   1149 void
   1150 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
   1151                                                        struct brw_reg dst,
   1152                                                        struct brw_reg index,
   1153                                                        struct brw_reg payload)
   1154 {
   1155    assert(index.type == BRW_REGISTER_TYPE_UD);
   1156    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
   1157    assert(type_sz(dst.type) == 4);
   1158 
   1159    if (index.file == BRW_IMMEDIATE_VALUE) {
   1160       const uint32_t surf_index = index.ud;
   1161 
   1162       brw_push_insn_state(p);
   1163       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1164       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1165       brw_pop_insn_state(p);
   1166 
   1167       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
   1168       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
   1169       brw_set_dp_read_message(p, send, surf_index,
   1170                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
   1171                               GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
   1172                               GEN6_SFID_DATAPORT_CONSTANT_CACHE,
   1173                               1, /* mlen */
   1174                               true, /* header */
   1175                               DIV_ROUND_UP(inst->size_written, REG_SIZE));
   1176 
   1177    } else {
   1178       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
   1179 
   1180       brw_push_insn_state(p);
   1181       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1182 
   1183       /* a0.0 = surf_index & 0xff */
   1184       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
   1185       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
   1186       brw_set_dest(p, insn_and, addr);
   1187       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
   1188       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
   1189 
   1190       /* dst = send(payload, a0.0 | <descriptor>) */
   1191       brw_inst *insn = brw_send_indirect_message(
   1192          p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
   1193          retype(dst, BRW_REGISTER_TYPE_UD),
   1194          retype(payload, BRW_REGISTER_TYPE_UD), addr);
   1195       brw_set_dp_read_message(p, insn, 0 /* surface */,
   1196                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
   1197                               GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
   1198                               GEN6_SFID_DATAPORT_CONSTANT_CACHE,
   1199                               1, /* mlen */
   1200                               true, /* header */
   1201                               DIV_ROUND_UP(inst->size_written, REG_SIZE));
   1202 
   1203       brw_pop_insn_state(p);
   1204    }
   1205 }
   1206 
   1207 void
   1208 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
   1209                                                        struct brw_reg dst,
   1210                                                        struct brw_reg index)
   1211 {
   1212    assert(devinfo->gen < 7); /* Should use the gen7 variant. */
   1213    assert(inst->header_size != 0);
   1214    assert(inst->mlen);
   1215 
   1216    assert(index.file == BRW_IMMEDIATE_VALUE &&
   1217 	  index.type == BRW_REGISTER_TYPE_UD);
   1218    uint32_t surf_index = index.ud;
   1219 
   1220    uint32_t simd_mode, rlen, msg_type;
   1221    if (inst->exec_size == 16) {
   1222       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
   1223       rlen = 8;
   1224    } else {
   1225       assert(inst->exec_size == 8);
   1226       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
   1227       rlen = 4;
   1228    }
   1229 
   1230    if (devinfo->gen >= 5)
   1231       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
   1232    else {
   1233       /* We always use the SIMD16 message so that we only have to load U, and
   1234        * not V or R.
   1235        */
   1236       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
   1237       assert(inst->mlen == 3);
   1238       assert(inst->size_written == 8 * REG_SIZE);
   1239       rlen = 8;
   1240       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
   1241    }
   1242 
   1243    struct brw_reg header = brw_vec8_grf(0, 0);
   1244    gen6_resolve_implied_move(p, &header, inst->base_mrf);
   1245 
   1246    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1247    brw_inst_set_compression(devinfo, send, false);
   1248    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
   1249    brw_set_src0(p, send, header);
   1250    if (devinfo->gen < 6)
   1251       brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
   1252 
   1253    /* Our surface is set up as floats, regardless of what actual data is
   1254     * stored in it.
   1255     */
   1256    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
   1257    brw_set_sampler_message(p, send,
   1258                            surf_index,
   1259                            0, /* sampler (unused) */
   1260                            msg_type,
   1261                            rlen,
   1262                            inst->mlen,
   1263                            inst->header_size != 0,
   1264                            simd_mode,
   1265                            return_format);
   1266 }
   1267 
   1268 void
   1269 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
   1270                                                        struct brw_reg dst,
   1271                                                        struct brw_reg index,
   1272                                                        struct brw_reg offset)
   1273 {
   1274    assert(devinfo->gen >= 7);
   1275    /* Varying-offset pull constant loads are treated as a normal expression on
   1276     * gen7, so the fact that it's a send message is hidden at the IR level.
   1277     */
   1278    assert(inst->header_size == 0);
   1279    assert(!inst->mlen);
   1280    assert(index.type == BRW_REGISTER_TYPE_UD);
   1281 
   1282    uint32_t simd_mode, rlen, mlen;
   1283    if (inst->exec_size == 16) {
   1284       mlen = 2;
   1285       rlen = 8;
   1286       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
   1287    } else {
   1288       assert(inst->exec_size == 8);
   1289       mlen = 1;
   1290       rlen = 4;
   1291       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
   1292    }
   1293 
   1294    if (index.file == BRW_IMMEDIATE_VALUE) {
   1295 
   1296       uint32_t surf_index = index.ud;
   1297 
   1298       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
   1299       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
   1300       brw_set_src0(p, send, offset);
   1301       brw_set_sampler_message(p, send,
   1302                               surf_index,
   1303                               0, /* LD message ignores sampler unit */
   1304                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
   1305                               rlen,
   1306                               mlen,
   1307                               false, /* no header */
   1308                               simd_mode,
   1309                               0);
   1310 
   1311    } else {
   1312 
   1313       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
   1314 
   1315       brw_push_insn_state(p);
   1316       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1317 
   1318       /* a0.0 = surf_index & 0xff */
   1319       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
   1320       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
   1321       brw_set_dest(p, insn_and, addr);
   1322       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
   1323       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
   1324 
   1325       brw_pop_insn_state(p);
   1326 
   1327       /* dst = send(offset, a0.0 | <descriptor>) */
   1328       brw_inst *insn = brw_send_indirect_message(
   1329          p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
   1330          offset, addr);
   1331       brw_set_sampler_message(p, insn,
   1332                               0 /* surface */,
   1333                               0 /* sampler */,
   1334                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
   1335                               rlen /* rlen */,
   1336                               mlen /* mlen */,
   1337                               false /* header */,
   1338                               simd_mode,
   1339                               0);
   1340    }
   1341 }
   1342 
   1343 /**
   1344  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
   1345  * into the flags register (f0.0).
   1346  *
   1347  * Used only on Gen6 and above.
   1348  */
   1349 void
   1350 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
   1351 {
   1352    struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
   1353    struct brw_reg dispatch_mask;
   1354 
   1355    if (devinfo->gen >= 6)
   1356       dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
   1357    else
   1358       dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
   1359 
   1360    brw_push_insn_state(p);
   1361    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   1362    brw_MOV(p, flags, dispatch_mask);
   1363    brw_pop_insn_state(p);
   1364 }
   1365 
   1366 void
   1367 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
   1368                                                 struct brw_reg dst,
   1369                                                 struct brw_reg src,
   1370                                                 struct brw_reg msg_data,
   1371                                                 unsigned msg_type)
   1372 {
   1373    assert(inst->size_written % REG_SIZE == 0);
   1374    assert(msg_data.type == BRW_REGISTER_TYPE_UD);
   1375 
   1376    brw_pixel_interpolator_query(p,
   1377          retype(dst, BRW_REGISTER_TYPE_UW),
   1378          src,
   1379          inst->pi_noperspective,
   1380          msg_type,
   1381          msg_data,
   1382          inst->mlen,
   1383          inst->size_written / REG_SIZE);
   1384 }
   1385 
   1386 /* Sets vstride=1, width=4, hstride=0 of register src1 during
   1387  * the ADD instruction.
   1388  */
   1389 void
   1390 fs_generator::generate_set_sample_id(fs_inst *inst,
   1391                                      struct brw_reg dst,
   1392                                      struct brw_reg src0,
   1393                                      struct brw_reg src1)
   1394 {
   1395    assert(dst.type == BRW_REGISTER_TYPE_D ||
   1396           dst.type == BRW_REGISTER_TYPE_UD);
   1397    assert(src0.type == BRW_REGISTER_TYPE_D ||
   1398           src0.type == BRW_REGISTER_TYPE_UD);
   1399 
   1400    struct brw_reg reg = stride(src1, 1, 4, 0);
   1401    if (devinfo->gen >= 8 || inst->exec_size == 8) {
   1402       brw_ADD(p, dst, src0, reg);
   1403    } else if (inst->exec_size == 16) {
   1404       brw_push_insn_state(p);
   1405       brw_set_default_exec_size(p, BRW_EXECUTE_8);
   1406       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   1407       brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
   1408       brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
   1409       brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
   1410       brw_pop_insn_state(p);
   1411    }
   1412 }
   1413 
   1414 void
   1415 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
   1416                                             struct brw_reg dst,
   1417                                             struct brw_reg x,
   1418                                             struct brw_reg y)
   1419 {
   1420    assert(devinfo->gen >= 7);
   1421    assert(dst.type == BRW_REGISTER_TYPE_UD);
   1422    assert(x.type == BRW_REGISTER_TYPE_F);
   1423    assert(y.type == BRW_REGISTER_TYPE_F);
   1424 
   1425    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
   1426     *
   1427     *   Because this instruction does not have a 16-bit floating-point type,
   1428     *   the destination data type must be Word (W).
   1429     *
   1430     *   The destination must be DWord-aligned and specify a horizontal stride
   1431     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
   1432     *   each destination channel and the upper word is not modified.
   1433     */
   1434    struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
   1435 
   1436    /* Give each 32-bit channel of dst the form below, where "." means
   1437     * unchanged.
   1438     *   0x....hhhh
   1439     */
   1440    brw_F32TO16(p, dst_w, y);
   1441 
   1442    /* Now the form:
   1443     *   0xhhhh0000
   1444     */
   1445    brw_SHL(p, dst, dst, brw_imm_ud(16u));
   1446 
   1447    /* And, finally the form of packHalf2x16's output:
   1448     *   0xhhhhllll
   1449     */
   1450    brw_F32TO16(p, dst_w, x);
   1451 }
   1452 
   1453 void
   1454 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
   1455                                               struct brw_reg dst,
   1456                                               struct brw_reg src)
   1457 {
   1458    assert(devinfo->gen >= 7);
   1459    assert(dst.type == BRW_REGISTER_TYPE_F);
   1460    assert(src.type == BRW_REGISTER_TYPE_UD);
   1461 
   1462    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
   1463     *
   1464     *   Because this instruction does not have a 16-bit floating-point type,
   1465     *   the source data type must be Word (W). The destination type must be
   1466     *   F (Float).
   1467     */
   1468    struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
   1469 
   1470    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
   1471     * For the Y case, we wish to access only the upper word; therefore
   1472     * a 16-bit subregister offset is needed.
   1473     */
   1474    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
   1475           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
   1476    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
   1477       src_w.subnr += 2;
   1478 
   1479    brw_F16TO32(p, dst, src_w);
   1480 }
   1481 
   1482 void
   1483 fs_generator::generate_shader_time_add(fs_inst *inst,
   1484                                        struct brw_reg payload,
   1485                                        struct brw_reg offset,
   1486                                        struct brw_reg value)
   1487 {
   1488    assert(devinfo->gen >= 7);
   1489    brw_push_insn_state(p);
   1490    brw_set_default_mask_control(p, true);
   1491 
   1492    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
   1493    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
   1494                                           offset.type);
   1495    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
   1496                                          value.type);
   1497 
   1498    assert(offset.file == BRW_IMMEDIATE_VALUE);
   1499    if (value.file == BRW_GENERAL_REGISTER_FILE) {
   1500       value.width = BRW_WIDTH_1;
   1501       value.hstride = BRW_HORIZONTAL_STRIDE_0;
   1502       value.vstride = BRW_VERTICAL_STRIDE_0;
   1503    } else {
   1504       assert(value.file == BRW_IMMEDIATE_VALUE);
   1505    }
   1506 
   1507    /* Trying to deal with setup of the params from the IR is crazy in the FS8
   1508     * case, and we don't really care about squeezing every bit of performance
   1509     * out of this path, so we just emit the MOVs from here.
   1510     */
   1511    brw_MOV(p, payload_offset, offset);
   1512    brw_MOV(p, payload_value, value);
   1513    brw_shader_time_add(p, payload,
   1514                        prog_data->binding_table.shader_time_start);
   1515    brw_pop_insn_state(p);
   1516 
   1517    brw_mark_surface_used(prog_data,
   1518                          prog_data->binding_table.shader_time_start);
   1519 }
   1520 
   1521 void
   1522 fs_generator::enable_debug(const char *shader_name)
   1523 {
   1524    debug_flag = true;
   1525    this->shader_name = shader_name;
   1526 }
   1527 
   1528 int
   1529 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
   1530 {
   1531    /* align to 64 byte boundary. */
   1532    while (p->next_insn_offset % 64)
   1533       brw_NOP(p);
   1534 
   1535    this->dispatch_width = dispatch_width;
   1536 
   1537    int start_offset = p->next_insn_offset;
   1538    int spill_count = 0, fill_count = 0;
   1539    int loop_count = 0;
   1540 
   1541    struct annotation_info annotation;
   1542    memset(&annotation, 0, sizeof(annotation));
   1543 
   1544    foreach_block_and_inst (block, fs_inst, inst, cfg) {
   1545       struct brw_reg src[3], dst;
   1546       unsigned int last_insn_offset = p->next_insn_offset;
   1547       bool multiple_instructions_emitted = false;
   1548 
   1549       /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
   1550        * "Register Region Restrictions" section: for BDW, SKL:
   1551        *
   1552        *    "A POW/FDIV operation must not be followed by an instruction
   1553        *     that requires two destination registers."
   1554        *
   1555        * The documentation is often lacking annotations for Atom parts,
   1556        * and empirically this affects CHV as well.
   1557        */
   1558       if (devinfo->gen >= 8 &&
   1559           p->nr_insn > 1 &&
   1560           brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
   1561           brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
   1562           inst->dst.component_size(inst->exec_size) > REG_SIZE) {
   1563          brw_NOP(p);
   1564          last_insn_offset = p->next_insn_offset;
   1565       }
   1566 
   1567       if (unlikely(debug_flag))
   1568          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
   1569 
   1570       /* If the instruction writes to more than one register, it needs to be
   1571        * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
   1572        * hardware figures out by itself what the right compression mode is,
   1573        * but we still need to know whether the instruction is compressed to
   1574        * set up the source register regions appropriately.
   1575        *
   1576        * XXX - This is wrong for instructions that write a single register but
   1577        *       read more than one which should strictly speaking be treated as
   1578        *       compressed.  For instructions that don't write any registers it
   1579        *       relies on the destination being a null register of the correct
   1580        *       type and regioning so the instruction is considered compressed
   1581        *       or not accordingly.
   1582        */
   1583       const bool compressed =
   1584            inst->dst.component_size(inst->exec_size) > REG_SIZE;
   1585       brw_set_default_compression(p, compressed);
   1586       brw_set_default_group(p, inst->group);
   1587 
   1588       for (unsigned int i = 0; i < inst->sources; i++) {
   1589          src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen,
   1590                                       compressed);
   1591 
   1592 	 /* The accumulator result appears to get used for the
   1593 	  * conditional modifier generation.  When negating a UD
   1594 	  * value, there is a 33rd bit generated for the sign in the
   1595 	  * accumulator value, so now you can't check, for example,
   1596 	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
   1597 	  */
   1598 	 assert(!inst->conditional_mod ||
   1599 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
   1600 		!inst->src[i].negate);
   1601       }
   1602       dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed);
   1603 
   1604       brw_set_default_access_mode(p, BRW_ALIGN_1);
   1605       brw_set_default_predicate_control(p, inst->predicate);
   1606       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
   1607       brw_set_default_flag_reg(p, 0, inst->flag_subreg);
   1608       brw_set_default_saturate(p, inst->saturate);
   1609       brw_set_default_mask_control(p, inst->force_writemask_all);
   1610       brw_set_default_acc_write_control(p, inst->writes_accumulator);
   1611       brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
   1612 
   1613       assert(inst->force_writemask_all || inst->exec_size >= 4);
   1614       assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
   1615       assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
   1616       assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
   1617 
   1618       switch (inst->opcode) {
   1619       case BRW_OPCODE_MOV:
   1620 	 brw_MOV(p, dst, src[0]);
   1621 	 break;
   1622       case BRW_OPCODE_ADD:
   1623 	 brw_ADD(p, dst, src[0], src[1]);
   1624 	 break;
   1625       case BRW_OPCODE_MUL:
   1626 	 brw_MUL(p, dst, src[0], src[1]);
   1627 	 break;
   1628       case BRW_OPCODE_AVG:
   1629 	 brw_AVG(p, dst, src[0], src[1]);
   1630 	 break;
   1631       case BRW_OPCODE_MACH:
   1632 	 brw_MACH(p, dst, src[0], src[1]);
   1633 	 break;
   1634 
   1635       case BRW_OPCODE_LINE:
   1636          brw_LINE(p, dst, src[0], src[1]);
   1637          break;
   1638 
   1639       case BRW_OPCODE_MAD:
   1640          assert(devinfo->gen >= 6);
   1641 	 brw_set_default_access_mode(p, BRW_ALIGN_16);
   1642          brw_MAD(p, dst, src[0], src[1], src[2]);
   1643 	 break;
   1644 
   1645       case BRW_OPCODE_LRP:
   1646          assert(devinfo->gen >= 6);
   1647 	 brw_set_default_access_mode(p, BRW_ALIGN_16);
   1648          brw_LRP(p, dst, src[0], src[1], src[2]);
   1649 	 break;
   1650 
   1651       case BRW_OPCODE_FRC:
   1652 	 brw_FRC(p, dst, src[0]);
   1653 	 break;
   1654       case BRW_OPCODE_RNDD:
   1655 	 brw_RNDD(p, dst, src[0]);
   1656 	 break;
   1657       case BRW_OPCODE_RNDE:
   1658 	 brw_RNDE(p, dst, src[0]);
   1659 	 break;
   1660       case BRW_OPCODE_RNDZ:
   1661 	 brw_RNDZ(p, dst, src[0]);
   1662 	 break;
   1663 
   1664       case BRW_OPCODE_AND:
   1665 	 brw_AND(p, dst, src[0], src[1]);
   1666 	 break;
   1667       case BRW_OPCODE_OR:
   1668 	 brw_OR(p, dst, src[0], src[1]);
   1669 	 break;
   1670       case BRW_OPCODE_XOR:
   1671 	 brw_XOR(p, dst, src[0], src[1]);
   1672 	 break;
   1673       case BRW_OPCODE_NOT:
   1674 	 brw_NOT(p, dst, src[0]);
   1675 	 break;
   1676       case BRW_OPCODE_ASR:
   1677 	 brw_ASR(p, dst, src[0], src[1]);
   1678 	 break;
   1679       case BRW_OPCODE_SHR:
   1680 	 brw_SHR(p, dst, src[0], src[1]);
   1681 	 break;
   1682       case BRW_OPCODE_SHL:
   1683 	 brw_SHL(p, dst, src[0], src[1]);
   1684 	 break;
   1685       case BRW_OPCODE_F32TO16:
   1686          assert(devinfo->gen >= 7);
   1687          brw_F32TO16(p, dst, src[0]);
   1688          break;
   1689       case BRW_OPCODE_F16TO32:
   1690          assert(devinfo->gen >= 7);
   1691          brw_F16TO32(p, dst, src[0]);
   1692          break;
   1693       case BRW_OPCODE_CMP:
   1694          if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
   1695              dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
   1696             /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
   1697              * implemented in the compiler is not sufficient. Overriding the
   1698              * type when the destination is the null register is necessary but
   1699              * not sufficient by itself.
   1700              */
   1701             assert(dst.nr == BRW_ARF_NULL);
   1702             dst.type = BRW_REGISTER_TYPE_D;
   1703          }
   1704          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
   1705 	 break;
   1706       case BRW_OPCODE_SEL:
   1707 	 brw_SEL(p, dst, src[0], src[1]);
   1708 	 break;
   1709       case BRW_OPCODE_BFREV:
   1710          assert(devinfo->gen >= 7);
   1711          /* BFREV only supports UD type for src and dst. */
   1712          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
   1713                       retype(src[0], BRW_REGISTER_TYPE_UD));
   1714          break;
   1715       case BRW_OPCODE_FBH:
   1716          assert(devinfo->gen >= 7);
   1717          /* FBH only supports UD type for dst. */
   1718          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
   1719          break;
   1720       case BRW_OPCODE_FBL:
   1721          assert(devinfo->gen >= 7);
   1722          /* FBL only supports UD type for dst. */
   1723          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
   1724          break;
   1725       case BRW_OPCODE_LZD:
   1726          brw_LZD(p, dst, src[0]);
   1727          break;
   1728       case BRW_OPCODE_CBIT:
   1729          assert(devinfo->gen >= 7);
   1730          /* CBIT only supports UD type for dst. */
   1731          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
   1732          break;
   1733       case BRW_OPCODE_ADDC:
   1734          assert(devinfo->gen >= 7);
   1735          brw_ADDC(p, dst, src[0], src[1]);
   1736          break;
   1737       case BRW_OPCODE_SUBB:
   1738          assert(devinfo->gen >= 7);
   1739          brw_SUBB(p, dst, src[0], src[1]);
   1740          break;
   1741       case BRW_OPCODE_MAC:
   1742          brw_MAC(p, dst, src[0], src[1]);
   1743          break;
   1744 
   1745       case BRW_OPCODE_BFE:
   1746          assert(devinfo->gen >= 7);
   1747          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1748          brw_BFE(p, dst, src[0], src[1], src[2]);
   1749          break;
   1750 
   1751       case BRW_OPCODE_BFI1:
   1752          assert(devinfo->gen >= 7);
   1753          brw_BFI1(p, dst, src[0], src[1]);
   1754          break;
   1755       case BRW_OPCODE_BFI2:
   1756          assert(devinfo->gen >= 7);
   1757          brw_set_default_access_mode(p, BRW_ALIGN_16);
   1758          brw_BFI2(p, dst, src[0], src[1], src[2]);
   1759          break;
   1760 
   1761       case BRW_OPCODE_IF:
   1762 	 if (inst->src[0].file != BAD_FILE) {
   1763 	    /* The instruction has an embedded compare (only allowed on gen6) */
   1764 	    assert(devinfo->gen == 6);
   1765 	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
   1766 	 } else {
   1767 	    brw_IF(p, brw_inst_exec_size(devinfo, p->current));
   1768 	 }
   1769 	 break;
   1770 
   1771       case BRW_OPCODE_ELSE:
   1772 	 brw_ELSE(p);
   1773 	 break;
   1774       case BRW_OPCODE_ENDIF:
   1775 	 brw_ENDIF(p);
   1776 	 break;
   1777 
   1778       case BRW_OPCODE_DO:
   1779 	 brw_DO(p, brw_inst_exec_size(devinfo, p->current));
   1780 	 break;
   1781 
   1782       case BRW_OPCODE_BREAK:
   1783 	 brw_BREAK(p);
   1784 	 break;
   1785       case BRW_OPCODE_CONTINUE:
   1786          brw_CONT(p);
   1787 	 break;
   1788 
   1789       case BRW_OPCODE_WHILE:
   1790 	 brw_WHILE(p);
   1791          loop_count++;
   1792 	 break;
   1793 
   1794       case SHADER_OPCODE_RCP:
   1795       case SHADER_OPCODE_RSQ:
   1796       case SHADER_OPCODE_SQRT:
   1797       case SHADER_OPCODE_EXP2:
   1798       case SHADER_OPCODE_LOG2:
   1799       case SHADER_OPCODE_SIN:
   1800       case SHADER_OPCODE_COS:
   1801          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
   1802 	 if (devinfo->gen >= 6) {
   1803             assert(inst->mlen == 0);
   1804             assert(devinfo->gen >= 7 || inst->exec_size == 8);
   1805             gen6_math(p, dst, brw_math_function(inst->opcode),
   1806                       src[0], brw_null_reg());
   1807 	 } else {
   1808             assert(inst->mlen >= 1);
   1809             assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
   1810             gen4_math(p, dst,
   1811                       brw_math_function(inst->opcode),
   1812                       inst->base_mrf, src[0],
   1813                       BRW_MATH_PRECISION_FULL);
   1814 	 }
   1815 	 break;
   1816       case SHADER_OPCODE_INT_QUOTIENT:
   1817       case SHADER_OPCODE_INT_REMAINDER:
   1818       case SHADER_OPCODE_POW:
   1819          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
   1820          if (devinfo->gen >= 6) {
   1821             assert(inst->mlen == 0);
   1822             assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
   1823                    inst->exec_size == 8);
   1824             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
   1825          } else {
   1826             assert(inst->mlen >= 1);
   1827             assert(inst->exec_size == 8);
   1828             gen4_math(p, dst, brw_math_function(inst->opcode),
   1829                       inst->base_mrf, src[0],
   1830                       BRW_MATH_PRECISION_FULL);
   1831 	 }
   1832 	 break;
   1833       case FS_OPCODE_CINTERP:
   1834 	 brw_MOV(p, dst, src[0]);
   1835 	 break;
   1836       case FS_OPCODE_LINTERP:
   1837 	 generate_linterp(inst, dst, src);
   1838 	 break;
   1839       case FS_OPCODE_PIXEL_X:
   1840          assert(src[0].type == BRW_REGISTER_TYPE_UW);
   1841          src[0].subnr = 0 * type_sz(src[0].type);
   1842          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
   1843          break;
   1844       case FS_OPCODE_PIXEL_Y:
   1845          assert(src[0].type == BRW_REGISTER_TYPE_UW);
   1846          src[0].subnr = 4 * type_sz(src[0].type);
   1847          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
   1848          break;
   1849       case FS_OPCODE_GET_BUFFER_SIZE:
   1850          generate_get_buffer_size(inst, dst, src[0], src[1]);
   1851          break;
   1852       case SHADER_OPCODE_TEX:
   1853       case FS_OPCODE_TXB:
   1854       case SHADER_OPCODE_TXD:
   1855       case SHADER_OPCODE_TXF:
   1856       case SHADER_OPCODE_TXF_LZ:
   1857       case SHADER_OPCODE_TXF_CMS:
   1858       case SHADER_OPCODE_TXF_CMS_W:
   1859       case SHADER_OPCODE_TXF_UMS:
   1860       case SHADER_OPCODE_TXF_MCS:
   1861       case SHADER_OPCODE_TXL:
   1862       case SHADER_OPCODE_TXL_LZ:
   1863       case SHADER_OPCODE_TXS:
   1864       case SHADER_OPCODE_LOD:
   1865       case SHADER_OPCODE_TG4:
   1866       case SHADER_OPCODE_TG4_OFFSET:
   1867       case SHADER_OPCODE_SAMPLEINFO:
   1868 	 generate_tex(inst, dst, src[0], src[1], src[2]);
   1869 	 break;
   1870       case FS_OPCODE_DDX_COARSE:
   1871       case FS_OPCODE_DDX_FINE:
   1872          generate_ddx(inst->opcode, dst, src[0]);
   1873          break;
   1874       case FS_OPCODE_DDY_COARSE:
   1875       case FS_OPCODE_DDY_FINE:
   1876          generate_ddy(inst->opcode, dst, src[0]);
   1877 	 break;
   1878 
   1879       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
   1880 	 generate_scratch_write(inst, src[0]);
   1881          spill_count++;
   1882 	 break;
   1883 
   1884       case SHADER_OPCODE_GEN4_SCRATCH_READ:
   1885 	 generate_scratch_read(inst, dst);
   1886          fill_count++;
   1887 	 break;
   1888 
   1889       case SHADER_OPCODE_GEN7_SCRATCH_READ:
   1890 	 generate_scratch_read_gen7(inst, dst);
   1891          fill_count++;
   1892 	 break;
   1893 
   1894       case SHADER_OPCODE_MOV_INDIRECT:
   1895          generate_mov_indirect(inst, dst, src[0], src[1]);
   1896          break;
   1897 
   1898       case SHADER_OPCODE_URB_READ_SIMD8:
   1899       case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
   1900          generate_urb_read(inst, dst, src[0]);
   1901          break;
   1902 
   1903       case SHADER_OPCODE_URB_WRITE_SIMD8:
   1904       case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
   1905       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
   1906       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
   1907 	 generate_urb_write(inst, src[0]);
   1908 	 break;
   1909 
   1910       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
   1911          assert(inst->force_writemask_all);
   1912 	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
   1913 	 break;
   1914 
   1915       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
   1916          assert(inst->force_writemask_all);
   1917 	 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
   1918 	 break;
   1919 
   1920       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
   1921 	 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
   1922 	 break;
   1923 
   1924       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
   1925 	 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
   1926 	 break;
   1927 
   1928       case FS_OPCODE_REP_FB_WRITE:
   1929       case FS_OPCODE_FB_WRITE:
   1930 	 generate_fb_write(inst, src[0]);
   1931 	 break;
   1932 
   1933       case FS_OPCODE_FB_READ:
   1934          generate_fb_read(inst, dst, src[0]);
   1935          break;
   1936 
   1937       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
   1938          generate_mov_dispatch_to_flags(inst);
   1939          break;
   1940 
   1941       case FS_OPCODE_DISCARD_JUMP:
   1942          generate_discard_jump(inst);
   1943          break;
   1944 
   1945       case SHADER_OPCODE_SHADER_TIME_ADD:
   1946          generate_shader_time_add(inst, src[0], src[1], src[2]);
   1947          break;
   1948 
   1949       case SHADER_OPCODE_UNTYPED_ATOMIC:
   1950          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1951          brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
   1952                             inst->mlen, !inst->dst.is_null());
   1953          break;
   1954 
   1955       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
   1956          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1957          brw_untyped_surface_read(p, dst, src[0], src[1],
   1958                                   inst->mlen, src[2].ud);
   1959          break;
   1960 
   1961       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
   1962          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1963          brw_untyped_surface_write(p, src[0], src[1],
   1964                                    inst->mlen, src[2].ud);
   1965          break;
   1966 
   1967       case SHADER_OPCODE_TYPED_ATOMIC:
   1968          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1969          brw_typed_atomic(p, dst, src[0], src[1],
   1970                           src[2].ud, inst->mlen, !inst->dst.is_null());
   1971          break;
   1972 
   1973       case SHADER_OPCODE_TYPED_SURFACE_READ:
   1974          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1975          brw_typed_surface_read(p, dst, src[0], src[1],
   1976                                 inst->mlen, src[2].ud);
   1977          break;
   1978 
   1979       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
   1980          assert(src[2].file == BRW_IMMEDIATE_VALUE);
   1981          brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
   1982          break;
   1983 
   1984       case SHADER_OPCODE_MEMORY_FENCE:
   1985          brw_memory_fence(p, dst);
   1986          break;
   1987 
   1988       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
   1989          const struct brw_reg mask =
   1990             brw_stage_has_packed_dispatch(devinfo, stage,
   1991                                           prog_data) ? brw_imm_ud(~0u) :
   1992             stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
   1993             brw_dmask_reg();
   1994          brw_find_live_channel(p, dst, mask);
   1995          break;
   1996       }
   1997 
   1998       case SHADER_OPCODE_BROADCAST:
   1999          assert(inst->force_writemask_all);
   2000          brw_broadcast(p, dst, src[0], src[1]);
   2001          break;
   2002 
   2003       case FS_OPCODE_SET_SAMPLE_ID:
   2004          generate_set_sample_id(inst, dst, src[0], src[1]);
   2005          break;
   2006 
   2007       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
   2008           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
   2009           break;
   2010 
   2011       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
   2012       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
   2013          generate_unpack_half_2x16_split(inst, dst, src[0]);
   2014          break;
   2015 
   2016       case FS_OPCODE_PLACEHOLDER_HALT:
   2017          /* This is the place where the final HALT needs to be inserted if
   2018           * we've emitted any discards.  If not, this will emit no code.
   2019           */
   2020          if (!patch_discard_jumps_to_fb_writes()) {
   2021             if (unlikely(debug_flag)) {
   2022                annotation.ann_count--;
   2023             }
   2024          }
   2025          break;
   2026 
   2027       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
   2028          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
   2029                                            GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
   2030          break;
   2031 
   2032       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
   2033          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
   2034                                            GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
   2035          break;
   2036 
   2037       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
   2038          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
   2039                                            GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
   2040          break;
   2041 
   2042       case CS_OPCODE_CS_TERMINATE:
   2043          generate_cs_terminate(inst, src[0]);
   2044          break;
   2045 
   2046       case SHADER_OPCODE_BARRIER:
   2047 	 generate_barrier(inst, src[0]);
   2048 	 break;
   2049 
   2050       case BRW_OPCODE_DIM:
   2051          assert(devinfo->is_haswell);
   2052          assert(src[0].type == BRW_REGISTER_TYPE_DF);
   2053          assert(dst.type == BRW_REGISTER_TYPE_DF);
   2054          brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
   2055          break;
   2056 
   2057       default:
   2058          unreachable("Unsupported opcode");
   2059 
   2060       case SHADER_OPCODE_LOAD_PAYLOAD:
   2061          unreachable("Should be lowered by lower_load_payload()");
   2062       }
   2063 
   2064       if (multiple_instructions_emitted)
   2065          continue;
   2066 
   2067       if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
   2068          assert(p->next_insn_offset == last_insn_offset + 16 ||
   2069                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
   2070                  "emitting more than 1 instruction");
   2071 
   2072          brw_inst *last = &p->store[last_insn_offset / 16];
   2073 
   2074          if (inst->conditional_mod)
   2075             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
   2076          brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
   2077          brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
   2078       }
   2079    }
   2080 
   2081    brw_set_uip_jip(p, start_offset);
   2082    annotation_finalize(&annotation, p->next_insn_offset);
   2083 
   2084 #ifndef NDEBUG
   2085    bool validated = brw_validate_instructions(p, start_offset, &annotation);
   2086 #else
   2087    if (unlikely(debug_flag))
   2088       brw_validate_instructions(p, start_offset, &annotation);
   2089 #endif
   2090 
   2091    int before_size = p->next_insn_offset - start_offset;
   2092    brw_compact_instructions(p, start_offset, annotation.ann_count,
   2093                             annotation.ann);
   2094    int after_size = p->next_insn_offset - start_offset;
   2095 
   2096    if (unlikely(debug_flag)) {
   2097       fprintf(stderr, "Native code for %s\n"
   2098               "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
   2099               " bytes (%.0f%%)\n",
   2100               shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
   2101               spill_count, fill_count, promoted_constants, before_size, after_size,
   2102               100.0f * (before_size - after_size) / before_size);
   2103 
   2104       dump_assembly(p->store, annotation.ann_count, annotation.ann,
   2105                     p->devinfo);
   2106       ralloc_free(annotation.mem_ctx);
   2107    }
   2108    assert(validated);
   2109 
   2110    compiler->shader_debug_log(log_data,
   2111                               "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
   2112                               "%d:%d spills:fills, Promoted %u constants, "
   2113                               "compacted %d to %d bytes.",
   2114                               _mesa_shader_stage_to_abbrev(stage),
   2115                               dispatch_width, before_size / 16,
   2116                               loop_count, cfg->cycle_count, spill_count,
   2117                               fill_count, promoted_constants, before_size,
   2118                               after_size);
   2119 
   2120    return start_offset;
   2121 }
   2122 
   2123 const unsigned *
   2124 fs_generator::get_assembly(unsigned int *assembly_size)
   2125 {
   2126    return brw_get_program(p, assembly_size);
   2127 }
   2128