Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2011 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include "brw_vec4.h"
     25 #include "brw_cfg.h"
     26 #include "brw_eu.h"
     27 #include "brw_program.h"
     28 
     29 namespace brw {
     30 
     31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
     32                                    const src_reg &src0, const src_reg &src1,
     33                                    const src_reg &src2)
     34 {
     35    this->opcode = opcode;
     36    this->dst = dst;
     37    this->src[0] = src0;
     38    this->src[1] = src1;
     39    this->src[2] = src2;
     40    this->saturate = false;
     41    this->force_writemask_all = false;
     42    this->no_dd_clear = false;
     43    this->no_dd_check = false;
     44    this->writes_accumulator = false;
     45    this->conditional_mod = BRW_CONDITIONAL_NONE;
     46    this->predicate = BRW_PREDICATE_NONE;
     47    this->predicate_inverse = false;
     48    this->target = 0;
     49    this->shadow_compare = false;
     50    this->ir = NULL;
     51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
     52    this->header_size = 0;
     53    this->flag_subreg = 0;
     54    this->mlen = 0;
     55    this->base_mrf = 0;
     56    this->offset = 0;
     57    this->exec_size = 8;
     58    this->group = 0;
     59    this->size_written = (dst.file == BAD_FILE ?
     60                          0 : this->exec_size * type_sz(dst.type));
     61    this->annotation = NULL;
     62 }
     63 
     64 vec4_instruction *
     65 vec4_visitor::emit(vec4_instruction *inst)
     66 {
     67    inst->ir = this->base_ir;
     68    inst->annotation = this->current_annotation;
     69 
     70    this->instructions.push_tail(inst);
     71 
     72    return inst;
     73 }
     74 
     75 vec4_instruction *
     76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
     77                           vec4_instruction *new_inst)
     78 {
     79    new_inst->ir = inst->ir;
     80    new_inst->annotation = inst->annotation;
     81 
     82    inst->insert_before(block, new_inst);
     83 
     84    return inst;
     85 }
     86 
     87 vec4_instruction *
     88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
     89                    const src_reg &src1, const src_reg &src2)
     90 {
     91    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
     92 }
     93 
     94 
     95 vec4_instruction *
     96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
     97                    const src_reg &src1)
     98 {
     99    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
    100 }
    101 
    102 vec4_instruction *
    103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
    104 {
    105    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
    106 }
    107 
    108 vec4_instruction *
    109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
    110 {
    111    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
    112 }
    113 
    114 vec4_instruction *
    115 vec4_visitor::emit(enum opcode opcode)
    116 {
    117    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
    118 }
    119 
    120 #define ALU1(op)							\
    121    vec4_instruction *							\
    122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
    123    {									\
    124       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
    125    }
    126 
    127 #define ALU2(op)							\
    128    vec4_instruction *							\
    129    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
    130                     const src_reg &src1)				\
    131    {									\
    132       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
    133                                            src0, src1);                 \
    134    }
    135 
    136 #define ALU2_ACC(op)							\
    137    vec4_instruction *							\
    138    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
    139                     const src_reg &src1)				\
    140    {									\
    141       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
    142                        BRW_OPCODE_##op, dst, src0, src1);		\
    143       inst->writes_accumulator = true;                                  \
    144       return inst;                                                      \
    145    }
    146 
    147 #define ALU3(op)							\
    148    vec4_instruction *							\
    149    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
    150                     const src_reg &src1, const src_reg &src2)		\
    151    {									\
    152       assert(devinfo->gen >= 6);						\
    153       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
    154 					   src0, src1, src2);		\
    155    }
    156 
    157 ALU1(NOT)
    158 ALU1(MOV)
    159 ALU1(FRC)
    160 ALU1(RNDD)
    161 ALU1(RNDE)
    162 ALU1(RNDZ)
    163 ALU1(F32TO16)
    164 ALU1(F16TO32)
    165 ALU2(ADD)
    166 ALU2(MUL)
    167 ALU2_ACC(MACH)
    168 ALU2(AND)
    169 ALU2(OR)
    170 ALU2(XOR)
    171 ALU2(DP3)
    172 ALU2(DP4)
    173 ALU2(DPH)
    174 ALU2(SHL)
    175 ALU2(SHR)
    176 ALU2(ASR)
    177 ALU3(LRP)
    178 ALU1(BFREV)
    179 ALU3(BFE)
    180 ALU2(BFI1)
    181 ALU3(BFI2)
    182 ALU1(FBH)
    183 ALU1(FBL)
    184 ALU1(CBIT)
    185 ALU3(MAD)
    186 ALU2_ACC(ADDC)
    187 ALU2_ACC(SUBB)
    188 ALU2(MAC)
    189 ALU1(DIM)
    190 
    191 /** Gen4 predicated IF. */
    192 vec4_instruction *
    193 vec4_visitor::IF(enum brw_predicate predicate)
    194 {
    195    vec4_instruction *inst;
    196 
    197    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
    198    inst->predicate = predicate;
    199 
    200    return inst;
    201 }
    202 
    203 /** Gen6 IF with embedded comparison. */
    204 vec4_instruction *
    205 vec4_visitor::IF(src_reg src0, src_reg src1,
    206                  enum brw_conditional_mod condition)
    207 {
    208    assert(devinfo->gen == 6);
    209 
    210    vec4_instruction *inst;
    211 
    212    resolve_ud_negate(&src0);
    213    resolve_ud_negate(&src1);
    214 
    215    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
    216 					src0, src1);
    217    inst->conditional_mod = condition;
    218 
    219    return inst;
    220 }
    221 
    222 /**
    223  * CMP: Sets the low bit of the destination channels with the result
    224  * of the comparison, while the upper bits are undefined, and updates
    225  * the flag register with the packed 16 bits of the result.
    226  */
    227 vec4_instruction *
    228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
    229                   enum brw_conditional_mod condition)
    230 {
    231    vec4_instruction *inst;
    232 
    233    /* Take the instruction:
    234     *
    235     * CMP null<d> src0<f> src1<f>
    236     *
    237     * Original gen4 does type conversion to the destination type before
    238     * comparison, producing garbage results for floating point comparisons.
    239     *
    240     * The destination type doesn't matter on newer generations, so we set the
    241     * type to match src0 so we can compact the instruction.
    242     */
    243    dst.type = src0.type;
    244 
    245    resolve_ud_negate(&src0);
    246    resolve_ud_negate(&src1);
    247 
    248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
    249    inst->conditional_mod = condition;
    250 
    251    return inst;
    252 }
    253 
    254 vec4_instruction *
    255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
    256 {
    257    vec4_instruction *inst;
    258 
    259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
    260 					dst, index);
    261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
    262    inst->mlen = 2;
    263 
    264    return inst;
    265 }
    266 
    267 vec4_instruction *
    268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
    269                             const src_reg &index)
    270 {
    271    vec4_instruction *inst;
    272 
    273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
    274 					dst, src, index);
    275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
    276    inst->mlen = 3;
    277 
    278    return inst;
    279 }
    280 
    281 src_reg
    282 vec4_visitor::fix_3src_operand(const src_reg &src)
    283 {
    284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
    285     * able to use vertical stride of zero to replicate the vec4 uniform, like
    286     *
    287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
    288     *
    289     * But you can't, since vertical stride is always four in three-source
    290     * instructions. Instead, insert a MOV instruction to do the replication so
    291     * that the three-source instruction can consume it.
    292     */
    293 
    294    /* The MOV is only needed if the source is a uniform or immediate. */
    295    if (src.file != UNIFORM && src.file != IMM)
    296       return src;
    297 
    298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
    299       return src;
    300 
    301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
    302    expanded.type = src.type;
    303    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
    304    return src_reg(expanded);
    305 }
    306 
    307 src_reg
    308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
    309 {
    310    if (!src.abs && !src.negate)
    311       return src;
    312 
    313    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
    314    resolved.type = src.type;
    315    emit(MOV(resolved, src));
    316 
    317    return src_reg(resolved);
    318 }
    319 
    320 src_reg
    321 vec4_visitor::fix_math_operand(const src_reg &src)
    322 {
    323    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
    324       return src;
    325 
    326    /* The gen6 math instruction ignores the source modifiers --
    327     * swizzle, abs, negate, and at least some parts of the register
    328     * region description.
    329     *
    330     * Rather than trying to enumerate all these cases, *always* expand the
    331     * operand to a temp GRF for gen6.
    332     *
    333     * For gen7, keep the operand as-is, except if immediate, which gen7 still
    334     * can't use.
    335     */
    336 
    337    if (devinfo->gen == 7 && src.file != IMM)
    338       return src;
    339 
    340    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
    341    expanded.type = src.type;
    342    emit(MOV(expanded, src));
    343    return src_reg(expanded);
    344 }
    345 
    346 vec4_instruction *
    347 vec4_visitor::emit_math(enum opcode opcode,
    348                         const dst_reg &dst,
    349                         const src_reg &src0, const src_reg &src1)
    350 {
    351    vec4_instruction *math =
    352       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
    353 
    354    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
    355       /* MATH on Gen6 must be align1, so we can't do writemasks. */
    356       math->dst = dst_reg(this, glsl_type::vec4_type);
    357       math->dst.type = dst.type;
    358       math = emit(MOV(dst, src_reg(math->dst)));
    359    } else if (devinfo->gen < 6) {
    360       math->base_mrf = 1;
    361       math->mlen = src1.file == BAD_FILE ? 1 : 2;
    362    }
    363 
    364    return math;
    365 }
    366 
    367 void
    368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
    369 {
    370    if (devinfo->gen < 7) {
    371       unreachable("ir_unop_pack_half_2x16 should be lowered");
    372    }
    373 
    374    assert(dst.type == BRW_REGISTER_TYPE_UD);
    375    assert(src0.type == BRW_REGISTER_TYPE_F);
    376 
    377    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
    378     *
    379     *   Because this instruction does not have a 16-bit floating-point type,
    380     *   the destination data type must be Word (W).
    381     *
    382     *   The destination must be DWord-aligned and specify a horizontal stride
    383     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
    384     *   each destination channel and the upper word is not modified.
    385     *
    386     * The above restriction implies that the f32to16 instruction must use
    387     * align1 mode, because only in align1 mode is it possible to specify
    388     * horizontal stride.  We choose here to defy the hardware docs and emit
    389     * align16 instructions.
    390     *
    391     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
    392     * instructions. I was partially successful in that the code passed all
    393     * tests.  However, the code was dubiously correct and fragile, and the
    394     * tests were not harsh enough to probe that frailty. Not trusting the
    395     * code, I chose instead to remain in align16 mode in defiance of the hw
    396     * docs).
    397     *
    398     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
    399     * simulator, emitting a f32to16 in align16 mode with UD as destination
    400     * data type is safe. The behavior differs from that specified in the PRM
    401     * in that the upper word of each destination channel is cleared to 0.
    402     */
    403 
    404    dst_reg tmp_dst(this, glsl_type::uvec2_type);
    405    src_reg tmp_src(tmp_dst);
    406 
    407 #if 0
    408    /* Verify the undocumented behavior on which the following instructions
    409     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
    410     * then the result of the bit-or instruction below will be incorrect.
    411     *
    412     * You should inspect the disasm output in order to verify that the MOV is
    413     * not optimized away.
    414     */
    415    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
    416 #endif
    417 
    418    /* Give tmp the form below, where "." means untouched.
    419     *
    420     *     w z          y          x w z          y          x
    421     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
    422     *
    423     * That the upper word of each write-channel be 0 is required for the
    424     * following bit-shift and bit-or instructions to work. Note that this
    425     * relies on the undocumented hardware behavior mentioned above.
    426     */
    427    tmp_dst.writemask = WRITEMASK_XY;
    428    emit(F32TO16(tmp_dst, src0));
    429 
    430    /* Give the write-channels of dst the form:
    431     *   0xhhhh0000
    432     */
    433    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
    434    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
    435 
    436    /* Finally, give the write-channels of dst the form of packHalf2x16's
    437     * output:
    438     *   0xhhhhllll
    439     */
    440    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
    441    emit(OR(dst, src_reg(dst), tmp_src));
    442 }
    443 
    444 void
    445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
    446 {
    447    if (devinfo->gen < 7) {
    448       unreachable("ir_unop_unpack_half_2x16 should be lowered");
    449    }
    450 
    451    assert(dst.type == BRW_REGISTER_TYPE_F);
    452    assert(src0.type == BRW_REGISTER_TYPE_UD);
    453 
    454    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
    455     *
    456     *   Because this instruction does not have a 16-bit floating-point type,
    457     *   the source data type must be Word (W). The destination type must be
    458     *   F (Float).
    459     *
    460     * To use W as the source data type, we must adjust horizontal strides,
    461     * which is only possible in align1 mode. All my [chadv] attempts at
    462     * emitting align1 instructions for unpackHalf2x16 failed to pass the
    463     * Piglit tests, so I gave up.
    464     *
    465     * I've verified that, on gen7 hardware and the simulator, it is safe to
    466     * emit f16to32 in align16 mode with UD as source data type.
    467     */
    468 
    469    dst_reg tmp_dst(this, glsl_type::uvec2_type);
    470    src_reg tmp_src(tmp_dst);
    471 
    472    tmp_dst.writemask = WRITEMASK_X;
    473    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
    474 
    475    tmp_dst.writemask = WRITEMASK_Y;
    476    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
    477 
    478    dst.writemask = WRITEMASK_XY;
    479    emit(F16TO32(dst, tmp_src));
    480 }
    481 
    482 void
    483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
    484 {
    485    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
    486     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
    487     * is not suitable to generate the shift values, but we can use the packed
    488     * vector float and a type-converting MOV.
    489     */
    490    dst_reg shift(this, glsl_type::uvec4_type);
    491    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
    492 
    493    dst_reg shifted(this, glsl_type::uvec4_type);
    494    src0.swizzle = BRW_SWIZZLE_XXXX;
    495    emit(SHR(shifted, src0, src_reg(shift)));
    496 
    497    shifted.type = BRW_REGISTER_TYPE_UB;
    498    dst_reg f(this, glsl_type::vec4_type);
    499    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
    500 
    501    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
    502 }
    503 
    504 void
    505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
    506 {
    507    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
    508     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
    509     * is not suitable to generate the shift values, but we can use the packed
    510     * vector float and a type-converting MOV.
    511     */
    512    dst_reg shift(this, glsl_type::uvec4_type);
    513    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
    514 
    515    dst_reg shifted(this, glsl_type::uvec4_type);
    516    src0.swizzle = BRW_SWIZZLE_XXXX;
    517    emit(SHR(shifted, src0, src_reg(shift)));
    518 
    519    shifted.type = BRW_REGISTER_TYPE_B;
    520    dst_reg f(this, glsl_type::vec4_type);
    521    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
    522 
    523    dst_reg scaled(this, glsl_type::vec4_type);
    524    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
    525 
    526    dst_reg max(this, glsl_type::vec4_type);
    527    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
    528    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
    529 }
    530 
    531 void
    532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
    533 {
    534    dst_reg saturated(this, glsl_type::vec4_type);
    535    vec4_instruction *inst = emit(MOV(saturated, src0));
    536    inst->saturate = true;
    537 
    538    dst_reg scaled(this, glsl_type::vec4_type);
    539    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
    540 
    541    dst_reg rounded(this, glsl_type::vec4_type);
    542    emit(RNDE(rounded, src_reg(scaled)));
    543 
    544    dst_reg u(this, glsl_type::uvec4_type);
    545    emit(MOV(u, src_reg(rounded)));
    546 
    547    src_reg bytes(u);
    548    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
    549 }
    550 
    551 void
    552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
    553 {
    554    dst_reg max(this, glsl_type::vec4_type);
    555    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
    556 
    557    dst_reg min(this, glsl_type::vec4_type);
    558    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
    559 
    560    dst_reg scaled(this, glsl_type::vec4_type);
    561    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
    562 
    563    dst_reg rounded(this, glsl_type::vec4_type);
    564    emit(RNDE(rounded, src_reg(scaled)));
    565 
    566    dst_reg i(this, glsl_type::ivec4_type);
    567    emit(MOV(i, src_reg(rounded)));
    568 
    569    src_reg bytes(i);
    570    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
    571 }
    572 
    573 /*
    574  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
    575  * false) elements needed to pack a type.
    576  */
    577 static int
    578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
    579 {
    580    unsigned int i;
    581    int size;
    582 
    583    switch (type->base_type) {
    584    case GLSL_TYPE_UINT:
    585    case GLSL_TYPE_INT:
    586    case GLSL_TYPE_FLOAT:
    587    case GLSL_TYPE_BOOL:
    588    case GLSL_TYPE_DOUBLE:
    589       if (type->is_matrix()) {
    590          const glsl_type *col_type = type->column_type();
    591          unsigned col_slots =
    592             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
    593          return type->matrix_columns * col_slots;
    594       } else {
    595          /* Regardless of size of vector, it gets a vec4. This is bad
    596           * packing for things like floats, but otherwise arrays become a
    597           * mess.  Hopefully a later pass over the code can pack scalars
    598           * down if appropriate.
    599           */
    600          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
    601       }
    602    case GLSL_TYPE_ARRAY:
    603       assert(type->length > 0);
    604       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
    605    case GLSL_TYPE_STRUCT:
    606       size = 0;
    607       for (i = 0; i < type->length; i++) {
    608 	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
    609       }
    610       return size;
    611    case GLSL_TYPE_SUBROUTINE:
    612       return 1;
    613 
    614    case GLSL_TYPE_SAMPLER:
    615       /* Samplers take up no register space, since they're baked in at
    616        * link time.
    617        */
    618       return 0;
    619    case GLSL_TYPE_ATOMIC_UINT:
    620       return 0;
    621    case GLSL_TYPE_IMAGE:
    622       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
    623    case GLSL_TYPE_VOID:
    624    case GLSL_TYPE_ERROR:
    625    case GLSL_TYPE_INTERFACE:
    626    case GLSL_TYPE_FUNCTION:
    627       unreachable("not reached");
    628    }
    629 
    630    return 0;
    631 }
    632 
    633 /**
    634  * Returns the minimum number of vec4 elements needed to pack a type.
    635  *
    636  * For simple types, it will return 1 (a single vec4); for matrices, the
    637  * number of columns; for array and struct, the sum of the vec4_size of
    638  * each of its elements; and for sampler and atomic, zero.
    639  *
    640  * This method is useful to calculate how much register space is needed to
    641  * store a particular type.
    642  */
    643 extern "C" int
    644 type_size_vec4(const struct glsl_type *type)
    645 {
    646    return type_size_xvec4(type, true);
    647 }
    648 
    649 /**
    650  * Returns the minimum number of dvec4 elements needed to pack a type.
    651  *
    652  * For simple types, it will return 1 (a single dvec4); for matrices, the
    653  * number of columns; for array and struct, the sum of the dvec4_size of
    654  * each of its elements; and for sampler and atomic, zero.
    655  *
    656  * This method is useful to calculate how much register space is needed to
    657  * store a particular type.
    658  *
    659  * Measuring double-precision vertex inputs as dvec4 is required because
    660  * ARB_vertex_attrib_64bit states that these uses the same number of locations
    661  * than the single-precision version. That is, two consecutives dvec4 would be
    662  * located in location "x" and location "x+1", not "x+2".
    663  *
    664  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
    665  * remap_vs_attrs() will take in account both the location and also if the
    666  * type fits in one or two vec4 slots.
    667  */
    668 extern "C" int
    669 type_size_dvec4(const struct glsl_type *type)
    670 {
    671    return type_size_xvec4(type, false);
    672 }
    673 
    674 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
    675 {
    676    init();
    677 
    678    this->file = VGRF;
    679    this->nr = v->alloc.allocate(type_size_vec4(type));
    680 
    681    if (type->is_array() || type->is_record()) {
    682       this->swizzle = BRW_SWIZZLE_NOOP;
    683    } else {
    684       this->swizzle = brw_swizzle_for_size(type->vector_elements);
    685    }
    686 
    687    this->type = brw_type_for_base_type(type);
    688 }
    689 
    690 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
    691 {
    692    assert(size > 0);
    693 
    694    init();
    695 
    696    this->file = VGRF;
    697    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
    698 
    699    this->swizzle = BRW_SWIZZLE_NOOP;
    700 
    701    this->type = brw_type_for_base_type(type);
    702 }
    703 
    704 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    705 {
    706    init();
    707 
    708    this->file = VGRF;
    709    this->nr = v->alloc.allocate(type_size_vec4(type));
    710 
    711    if (type->is_array() || type->is_record()) {
    712       this->writemask = WRITEMASK_XYZW;
    713    } else {
    714       this->writemask = (1 << type->vector_elements) - 1;
    715    }
    716 
    717    this->type = brw_type_for_base_type(type);
    718 }
    719 
    720 vec4_instruction *
    721 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
    722                           src_reg src0, src_reg src1)
    723 {
    724    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
    725    inst->conditional_mod = conditionalmod;
    726    return inst;
    727 }
    728 
    729 vec4_instruction *
    730 vec4_visitor::emit_lrp(const dst_reg &dst,
    731                        const src_reg &x, const src_reg &y, const src_reg &a)
    732 {
    733    if (devinfo->gen >= 6) {
    734       /* Note that the instruction's argument order is reversed from GLSL
    735        * and the IR.
    736        */
    737      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
    738                      fix_3src_operand(x)));
    739    } else {
    740       /* Earlier generations don't support three source operations, so we
    741        * need to emit x*(1-a) + y*a.
    742        */
    743       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
    744       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
    745       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
    746       y_times_a.writemask           = dst.writemask;
    747       one_minus_a.writemask         = dst.writemask;
    748       x_times_one_minus_a.writemask = dst.writemask;
    749 
    750       emit(MUL(y_times_a, y, a));
    751       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
    752       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
    753       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
    754    }
    755 }
    756 
    757 /**
    758  * Emits the instructions needed to perform a pull constant load. before_block
    759  * and before_inst can be NULL in which case the instruction will be appended
    760  * to the end of the instruction list.
    761  */
    762 void
    763 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
    764                                           src_reg surf_index,
    765                                           src_reg offset_reg,
    766                                           bblock_t *before_block,
    767                                           vec4_instruction *before_inst)
    768 {
    769    assert((before_inst == NULL && before_block == NULL) ||
    770           (before_inst && before_block));
    771 
    772    vec4_instruction *pull;
    773 
    774    if (devinfo->gen >= 9) {
    775       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
    776       src_reg header(this, glsl_type::uvec4_type, 2);
    777 
    778       pull = new(mem_ctx)
    779          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
    780                           dst_reg(header));
    781 
    782       if (before_inst)
    783          emit_before(before_block, before_inst, pull);
    784       else
    785          emit(pull);
    786 
    787       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
    788                                  offset_reg.type);
    789       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
    790 
    791       if (before_inst)
    792          emit_before(before_block, before_inst, pull);
    793       else
    794          emit(pull);
    795 
    796       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
    797                                            dst,
    798                                            surf_index,
    799                                            header);
    800       pull->mlen = 2;
    801       pull->header_size = 1;
    802    } else if (devinfo->gen >= 7) {
    803       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
    804 
    805       grf_offset.type = offset_reg.type;
    806 
    807       pull = MOV(grf_offset, offset_reg);
    808 
    809       if (before_inst)
    810          emit_before(before_block, before_inst, pull);
    811       else
    812          emit(pull);
    813 
    814       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
    815                                            dst,
    816                                            surf_index,
    817                                            src_reg(grf_offset));
    818       pull->mlen = 1;
    819    } else {
    820       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
    821                                            dst,
    822                                            surf_index,
    823                                            offset_reg);
    824       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
    825       pull->mlen = 1;
    826    }
    827 
    828    if (before_inst)
    829       emit_before(before_block, before_inst, pull);
    830    else
    831       emit(pull);
    832 }
    833 
    834 src_reg
    835 vec4_visitor::emit_uniformize(const src_reg &src)
    836 {
    837    const src_reg chan_index(this, glsl_type::uint_type);
    838    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
    839                               src.type);
    840 
    841    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
    842       ->force_writemask_all = true;
    843    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
    844       ->force_writemask_all = true;
    845 
    846    return src_reg(dst);
    847 }
    848 
    849 src_reg
    850 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
    851                              src_reg coordinate, src_reg surface)
    852 {
    853    vec4_instruction *inst =
    854       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
    855                                     dst_reg(this, glsl_type::uvec4_type));
    856    inst->base_mrf = 2;
    857    inst->src[1] = surface;
    858    inst->src[2] = surface;
    859 
    860    int param_base;
    861 
    862    if (devinfo->gen >= 9) {
    863       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
    864       vec4_instruction *header_inst = new(mem_ctx)
    865          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
    866                           dst_reg(MRF, inst->base_mrf));
    867 
    868       emit(header_inst);
    869 
    870       inst->mlen = 2;
    871       inst->header_size = 1;
    872       param_base = inst->base_mrf + 1;
    873    } else {
    874       inst->mlen = 1;
    875       param_base = inst->base_mrf;
    876    }
    877 
    878    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
    879    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
    880    int zero_mask = 0xf & ~coord_mask;
    881 
    882    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
    883             coordinate));
    884 
    885    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
    886             brw_imm_d(0)));
    887 
    888    emit(inst);
    889    return src_reg(inst->dst);
    890 }
    891 
    892 bool
    893 vec4_visitor::is_high_sampler(src_reg sampler)
    894 {
    895    if (devinfo->gen < 8 && !devinfo->is_haswell)
    896       return false;
    897 
    898    return sampler.file != IMM || sampler.ud >= 16;
    899 }
    900 
    901 void
    902 vec4_visitor::emit_texture(ir_texture_opcode op,
    903                            dst_reg dest,
    904                            const glsl_type *dest_type,
    905                            src_reg coordinate,
    906                            int coord_components,
    907                            src_reg shadow_comparator,
    908                            src_reg lod, src_reg lod2,
    909                            src_reg sample_index,
    910                            uint32_t constant_offset,
    911                            src_reg offset_value,
    912                            src_reg mcs,
    913                            uint32_t surface,
    914                            src_reg surface_reg,
    915                            src_reg sampler_reg)
    916 {
    917    /* The sampler can only meaningfully compute LOD for fragment shader
    918     * messages. For all other stages, we change the opcode to TXL and hardcode
    919     * the LOD to 0.
    920     *
    921     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
    922     * valid LOD argument.
    923     */
    924    if (op == ir_tex || op == ir_query_levels) {
    925       assert(lod.file == BAD_FILE);
    926       lod = brw_imm_f(0.0f);
    927    }
    928 
    929    enum opcode opcode;
    930    switch (op) {
    931    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
    932    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
    933    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
    934    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
    935    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
    936                              SHADER_OPCODE_TXF_CMS); break;
    937    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
    938    case ir_tg4: opcode = offset_value.file != BAD_FILE
    939                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
    940    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
    941    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
    942    case ir_txb:
    943       unreachable("TXB is not valid for vertex shaders.");
    944    case ir_lod:
    945       unreachable("LOD is not valid for vertex shaders.");
    946    case ir_samples_identical: {
    947       /* There are some challenges implementing this for vec4, and it seems
    948        * unlikely to be used anyway.  For now, just return false ways.
    949        */
    950       emit(MOV(dest, brw_imm_ud(0u)));
    951       return;
    952    }
    953    default:
    954       unreachable("Unrecognized tex op");
    955    }
    956 
    957    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
    958 
    959    inst->offset = constant_offset;
    960 
    961    /* The message header is necessary for:
    962     * - Gen4 (always)
    963     * - Gen9+ for selecting SIMD4x2
    964     * - Texel offsets
    965     * - Gather channel selection
    966     * - Sampler indices too large to fit in a 4-bit value.
    967     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
    968     */
    969    inst->header_size =
    970       (devinfo->gen < 5 || devinfo->gen >= 9 ||
    971        inst->offset != 0 || op == ir_tg4 ||
    972        op == ir_texture_samples ||
    973        is_high_sampler(sampler_reg)) ? 1 : 0;
    974    inst->base_mrf = 2;
    975    inst->mlen = inst->header_size;
    976    inst->dst.writemask = WRITEMASK_XYZW;
    977    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
    978 
    979    inst->src[1] = surface_reg;
    980    inst->src[2] = sampler_reg;
    981 
    982    /* MRF for the first parameter */
    983    int param_base = inst->base_mrf + inst->header_size;
    984 
    985    if (op == ir_txs || op == ir_query_levels) {
    986       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
    987       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
    988       inst->mlen++;
    989    } else if (op == ir_texture_samples) {
    990       inst->dst.writemask = WRITEMASK_X;
    991    } else {
    992       /* Load the coordinate */
    993       /* FINISHME: gl_clamp_mask and saturate */
    994       int coord_mask = (1 << coord_components) - 1;
    995       int zero_mask = 0xf & ~coord_mask;
    996 
    997       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
    998                coordinate));
    999       inst->mlen++;
   1000 
   1001       if (zero_mask != 0) {
   1002          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
   1003                   brw_imm_d(0)));
   1004       }
   1005       /* Load the shadow comparator */
   1006       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
   1007 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
   1008 			  WRITEMASK_X),
   1009 		  shadow_comparator));
   1010 	 inst->mlen++;
   1011       }
   1012 
   1013       /* Load the LOD info */
   1014       if (op == ir_tex || op == ir_txl) {
   1015 	 int mrf, writemask;
   1016 	 if (devinfo->gen >= 5) {
   1017 	    mrf = param_base + 1;
   1018 	    if (shadow_comparator.file != BAD_FILE) {
   1019 	       writemask = WRITEMASK_Y;
   1020 	       /* mlen already incremented */
   1021 	    } else {
   1022 	       writemask = WRITEMASK_X;
   1023 	       inst->mlen++;
   1024 	    }
   1025 	 } else /* devinfo->gen == 4 */ {
   1026 	    mrf = param_base;
   1027 	    writemask = WRITEMASK_W;
   1028 	 }
   1029 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
   1030       } else if (op == ir_txf) {
   1031          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
   1032       } else if (op == ir_txf_ms) {
   1033          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
   1034                   sample_index));
   1035          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
   1036             /* MCS data is stored in the first two channels of mcs, but we
   1037              * need to get it into the .y and .z channels of the second vec4
   1038              * of params.
   1039              */
   1040             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
   1041             emit(MOV(dst_reg(MRF, param_base + 1,
   1042                              glsl_type::uint_type, WRITEMASK_YZ),
   1043                      mcs));
   1044          } else if (devinfo->gen >= 7) {
   1045             /* MCS data is in the first channel of `mcs`, but we need to get it into
   1046              * the .y channel of the second vec4 of params, so replicate .x across
   1047              * the whole vec4 and then mask off everything except .y
   1048              */
   1049             mcs.swizzle = BRW_SWIZZLE_XXXX;
   1050             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
   1051                      mcs));
   1052          }
   1053          inst->mlen++;
   1054       } else if (op == ir_txd) {
   1055          const brw_reg_type type = lod.type;
   1056 
   1057 	 if (devinfo->gen >= 5) {
   1058 	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
   1059 	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
   1060 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
   1061 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
   1062 	    inst->mlen++;
   1063 
   1064 	    if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
   1065 	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
   1066 	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
   1067 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
   1068 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
   1069 	       inst->mlen++;
   1070 
   1071                if (shadow_comparator.file != BAD_FILE) {
   1072                   emit(MOV(dst_reg(MRF, param_base + 2,
   1073                                    shadow_comparator.type, WRITEMASK_Z),
   1074                            shadow_comparator));
   1075                }
   1076 	    }
   1077 	 } else /* devinfo->gen == 4 */ {
   1078 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
   1079 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
   1080 	    inst->mlen += 2;
   1081 	 }
   1082       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
   1083          if (shadow_comparator.file != BAD_FILE) {
   1084             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
   1085                      shadow_comparator));
   1086          }
   1087 
   1088          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
   1089                   offset_value));
   1090          inst->mlen++;
   1091       }
   1092    }
   1093 
   1094    emit(inst);
   1095 
   1096    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
   1097     * spec requires layers.
   1098     */
   1099    if (op == ir_txs && devinfo->gen < 7) {
   1100       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
   1101       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
   1102                   src_reg(inst->dst), brw_imm_d(1));
   1103    }
   1104 
   1105    if (devinfo->gen == 6 && op == ir_tg4) {
   1106       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
   1107    }
   1108 
   1109    if (op == ir_query_levels) {
   1110       /* # levels is in .w */
   1111       src_reg swizzled(dest);
   1112       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
   1113                                       SWIZZLE_W, SWIZZLE_W);
   1114       emit(MOV(dest, swizzled));
   1115    }
   1116 }
   1117 
   1118 /**
   1119  * Apply workarounds for Gen6 gather with UINT/SINT
   1120  */
   1121 void
   1122 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
   1123 {
   1124    if (!wa)
   1125       return;
   1126 
   1127    int width = (wa & WA_8BIT) ? 8 : 16;
   1128    dst_reg dst_f = dst;
   1129    dst_f.type = BRW_REGISTER_TYPE_F;
   1130 
   1131    /* Convert from UNORM to UINT */
   1132    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
   1133    emit(MOV(dst, src_reg(dst_f)));
   1134 
   1135    if (wa & WA_SIGN) {
   1136       /* Reinterpret the UINT value as a signed INT value by
   1137        * shifting the sign bit into place, then shifting back
   1138        * preserving sign.
   1139        */
   1140       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
   1141       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
   1142    }
   1143 }
   1144 
   1145 void
   1146 vec4_visitor::gs_emit_vertex(int /* stream_id */)
   1147 {
   1148    unreachable("not reached");
   1149 }
   1150 
   1151 void
   1152 vec4_visitor::gs_end_primitive()
   1153 {
   1154    unreachable("not reached");
   1155 }
   1156 
   1157 void
   1158 vec4_visitor::emit_ndc_computation()
   1159 {
   1160    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
   1161       return;
   1162 
   1163    /* Get the position */
   1164    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
   1165 
   1166    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
   1167    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
   1168    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
   1169    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
   1170 
   1171    current_annotation = "NDC";
   1172    dst_reg ndc_w = ndc;
   1173    ndc_w.writemask = WRITEMASK_W;
   1174    src_reg pos_w = pos;
   1175    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
   1176    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
   1177 
   1178    dst_reg ndc_xyz = ndc;
   1179    ndc_xyz.writemask = WRITEMASK_XYZ;
   1180 
   1181    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
   1182 }
   1183 
   1184 void
   1185 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
   1186 {
   1187    if (devinfo->gen < 6 &&
   1188        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
   1189         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
   1190         devinfo->has_negative_rhw_bug)) {
   1191       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
   1192       dst_reg header1_w = header1;
   1193       header1_w.writemask = WRITEMASK_W;
   1194 
   1195       emit(MOV(header1, brw_imm_ud(0u)));
   1196 
   1197       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
   1198 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
   1199 
   1200 	 current_annotation = "Point size";
   1201 	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
   1202 	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
   1203       }
   1204 
   1205       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
   1206          current_annotation = "Clipping flags";
   1207          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
   1208          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
   1209 
   1210          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
   1211          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
   1212          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
   1213 
   1214          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
   1215          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
   1216          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
   1217          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
   1218       }
   1219 
   1220       /* i965 clipping workaround:
   1221        * 1) Test for -ve rhw
   1222        * 2) If set,
   1223        *      set ndc = (0,0,0,0)
   1224        *      set ucp[6] = 1
   1225        *
   1226        * Later, clipping will detect ucp[6] and ensure the primitive is
   1227        * clipped against all fixed planes.
   1228        */
   1229       if (devinfo->has_negative_rhw_bug &&
   1230           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
   1231          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
   1232          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
   1233          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
   1234          vec4_instruction *inst;
   1235          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
   1236          inst->predicate = BRW_PREDICATE_NORMAL;
   1237          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
   1238          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
   1239          inst->predicate = BRW_PREDICATE_NORMAL;
   1240       }
   1241 
   1242       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
   1243    } else if (devinfo->gen < 6) {
   1244       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
   1245    } else {
   1246       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
   1247       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
   1248          dst_reg reg_w = reg;
   1249          reg_w.writemask = WRITEMASK_W;
   1250          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
   1251          reg_as_src.type = reg_w.type;
   1252          reg_as_src.swizzle = brw_swizzle_for_size(1);
   1253          emit(MOV(reg_w, reg_as_src));
   1254       }
   1255       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
   1256          dst_reg reg_y = reg;
   1257          reg_y.writemask = WRITEMASK_Y;
   1258          reg_y.type = BRW_REGISTER_TYPE_D;
   1259          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
   1260          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
   1261       }
   1262       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
   1263          dst_reg reg_z = reg;
   1264          reg_z.writemask = WRITEMASK_Z;
   1265          reg_z.type = BRW_REGISTER_TYPE_D;
   1266          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
   1267          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
   1268       }
   1269    }
   1270 }
   1271 
   1272 vec4_instruction *
   1273 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
   1274 {
   1275    assert(varying < VARYING_SLOT_MAX);
   1276 
   1277    unsigned num_comps = output_num_components[varying][component];
   1278    if (num_comps == 0)
   1279       return NULL;
   1280 
   1281    assert(output_reg[varying][component].type == reg.type);
   1282    current_annotation = output_reg_annotation[varying];
   1283    if (output_reg[varying][component].file != BAD_FILE) {
   1284       src_reg src = src_reg(output_reg[varying][component]);
   1285       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
   1286       reg.writemask =
   1287          brw_writemask_for_component_packing(num_comps, component);
   1288       return emit(MOV(reg, src));
   1289    }
   1290    return NULL;
   1291 }
   1292 
   1293 void
   1294 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
   1295 {
   1296    reg.type = BRW_REGISTER_TYPE_F;
   1297    output_reg[varying][0].type = reg.type;
   1298 
   1299    switch (varying) {
   1300    case VARYING_SLOT_PSIZ:
   1301    {
   1302       /* PSIZ is always in slot 0, and is coupled with other flags. */
   1303       current_annotation = "indices, point width, clip flags";
   1304       emit_psiz_and_flags(reg);
   1305       break;
   1306    }
   1307    case BRW_VARYING_SLOT_NDC:
   1308       current_annotation = "NDC";
   1309       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
   1310          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
   1311       break;
   1312    case VARYING_SLOT_POS:
   1313       current_annotation = "gl_Position";
   1314       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
   1315          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
   1316       break;
   1317    case VARYING_SLOT_EDGE:
   1318       /* This is present when doing unfilled polygons.  We're supposed to copy
   1319        * the edge flag from the user-provided vertex array
   1320        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
   1321        * of that attribute (starts as 1.0f).  This is then used in clipping to
   1322        * determine which edges should be drawn as wireframe.
   1323        */
   1324       current_annotation = "edge flag";
   1325       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
   1326                                     glsl_type::float_type, WRITEMASK_XYZW))));
   1327       break;
   1328    case BRW_VARYING_SLOT_PAD:
   1329       /* No need to write to this slot */
   1330       break;
   1331    default:
   1332       for (int i = 0; i < 4; i++) {
   1333          emit_generic_urb_slot(reg, varying, i);
   1334       }
   1335       break;
   1336    }
   1337 }
   1338 
   1339 static int
   1340 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
   1341 {
   1342    if (devinfo->gen >= 6) {
   1343       /* URB data written (does not include the message header reg) must
   1344        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
   1345        * section 5.4.3.2.2: URB_INTERLEAVED.
   1346        *
   1347        * URB entries are allocated on a multiple of 1024 bits, so an
   1348        * extra 128 bits written here to make the end align to 256 is
   1349        * no problem.
   1350        */
   1351       if ((mlen % 2) != 1)
   1352 	 mlen++;
   1353    }
   1354 
   1355    return mlen;
   1356 }
   1357 
   1358 
   1359 /**
   1360  * Generates the VUE payload plus the necessary URB write instructions to
   1361  * output it.
   1362  *
   1363  * The VUE layout is documented in Volume 2a.
   1364  */
   1365 void
   1366 vec4_visitor::emit_vertex()
   1367 {
   1368    /* MRF 0 is reserved for the debugger, so start with message header
   1369     * in MRF 1.
   1370     */
   1371    int base_mrf = 1;
   1372    int mrf = base_mrf;
   1373    /* In the process of generating our URB write message contents, we
   1374     * may need to unspill a register or load from an array.  Those
   1375     * reads would use MRFs 14-15.
   1376     */
   1377    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
   1378 
   1379    /* The following assertion verifies that max_usable_mrf causes an
   1380     * even-numbered amount of URB write data, which will meet gen6's
   1381     * requirements for length alignment.
   1382     */
   1383    assert ((max_usable_mrf - base_mrf) % 2 == 0);
   1384 
   1385    /* First mrf is the g0-based message header containing URB handles and
   1386     * such.
   1387     */
   1388    emit_urb_write_header(mrf++);
   1389 
   1390    if (devinfo->gen < 6) {
   1391       emit_ndc_computation();
   1392    }
   1393 
   1394    /* We may need to split this up into several URB writes, so do them in a
   1395     * loop.
   1396     */
   1397    int slot = 0;
   1398    bool complete = false;
   1399    do {
   1400       /* URB offset is in URB row increments, and each of our MRFs is half of
   1401        * one of those, since we're doing interleaved writes.
   1402        */
   1403       int offset = slot / 2;
   1404 
   1405       mrf = base_mrf + 1;
   1406       for (; slot < prog_data->vue_map.num_slots; ++slot) {
   1407          emit_urb_slot(dst_reg(MRF, mrf++),
   1408                        prog_data->vue_map.slot_to_varying[slot]);
   1409 
   1410          /* If this was max_usable_mrf, we can't fit anything more into this
   1411           * URB WRITE. Same thing if we reached the maximum length available.
   1412           */
   1413          if (mrf > max_usable_mrf ||
   1414              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
   1415             slot++;
   1416             break;
   1417          }
   1418       }
   1419 
   1420       complete = slot >= prog_data->vue_map.num_slots;
   1421       current_annotation = "URB write";
   1422       vec4_instruction *inst = emit_urb_write_opcode(complete);
   1423       inst->base_mrf = base_mrf;
   1424       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
   1425       inst->offset += offset;
   1426    } while(!complete);
   1427 }
   1428 
   1429 
   1430 src_reg
   1431 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
   1432 				 src_reg *reladdr, int reg_offset)
   1433 {
   1434    /* Because we store the values to scratch interleaved like our
   1435     * vertex data, we need to scale the vec4 index by 2.
   1436     */
   1437    int message_header_scale = 2;
   1438 
   1439    /* Pre-gen6, the message header uses byte offsets instead of vec4
   1440     * (16-byte) offset units.
   1441     */
   1442    if (devinfo->gen < 6)
   1443       message_header_scale *= 16;
   1444 
   1445    if (reladdr) {
   1446       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
   1447        * to multiply the reladdr by 2. Notice that the reg_offset part
   1448        * is in units of 16 bytes and is used to select the low/high 16-byte
   1449        * chunk of a full dvec4, so we don't want to multiply that part.
   1450        */
   1451       src_reg index = src_reg(this, glsl_type::int_type);
   1452       if (type_sz(inst->dst.type) < 8) {
   1453          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
   1454                                       brw_imm_d(reg_offset)));
   1455          emit_before(block, inst, MUL(dst_reg(index), index,
   1456                                       brw_imm_d(message_header_scale)));
   1457       } else {
   1458          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
   1459                                       brw_imm_d(message_header_scale * 2)));
   1460          emit_before(block, inst, ADD(dst_reg(index), index,
   1461                                       brw_imm_d(reg_offset * message_header_scale)));
   1462       }
   1463       return index;
   1464    } else {
   1465       return brw_imm_d(reg_offset * message_header_scale);
   1466    }
   1467 }
   1468 
   1469 /**
   1470  * Emits an instruction before @inst to load the value named by @orig_src
   1471  * from scratch space at @base_offset to @temp.
   1472  *
   1473  * @base_offset is measured in 32-byte units (the size of a register).
   1474  */
   1475 void
   1476 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
   1477 				dst_reg temp, src_reg orig_src,
   1478 				int base_offset)
   1479 {
   1480    assert(orig_src.offset % REG_SIZE == 0);
   1481    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
   1482    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
   1483                                       reg_offset);
   1484 
   1485    if (type_sz(orig_src.type) < 8) {
   1486       emit_before(block, inst, SCRATCH_READ(temp, index));
   1487    } else {
   1488       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
   1489       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
   1490       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
   1491       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
   1492       vec4_instruction *last_read =
   1493          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
   1494       emit_before(block, inst, last_read);
   1495       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
   1496    }
   1497 }
   1498 
   1499 /**
   1500  * Emits an instruction after @inst to store the value to be written
   1501  * to @orig_dst to scratch space at @base_offset, from @temp.
   1502  *
   1503  * @base_offset is measured in 32-byte units (the size of a register).
   1504  */
   1505 void
   1506 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
   1507                                  int base_offset)
   1508 {
   1509    assert(inst->dst.offset % REG_SIZE == 0);
   1510    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
   1511    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
   1512                                       reg_offset);
   1513 
   1514    /* Create a temporary register to store *inst's result in.
   1515     *
   1516     * We have to be careful in MOVing from our temporary result register in
   1517     * the scratch write.  If we swizzle from channels of the temporary that
   1518     * weren't initialized, it will confuse live interval analysis, which will
   1519     * make spilling fail to make progress.
   1520     */
   1521    bool is_64bit = type_sz(inst->dst.type) == 8;
   1522    const glsl_type *alloc_type =
   1523       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
   1524    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
   1525                                        inst->dst.type),
   1526                                 brw_swizzle_for_mask(inst->dst.writemask));
   1527 
   1528    if (!is_64bit) {
   1529       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
   1530 				          inst->dst.writemask));
   1531       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
   1532       if (inst->opcode != BRW_OPCODE_SEL)
   1533          write->predicate = inst->predicate;
   1534       write->ir = inst->ir;
   1535       write->annotation = inst->annotation;
   1536       inst->insert_after(block, write);
   1537    } else {
   1538       dst_reg shuffled = dst_reg(this, alloc_type);
   1539       vec4_instruction *last =
   1540          shuffle_64bit_data(shuffled, temp, true, block, inst);
   1541       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
   1542 
   1543       uint8_t mask = 0;
   1544       if (inst->dst.writemask & WRITEMASK_X)
   1545          mask |= WRITEMASK_XY;
   1546       if (inst->dst.writemask & WRITEMASK_Y)
   1547          mask |= WRITEMASK_ZW;
   1548       if (mask) {
   1549          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
   1550 
   1551          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
   1552          if (inst->opcode != BRW_OPCODE_SEL)
   1553             write->predicate = inst->predicate;
   1554          write->ir = inst->ir;
   1555          write->annotation = inst->annotation;
   1556          last->insert_after(block, write);
   1557       }
   1558 
   1559       mask = 0;
   1560       if (inst->dst.writemask & WRITEMASK_Z)
   1561          mask |= WRITEMASK_XY;
   1562       if (inst->dst.writemask & WRITEMASK_W)
   1563          mask |= WRITEMASK_ZW;
   1564       if (mask) {
   1565          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
   1566 
   1567          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
   1568                                             reg_offset + 1);
   1569          vec4_instruction *write =
   1570             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
   1571          if (inst->opcode != BRW_OPCODE_SEL)
   1572             write->predicate = inst->predicate;
   1573          write->ir = inst->ir;
   1574          write->annotation = inst->annotation;
   1575          last->insert_after(block, write);
   1576       }
   1577    }
   1578 
   1579    inst->dst.file = temp.file;
   1580    inst->dst.nr = temp.nr;
   1581    inst->dst.offset %= REG_SIZE;
   1582    inst->dst.reladdr = NULL;
   1583 }
   1584 
   1585 /**
   1586  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
   1587  * adds the scratch read(s) before \p inst. The function also checks for
   1588  * recursive reladdr scratch accesses, issuing the corresponding scratch
   1589  * loads and rewriting reladdr references accordingly.
   1590  *
   1591  * \return \p src if it did not require a scratch load, otherwise, the
   1592  * register holding the result of the scratch load that the caller should
   1593  * use to rewrite src.
   1594  */
   1595 src_reg
   1596 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
   1597                                    vec4_instruction *inst, src_reg src)
   1598 {
   1599    /* Resolve recursive reladdr scratch access by calling ourselves
   1600     * with src.reladdr
   1601     */
   1602    if (src.reladdr)
   1603       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
   1604                                           *src.reladdr);
   1605 
   1606    /* Now handle scratch access on src */
   1607    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
   1608       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
   1609          glsl_type::dvec4_type : glsl_type::vec4_type);
   1610       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
   1611       src.nr = temp.nr;
   1612       src.offset %= REG_SIZE;
   1613       src.reladdr = NULL;
   1614    }
   1615 
   1616    return src;
   1617 }
   1618 
   1619 /**
   1620  * We can't generally support array access in GRF space, because a
   1621  * single instruction's destination can only span 2 contiguous
   1622  * registers.  So, we send all GRF arrays that get variable index
   1623  * access to scratch space.
   1624  */
   1625 void
   1626 vec4_visitor::move_grf_array_access_to_scratch()
   1627 {
   1628    int scratch_loc[this->alloc.count];
   1629    memset(scratch_loc, -1, sizeof(scratch_loc));
   1630 
   1631    /* First, calculate the set of virtual GRFs that need to be punted
   1632     * to scratch due to having any array access on them, and where in
   1633     * scratch.
   1634     */
   1635    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
   1636       if (inst->dst.file == VGRF && inst->dst.reladdr) {
   1637          if (scratch_loc[inst->dst.nr] == -1) {
   1638             scratch_loc[inst->dst.nr] = last_scratch;
   1639             last_scratch += this->alloc.sizes[inst->dst.nr];
   1640          }
   1641 
   1642          for (src_reg *iter = inst->dst.reladdr;
   1643               iter->reladdr;
   1644               iter = iter->reladdr) {
   1645             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
   1646                scratch_loc[iter->nr] = last_scratch;
   1647                last_scratch += this->alloc.sizes[iter->nr];
   1648             }
   1649          }
   1650       }
   1651 
   1652       for (int i = 0 ; i < 3; i++) {
   1653          for (src_reg *iter = &inst->src[i];
   1654               iter->reladdr;
   1655               iter = iter->reladdr) {
   1656             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
   1657                scratch_loc[iter->nr] = last_scratch;
   1658                last_scratch += this->alloc.sizes[iter->nr];
   1659             }
   1660          }
   1661       }
   1662    }
   1663 
   1664    /* Now, for anything that will be accessed through scratch, rewrite
   1665     * it to load/store.  Note that this is a _safe list walk, because
   1666     * we may generate a new scratch_write instruction after the one
   1667     * we're processing.
   1668     */
   1669    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
   1670       /* Set up the annotation tracking for new generated instructions. */
   1671       base_ir = inst->ir;
   1672       current_annotation = inst->annotation;
   1673 
   1674       /* First handle scratch access on the dst. Notice we have to handle
   1675        * the case where the dst's reladdr also points to scratch space.
   1676        */
   1677       if (inst->dst.reladdr)
   1678          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
   1679                                                    *inst->dst.reladdr);
   1680 
   1681       /* Now that we have handled any (possibly recursive) reladdr scratch
   1682        * accesses for dst we can safely do the scratch write for dst itself
   1683        */
   1684       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
   1685          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
   1686 
   1687       /* Now handle scratch access on any src. In this case, since inst->src[i]
   1688        * already is a src_reg, we can just call emit_resolve_reladdr with
   1689        * inst->src[i] and it will take care of handling scratch loads for
   1690        * both src and src.reladdr (recursively).
   1691        */
   1692       for (int i = 0 ; i < 3; i++) {
   1693          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
   1694                                              inst->src[i]);
   1695       }
   1696    }
   1697 }
   1698 
   1699 /**
   1700  * Emits an instruction before @inst to load the value named by @orig_src
   1701  * from the pull constant buffer (surface) at @base_offset to @temp.
   1702  */
   1703 void
   1704 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
   1705                                       dst_reg temp, src_reg orig_src,
   1706                                       int base_offset, src_reg indirect)
   1707 {
   1708    assert(orig_src.offset % 16 == 0);
   1709    const unsigned index = prog_data->base.binding_table.pull_constants_start;
   1710 
   1711    /* For 64bit loads we need to emit two 32-bit load messages and we also
   1712     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
   1713     * that we emit the 32-bit loads into a temporary and we shuffle the result
   1714     * into the original destination.
   1715     */
   1716    dst_reg orig_temp = temp;
   1717    bool is_64bit = type_sz(orig_src.type) == 8;
   1718    if (is_64bit) {
   1719       assert(type_sz(temp.type) == 8);
   1720       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
   1721       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
   1722    }
   1723 
   1724    src_reg src = orig_src;
   1725    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
   1726       int reg_offset = base_offset + src.offset / 16;
   1727 
   1728       src_reg offset;
   1729       if (indirect.file != BAD_FILE) {
   1730          offset = src_reg(this, glsl_type::uint_type);
   1731          emit_before(block, inst, ADD(dst_reg(offset), indirect,
   1732                                       brw_imm_ud(reg_offset * 16)));
   1733       } else if (devinfo->gen >= 8) {
   1734          /* Store the offset in a GRF so we can send-from-GRF. */
   1735          offset = src_reg(this, glsl_type::uint_type);
   1736          emit_before(block, inst, MOV(dst_reg(offset),
   1737                                       brw_imm_ud(reg_offset * 16)));
   1738       } else {
   1739          offset = brw_imm_d(reg_offset * 16);
   1740       }
   1741 
   1742       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
   1743                                   brw_imm_ud(index),
   1744                                   offset,
   1745                                   block, inst);
   1746 
   1747       src = byte_offset(src, 16);
   1748    }
   1749 
   1750    brw_mark_surface_used(&prog_data->base, index);
   1751 
   1752    if (is_64bit) {
   1753       temp = retype(temp, BRW_REGISTER_TYPE_DF);
   1754       shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
   1755    }
   1756 }
   1757 
   1758 /**
   1759  * Implements array access of uniforms by inserting a
   1760  * PULL_CONSTANT_LOAD instruction.
   1761  *
   1762  * Unlike temporary GRF array access (where we don't support it due to
   1763  * the difficulty of doing relative addressing on instruction
   1764  * destinations), we could potentially do array access of uniforms
   1765  * that were loaded in GRF space as push constants.  In real-world
   1766  * usage we've seen, though, the arrays being used are always larger
   1767  * than we could load as push constants, so just always move all
   1768  * uniform array access out to a pull constant buffer.
   1769  */
   1770 void
   1771 vec4_visitor::move_uniform_array_access_to_pull_constants()
   1772 {
   1773    /* The vulkan dirver doesn't support pull constants other than UBOs so
   1774     * everything has to be pushed regardless.
   1775     */
   1776    if (stage_prog_data->pull_param == NULL) {
   1777       split_uniform_registers();
   1778       return;
   1779    }
   1780 
   1781    int pull_constant_loc[this->uniforms];
   1782    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
   1783 
   1784    /* First, walk through the instructions and determine which things need to
   1785     * be pulled.  We mark something as needing to be pulled by setting
   1786     * pull_constant_loc to 0.
   1787     */
   1788    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
   1789       /* We only care about MOV_INDIRECT of a uniform */
   1790       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
   1791           inst->src[0].file != UNIFORM)
   1792          continue;
   1793 
   1794       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
   1795 
   1796       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
   1797          pull_constant_loc[uniform_nr + j] = 0;
   1798    }
   1799 
   1800    /* Next, we walk the list of uniforms and assign real pull constant
   1801     * locations and set their corresponding entries in pull_param.
   1802     */
   1803    for (int j = 0; j < this->uniforms; j++) {
   1804       if (pull_constant_loc[j] < 0)
   1805          continue;
   1806 
   1807       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
   1808 
   1809       for (int i = 0; i < 4; i++) {
   1810          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
   1811             = stage_prog_data->param[j * 4 + i];
   1812       }
   1813    }
   1814 
   1815    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
   1816     * instructions to actual uniform pulls.
   1817     */
   1818    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
   1819       /* We only care about MOV_INDIRECT of a uniform */
   1820       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
   1821           inst->src[0].file != UNIFORM)
   1822          continue;
   1823 
   1824       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
   1825 
   1826       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
   1827 
   1828       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
   1829                               pull_constant_loc[uniform_nr], inst->src[1]);
   1830       inst->remove(block);
   1831    }
   1832 
   1833    /* Now there are no accesses of the UNIFORM file with a reladdr, so
   1834     * no need to track them as larger-than-vec4 objects.  This will be
   1835     * relied on in cutting out unused uniform vectors from push
   1836     * constants.
   1837     */
   1838    split_uniform_registers();
   1839 }
   1840 
   1841 void
   1842 vec4_visitor::resolve_ud_negate(src_reg *reg)
   1843 {
   1844    if (reg->type != BRW_REGISTER_TYPE_UD ||
   1845        !reg->negate)
   1846       return;
   1847 
   1848    src_reg temp = src_reg(this, glsl_type::uvec4_type);
   1849    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
   1850    *reg = temp;
   1851 }
   1852 
   1853 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
   1854                            void *log_data,
   1855                            const struct brw_sampler_prog_key_data *key_tex,
   1856                            struct brw_vue_prog_data *prog_data,
   1857                            const nir_shader *shader,
   1858 			   void *mem_ctx,
   1859                            bool no_spills,
   1860                            int shader_time_index)
   1861    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
   1862      key_tex(key_tex),
   1863      prog_data(prog_data),
   1864      fail_msg(NULL),
   1865      first_non_payload_grf(0),
   1866      need_all_constants_in_pull_buffer(false),
   1867      no_spills(no_spills),
   1868      shader_time_index(shader_time_index),
   1869      last_scratch(0)
   1870 {
   1871    this->failed = false;
   1872 
   1873    this->base_ir = NULL;
   1874    this->current_annotation = NULL;
   1875    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
   1876 
   1877    memset(this->output_num_components, 0, sizeof(this->output_num_components));
   1878 
   1879    this->virtual_grf_start = NULL;
   1880    this->virtual_grf_end = NULL;
   1881    this->live_intervals = NULL;
   1882 
   1883    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
   1884 
   1885    this->uniforms = 0;
   1886 }
   1887 
   1888 vec4_visitor::~vec4_visitor()
   1889 {
   1890 }
   1891 
   1892 
   1893 void
   1894 vec4_visitor::fail(const char *format, ...)
   1895 {
   1896    va_list va;
   1897    char *msg;
   1898 
   1899    if (failed)
   1900       return;
   1901 
   1902    failed = true;
   1903 
   1904    va_start(va, format);
   1905    msg = ralloc_vasprintf(mem_ctx, format, va);
   1906    va_end(va);
   1907    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
   1908 
   1909    this->fail_msg = msg;
   1910 
   1911    if (debug_enabled) {
   1912       fprintf(stderr, "%s",  msg);
   1913    }
   1914 }
   1915 
   1916 } /* namespace brw */
   1917