Home | History | Annotate | Download | only in i965
      1 /* -*- c++ -*- */
      2 /*
      3  * Copyright  2010-2015 Intel Corporation
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9  * and/or sell copies of the Software, and to permit persons to whom the
     10  * Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     22  * IN THE SOFTWARE.
     23  */
     24 
     25 #ifndef BRW_VEC4_BUILDER_H
     26 #define BRW_VEC4_BUILDER_H
     27 
     28 #include "brw_ir_vec4.h"
     29 #include "brw_ir_allocator.h"
     30 #include "brw_context.h"
     31 
     32 namespace brw {
     33    /**
     34     * Toolbox to assemble a VEC4 IR program out of individual instructions.
     35     *
     36     * This object is meant to have an interface consistent with
     37     * brw::fs_builder.  They cannot be fully interchangeable because
     38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
     39     * vector code.
     40     */
     41    class vec4_builder {
     42    public:
     43       /** Type used in this IR to represent a source of an instruction. */
     44       typedef brw::src_reg src_reg;
     45 
     46       /** Type used in this IR to represent the destination of an instruction. */
     47       typedef brw::dst_reg dst_reg;
     48 
     49       /** Type used in this IR to represent an instruction. */
     50       typedef vec4_instruction instruction;
     51 
     52       /**
     53        * Construct a vec4_builder that inserts instructions into \p shader.
     54        */
     55       vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
     56          shader(shader), block(NULL), cursor(NULL),
     57          _dispatch_width(dispatch_width), _group(0),
     58          force_writemask_all(false),
     59          annotation()
     60       {
     61       }
     62 
     63       /**
     64        * Construct a vec4_builder that inserts instructions into \p shader
     65        * before instruction \p inst in basic block \p block.  The default
     66        * execution controls and debug annotation are initialized from the
     67        * instruction passed as argument.
     68        */
     69       vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
     70          shader(shader), block(block), cursor(inst),
     71          _dispatch_width(inst->exec_size), _group(inst->group),
     72          force_writemask_all(inst->force_writemask_all)
     73       {
     74          annotation.str = inst->annotation;
     75          annotation.ir = inst->ir;
     76       }
     77 
     78       /**
     79        * Construct a vec4_builder that inserts instructions before \p cursor
     80        * in basic block \p block, inheriting other code generation parameters
     81        * from this.
     82        */
     83       vec4_builder
     84       at(bblock_t *block, exec_node *cursor) const
     85       {
     86          vec4_builder bld = *this;
     87          bld.block = block;
     88          bld.cursor = cursor;
     89          return bld;
     90       }
     91 
     92       /**
     93        * Construct a vec4_builder appending instructions at the end of the
     94        * instruction list of the shader, inheriting other code generation
     95        * parameters from this.
     96        */
     97       vec4_builder
     98       at_end() const
     99       {
    100          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
    101       }
    102 
    103       /**
    104        * Construct a builder specifying the default SIMD width and group of
    105        * channel enable signals, inheriting other code generation parameters
    106        * from this.
    107        *
    108        * \p n gives the default SIMD width, \p i gives the slot group used for
    109        * predication and control flow masking in multiples of \p n channels.
    110        */
    111       vec4_builder
    112       group(unsigned n, unsigned i) const
    113       {
    114          assert(force_writemask_all ||
    115                 (n <= dispatch_width() && i < dispatch_width() / n));
    116          vec4_builder bld = *this;
    117          bld._dispatch_width = n;
    118          bld._group += i * n;
    119          return bld;
    120       }
    121 
    122       /**
    123        * Construct a builder with per-channel control flow execution masking
    124        * disabled if \p b is true.  If control flow execution masking is
    125        * already disabled this has no effect.
    126        */
    127       vec4_builder
    128       exec_all(bool b = true) const
    129       {
    130          vec4_builder bld = *this;
    131          if (b)
    132             bld.force_writemask_all = true;
    133          return bld;
    134       }
    135 
    136       /**
    137        * Construct a builder with the given debug annotation info.
    138        */
    139       vec4_builder
    140       annotate(const char *str, const void *ir = NULL) const
    141       {
    142          vec4_builder bld = *this;
    143          bld.annotation.str = str;
    144          bld.annotation.ir = ir;
    145          return bld;
    146       }
    147 
    148       /**
    149        * Get the SIMD width in use.
    150        */
    151       unsigned
    152       dispatch_width() const
    153       {
    154          return _dispatch_width;
    155       }
    156 
    157       /**
    158        * Get the channel group in use.
    159        */
    160       unsigned
    161       group() const
    162       {
    163          return _group;
    164       }
    165 
    166       /**
    167        * Allocate a virtual register of natural vector size (four for this IR)
    168        * and SIMD width.  \p n gives the amount of space to allocate in
    169        * dispatch_width units (which is just enough space for four logical
    170        * components in this IR).
    171        */
    172       dst_reg
    173       vgrf(enum brw_reg_type type, unsigned n = 1) const
    174       {
    175          assert(dispatch_width() <= 32);
    176 
    177          if (n > 0)
    178             return retype(dst_reg(VGRF, shader->alloc.allocate(
    179                                      n * DIV_ROUND_UP(type_sz(type), 4))),
    180                            type);
    181          else
    182             return retype(null_reg_ud(), type);
    183       }
    184 
    185       /**
    186        * Create a null register of floating type.
    187        */
    188       dst_reg
    189       null_reg_f() const
    190       {
    191          return dst_reg(retype(brw_null_vec(dispatch_width()),
    192                                BRW_REGISTER_TYPE_F));
    193       }
    194 
    195       /**
    196        * Create a null register of signed integer type.
    197        */
    198       dst_reg
    199       null_reg_d() const
    200       {
    201          return dst_reg(retype(brw_null_vec(dispatch_width()),
    202                                BRW_REGISTER_TYPE_D));
    203       }
    204 
    205       /**
    206        * Create a null register of unsigned integer type.
    207        */
    208       dst_reg
    209       null_reg_ud() const
    210       {
    211          return dst_reg(retype(brw_null_vec(dispatch_width()),
    212                                BRW_REGISTER_TYPE_UD));
    213       }
    214 
    215       /**
    216        * Insert an instruction into the program.
    217        */
    218       instruction *
    219       emit(const instruction &inst) const
    220       {
    221          return emit(new(shader->mem_ctx) instruction(inst));
    222       }
    223 
    224       /**
    225        * Create and insert a nullary control instruction into the program.
    226        */
    227       instruction *
    228       emit(enum opcode opcode) const
    229       {
    230          return emit(instruction(opcode));
    231       }
    232 
    233       /**
    234        * Create and insert a nullary instruction into the program.
    235        */
    236       instruction *
    237       emit(enum opcode opcode, const dst_reg &dst) const
    238       {
    239          return emit(instruction(opcode, dst));
    240       }
    241 
    242       /**
    243        * Create and insert a unary instruction into the program.
    244        */
    245       instruction *
    246       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
    247       {
    248          switch (opcode) {
    249          case SHADER_OPCODE_RCP:
    250          case SHADER_OPCODE_RSQ:
    251          case SHADER_OPCODE_SQRT:
    252          case SHADER_OPCODE_EXP2:
    253          case SHADER_OPCODE_LOG2:
    254          case SHADER_OPCODE_SIN:
    255          case SHADER_OPCODE_COS:
    256             return fix_math_instruction(
    257                emit(instruction(opcode, dst,
    258                                 fix_math_operand(src0))));
    259 
    260          default:
    261             return emit(instruction(opcode, dst, src0));
    262          }
    263       }
    264 
    265       /**
    266        * Create and insert a binary instruction into the program.
    267        */
    268       instruction *
    269       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
    270            const src_reg &src1) const
    271       {
    272          switch (opcode) {
    273          case SHADER_OPCODE_POW:
    274          case SHADER_OPCODE_INT_QUOTIENT:
    275          case SHADER_OPCODE_INT_REMAINDER:
    276             return fix_math_instruction(
    277                emit(instruction(opcode, dst,
    278                                 fix_math_operand(src0),
    279                                 fix_math_operand(src1))));
    280 
    281          default:
    282             return emit(instruction(opcode, dst, src0, src1));
    283          }
    284       }
    285 
    286       /**
    287        * Create and insert a ternary instruction into the program.
    288        */
    289       instruction *
    290       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
    291            const src_reg &src1, const src_reg &src2) const
    292       {
    293          switch (opcode) {
    294          case BRW_OPCODE_BFE:
    295          case BRW_OPCODE_BFI2:
    296          case BRW_OPCODE_MAD:
    297          case BRW_OPCODE_LRP:
    298             return emit(instruction(opcode, dst,
    299                                     fix_3src_operand(src0),
    300                                     fix_3src_operand(src1),
    301                                     fix_3src_operand(src2)));
    302 
    303          default:
    304             return emit(instruction(opcode, dst, src0, src1, src2));
    305          }
    306       }
    307 
    308       /**
    309        * Insert a preallocated instruction into the program.
    310        */
    311       instruction *
    312       emit(instruction *inst) const
    313       {
    314          inst->exec_size = dispatch_width();
    315          inst->group = group();
    316          inst->force_writemask_all = force_writemask_all;
    317          inst->size_written = inst->exec_size * type_sz(inst->dst.type);
    318          inst->annotation = annotation.str;
    319          inst->ir = annotation.ir;
    320 
    321          if (block)
    322             static_cast<instruction *>(cursor)->insert_before(block, inst);
    323          else
    324             cursor->insert_before(inst);
    325 
    326          return inst;
    327       }
    328 
    329       /**
    330        * Select \p src0 if the comparison of both sources with the given
    331        * conditional mod evaluates to true, otherwise select \p src1.
    332        *
    333        * Generally useful to get the minimum or maximum of two values.
    334        */
    335       instruction *
    336       emit_minmax(const dst_reg &dst, const src_reg &src0,
    337                   const src_reg &src1, brw_conditional_mod mod) const
    338       {
    339          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
    340 
    341          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
    342                                      fix_unsigned_negate(src1)));
    343       }
    344 
    345       /**
    346        * Copy any live channel from \p src to the first channel of the result.
    347        */
    348       src_reg
    349       emit_uniformize(const src_reg &src) const
    350       {
    351          const vec4_builder ubld = exec_all();
    352          const dst_reg chan_index =
    353             writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
    354          const dst_reg dst = vgrf(src.type);
    355 
    356          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
    357          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
    358 
    359          return src_reg(dst);
    360       }
    361 
    362       /**
    363        * Assorted arithmetic ops.
    364        * @{
    365        */
    366 #define ALU1(op)                                        \
    367       instruction *                                     \
    368       op(const dst_reg &dst, const src_reg &src0) const \
    369       {                                                 \
    370          return emit(BRW_OPCODE_##op, dst, src0);       \
    371       }
    372 
    373 #define ALU2(op)                                                        \
    374       instruction *                                                     \
    375       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
    376       {                                                                 \
    377          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
    378       }
    379 
    380 #define ALU2_ACC(op)                                                    \
    381       instruction *                                                     \
    382       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
    383       {                                                                 \
    384          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
    385          inst->writes_accumulator = true;                               \
    386          return inst;                                                   \
    387       }
    388 
    389 #define ALU3(op)                                                        \
    390       instruction *                                                     \
    391       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
    392          const src_reg &src2) const                                     \
    393       {                                                                 \
    394          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
    395       }
    396 
    397       ALU2(ADD)
    398       ALU2_ACC(ADDC)
    399       ALU2(AND)
    400       ALU2(ASR)
    401       ALU2(AVG)
    402       ALU3(BFE)
    403       ALU2(BFI1)
    404       ALU3(BFI2)
    405       ALU1(BFREV)
    406       ALU1(CBIT)
    407       ALU2(CMPN)
    408       ALU3(CSEL)
    409       ALU1(DIM)
    410       ALU2(DP2)
    411       ALU2(DP3)
    412       ALU2(DP4)
    413       ALU2(DPH)
    414       ALU1(F16TO32)
    415       ALU1(F32TO16)
    416       ALU1(FBH)
    417       ALU1(FBL)
    418       ALU1(FRC)
    419       ALU2(LINE)
    420       ALU1(LZD)
    421       ALU2(MAC)
    422       ALU2_ACC(MACH)
    423       ALU3(MAD)
    424       ALU1(MOV)
    425       ALU2(MUL)
    426       ALU1(NOT)
    427       ALU2(OR)
    428       ALU2(PLN)
    429       ALU1(RNDD)
    430       ALU1(RNDE)
    431       ALU1(RNDU)
    432       ALU1(RNDZ)
    433       ALU2(SAD2)
    434       ALU2_ACC(SADA2)
    435       ALU2(SEL)
    436       ALU2(SHL)
    437       ALU2(SHR)
    438       ALU2_ACC(SUBB)
    439       ALU2(XOR)
    440 
    441 #undef ALU3
    442 #undef ALU2_ACC
    443 #undef ALU2
    444 #undef ALU1
    445       /** @} */
    446 
    447       /**
    448        * CMP: Sets the low bit of the destination channels with the result
    449        * of the comparison, while the upper bits are undefined, and updates
    450        * the flag register with the packed 16 bits of the result.
    451        */
    452       instruction *
    453       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
    454           brw_conditional_mod condition) const
    455       {
    456          /* Take the instruction:
    457           *
    458           * CMP null<d> src0<f> src1<f>
    459           *
    460           * Original gen4 does type conversion to the destination type
    461           * before comparison, producing garbage results for floating
    462           * point comparisons.
    463           *
    464           * The destination type doesn't matter on newer generations,
    465           * so we set the type to match src0 so we can compact the
    466           * instruction.
    467           */
    468          return set_condmod(condition,
    469                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
    470                                  fix_unsigned_negate(src0),
    471                                  fix_unsigned_negate(src1)));
    472       }
    473 
    474       /**
    475        * Gen4 predicated IF.
    476        */
    477       instruction *
    478       IF(brw_predicate predicate) const
    479       {
    480          return set_predicate(predicate, emit(BRW_OPCODE_IF));
    481       }
    482 
    483       /**
    484        * Gen6 IF with embedded comparison.
    485        */
    486       instruction *
    487       IF(const src_reg &src0, const src_reg &src1,
    488          brw_conditional_mod condition) const
    489       {
    490          assert(shader->devinfo->gen == 6);
    491          return set_condmod(condition,
    492                             emit(BRW_OPCODE_IF,
    493                                  null_reg_d(),
    494                                  fix_unsigned_negate(src0),
    495                                  fix_unsigned_negate(src1)));
    496       }
    497 
    498       /**
    499        * Emit a linear interpolation instruction.
    500        */
    501       instruction *
    502       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
    503           const src_reg &a) const
    504       {
    505          if (shader->devinfo->gen >= 6) {
    506             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
    507              * we need to reorder the operands.
    508              */
    509             return emit(BRW_OPCODE_LRP, dst, a, y, x);
    510 
    511          } else {
    512             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
    513             const dst_reg y_times_a = vgrf(dst.type);
    514             const dst_reg one_minus_a = vgrf(dst.type);
    515             const dst_reg x_times_one_minus_a = vgrf(dst.type);
    516 
    517             MUL(y_times_a, y, a);
    518             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
    519             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
    520             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
    521          }
    522       }
    523 
    524       backend_shader *shader;
    525 
    526    protected:
    527       /**
    528        * Workaround for negation of UD registers.  See comment in
    529        * fs_generator::generate_code() for the details.
    530        */
    531       src_reg
    532       fix_unsigned_negate(const src_reg &src) const
    533       {
    534          if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
    535             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
    536             MOV(temp, src);
    537             return src_reg(temp);
    538          } else {
    539             return src;
    540          }
    541       }
    542 
    543       /**
    544        * Workaround for register access modes not supported by the ternary
    545        * instruction encoding.
    546        */
    547       src_reg
    548       fix_3src_operand(const src_reg &src) const
    549       {
    550          /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
    551           * able to use vertical stride of zero to replicate the vec4 uniform, like
    552           *
    553           *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
    554           *
    555           * But you can't, since vertical stride is always four in three-source
    556           * instructions. Instead, insert a MOV instruction to do the replication so
    557           * that the three-source instruction can consume it.
    558           */
    559 
    560          /* The MOV is only needed if the source is a uniform or immediate. */
    561          if (src.file != UNIFORM && src.file != IMM)
    562             return src;
    563 
    564          if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
    565             return src;
    566 
    567          const dst_reg expanded = vgrf(src.type);
    568          emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
    569          return src_reg(expanded);
    570       }
    571 
    572       /**
    573        * Workaround for register access modes not supported by the math
    574        * instruction.
    575        */
    576       src_reg
    577       fix_math_operand(const src_reg &src) const
    578       {
    579          /* The gen6 math instruction ignores the source modifiers --
    580           * swizzle, abs, negate, and at least some parts of the register
    581           * region description.
    582           *
    583           * Rather than trying to enumerate all these cases, *always* expand the
    584           * operand to a temp GRF for gen6.
    585           *
    586           * For gen7, keep the operand as-is, except if immediate, which gen7 still
    587           * can't use.
    588           */
    589          if (shader->devinfo->gen == 6 ||
    590              (shader->devinfo->gen == 7 && src.file == IMM)) {
    591             const dst_reg tmp = vgrf(src.type);
    592             MOV(tmp, src);
    593             return src_reg(tmp);
    594          } else {
    595             return src;
    596          }
    597       }
    598 
    599       /**
    600        * Workaround other weirdness of the math instruction.
    601        */
    602       instruction *
    603       fix_math_instruction(instruction *inst) const
    604       {
    605          if (shader->devinfo->gen == 6 &&
    606              inst->dst.writemask != WRITEMASK_XYZW) {
    607             const dst_reg tmp = vgrf(inst->dst.type);
    608             MOV(inst->dst, src_reg(tmp));
    609             inst->dst = tmp;
    610 
    611          } else if (shader->devinfo->gen < 6) {
    612             const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
    613             inst->base_mrf = 1;
    614             inst->mlen = sources;
    615          }
    616 
    617          return inst;
    618       }
    619 
    620       bblock_t *block;
    621       exec_node *cursor;
    622 
    623       unsigned _dispatch_width;
    624       unsigned _group;
    625       bool force_writemask_all;
    626 
    627       /** Debug annotation info. */
    628       struct {
    629          const char *str;
    630          const void *ir;
    631       } annotation;
    632    };
    633 }
    634 
    635 #endif
    636