Home | History | Annotate | Download | only in i965
      1 /* -*- c++ -*- */
      2 /*
      3  * Copyright  2010-2015 Intel Corporation
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9  * and/or sell copies of the Software, and to permit persons to whom the
     10  * Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     22  * IN THE SOFTWARE.
     23  */
     24 
     25 #ifndef BRW_FS_BUILDER_H
     26 #define BRW_FS_BUILDER_H
     27 
     28 #include "brw_ir_fs.h"
     29 #include "brw_shader.h"
     30 #include "brw_context.h"
     31 
     32 namespace brw {
     33    /**
     34     * Toolbox to assemble an FS IR program out of individual instructions.
     35     *
     36     * This object is meant to have an interface consistent with
     37     * brw::vec4_builder.  They cannot be fully interchangeable because
     38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
     39     * vector code.
     40     */
     41    class fs_builder {
     42    public:
     43       /** Type used in this IR to represent a source of an instruction. */
     44       typedef fs_reg src_reg;
     45 
     46       /** Type used in this IR to represent the destination of an instruction. */
     47       typedef fs_reg dst_reg;
     48 
     49       /** Type used in this IR to represent an instruction. */
     50       typedef fs_inst instruction;
     51 
     52       /**
     53        * Construct an fs_builder that inserts instructions into \p shader.
     54        * \p dispatch_width gives the native execution width of the program.
     55        */
     56       fs_builder(backend_shader *shader,
     57                  unsigned dispatch_width) :
     58          shader(shader), block(NULL), cursor(NULL),
     59          _dispatch_width(dispatch_width),
     60          _group(0),
     61          force_writemask_all(false),
     62          annotation()
     63       {
     64       }
     65 
     66       /**
     67        * Construct an fs_builder that inserts instructions into \p shader
     68        * before instruction \p inst in basic block \p block.  The default
     69        * execution controls and debug annotation are initialized from the
     70        * instruction passed as argument.
     71        */
     72       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
     73          shader(shader), block(block), cursor(inst),
     74          _dispatch_width(inst->exec_size),
     75          _group(inst->group),
     76          force_writemask_all(inst->force_writemask_all)
     77       {
     78          annotation.str = inst->annotation;
     79          annotation.ir = inst->ir;
     80       }
     81 
     82       /**
     83        * Construct an fs_builder that inserts instructions before \p cursor in
     84        * basic block \p block, inheriting other code generation parameters
     85        * from this.
     86        */
     87       fs_builder
     88       at(bblock_t *block, exec_node *cursor) const
     89       {
     90          fs_builder bld = *this;
     91          bld.block = block;
     92          bld.cursor = cursor;
     93          return bld;
     94       }
     95 
     96       /**
     97        * Construct an fs_builder appending instructions at the end of the
     98        * instruction list of the shader, inheriting other code generation
     99        * parameters from this.
    100        */
    101       fs_builder
    102       at_end() const
    103       {
    104          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
    105       }
    106 
    107       /**
    108        * Construct a builder specifying the default SIMD width and group of
    109        * channel enable signals, inheriting other code generation parameters
    110        * from this.
    111        *
    112        * \p n gives the default SIMD width, \p i gives the slot group used for
    113        * predication and control flow masking in multiples of \p n channels.
    114        */
    115       fs_builder
    116       group(unsigned n, unsigned i) const
    117       {
    118          assert(force_writemask_all ||
    119                 (n <= dispatch_width() && i < dispatch_width() / n));
    120          fs_builder bld = *this;
    121          bld._dispatch_width = n;
    122          bld._group += i * n;
    123          return bld;
    124       }
    125 
    126       /**
    127        * Alias for group() with width equal to eight.
    128        */
    129       fs_builder
    130       half(unsigned i) const
    131       {
    132          return group(8, i);
    133       }
    134 
    135       /**
    136        * Construct a builder with per-channel control flow execution masking
    137        * disabled if \p b is true.  If control flow execution masking is
    138        * already disabled this has no effect.
    139        */
    140       fs_builder
    141       exec_all(bool b = true) const
    142       {
    143          fs_builder bld = *this;
    144          if (b)
    145             bld.force_writemask_all = true;
    146          return bld;
    147       }
    148 
    149       /**
    150        * Construct a builder with the given debug annotation info.
    151        */
    152       fs_builder
    153       annotate(const char *str, const void *ir = NULL) const
    154       {
    155          fs_builder bld = *this;
    156          bld.annotation.str = str;
    157          bld.annotation.ir = ir;
    158          return bld;
    159       }
    160 
    161       /**
    162        * Get the SIMD width in use.
    163        */
    164       unsigned
    165       dispatch_width() const
    166       {
    167          return _dispatch_width;
    168       }
    169 
    170       /**
    171        * Get the channel group in use.
    172        */
    173       unsigned
    174       group() const
    175       {
    176          return _group;
    177       }
    178 
    179       /**
    180        * Allocate a virtual register of natural vector size (one for this IR)
    181        * and SIMD width.  \p n gives the amount of space to allocate in
    182        * dispatch_width units (which is just enough space for one logical
    183        * component in this IR).
    184        */
    185       dst_reg
    186       vgrf(enum brw_reg_type type, unsigned n = 1) const
    187       {
    188          assert(dispatch_width() <= 32);
    189 
    190          if (n > 0)
    191             return dst_reg(VGRF, shader->alloc.allocate(
    192                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
    193                                            REG_SIZE)),
    194                            type);
    195          else
    196             return retype(null_reg_ud(), type);
    197       }
    198 
    199       /**
    200        * Create a null register of floating type.
    201        */
    202       dst_reg
    203       null_reg_f() const
    204       {
    205          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
    206       }
    207 
    208       dst_reg
    209       null_reg_df() const
    210       {
    211          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
    212       }
    213 
    214       /**
    215        * Create a null register of signed integer type.
    216        */
    217       dst_reg
    218       null_reg_d() const
    219       {
    220          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
    221       }
    222 
    223       /**
    224        * Create a null register of unsigned integer type.
    225        */
    226       dst_reg
    227       null_reg_ud() const
    228       {
    229          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
    230       }
    231 
    232       /**
    233        * Get the mask of SIMD channels enabled by dispatch and not yet
    234        * disabled by discard.
    235        */
    236       src_reg
    237       sample_mask_reg() const
    238       {
    239          assert(shader->stage != MESA_SHADER_FRAGMENT ||
    240                 group() + dispatch_width() <= 16);
    241          if (shader->stage != MESA_SHADER_FRAGMENT) {
    242             return brw_imm_d(0xffffffff);
    243          } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
    244             return brw_flag_reg(0, 1);
    245          } else {
    246             return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
    247          }
    248       }
    249 
    250       /**
    251        * Insert an instruction into the program.
    252        */
    253       instruction *
    254       emit(const instruction &inst) const
    255       {
    256          return emit(new(shader->mem_ctx) instruction(inst));
    257       }
    258 
    259       /**
    260        * Create and insert a nullary control instruction into the program.
    261        */
    262       instruction *
    263       emit(enum opcode opcode) const
    264       {
    265          return emit(instruction(opcode, dispatch_width()));
    266       }
    267 
    268       /**
    269        * Create and insert a nullary instruction into the program.
    270        */
    271       instruction *
    272       emit(enum opcode opcode, const dst_reg &dst) const
    273       {
    274          return emit(instruction(opcode, dispatch_width(), dst));
    275       }
    276 
    277       /**
    278        * Create and insert a unary instruction into the program.
    279        */
    280       instruction *
    281       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
    282       {
    283          switch (opcode) {
    284          case SHADER_OPCODE_RCP:
    285          case SHADER_OPCODE_RSQ:
    286          case SHADER_OPCODE_SQRT:
    287          case SHADER_OPCODE_EXP2:
    288          case SHADER_OPCODE_LOG2:
    289          case SHADER_OPCODE_SIN:
    290          case SHADER_OPCODE_COS:
    291             return emit(instruction(opcode, dispatch_width(), dst,
    292                                     fix_math_operand(src0)));
    293 
    294          default:
    295             return emit(instruction(opcode, dispatch_width(), dst, src0));
    296          }
    297       }
    298 
    299       /**
    300        * Create and insert a binary instruction into the program.
    301        */
    302       instruction *
    303       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
    304            const src_reg &src1) const
    305       {
    306          switch (opcode) {
    307          case SHADER_OPCODE_POW:
    308          case SHADER_OPCODE_INT_QUOTIENT:
    309          case SHADER_OPCODE_INT_REMAINDER:
    310             return emit(instruction(opcode, dispatch_width(), dst,
    311                                     fix_math_operand(src0),
    312                                     fix_math_operand(src1)));
    313 
    314          default:
    315             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
    316 
    317          }
    318       }
    319 
    320       /**
    321        * Create and insert a ternary instruction into the program.
    322        */
    323       instruction *
    324       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
    325            const src_reg &src1, const src_reg &src2) const
    326       {
    327          switch (opcode) {
    328          case BRW_OPCODE_BFE:
    329          case BRW_OPCODE_BFI2:
    330          case BRW_OPCODE_MAD:
    331          case BRW_OPCODE_LRP:
    332             return emit(instruction(opcode, dispatch_width(), dst,
    333                                     fix_3src_operand(src0),
    334                                     fix_3src_operand(src1),
    335                                     fix_3src_operand(src2)));
    336 
    337          default:
    338             return emit(instruction(opcode, dispatch_width(), dst,
    339                                     src0, src1, src2));
    340          }
    341       }
    342 
    343       /**
    344        * Create and insert an instruction with a variable number of sources
    345        * into the program.
    346        */
    347       instruction *
    348       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
    349            unsigned n) const
    350       {
    351          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
    352       }
    353 
    354       /**
    355        * Insert a preallocated instruction into the program.
    356        */
    357       instruction *
    358       emit(instruction *inst) const
    359       {
    360          assert(inst->exec_size <= 32);
    361          assert(inst->exec_size == dispatch_width() ||
    362                 force_writemask_all);
    363 
    364          inst->group = _group;
    365          inst->force_writemask_all = force_writemask_all;
    366          inst->annotation = annotation.str;
    367          inst->ir = annotation.ir;
    368 
    369          if (block)
    370             static_cast<instruction *>(cursor)->insert_before(block, inst);
    371          else
    372             cursor->insert_before(inst);
    373 
    374          return inst;
    375       }
    376 
    377       /**
    378        * Select \p src0 if the comparison of both sources with the given
    379        * conditional mod evaluates to true, otherwise select \p src1.
    380        *
    381        * Generally useful to get the minimum or maximum of two values.
    382        */
    383       instruction *
    384       emit_minmax(const dst_reg &dst, const src_reg &src0,
    385                   const src_reg &src1, brw_conditional_mod mod) const
    386       {
    387          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
    388 
    389          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
    390                                      fix_unsigned_negate(src1)));
    391       }
    392 
    393       /**
    394        * Copy any live channel from \p src to the first channel of the result.
    395        */
    396       src_reg
    397       emit_uniformize(const src_reg &src) const
    398       {
    399          /* FIXME: We use a vector chan_index and dst to allow constant and
    400           * copy propagration to move result all the way into the consuming
    401           * instruction (typically a surface index or sampler index for a
    402           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
    403           * dispatch. Once we teach const/copy propagation about scalars we
    404           * should go back to scalar destinations here.
    405           */
    406          const fs_builder ubld = exec_all();
    407          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
    408          const dst_reg dst = vgrf(src.type);
    409 
    410          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
    411          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
    412 
    413          return src_reg(component(dst, 0));
    414       }
    415 
    416       /**
    417        * Assorted arithmetic ops.
    418        * @{
    419        */
    420 #define ALU1(op)                                        \
    421       instruction *                                     \
    422       op(const dst_reg &dst, const src_reg &src0) const \
    423       {                                                 \
    424          return emit(BRW_OPCODE_##op, dst, src0);       \
    425       }
    426 
    427 #define ALU2(op)                                                        \
    428       instruction *                                                     \
    429       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
    430       {                                                                 \
    431          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
    432       }
    433 
    434 #define ALU2_ACC(op)                                                    \
    435       instruction *                                                     \
    436       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
    437       {                                                                 \
    438          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
    439          inst->writes_accumulator = true;                               \
    440          return inst;                                                   \
    441       }
    442 
    443 #define ALU3(op)                                                        \
    444       instruction *                                                     \
    445       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
    446          const src_reg &src2) const                                     \
    447       {                                                                 \
    448          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
    449       }
    450 
    451       ALU2(ADD)
    452       ALU2_ACC(ADDC)
    453       ALU2(AND)
    454       ALU2(ASR)
    455       ALU2(AVG)
    456       ALU3(BFE)
    457       ALU2(BFI1)
    458       ALU3(BFI2)
    459       ALU1(BFREV)
    460       ALU1(CBIT)
    461       ALU2(CMPN)
    462       ALU3(CSEL)
    463       ALU1(DIM)
    464       ALU2(DP2)
    465       ALU2(DP3)
    466       ALU2(DP4)
    467       ALU2(DPH)
    468       ALU1(F16TO32)
    469       ALU1(F32TO16)
    470       ALU1(FBH)
    471       ALU1(FBL)
    472       ALU1(FRC)
    473       ALU2(LINE)
    474       ALU1(LZD)
    475       ALU2(MAC)
    476       ALU2_ACC(MACH)
    477       ALU3(MAD)
    478       ALU1(MOV)
    479       ALU2(MUL)
    480       ALU1(NOT)
    481       ALU2(OR)
    482       ALU2(PLN)
    483       ALU1(RNDD)
    484       ALU1(RNDE)
    485       ALU1(RNDU)
    486       ALU1(RNDZ)
    487       ALU2(SAD2)
    488       ALU2_ACC(SADA2)
    489       ALU2(SEL)
    490       ALU2(SHL)
    491       ALU2(SHR)
    492       ALU2_ACC(SUBB)
    493       ALU2(XOR)
    494 
    495 #undef ALU3
    496 #undef ALU2_ACC
    497 #undef ALU2
    498 #undef ALU1
    499       /** @} */
    500 
    501       /**
    502        * CMP: Sets the low bit of the destination channels with the result
    503        * of the comparison, while the upper bits are undefined, and updates
    504        * the flag register with the packed 16 bits of the result.
    505        */
    506       instruction *
    507       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
    508           brw_conditional_mod condition) const
    509       {
    510          /* Take the instruction:
    511           *
    512           * CMP null<d> src0<f> src1<f>
    513           *
    514           * Original gen4 does type conversion to the destination type
    515           * before comparison, producing garbage results for floating
    516           * point comparisons.
    517           *
    518           * The destination type doesn't matter on newer generations,
    519           * so we set the type to match src0 so we can compact the
    520           * instruction.
    521           */
    522          return set_condmod(condition,
    523                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
    524                                  fix_unsigned_negate(src0),
    525                                  fix_unsigned_negate(src1)));
    526       }
    527 
    528       /**
    529        * Gen4 predicated IF.
    530        */
    531       instruction *
    532       IF(brw_predicate predicate) const
    533       {
    534          return set_predicate(predicate, emit(BRW_OPCODE_IF));
    535       }
    536 
    537       /**
    538        * Emit a linear interpolation instruction.
    539        */
    540       instruction *
    541       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
    542           const src_reg &a) const
    543       {
    544          if (shader->devinfo->gen >= 6) {
    545             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
    546              * we need to reorder the operands.
    547              */
    548             return emit(BRW_OPCODE_LRP, dst, a, y, x);
    549 
    550          } else {
    551             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
    552             const dst_reg y_times_a = vgrf(dst.type);
    553             const dst_reg one_minus_a = vgrf(dst.type);
    554             const dst_reg x_times_one_minus_a = vgrf(dst.type);
    555 
    556             MUL(y_times_a, y, a);
    557             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
    558             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
    559             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
    560          }
    561       }
    562 
    563       /**
    564        * Collect a number of registers in a contiguous range of registers.
    565        */
    566       instruction *
    567       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
    568                    unsigned sources, unsigned header_size) const
    569       {
    570          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
    571          inst->header_size = header_size;
    572          inst->size_written = header_size * REG_SIZE;
    573          for (unsigned i = header_size; i < sources; i++) {
    574             inst->size_written +=
    575                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
    576                      REG_SIZE);
    577          }
    578 
    579          return inst;
    580       }
    581 
    582       backend_shader *shader;
    583 
    584    private:
    585       /**
    586        * Workaround for negation of UD registers.  See comment in
    587        * fs_generator::generate_code() for more details.
    588        */
    589       src_reg
    590       fix_unsigned_negate(const src_reg &src) const
    591       {
    592          if (src.type == BRW_REGISTER_TYPE_UD &&
    593              src.negate) {
    594             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
    595             MOV(temp, src);
    596             return src_reg(temp);
    597          } else {
    598             return src;
    599          }
    600       }
    601 
    602       /**
    603        * Workaround for source register modes not supported by the ternary
    604        * instruction encoding.
    605        */
    606       src_reg
    607       fix_3src_operand(const src_reg &src) const
    608       {
    609          if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
    610             return src;
    611          } else {
    612             dst_reg expanded = vgrf(src.type);
    613             MOV(expanded, src);
    614             return expanded;
    615          }
    616       }
    617 
    618       /**
    619        * Workaround for source register modes not supported by the math
    620        * instruction.
    621        */
    622       src_reg
    623       fix_math_operand(const src_reg &src) const
    624       {
    625          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
    626           * might be able to do better by doing execsize = 1 math and then
    627           * expanding that result out, but we would need to be careful with
    628           * masking.
    629           *
    630           * Gen6 hardware ignores source modifiers (negate and abs) on math
    631           * instructions, so we also move to a temp to set those up.
    632           *
    633           * Gen7 relaxes most of the above restrictions, but still can't use IMM
    634           * operands to math
    635           */
    636          if ((shader->devinfo->gen == 6 &&
    637               (src.file == IMM || src.file == UNIFORM ||
    638                src.abs || src.negate)) ||
    639              (shader->devinfo->gen == 7 && src.file == IMM)) {
    640             const dst_reg tmp = vgrf(src.type);
    641             MOV(tmp, src);
    642             return tmp;
    643          } else {
    644             return src;
    645          }
    646       }
    647 
    648       bblock_t *block;
    649       exec_node *cursor;
    650 
    651       unsigned _dispatch_width;
    652       unsigned _group;
    653       bool force_writemask_all;
    654 
    655       /** Debug annotation info. */
    656       struct {
    657          const char *str;
    658          const void *ir;
    659       } annotation;
    660    };
    661 }
    662 
    663 #endif
    664