Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2012, 2013, 2014 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include "brw_vec4.h"
     25 #include "brw_vec4_live_variables.h"
     26 #include "brw_cfg.h"
     27 
     28 using namespace brw;
     29 
     30 /** @file brw_vec4_cse.cpp
     31  *
     32  * Support for local common subexpression elimination.
     33  *
     34  * See Muchnick's Advanced Compiler Design and Implementation, section
     35  * 13.1 (p378).
     36  */
     37 
     38 namespace {
     39 struct aeb_entry : public exec_node {
     40    /** The instruction that generates the expression value. */
     41    vec4_instruction *generator;
     42 
     43    /** The temporary where the value is stored. */
     44    src_reg tmp;
     45 };
     46 }
     47 
     48 static bool
     49 is_expression(const vec4_instruction *const inst)
     50 {
     51    switch (inst->opcode) {
     52    case BRW_OPCODE_MOV:
     53    case BRW_OPCODE_SEL:
     54    case BRW_OPCODE_NOT:
     55    case BRW_OPCODE_AND:
     56    case BRW_OPCODE_OR:
     57    case BRW_OPCODE_XOR:
     58    case BRW_OPCODE_SHR:
     59    case BRW_OPCODE_SHL:
     60    case BRW_OPCODE_ASR:
     61    case BRW_OPCODE_CMP:
     62    case BRW_OPCODE_CMPN:
     63    case BRW_OPCODE_ADD:
     64    case BRW_OPCODE_MUL:
     65    case SHADER_OPCODE_MULH:
     66    case BRW_OPCODE_FRC:
     67    case BRW_OPCODE_RNDU:
     68    case BRW_OPCODE_RNDD:
     69    case BRW_OPCODE_RNDE:
     70    case BRW_OPCODE_RNDZ:
     71    case BRW_OPCODE_LINE:
     72    case BRW_OPCODE_PLN:
     73    case BRW_OPCODE_MAD:
     74    case BRW_OPCODE_LRP:
     75    case VEC4_OPCODE_UNPACK_UNIFORM:
     76    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
     77    case SHADER_OPCODE_BROADCAST:
     78    case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
     79    case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
     80       return true;
     81    case SHADER_OPCODE_RCP:
     82    case SHADER_OPCODE_RSQ:
     83    case SHADER_OPCODE_SQRT:
     84    case SHADER_OPCODE_EXP2:
     85    case SHADER_OPCODE_LOG2:
     86    case SHADER_OPCODE_POW:
     87    case SHADER_OPCODE_INT_QUOTIENT:
     88    case SHADER_OPCODE_INT_REMAINDER:
     89    case SHADER_OPCODE_SIN:
     90    case SHADER_OPCODE_COS:
     91       return inst->mlen == 0;
     92    default:
     93       return false;
     94    }
     95 }
     96 
     97 static bool
     98 operands_match(const vec4_instruction *a, const vec4_instruction *b)
     99 {
    100    const src_reg *xs = a->src;
    101    const src_reg *ys = b->src;
    102 
    103    if (a->opcode == BRW_OPCODE_MAD) {
    104       return xs[0].equals(ys[0]) &&
    105              ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
    106               (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
    107    } else if (!a->is_commutative()) {
    108       return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]);
    109    } else {
    110       return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
    111              (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
    112    }
    113 }
    114 
    115 static bool
    116 instructions_match(vec4_instruction *a, vec4_instruction *b)
    117 {
    118    return a->opcode == b->opcode &&
    119           a->saturate == b->saturate &&
    120           a->predicate == b->predicate &&
    121           a->predicate_inverse == b->predicate_inverse &&
    122           a->conditional_mod == b->conditional_mod &&
    123           a->flag_subreg == b->flag_subreg &&
    124           a->dst.type == b->dst.type &&
    125           a->offset == b->offset &&
    126           a->mlen == b->mlen &&
    127           a->base_mrf == b->base_mrf &&
    128           a->header_size == b->header_size &&
    129           a->shadow_compare == b->shadow_compare &&
    130           a->dst.writemask == b->dst.writemask &&
    131           a->force_writemask_all == b->force_writemask_all &&
    132           a->size_written == b->size_written &&
    133           a->exec_size == b->exec_size &&
    134           a->group == b->group &&
    135           operands_match(a, b);
    136 }
    137 
    138 bool
    139 vec4_visitor::opt_cse_local(bblock_t *block)
    140 {
    141    bool progress = false;
    142    exec_list aeb;
    143 
    144    void *cse_ctx = ralloc_context(NULL);
    145 
    146    int ip = block->start_ip;
    147    foreach_inst_in_block (vec4_instruction, inst, block) {
    148       /* Skip some cases. */
    149       if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
    150           ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
    151            inst->dst.is_null()))
    152       {
    153          bool found = false;
    154 
    155          foreach_in_list_use_after(aeb_entry, entry, &aeb) {
    156             /* Match current instruction's expression against those in AEB. */
    157             if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
    158                 instructions_match(inst, entry->generator)) {
    159                found = true;
    160                progress = true;
    161                break;
    162             }
    163          }
    164 
    165          if (!found) {
    166             if (inst->opcode != BRW_OPCODE_MOV ||
    167                 (inst->opcode == BRW_OPCODE_MOV &&
    168                  inst->src[0].file == IMM &&
    169                  inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
    170                /* Our first sighting of this expression.  Create an entry. */
    171                aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
    172                entry->tmp = src_reg(); /* file will be BAD_FILE */
    173                entry->generator = inst;
    174                aeb.push_tail(entry);
    175             }
    176          } else {
    177             /* This is at least our second sighting of this expression.
    178              * If we don't have a temporary already, make one.
    179              */
    180             bool no_existing_temp = entry->tmp.file == BAD_FILE;
    181             if (no_existing_temp && !entry->generator->dst.is_null()) {
    182                entry->tmp = retype(src_reg(VGRF, alloc.allocate(
    183                                               regs_written(entry->generator)),
    184                                            NULL), inst->dst.type);
    185 
    186                const unsigned width = entry->generator->exec_size;
    187                unsigned component_size = width * type_sz(entry->tmp.type);
    188                unsigned num_copy_movs =
    189                   DIV_ROUND_UP(entry->generator->size_written, component_size);
    190                for (unsigned i = 0; i < num_copy_movs; ++i) {
    191                   vec4_instruction *copy =
    192                      MOV(offset(entry->generator->dst, width, i),
    193                          offset(entry->tmp, width, i));
    194                   copy->exec_size = width;
    195                   copy->group = entry->generator->group;
    196                   copy->force_writemask_all =
    197                      entry->generator->force_writemask_all;
    198                   entry->generator->insert_after(block, copy);
    199                }
    200 
    201                entry->generator->dst = dst_reg(entry->tmp);
    202             }
    203 
    204             /* dest <- temp */
    205             if (!inst->dst.is_null()) {
    206                assert(inst->dst.type == entry->tmp.type);
    207                const unsigned width = inst->exec_size;
    208                unsigned component_size = width * type_sz(inst->dst.type);
    209                unsigned num_copy_movs =
    210                   DIV_ROUND_UP(inst->size_written, component_size);
    211                for (unsigned i = 0; i < num_copy_movs; ++i) {
    212                   vec4_instruction *copy =
    213                      MOV(offset(inst->dst, width, i),
    214                          offset(entry->tmp, width, i));
    215                   copy->exec_size = inst->exec_size;
    216                   copy->group = inst->group;
    217                   copy->force_writemask_all = inst->force_writemask_all;
    218                   inst->insert_before(block, copy);
    219                }
    220             }
    221 
    222             /* Set our iterator so that next time through the loop inst->next
    223              * will get the instruction in the basic block after the one we've
    224              * removed.
    225              */
    226             vec4_instruction *prev = (vec4_instruction *)inst->prev;
    227 
    228             inst->remove(block);
    229             inst = prev;
    230          }
    231       }
    232 
    233       foreach_in_list_safe(aeb_entry, entry, &aeb) {
    234          /* Kill all AEB entries that write a different value to or read from
    235           * the flag register if we just wrote it.
    236           */
    237          if (inst->writes_flag()) {
    238             if (entry->generator->reads_flag() ||
    239                 (entry->generator->writes_flag() &&
    240                  !instructions_match(inst, entry->generator))) {
    241                entry->remove();
    242                ralloc_free(entry);
    243                continue;
    244             }
    245          }
    246 
    247          for (int i = 0; i < 3; i++) {
    248             src_reg *src = &entry->generator->src[i];
    249 
    250             /* Kill all AEB entries that use the destination we just
    251              * overwrote.
    252              */
    253             if (inst->dst.file == entry->generator->src[i].file &&
    254                 inst->dst.nr == entry->generator->src[i].nr) {
    255                entry->remove();
    256                ralloc_free(entry);
    257                break;
    258             }
    259 
    260             /* Kill any AEB entries using registers that don't get reused any
    261              * more -- a sure sign they'll fail operands_match().
    262              */
    263             if (src->file == VGRF) {
    264                if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
    265                   entry->remove();
    266                   ralloc_free(entry);
    267                   break;
    268                }
    269             }
    270          }
    271       }
    272 
    273       ip++;
    274    }
    275 
    276    ralloc_free(cse_ctx);
    277 
    278    return progress;
    279 }
    280 
    281 bool
    282 vec4_visitor::opt_cse()
    283 {
    284    bool progress = false;
    285 
    286    calculate_live_intervals();
    287 
    288    foreach_block (block, cfg) {
    289       progress = opt_cse_local(block) || progress;
    290    }
    291 
    292    if (progress)
    293       invalidate_live_intervals();
    294 
    295    return progress;
    296 }
    297