Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *    Eric Anholt <eric (at) anholt.net>
     25  *
     26  */
     27 
     28 #include "brw_fs.h"
     29 #include "glsl/glsl_types.h"
     30 #include "glsl/ir_optimization.h"
     31 #include "glsl/ir_print_visitor.h"
     32 
     33 /** @file brw_fs_schedule_instructions.cpp
     34  *
     35  * List scheduling of FS instructions.
     36  *
     37  * The basic model of the list scheduler is to take a basic block,
     38  * compute a DAG of the dependencies (RAW ordering with latency, WAW
     39  * ordering, WAR ordering), and make a list of the DAG heads.
     40  * Heuristically pick a DAG head, then put all the children that are
     41  * now DAG heads into the list of things to schedule.
     42  *
     43  * The heuristic is the important part.  We're trying to be cheap,
     44  * since actually computing the optimal scheduling is NP complete.
     45  * What we do is track a "current clock".  When we schedule a node, we
     46  * update the earliest-unblocked clock time of its children, and
     47  * increment the clock.  Then, when trying to schedule, we just pick
     48  * the earliest-unblocked instruction to schedule.
     49  *
     50  * Note that often there will be many things which could execute
     51  * immediately, and there are a range of heuristic options to choose
     52  * from in picking among those.
     53  */
     54 
     55 class schedule_node : public exec_node
     56 {
     57 public:
     58    schedule_node(fs_inst *inst)
     59    {
     60       this->inst = inst;
     61       this->child_array_size = 0;
     62       this->children = NULL;
     63       this->child_latency = NULL;
     64       this->child_count = 0;
     65       this->parent_count = 0;
     66       this->unblocked_time = 0;
     67 
     68       int chans = 8;
     69       int math_latency = 22;
     70 
     71       switch (inst->opcode) {
     72       case SHADER_OPCODE_RCP:
     73 	 this->latency = 1 * chans * math_latency;
     74 	 break;
     75       case SHADER_OPCODE_RSQ:
     76 	 this->latency = 2 * chans * math_latency;
     77 	 break;
     78       case SHADER_OPCODE_INT_QUOTIENT:
     79       case SHADER_OPCODE_SQRT:
     80       case SHADER_OPCODE_LOG2:
     81 	 /* full precision log.  partial is 2. */
     82 	 this->latency = 3 * chans * math_latency;
     83 	 break;
     84       case SHADER_OPCODE_INT_REMAINDER:
     85       case SHADER_OPCODE_EXP2:
     86 	 /* full precision.  partial is 3, same throughput. */
     87 	 this->latency = 4 * chans * math_latency;
     88 	 break;
     89       case SHADER_OPCODE_POW:
     90 	 this->latency = 8 * chans * math_latency;
     91 	 break;
     92       case SHADER_OPCODE_SIN:
     93       case SHADER_OPCODE_COS:
     94 	 /* minimum latency, max is 12 rounds. */
     95 	 this->latency = 5 * chans * math_latency;
     96 	 break;
     97       default:
     98 	 this->latency = 2;
     99 	 break;
    100       }
    101    }
    102 
    103    fs_inst *inst;
    104    schedule_node **children;
    105    int *child_latency;
    106    int child_count;
    107    int parent_count;
    108    int child_array_size;
    109    int unblocked_time;
    110    int latency;
    111 };
    112 
    113 class instruction_scheduler {
    114 public:
    115    instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
    116    {
    117       this->v = v;
    118       this->mem_ctx = ralloc_context(mem_ctx);
    119       this->virtual_grf_count = virtual_grf_count;
    120       this->instructions.make_empty();
    121       this->instructions_to_schedule = 0;
    122    }
    123 
    124    ~instruction_scheduler()
    125    {
    126       ralloc_free(this->mem_ctx);
    127    }
    128    void add_barrier_deps(schedule_node *n);
    129    void add_dep(schedule_node *before, schedule_node *after, int latency);
    130    void add_dep(schedule_node *before, schedule_node *after);
    131 
    132    void add_inst(fs_inst *inst);
    133    void calculate_deps();
    134    void schedule_instructions(fs_inst *next_block_header);
    135 
    136    bool is_compressed(fs_inst *inst);
    137 
    138    void *mem_ctx;
    139 
    140    int instructions_to_schedule;
    141    int virtual_grf_count;
    142    exec_list instructions;
    143    fs_visitor *v;
    144 };
    145 
    146 void
    147 instruction_scheduler::add_inst(fs_inst *inst)
    148 {
    149    schedule_node *n = new(mem_ctx) schedule_node(inst);
    150 
    151    assert(!inst->is_head_sentinel());
    152    assert(!inst->is_tail_sentinel());
    153 
    154    this->instructions_to_schedule++;
    155 
    156    inst->remove();
    157    instructions.push_tail(n);
    158 }
    159 
    160 /**
    161  * Add a dependency between two instruction nodes.
    162  *
    163  * The @after node will be scheduled after @before.  We will try to
    164  * schedule it @latency cycles after @before, but no guarantees there.
    165  */
    166 void
    167 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
    168 			       int latency)
    169 {
    170    if (!before || !after)
    171       return;
    172 
    173    assert(before != after);
    174 
    175    for (int i = 0; i < before->child_count; i++) {
    176       if (before->children[i] == after) {
    177 	 before->child_latency[i] = MAX2(before->child_latency[i], latency);
    178 	 return;
    179       }
    180    }
    181 
    182    if (before->child_array_size <= before->child_count) {
    183       if (before->child_array_size < 16)
    184 	 before->child_array_size = 16;
    185       else
    186 	 before->child_array_size *= 2;
    187 
    188       before->children = reralloc(mem_ctx, before->children,
    189 				  schedule_node *,
    190 				  before->child_array_size);
    191       before->child_latency = reralloc(mem_ctx, before->child_latency,
    192 				       int, before->child_array_size);
    193    }
    194 
    195    before->children[before->child_count] = after;
    196    before->child_latency[before->child_count] = latency;
    197    before->child_count++;
    198    after->parent_count++;
    199 }
    200 
    201 void
    202 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
    203 {
    204    if (!before)
    205       return;
    206 
    207    add_dep(before, after, before->latency);
    208 }
    209 
    210 /**
    211  * Sometimes we really want this node to execute after everything that
    212  * was before it and before everything that followed it.  This adds
    213  * the deps to do so.
    214  */
    215 void
    216 instruction_scheduler::add_barrier_deps(schedule_node *n)
    217 {
    218    schedule_node *prev = (schedule_node *)n->prev;
    219    schedule_node *next = (schedule_node *)n->next;
    220 
    221    if (prev) {
    222       while (!prev->is_head_sentinel()) {
    223 	 add_dep(prev, n, 0);
    224 	 prev = (schedule_node *)prev->prev;
    225       }
    226    }
    227 
    228    if (next) {
    229       while (!next->is_tail_sentinel()) {
    230 	 add_dep(n, next, 0);
    231 	 next = (schedule_node *)next->next;
    232       }
    233    }
    234 }
    235 
    236 /* instruction scheduling needs to be aware of when an MRF write
    237  * actually writes 2 MRFs.
    238  */
    239 bool
    240 instruction_scheduler::is_compressed(fs_inst *inst)
    241 {
    242    return (v->c->dispatch_width == 16 &&
    243 	   !inst->force_uncompressed &&
    244 	   !inst->force_sechalf);
    245 }
    246 
    247 void
    248 instruction_scheduler::calculate_deps()
    249 {
    250    schedule_node *last_grf_write[virtual_grf_count];
    251    schedule_node *last_mrf_write[BRW_MAX_MRF];
    252    schedule_node *last_conditional_mod = NULL;
    253    /* Fixed HW registers are assumed to be separate from the virtual
    254     * GRFs, so they can be tracked separately.  We don't really write
    255     * to fixed GRFs much, so don't bother tracking them on a more
    256     * granular level.
    257     */
    258    schedule_node *last_fixed_grf_write = NULL;
    259 
    260    /* The last instruction always needs to still be the last
    261     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
    262     * WHILE) and scheduling other things after it would disturb the
    263     * basic block, or it's FB_WRITE and we should do a better job at
    264     * dead code elimination anyway.
    265     */
    266    schedule_node *last = (schedule_node *)instructions.get_tail();
    267    add_barrier_deps(last);
    268 
    269    memset(last_grf_write, 0, sizeof(last_grf_write));
    270    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    271 
    272    /* top-to-bottom dependencies: RAW and WAW. */
    273    foreach_list(node, &instructions) {
    274       schedule_node *n = (schedule_node *)node;
    275       fs_inst *inst = n->inst;
    276 
    277       /* read-after-write deps. */
    278       for (int i = 0; i < 3; i++) {
    279 	 if (inst->src[i].file == GRF) {
    280 	    add_dep(last_grf_write[inst->src[i].reg], n);
    281 	 } else if (inst->src[i].file == FIXED_HW_REG &&
    282 		    (inst->src[i].fixed_hw_reg.file ==
    283 		     BRW_GENERAL_REGISTER_FILE)) {
    284 	    add_dep(last_fixed_grf_write, n);
    285 	 } else if (inst->src[i].file != BAD_FILE &&
    286 		    inst->src[i].file != IMM &&
    287 		    inst->src[i].file != UNIFORM) {
    288 	    assert(inst->src[i].file != MRF);
    289 	    add_barrier_deps(n);
    290 	 }
    291       }
    292 
    293       for (int i = 0; i < inst->mlen; i++) {
    294 	 /* It looks like the MRF regs are released in the send
    295 	  * instruction once it's sent, not when the result comes
    296 	  * back.
    297 	  */
    298 	 add_dep(last_mrf_write[inst->base_mrf + i], n);
    299       }
    300 
    301       if (inst->predicated) {
    302 	 assert(last_conditional_mod);
    303 	 add_dep(last_conditional_mod, n);
    304       }
    305 
    306       /* write-after-write deps. */
    307       if (inst->dst.file == GRF) {
    308 	 add_dep(last_grf_write[inst->dst.reg], n);
    309 	 last_grf_write[inst->dst.reg] = n;
    310       } else if (inst->dst.file == MRF) {
    311 	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
    312 
    313 	 add_dep(last_mrf_write[reg], n);
    314 	 last_mrf_write[reg] = n;
    315 	 if (is_compressed(inst)) {
    316 	    if (inst->dst.reg & BRW_MRF_COMPR4)
    317 	       reg += 4;
    318 	    else
    319 	       reg++;
    320 	    add_dep(last_mrf_write[reg], n);
    321 	    last_mrf_write[reg] = n;
    322 	 }
    323       } else if (inst->dst.file == FIXED_HW_REG &&
    324 		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
    325 	 last_fixed_grf_write = n;
    326       } else if (inst->dst.file != BAD_FILE) {
    327 	 add_barrier_deps(n);
    328       }
    329 
    330       if (inst->mlen > 0) {
    331 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
    332 	    add_dep(last_mrf_write[inst->base_mrf + i], n);
    333 	    last_mrf_write[inst->base_mrf + i] = n;
    334 	 }
    335       }
    336 
    337       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
    338        * conditional_mod, because it sets the flag register.
    339        */
    340       if (inst->conditional_mod ||
    341           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
    342 	 add_dep(last_conditional_mod, n, 0);
    343 	 last_conditional_mod = n;
    344       }
    345    }
    346 
    347    /* bottom-to-top dependencies: WAR */
    348    memset(last_grf_write, 0, sizeof(last_grf_write));
    349    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    350    last_conditional_mod = NULL;
    351    last_fixed_grf_write = NULL;
    352 
    353    exec_node *node;
    354    exec_node *prev;
    355    for (node = instructions.get_tail(), prev = node->prev;
    356 	!node->is_head_sentinel();
    357 	node = prev, prev = node->prev) {
    358       schedule_node *n = (schedule_node *)node;
    359       fs_inst *inst = n->inst;
    360 
    361       /* write-after-read deps. */
    362       for (int i = 0; i < 3; i++) {
    363 	 if (inst->src[i].file == GRF) {
    364 	    add_dep(n, last_grf_write[inst->src[i].reg]);
    365 	 } else if (inst->src[i].file == FIXED_HW_REG &&
    366 		    (inst->src[i].fixed_hw_reg.file ==
    367 		     BRW_GENERAL_REGISTER_FILE)) {
    368 	    add_dep(n, last_fixed_grf_write);
    369 	 } else if (inst->src[i].file != BAD_FILE &&
    370 		    inst->src[i].file != IMM &&
    371 		    inst->src[i].file != UNIFORM) {
    372 	    assert(inst->src[i].file != MRF);
    373 	    add_barrier_deps(n);
    374 	 }
    375       }
    376 
    377       for (int i = 0; i < inst->mlen; i++) {
    378 	 /* It looks like the MRF regs are released in the send
    379 	  * instruction once it's sent, not when the result comes
    380 	  * back.
    381 	  */
    382 	 add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
    383       }
    384 
    385       if (inst->predicated) {
    386 	 add_dep(n, last_conditional_mod);
    387       }
    388 
    389       /* Update the things this instruction wrote, so earlier reads
    390        * can mark this as WAR dependency.
    391        */
    392       if (inst->dst.file == GRF) {
    393 	 last_grf_write[inst->dst.reg] = n;
    394       } else if (inst->dst.file == MRF) {
    395 	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
    396 
    397 	 last_mrf_write[reg] = n;
    398 
    399 	 if (is_compressed(inst)) {
    400 	    if (inst->dst.reg & BRW_MRF_COMPR4)
    401 	       reg += 4;
    402 	    else
    403 	       reg++;
    404 
    405 	    last_mrf_write[reg] = n;
    406 	 }
    407       } else if (inst->dst.file == FIXED_HW_REG &&
    408 		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
    409 	 last_fixed_grf_write = n;
    410       } else if (inst->dst.file != BAD_FILE) {
    411 	 add_barrier_deps(n);
    412       }
    413 
    414       if (inst->mlen > 0) {
    415 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
    416 	    last_mrf_write[inst->base_mrf + i] = n;
    417 	 }
    418       }
    419 
    420       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
    421        * conditional_mod, because it sets the flag register.
    422        */
    423       if (inst->conditional_mod ||
    424           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
    425 	 last_conditional_mod = n;
    426       }
    427    }
    428 }
    429 
    430 void
    431 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
    432 {
    433    int time = 0;
    434 
    435    /* Remove non-DAG heads from the list. */
    436    foreach_list_safe(node, &instructions) {
    437       schedule_node *n = (schedule_node *)node;
    438       if (n->parent_count != 0)
    439 	 n->remove();
    440    }
    441 
    442    while (!instructions.is_empty()) {
    443       schedule_node *chosen = NULL;
    444       int chosen_time = 0;
    445 
    446       foreach_list(node, &instructions) {
    447 	 schedule_node *n = (schedule_node *)node;
    448 
    449 	 if (!chosen || n->unblocked_time < chosen_time) {
    450 	    chosen = n;
    451 	    chosen_time = n->unblocked_time;
    452 	 }
    453       }
    454 
    455       /* Schedule this instruction. */
    456       assert(chosen);
    457       chosen->remove();
    458       next_block_header->insert_before(chosen->inst);
    459       instructions_to_schedule--;
    460 
    461       /* Bump the clock.  If we expected a delay for scheduling, then
    462        * bump the clock to reflect that.
    463        */
    464       time = MAX2(time + 1, chosen_time);
    465 
    466       /* Now that we've scheduled a new instruction, some of its
    467        * children can be promoted to the list of instructions ready to
    468        * be scheduled.  Update the children's unblocked time for this
    469        * DAG edge as we do so.
    470        */
    471       for (int i = 0; i < chosen->child_count; i++) {
    472 	 schedule_node *child = chosen->children[i];
    473 
    474 	 child->unblocked_time = MAX2(child->unblocked_time,
    475 				      time + chosen->child_latency[i]);
    476 
    477 	 child->parent_count--;
    478 	 if (child->parent_count == 0) {
    479 	    instructions.push_tail(child);
    480 	 }
    481       }
    482 
    483       /* Shared resource: the mathbox.  There's one per EU (on later
    484        * generations, it's even more limited pre-gen6), so if we send
    485        * something off to it then the next math isn't going to make
    486        * progress until the first is done.
    487        */
    488       if (chosen->inst->is_math()) {
    489 	 foreach_list(node, &instructions) {
    490 	    schedule_node *n = (schedule_node *)node;
    491 
    492 	    if (n->inst->is_math())
    493 	       n->unblocked_time = MAX2(n->unblocked_time,
    494 					time + chosen->latency);
    495 	 }
    496       }
    497    }
    498 
    499    assert(instructions_to_schedule == 0);
    500 }
    501 
    502 void
    503 fs_visitor::schedule_instructions()
    504 {
    505    fs_inst *next_block_header = (fs_inst *)instructions.head;
    506    instruction_scheduler sched(this, mem_ctx, this->virtual_grf_count);
    507 
    508    while (!next_block_header->is_tail_sentinel()) {
    509       /* Add things to be scheduled until we get to a new BB. */
    510       while (!next_block_header->is_tail_sentinel()) {
    511 	 fs_inst *inst = next_block_header;
    512 	 next_block_header = (fs_inst *)next_block_header->next;
    513 
    514 	 sched.add_inst(inst);
    515 	 if (inst->opcode == BRW_OPCODE_IF ||
    516 	     inst->opcode == BRW_OPCODE_ELSE ||
    517 	     inst->opcode == BRW_OPCODE_ENDIF ||
    518 	     inst->opcode == BRW_OPCODE_DO ||
    519 	     inst->opcode == BRW_OPCODE_WHILE ||
    520 	     inst->opcode == BRW_OPCODE_BREAK ||
    521 	     inst->opcode == BRW_OPCODE_CONTINUE) {
    522 	    break;
    523 	 }
    524       }
    525       sched.calculate_deps();
    526       sched.schedule_instructions(next_block_header);
    527    }
    528 
    529    this->live_intervals_valid = false;
    530 }
    531