Home | History | Annotate | Download | only in compiler
      1 /*
      2  * Copyright  2010 Intel Corporation
      3  * Copyright  2014-2017 Broadcom
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9  * and/or sell copies of the Software, and to permit persons to whom the
     10  * Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     22  * IN THE SOFTWARE.
     23  */
     24 
     25 /**
     26  * @file
     27  *
     28  * The basic model of the list scheduler is to take a basic block, compute a
     29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
     30  * pick a DAG head, then put all the children that are now DAG heads into the
     31  * list of things to schedule.
     32  *
     33  * The goal of scheduling here is to pack pairs of operations together in a
     34  * single QPU instruction.
     35  */
     36 
     37 #include "qpu/qpu_disasm.h"
     38 #include "v3d_compiler.h"
     39 #include "util/ralloc.h"
     40 
     41 static bool debug;
     42 
     43 struct schedule_node_child;
     44 
     45 struct schedule_node {
     46         struct list_head link;
     47         struct qinst *inst;
     48         struct schedule_node_child *children;
     49         uint32_t child_count;
     50         uint32_t child_array_size;
     51         uint32_t parent_count;
     52 
     53         /* Longest cycles + instruction_latency() of any parent of this node. */
     54         uint32_t unblocked_time;
     55 
     56         /**
     57          * Minimum number of cycles from scheduling this instruction until the
     58          * end of the program, based on the slowest dependency chain through
     59          * the children.
     60          */
     61         uint32_t delay;
     62 
     63         /**
     64          * cycles between this instruction being scheduled and when its result
     65          * can be consumed.
     66          */
     67         uint32_t latency;
     68 };
     69 
     70 struct schedule_node_child {
     71         struct schedule_node *node;
     72         bool write_after_read;
     73 };
     74 
     75 /* When walking the instructions in reverse, we need to swap before/after in
     76  * add_dep().
     77  */
     78 enum direction { F, R };
     79 
     80 struct schedule_state {
     81         const struct v3d_device_info *devinfo;
     82         struct schedule_node *last_r[6];
     83         struct schedule_node *last_rf[64];
     84         struct schedule_node *last_sf;
     85         struct schedule_node *last_vpm_read;
     86         struct schedule_node *last_tmu_write;
     87         struct schedule_node *last_tmu_config;
     88         struct schedule_node *last_tlb;
     89         struct schedule_node *last_vpm;
     90         struct schedule_node *last_unif;
     91         struct schedule_node *last_rtop;
     92         enum direction dir;
     93         /* Estimated cycle when the current instruction would start. */
     94         uint32_t time;
     95 };
     96 
     97 static void
     98 add_dep(struct schedule_state *state,
     99         struct schedule_node *before,
    100         struct schedule_node *after,
    101         bool write)
    102 {
    103         bool write_after_read = !write && state->dir == R;
    104 
    105         if (!before || !after)
    106                 return;
    107 
    108         assert(before != after);
    109 
    110         if (state->dir == R) {
    111                 struct schedule_node *t = before;
    112                 before = after;
    113                 after = t;
    114         }
    115 
    116         for (int i = 0; i < before->child_count; i++) {
    117                 if (before->children[i].node == after &&
    118                     (before->children[i].write_after_read == write_after_read)) {
    119                         return;
    120                 }
    121         }
    122 
    123         if (before->child_array_size <= before->child_count) {
    124                 before->child_array_size = MAX2(before->child_array_size * 2, 16);
    125                 before->children = reralloc(before, before->children,
    126                                             struct schedule_node_child,
    127                                             before->child_array_size);
    128         }
    129 
    130         before->children[before->child_count].node = after;
    131         before->children[before->child_count].write_after_read =
    132                 write_after_read;
    133         before->child_count++;
    134         after->parent_count++;
    135 }
    136 
    137 static void
    138 add_read_dep(struct schedule_state *state,
    139               struct schedule_node *before,
    140               struct schedule_node *after)
    141 {
    142         add_dep(state, before, after, false);
    143 }
    144 
    145 static void
    146 add_write_dep(struct schedule_state *state,
    147               struct schedule_node **before,
    148               struct schedule_node *after)
    149 {
    150         add_dep(state, *before, after, true);
    151         *before = after;
    152 }
    153 
    154 static bool
    155 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
    156 {
    157         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
    158                 return false;
    159 
    160         if (inst->alu.add.magic_write &&
    161             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
    162              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
    163                 return true;
    164 
    165         if (inst->alu.mul.magic_write &&
    166             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
    167              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
    168                 return true;
    169 
    170         return false;
    171 }
    172 
    173 static void
    174 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
    175                  enum v3d_qpu_mux mux)
    176 {
    177         switch (mux) {
    178         case V3D_QPU_MUX_A:
    179                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
    180                 break;
    181         case V3D_QPU_MUX_B:
    182                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
    183                 break;
    184         default:
    185                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
    186                 break;
    187         }
    188 }
    189 
    190 
    191 static void
    192 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
    193                    uint32_t waddr, bool magic)
    194 {
    195         if (!magic) {
    196                 add_write_dep(state, &state->last_rf[waddr], n);
    197         } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
    198                 add_write_dep(state, &state->last_tmu_write, n);
    199                 switch (waddr) {
    200                 case V3D_QPU_WADDR_TMUS:
    201                 case V3D_QPU_WADDR_TMUSCM:
    202                 case V3D_QPU_WADDR_TMUSF:
    203                 case V3D_QPU_WADDR_TMUSLOD:
    204                         add_write_dep(state, &state->last_tmu_config, n);
    205                         break;
    206                 default:
    207                         break;
    208                 }
    209         } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
    210                 /* Handled by v3d_qpu_writes_r4() check. */
    211         } else {
    212                 switch (waddr) {
    213                 case V3D_QPU_WADDR_R0:
    214                 case V3D_QPU_WADDR_R1:
    215                 case V3D_QPU_WADDR_R2:
    216                         add_write_dep(state,
    217                                       &state->last_r[waddr - V3D_QPU_WADDR_R0],
    218                                       n);
    219                         break;
    220                 case V3D_QPU_WADDR_R3:
    221                 case V3D_QPU_WADDR_R4:
    222                 case V3D_QPU_WADDR_R5:
    223                         /* Handled by v3d_qpu_writes_r*() checks below. */
    224                         break;
    225 
    226                 case V3D_QPU_WADDR_VPM:
    227                 case V3D_QPU_WADDR_VPMU:
    228                         add_write_dep(state, &state->last_vpm, n);
    229                         break;
    230 
    231                 case V3D_QPU_WADDR_TLB:
    232                 case V3D_QPU_WADDR_TLBU:
    233                         add_write_dep(state, &state->last_tlb, n);
    234                         break;
    235 
    236                 case V3D_QPU_WADDR_NOP:
    237                         break;
    238 
    239                 default:
    240                         fprintf(stderr, "Unknown waddr %d\n", waddr);
    241                         abort();
    242                 }
    243         }
    244 }
    245 
    246 static void
    247 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
    248                   enum v3d_qpu_cond cond)
    249 {
    250         if (cond != V3D_QPU_COND_NONE)
    251                 add_read_dep(state, state->last_sf, n);
    252 }
    253 
    254 static void
    255 process_pf_deps(struct schedule_state *state, struct schedule_node *n,
    256                 enum v3d_qpu_pf pf)
    257 {
    258         if (pf != V3D_QPU_PF_NONE)
    259                 add_write_dep(state, &state->last_sf, n);
    260 }
    261 
    262 static void
    263 process_uf_deps(struct schedule_state *state, struct schedule_node *n,
    264                 enum v3d_qpu_uf uf)
    265 {
    266         if (uf != V3D_QPU_UF_NONE)
    267                 add_write_dep(state, &state->last_sf, n);
    268 }
    269 
    270 /**
    271  * Common code for dependencies that need to be tracked both forward and
    272  * backward.
    273  *
    274  * This is for things like "all reads of r4 have to happen between the r4
    275  * writes that surround them".
    276  */
    277 static void
    278 calculate_deps(struct schedule_state *state, struct schedule_node *n)
    279 {
    280         const struct v3d_device_info *devinfo = state->devinfo;
    281         struct qinst *qinst = n->inst;
    282         struct v3d_qpu_instr *inst = &qinst->qpu;
    283 
    284         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
    285                 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
    286                         add_read_dep(state, state->last_sf, n);
    287 
    288                 /* XXX: BDI */
    289                 /* XXX: BDU */
    290                 /* XXX: ub */
    291                 /* XXX: raddr_a */
    292 
    293                 add_write_dep(state, &state->last_unif, n);
    294                 return;
    295         }
    296 
    297         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
    298 
    299         /* XXX: LOAD_IMM */
    300 
    301         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
    302                 process_mux_deps(state, n, inst->alu.add.a);
    303         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
    304                 process_mux_deps(state, n, inst->alu.add.b);
    305 
    306         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
    307                 process_mux_deps(state, n, inst->alu.mul.a);
    308         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
    309                 process_mux_deps(state, n, inst->alu.mul.b);
    310 
    311         switch (inst->alu.add.op) {
    312         case V3D_QPU_A_VPMSETUP:
    313                 /* Could distinguish read/write by unpacking the uniform. */
    314                 add_write_dep(state, &state->last_vpm, n);
    315                 add_write_dep(state, &state->last_vpm_read, n);
    316                 break;
    317 
    318         case V3D_QPU_A_STVPMV:
    319         case V3D_QPU_A_STVPMD:
    320         case V3D_QPU_A_STVPMP:
    321                 add_write_dep(state, &state->last_vpm, n);
    322                 break;
    323 
    324         case V3D_QPU_A_VPMWT:
    325                 add_read_dep(state, state->last_vpm, n);
    326                 break;
    327 
    328         case V3D_QPU_A_MSF:
    329                 add_read_dep(state, state->last_tlb, n);
    330                 break;
    331 
    332         case V3D_QPU_A_SETMSF:
    333         case V3D_QPU_A_SETREVF:
    334                 add_write_dep(state, &state->last_tlb, n);
    335                 break;
    336 
    337         case V3D_QPU_A_FLAPUSH:
    338         case V3D_QPU_A_FLBPUSH:
    339         case V3D_QPU_A_VFLA:
    340         case V3D_QPU_A_VFLNA:
    341         case V3D_QPU_A_VFLB:
    342         case V3D_QPU_A_VFLNB:
    343                 add_read_dep(state, state->last_sf, n);
    344                 break;
    345 
    346         case V3D_QPU_A_FLBPOP:
    347                 add_write_dep(state, &state->last_sf, n);
    348                 break;
    349 
    350         default:
    351                 break;
    352         }
    353 
    354         switch (inst->alu.mul.op) {
    355         case V3D_QPU_M_MULTOP:
    356         case V3D_QPU_M_UMUL24:
    357                 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
    358                  * resets it to 0.  We could possibly reorder umul24s relative
    359                  * to each other, but for now just keep all the MUL parts in
    360                  * order.
    361                  */
    362                 add_write_dep(state, &state->last_rtop, n);
    363                 break;
    364         default:
    365                 break;
    366         }
    367 
    368         if (inst->alu.add.op != V3D_QPU_A_NOP) {
    369                 process_waddr_deps(state, n, inst->alu.add.waddr,
    370                                    inst->alu.add.magic_write);
    371         }
    372         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
    373                 process_waddr_deps(state, n, inst->alu.mul.waddr,
    374                                    inst->alu.mul.magic_write);
    375         }
    376         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
    377                 process_waddr_deps(state, n, inst->sig_addr,
    378                                    inst->sig_magic);
    379         }
    380 
    381         if (v3d_qpu_writes_r3(devinfo, inst))
    382                 add_write_dep(state, &state->last_r[3], n);
    383         if (v3d_qpu_writes_r4(devinfo, inst))
    384                 add_write_dep(state, &state->last_r[4], n);
    385         if (v3d_qpu_writes_r5(devinfo, inst))
    386                 add_write_dep(state, &state->last_r[5], n);
    387 
    388         if (inst->sig.thrsw) {
    389                 /* All accumulator contents and flags are undefined after the
    390                  * switch.
    391                  */
    392                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
    393                         add_write_dep(state, &state->last_r[i], n);
    394                 add_write_dep(state, &state->last_sf, n);
    395 
    396                 /* Scoreboard-locking operations have to stay after the last
    397                  * thread switch.
    398                  */
    399                 add_write_dep(state, &state->last_tlb, n);
    400 
    401                 add_write_dep(state, &state->last_tmu_write, n);
    402                 add_write_dep(state, &state->last_tmu_config, n);
    403         }
    404 
    405         if (inst->sig.ldtmu) {
    406                 /* TMU loads are coming from a FIFO, so ordering is important.
    407                  */
    408                 add_write_dep(state, &state->last_tmu_write, n);
    409         }
    410 
    411         if (inst->sig.wrtmuc)
    412                 add_write_dep(state, &state->last_tmu_config, n);
    413 
    414         if (inst->sig.ldtlb | inst->sig.ldtlbu)
    415                 add_read_dep(state, state->last_tlb, n);
    416 
    417         if (inst->sig.ldvpm)
    418                 add_write_dep(state, &state->last_vpm_read, n);
    419 
    420         /* inst->sig.ldunif or sideband uniform read */
    421         if (qinst->uniform != ~0)
    422                 add_write_dep(state, &state->last_unif, n);
    423 
    424         process_cond_deps(state, n, inst->flags.ac);
    425         process_cond_deps(state, n, inst->flags.mc);
    426         process_pf_deps(state, n, inst->flags.apf);
    427         process_pf_deps(state, n, inst->flags.mpf);
    428         process_uf_deps(state, n, inst->flags.auf);
    429         process_uf_deps(state, n, inst->flags.muf);
    430 }
    431 
    432 static void
    433 calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
    434 {
    435         struct schedule_state state;
    436 
    437         memset(&state, 0, sizeof(state));
    438         state.devinfo = c->devinfo;
    439         state.dir = F;
    440 
    441         list_for_each_entry(struct schedule_node, node, schedule_list, link)
    442                 calculate_deps(&state, node);
    443 }
    444 
    445 static void
    446 calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
    447 {
    448         struct list_head *node;
    449         struct schedule_state state;
    450 
    451         memset(&state, 0, sizeof(state));
    452         state.devinfo = c->devinfo;
    453         state.dir = R;
    454 
    455         for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
    456                 calculate_deps(&state, (struct schedule_node *)node);
    457         }
    458 }
    459 
    460 struct choose_scoreboard {
    461         int tick;
    462         int last_sfu_write_tick;
    463         int last_ldvary_tick;
    464         int last_uniforms_reset_tick;
    465         uint32_t last_waddr_add, last_waddr_mul;
    466         bool tlb_locked;
    467 };
    468 
    469 static bool
    470 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
    471                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
    472 {
    473         switch (mux) {
    474         case V3D_QPU_MUX_A:
    475                 if (scoreboard->last_waddr_add == inst->raddr_a ||
    476                     scoreboard->last_waddr_mul == inst->raddr_a) {
    477                         return true;
    478                 }
    479                 break;
    480 
    481         case V3D_QPU_MUX_B:
    482                 if (scoreboard->last_waddr_add == inst->raddr_b ||
    483                     scoreboard->last_waddr_mul == inst->raddr_b) {
    484                         return true;
    485                 }
    486                 break;
    487 
    488         case V3D_QPU_MUX_R4:
    489                 if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
    490                         return true;
    491                 break;
    492 
    493         case V3D_QPU_MUX_R5:
    494                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
    495                         return true;
    496                 break;
    497         default:
    498                 break;
    499         }
    500 
    501         return false;
    502 }
    503 
    504 static bool
    505 reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
    506                            struct qinst *qinst)
    507 {
    508         const struct v3d_qpu_instr *inst = &qinst->qpu;
    509 
    510         /* XXX: Branching off of raddr. */
    511         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
    512                 return false;
    513 
    514         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
    515 
    516         if (inst->alu.add.op != V3D_QPU_A_NOP) {
    517                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
    518                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
    519                         return true;
    520                 }
    521                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
    522                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
    523                         return true;
    524                 }
    525         }
    526 
    527         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
    528                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
    529                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
    530                         return true;
    531                 }
    532                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
    533                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
    534                         return true;
    535                 }
    536         }
    537 
    538         /* XXX: imm */
    539 
    540         return false;
    541 }
    542 
    543 static bool
    544 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
    545                             struct choose_scoreboard *scoreboard,
    546                             struct qinst *qinst)
    547 {
    548         const struct v3d_qpu_instr *inst = &qinst->qpu;
    549 
    550         /* Don't schedule any other r4 write too soon after an SFU write.
    551          * This would normally be prevented by dependency tracking, but might
    552          * occur if a dead SFU computation makes it to scheduling.
    553          */
    554         if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
    555             v3d_qpu_writes_r4(devinfo, inst))
    556                 return true;
    557 
    558         return false;
    559 }
    560 
    561 static bool
    562 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
    563                           const struct v3d_qpu_instr *inst)
    564 {
    565         return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
    566 }
    567 
    568 static int
    569 get_instruction_priority(const struct v3d_qpu_instr *inst)
    570 {
    571         uint32_t baseline_score;
    572         uint32_t next_score = 0;
    573 
    574         /* Schedule TLB operations as late as possible, to get more
    575          * parallelism between shaders.
    576          */
    577         if (qpu_inst_is_tlb(inst))
    578                 return next_score;
    579         next_score++;
    580 
    581         /* Schedule texture read results collection late to hide latency. */
    582         if (inst->sig.ldtmu)
    583                 return next_score;
    584         next_score++;
    585 
    586         /* Default score for things that aren't otherwise special. */
    587         baseline_score = next_score;
    588         next_score++;
    589 
    590         /* Schedule texture read setup early to hide their latency better. */
    591         if (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
    592             ((inst->alu.add.magic_write &&
    593               v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) ||
    594              (inst->alu.mul.magic_write &&
    595               v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) {
    596                 return next_score;
    597         }
    598         next_score++;
    599 
    600         return baseline_score;
    601 }
    602 
    603 static bool
    604 qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
    605 {
    606         return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
    607                 v3d_qpu_magic_waddr_is_sfu(waddr) ||
    608                 v3d_qpu_magic_waddr_is_tlb(waddr) ||
    609                 v3d_qpu_magic_waddr_is_vpm(waddr) ||
    610                 v3d_qpu_magic_waddr_is_tsy(waddr));
    611 }
    612 
    613 static bool
    614 qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
    615 {
    616         if (v3d_qpu_uses_vpm(inst))
    617                 return true;
    618 
    619         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
    620                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
    621                     inst->alu.add.magic_write &&
    622                     qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
    623                         return true;
    624                 }
    625 
    626                 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
    627                     inst->alu.mul.magic_write &&
    628                     qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
    629                         return true;
    630                 }
    631         }
    632 
    633         return (inst->sig.ldvpm ||
    634                 inst->sig.ldtmu ||
    635                 inst->sig.ldtlb ||
    636                 inst->sig.ldtlbu ||
    637                 inst->sig.wrtmuc);
    638 }
    639 
    640 static bool
    641 qpu_merge_inst(const struct v3d_device_info *devinfo,
    642                struct v3d_qpu_instr *result,
    643                const struct v3d_qpu_instr *a,
    644                const struct v3d_qpu_instr *b)
    645 {
    646         if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
    647             b->type != V3D_QPU_INSTR_TYPE_ALU) {
    648                 return false;
    649         }
    650 
    651         /* Can't do more than one peripheral access in an instruction.
    652          *
    653          * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and
    654          * WRTMUC with a TMU magic register write (other than tmuc).
    655          */
    656         if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
    657                 return false;
    658 
    659         struct v3d_qpu_instr merge = *a;
    660 
    661         if (b->alu.add.op != V3D_QPU_A_NOP) {
    662                 if (a->alu.add.op != V3D_QPU_A_NOP)
    663                         return false;
    664                 merge.alu.add = b->alu.add;
    665 
    666                 merge.flags.ac = b->flags.ac;
    667                 merge.flags.apf = b->flags.apf;
    668                 merge.flags.auf = b->flags.auf;
    669         }
    670 
    671         if (b->alu.mul.op != V3D_QPU_M_NOP) {
    672                 if (a->alu.mul.op != V3D_QPU_M_NOP)
    673                         return false;
    674                 merge.alu.mul = b->alu.mul;
    675 
    676                 merge.flags.mc = b->flags.mc;
    677                 merge.flags.mpf = b->flags.mpf;
    678                 merge.flags.muf = b->flags.muf;
    679         }
    680 
    681         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
    682                 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
    683                     a->raddr_a != b->raddr_a) {
    684                         return false;
    685                 }
    686                 merge.raddr_a = b->raddr_a;
    687         }
    688 
    689         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
    690                 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
    691                     a->raddr_b != b->raddr_b) {
    692                         return false;
    693                 }
    694                 merge.raddr_b = b->raddr_b;
    695         }
    696 
    697         merge.sig.thrsw |= b->sig.thrsw;
    698         merge.sig.ldunif |= b->sig.ldunif;
    699         merge.sig.ldunifrf |= b->sig.ldunifrf;
    700         merge.sig.ldunifa |= b->sig.ldunifa;
    701         merge.sig.ldunifarf |= b->sig.ldunifarf;
    702         merge.sig.ldtmu |= b->sig.ldtmu;
    703         merge.sig.ldvary |= b->sig.ldvary;
    704         merge.sig.ldvpm |= b->sig.ldvpm;
    705         merge.sig.small_imm |= b->sig.small_imm;
    706         merge.sig.ldtlb |= b->sig.ldtlb;
    707         merge.sig.ldtlbu |= b->sig.ldtlbu;
    708         merge.sig.ucb |= b->sig.ucb;
    709         merge.sig.rotate |= b->sig.rotate;
    710         merge.sig.wrtmuc |= b->sig.wrtmuc;
    711 
    712         if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
    713             v3d_qpu_sig_writes_address(devinfo, &b->sig))
    714                 return false;
    715         merge.sig_addr |= b->sig_addr;
    716         merge.sig_magic |= b->sig_magic;
    717 
    718         uint64_t packed;
    719         bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
    720 
    721         *result = merge;
    722         /* No modifying the real instructions on failure. */
    723         assert(ok || (a != result && b != result));
    724 
    725         return ok;
    726 }
    727 
    728 static struct schedule_node *
    729 choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
    730                                struct choose_scoreboard *scoreboard,
    731                                struct list_head *schedule_list,
    732                                struct schedule_node *prev_inst)
    733 {
    734         struct schedule_node *chosen = NULL;
    735         int chosen_prio = 0;
    736 
    737         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
    738          * will handle pairing it along with filling the delay slots.
    739          */
    740         if (prev_inst) {
    741                 if (prev_inst->inst->qpu.sig.thrsw)
    742                         return NULL;
    743         }
    744 
    745         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
    746                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
    747 
    748                 /* Don't choose the branch instruction until it's the last one
    749                  * left.  We'll move it up to fit its delay slots after we
    750                  * choose it.
    751                  */
    752                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
    753                     !list_is_singular(schedule_list)) {
    754                         continue;
    755                 }
    756 
    757                 /* "An instruction must not read from a location in physical
    758                  *  regfile A or B that was written to by the previous
    759                  *  instruction."
    760                  */
    761                 if (reads_too_soon_after_write(scoreboard, n->inst))
    762                         continue;
    763 
    764                 if (writes_too_soon_after_write(devinfo, scoreboard, n->inst))
    765                         continue;
    766 
    767                 /* "A scoreboard wait must not occur in the first two
    768                  *  instructions of a fragment shader. This is either the
    769                  *  explicit Wait for Scoreboard signal or an implicit wait
    770                  *  with the first tile-buffer read or write instruction."
    771                  */
    772                 if (pixel_scoreboard_too_soon(scoreboard, inst))
    773                         continue;
    774 
    775                 /* ldunif and ldvary both write r5, but ldunif does so a tick
    776                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
    777                  * otherwise get scheduled so ldunif and ldvary try to update
    778                  * r5 in the same tick.
    779                  */
    780                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
    781                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
    782                         continue;
    783                 }
    784 
    785                 /* If we're trying to pair with another instruction, check
    786                  * that they're compatible.
    787                  */
    788                 if (prev_inst) {
    789                         /* Don't pair up a thread switch signal -- we'll
    790                          * handle pairing it when we pick it on its own.
    791                          */
    792                         if (inst->sig.thrsw)
    793                                 continue;
    794 
    795                         if (prev_inst->inst->uniform != -1 &&
    796                             n->inst->uniform != -1)
    797                                 continue;
    798 
    799                         /* Don't merge in something that will lock the TLB.
    800                          * Hopwefully what we have in inst will release some
    801                          * other instructions, allowing us to delay the
    802                          * TLB-locking instruction until later.
    803                          */
    804                         if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
    805                                 continue;
    806 
    807                         struct v3d_qpu_instr merged_inst;
    808                         if (!qpu_merge_inst(devinfo, &merged_inst,
    809                                             &prev_inst->inst->qpu, inst)) {
    810                                 continue;
    811                         }
    812                 }
    813 
    814                 int prio = get_instruction_priority(inst);
    815 
    816                 /* Found a valid instruction.  If nothing better comes along,
    817                  * this one works.
    818                  */
    819                 if (!chosen) {
    820                         chosen = n;
    821                         chosen_prio = prio;
    822                         continue;
    823                 }
    824 
    825                 if (prio > chosen_prio) {
    826                         chosen = n;
    827                         chosen_prio = prio;
    828                 } else if (prio < chosen_prio) {
    829                         continue;
    830                 }
    831 
    832                 if (n->delay > chosen->delay) {
    833                         chosen = n;
    834                         chosen_prio = prio;
    835                 } else if (n->delay < chosen->delay) {
    836                         continue;
    837                 }
    838         }
    839 
    840         return chosen;
    841 }
    842 
    843 static void
    844 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
    845                                   enum v3d_qpu_waddr waddr)
    846 {
    847         if (v3d_qpu_magic_waddr_is_sfu(waddr))
    848                 scoreboard->last_sfu_write_tick = scoreboard->tick;
    849 }
    850 
    851 static void
    852 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
    853                              const struct v3d_qpu_instr *inst)
    854 {
    855         scoreboard->last_waddr_add = ~0;
    856         scoreboard->last_waddr_mul = ~0;
    857 
    858         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
    859                 return;
    860 
    861         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
    862 
    863         if (inst->alu.add.op != V3D_QPU_A_NOP)  {
    864                 if (inst->alu.add.magic_write) {
    865                         update_scoreboard_for_magic_waddr(scoreboard,
    866                                                           inst->alu.add.waddr);
    867                 } else {
    868                         scoreboard->last_waddr_add = inst->alu.add.waddr;
    869                 }
    870         }
    871 
    872         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
    873                 if (inst->alu.mul.magic_write) {
    874                         update_scoreboard_for_magic_waddr(scoreboard,
    875                                                           inst->alu.mul.waddr);
    876                 } else {
    877                         scoreboard->last_waddr_mul = inst->alu.mul.waddr;
    878                 }
    879         }
    880 
    881         if (inst->sig.ldvary)
    882                 scoreboard->last_ldvary_tick = scoreboard->tick;
    883 
    884         if (qpu_inst_is_tlb(inst))
    885                 scoreboard->tlb_locked = true;
    886 }
    887 
    888 static void
    889 dump_state(const struct v3d_device_info *devinfo,
    890            struct list_head *schedule_list)
    891 {
    892         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
    893                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
    894                 v3d_qpu_dump(devinfo, &n->inst->qpu);
    895                 fprintf(stderr, "\n");
    896 
    897                 for (int i = 0; i < n->child_count; i++) {
    898                         struct schedule_node *child = n->children[i].node;
    899                         if (!child)
    900                                 continue;
    901 
    902                         fprintf(stderr, "                 - ");
    903                         v3d_qpu_dump(devinfo, &child->inst->qpu);
    904                         fprintf(stderr, " (%d parents, %c)\n",
    905                                 child->parent_count,
    906                                 n->children[i].write_after_read ? 'w' : 'r');
    907                 }
    908         }
    909 }
    910 
    911 static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
    912                                     const struct v3d_qpu_instr *after)
    913 {
    914         /* Apply some huge latency between texture fetch requests and getting
    915          * their results back.
    916          *
    917          * FIXME: This is actually pretty bogus.  If we do:
    918          *
    919          * mov tmu0_s, a
    920          * <a bit of math>
    921          * mov tmu0_s, b
    922          * load_tmu0
    923          * <more math>
    924          * load_tmu0
    925          *
    926          * we count that as worse than
    927          *
    928          * mov tmu0_s, a
    929          * mov tmu0_s, b
    930          * <lots of math>
    931          * load_tmu0
    932          * <more math>
    933          * load_tmu0
    934          *
    935          * because we associate the first load_tmu0 with the *second* tmu0_s.
    936          */
    937         if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu)
    938                 return 100;
    939 
    940         /* Assume that anything depending on us is consuming the SFU result. */
    941         if (v3d_qpu_magic_waddr_is_sfu(waddr))
    942                 return 3;
    943 
    944         return 1;
    945 }
    946 
    947 static uint32_t
    948 instruction_latency(struct schedule_node *before, struct schedule_node *after)
    949 {
    950         const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
    951         const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
    952         uint32_t latency = 1;
    953 
    954         if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
    955             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
    956                 return latency;
    957 
    958         if (before_inst->alu.add.magic_write) {
    959                 latency = MAX2(latency,
    960                                magic_waddr_latency(before_inst->alu.add.waddr,
    961                                                    after_inst));
    962         }
    963 
    964         if (before_inst->alu.mul.magic_write) {
    965                 latency = MAX2(latency,
    966                                magic_waddr_latency(before_inst->alu.mul.waddr,
    967                                                    after_inst));
    968         }
    969 
    970         return latency;
    971 }
    972 
    973 /** Recursive computation of the delay member of a node. */
    974 static void
    975 compute_delay(struct schedule_node *n)
    976 {
    977         if (!n->child_count) {
    978                 n->delay = 1;
    979         } else {
    980                 for (int i = 0; i < n->child_count; i++) {
    981                         if (!n->children[i].node->delay)
    982                                 compute_delay(n->children[i].node);
    983                         n->delay = MAX2(n->delay,
    984                                         n->children[i].node->delay +
    985                                         instruction_latency(n, n->children[i].node));
    986                 }
    987         }
    988 }
    989 
    990 static void
    991 mark_instruction_scheduled(struct list_head *schedule_list,
    992                            uint32_t time,
    993                            struct schedule_node *node,
    994                            bool war_only)
    995 {
    996         if (!node)
    997                 return;
    998 
    999         for (int i = node->child_count - 1; i >= 0; i--) {
   1000                 struct schedule_node *child =
   1001                         node->children[i].node;
   1002 
   1003                 if (!child)
   1004                         continue;
   1005 
   1006                 if (war_only && !node->children[i].write_after_read)
   1007                         continue;
   1008 
   1009                 /* If the requirement is only that the node not appear before
   1010                  * the last read of its destination, then it can be scheduled
   1011                  * immediately after (or paired with!) the thing reading the
   1012                  * destination.
   1013                  */
   1014                 uint32_t latency = 0;
   1015                 if (!war_only) {
   1016                         latency = instruction_latency(node,
   1017                                                       node->children[i].node);
   1018                 }
   1019 
   1020                 child->unblocked_time = MAX2(child->unblocked_time,
   1021                                              time + latency);
   1022                 child->parent_count--;
   1023                 if (child->parent_count == 0)
   1024                         list_add(&child->link, schedule_list);
   1025 
   1026                 node->children[i].node = NULL;
   1027         }
   1028 }
   1029 
   1030 static void
   1031 insert_scheduled_instruction(struct v3d_compile *c,
   1032                              struct qblock *block,
   1033                              struct choose_scoreboard *scoreboard,
   1034                              struct qinst *inst)
   1035 {
   1036         list_addtail(&inst->link, &block->instructions);
   1037 
   1038         update_scoreboard_for_chosen(scoreboard, &inst->qpu);
   1039         c->qpu_inst_count++;
   1040         scoreboard->tick++;
   1041 }
   1042 
   1043 static struct qinst *
   1044 vir_nop()
   1045 {
   1046         struct qreg undef = { QFILE_NULL, 0 };
   1047         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
   1048 
   1049         return qinst;
   1050 }
   1051 
   1052 static void
   1053 emit_nop(struct v3d_compile *c, struct qblock *block,
   1054          struct choose_scoreboard *scoreboard)
   1055 {
   1056         insert_scheduled_instruction(c, block, scoreboard, vir_nop());
   1057 }
   1058 
   1059 static bool
   1060 qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
   1061                                      const struct qinst *qinst, int slot)
   1062 {
   1063         const struct v3d_qpu_instr *inst = &qinst->qpu;
   1064 
   1065         /* Only TLB Z writes are prohibited in the last slot, but we don't
   1066          * have those flagged so prohibit all TLB ops for now.
   1067          */
   1068         if (slot == 2 && qpu_inst_is_tlb(inst))
   1069                 return false;
   1070 
   1071         if (slot > 0 && qinst->uniform != ~0)
   1072                 return false;
   1073 
   1074         if (v3d_qpu_uses_vpm(inst))
   1075                 return false;
   1076 
   1077         if (inst->sig.ldvary)
   1078                 return false;
   1079 
   1080         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
   1081                 /* No writing physical registers at the end. */
   1082                 if (!inst->alu.add.magic_write ||
   1083                     !inst->alu.mul.magic_write) {
   1084                         return false;
   1085                 }
   1086 
   1087                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
   1088                         return false;
   1089 
   1090                 /* RF0-2 might be overwritten during the delay slots by
   1091                  * fragment shader setup.
   1092                  */
   1093                 if (inst->raddr_a < 3 &&
   1094                     (inst->alu.add.a == V3D_QPU_MUX_A ||
   1095                      inst->alu.add.b == V3D_QPU_MUX_A ||
   1096                      inst->alu.mul.a == V3D_QPU_MUX_A ||
   1097                      inst->alu.mul.b == V3D_QPU_MUX_A)) {
   1098                         return false;
   1099                 }
   1100 
   1101                 if (inst->raddr_b < 3 &&
   1102                     !inst->sig.small_imm &&
   1103                     (inst->alu.add.a == V3D_QPU_MUX_B ||
   1104                      inst->alu.add.b == V3D_QPU_MUX_B ||
   1105                      inst->alu.mul.a == V3D_QPU_MUX_B ||
   1106                      inst->alu.mul.b == V3D_QPU_MUX_B)) {
   1107                         return false;
   1108                 }
   1109         }
   1110 
   1111         return true;
   1112 }
   1113 
   1114 static bool
   1115 valid_thrsw_sequence(struct v3d_compile *c,
   1116                      struct qinst *qinst, int instructions_in_sequence,
   1117                      bool is_thrend)
   1118 {
   1119         for (int slot = 0; slot < instructions_in_sequence; slot++) {
   1120                 /* No scheduling SFU when the result would land in the other
   1121                  * thread.  The simulator complains for safety, though it
   1122                  * would only occur for dead code in our case.
   1123                  */
   1124                 if (slot > 0 &&
   1125                     qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
   1126                     (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
   1127                      v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
   1128                         return false;
   1129                 }
   1130 
   1131                 if (slot > 0 && qinst->qpu.sig.ldvary)
   1132                         return false;
   1133 
   1134                 if (is_thrend &&
   1135                     !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) {
   1136                         return false;
   1137                 }
   1138 
   1139                 /* Note that the list is circular, so we can only do this up
   1140                  * to instructions_in_sequence.
   1141                  */
   1142                 qinst = (struct qinst *)qinst->link.next;
   1143         }
   1144 
   1145         return true;
   1146 }
   1147 
   1148 /**
   1149  * Emits a THRSW signal in the stream, trying to move it up to pair with
   1150  * another instruction.
   1151  */
   1152 static int
   1153 emit_thrsw(struct v3d_compile *c,
   1154            struct qblock *block,
   1155            struct choose_scoreboard *scoreboard,
   1156            struct qinst *inst,
   1157            bool is_thrend)
   1158 {
   1159         int time = 0;
   1160 
   1161         /* There should be nothing in a thrsw inst being scheduled other than
   1162          * the signal bits.
   1163          */
   1164         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
   1165         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
   1166         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
   1167 
   1168         /* Find how far back into previous instructions we can put the THRSW. */
   1169         int slots_filled = 0;
   1170         struct qinst *merge_inst = NULL;
   1171         vir_for_each_inst_rev(prev_inst, block) {
   1172                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
   1173                 sig.thrsw = true;
   1174                 uint32_t packed_sig;
   1175 
   1176                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
   1177                         break;
   1178 
   1179                 if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
   1180                                           is_thrend)) {
   1181                         break;
   1182                 }
   1183 
   1184                 merge_inst = prev_inst;
   1185                 if (++slots_filled == 3)
   1186                         break;
   1187         }
   1188 
   1189         bool needs_free = false;
   1190         if (merge_inst) {
   1191                 merge_inst->qpu.sig.thrsw = true;
   1192                 needs_free = true;
   1193         } else {
   1194                 insert_scheduled_instruction(c, block, scoreboard, inst);
   1195                 time++;
   1196                 slots_filled++;
   1197                 merge_inst = inst;
   1198         }
   1199 
   1200         /* Insert any extra delay slot NOPs we need. */
   1201         for (int i = 0; i < 3 - slots_filled; i++) {
   1202                 emit_nop(c, block, scoreboard);
   1203                 time++;
   1204         }
   1205 
   1206         /* If we're emitting the last THRSW (other than program end), then
   1207          * signal that to the HW by emitting two THRSWs in a row.
   1208          */
   1209         if (inst->is_last_thrsw) {
   1210                 struct qinst *second_inst =
   1211                         (struct qinst *)merge_inst->link.next;
   1212                 second_inst->qpu.sig.thrsw = true;
   1213         }
   1214 
   1215         /* If we put our THRSW into another instruction, free up the
   1216          * instruction that didn't end up scheduled into the list.
   1217          */
   1218         if (needs_free)
   1219                 free(inst);
   1220 
   1221         return time;
   1222 }
   1223 
   1224 static uint32_t
   1225 schedule_instructions(struct v3d_compile *c,
   1226                       struct choose_scoreboard *scoreboard,
   1227                       struct qblock *block,
   1228                       struct list_head *schedule_list,
   1229                       enum quniform_contents *orig_uniform_contents,
   1230                       uint32_t *orig_uniform_data,
   1231                       uint32_t *next_uniform)
   1232 {
   1233         const struct v3d_device_info *devinfo = c->devinfo;
   1234         uint32_t time = 0;
   1235 
   1236         if (debug) {
   1237                 fprintf(stderr, "initial deps:\n");
   1238                 dump_state(devinfo, schedule_list);
   1239                 fprintf(stderr, "\n");
   1240         }
   1241 
   1242         /* Remove non-DAG heads from the list. */
   1243         list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
   1244                 if (n->parent_count != 0)
   1245                         list_del(&n->link);
   1246         }
   1247 
   1248         while (!list_empty(schedule_list)) {
   1249                 struct schedule_node *chosen =
   1250                         choose_instruction_to_schedule(devinfo,
   1251                                                        scoreboard,
   1252                                                        schedule_list,
   1253                                                        NULL);
   1254                 struct schedule_node *merge = NULL;
   1255 
   1256                 /* If there are no valid instructions to schedule, drop a NOP
   1257                  * in.
   1258                  */
   1259                 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
   1260                 struct v3d_qpu_instr *inst = &qinst->qpu;
   1261 
   1262                 if (debug) {
   1263                         fprintf(stderr, "t=%4d: current list:\n",
   1264                                 time);
   1265                         dump_state(devinfo, schedule_list);
   1266                         fprintf(stderr, "t=%4d: chose:   ", time);
   1267                         v3d_qpu_dump(devinfo, inst);
   1268                         fprintf(stderr, "\n");
   1269                 }
   1270 
   1271                 /* Schedule this instruction onto the QPU list. Also try to
   1272                  * find an instruction to pair with it.
   1273                  */
   1274                 if (chosen) {
   1275                         time = MAX2(chosen->unblocked_time, time);
   1276                         list_del(&chosen->link);
   1277                         mark_instruction_scheduled(schedule_list, time,
   1278                                                    chosen, true);
   1279 
   1280                         merge = choose_instruction_to_schedule(devinfo,
   1281                                                                scoreboard,
   1282                                                                schedule_list,
   1283                                                                chosen);
   1284                         if (merge) {
   1285                                 time = MAX2(merge->unblocked_time, time);
   1286                                 list_del(&merge->link);
   1287                                 (void)qpu_merge_inst(devinfo, inst,
   1288                                                      inst, &merge->inst->qpu);
   1289                                 if (merge->inst->uniform != -1) {
   1290                                         chosen->inst->uniform =
   1291                                                 merge->inst->uniform;
   1292                                 }
   1293 
   1294                                 if (debug) {
   1295                                         fprintf(stderr, "t=%4d: merging: ",
   1296                                                 time);
   1297                                         v3d_qpu_dump(devinfo, &merge->inst->qpu);
   1298                                         fprintf(stderr, "\n");
   1299                                         fprintf(stderr, "         result: ");
   1300                                         v3d_qpu_dump(devinfo, inst);
   1301                                         fprintf(stderr, "\n");
   1302                                 }
   1303                         }
   1304                 }
   1305 
   1306                 /* Update the uniform index for the rewritten location --
   1307                  * branch target updating will still need to change
   1308                  * c->uniform_data[] using this index.
   1309                  */
   1310                 if (qinst->uniform != -1) {
   1311                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
   1312                                 block->branch_uniform = *next_uniform;
   1313 
   1314                         c->uniform_data[*next_uniform] =
   1315                                 orig_uniform_data[qinst->uniform];
   1316                         c->uniform_contents[*next_uniform] =
   1317                                 orig_uniform_contents[qinst->uniform];
   1318                         qinst->uniform = *next_uniform;
   1319                         (*next_uniform)++;
   1320                 }
   1321 
   1322                 if (debug) {
   1323                         fprintf(stderr, "\n");
   1324                 }
   1325 
   1326                 /* Now that we've scheduled a new instruction, some of its
   1327                  * children can be promoted to the list of instructions ready to
   1328                  * be scheduled.  Update the children's unblocked time for this
   1329                  * DAG edge as we do so.
   1330                  */
   1331                 mark_instruction_scheduled(schedule_list, time, chosen, false);
   1332 
   1333                 if (merge) {
   1334                         mark_instruction_scheduled(schedule_list, time, merge,
   1335                                                    false);
   1336 
   1337                         /* The merged VIR instruction doesn't get re-added to the
   1338                          * block, so free it now.
   1339                          */
   1340                         free(merge->inst);
   1341                 }
   1342 
   1343                 if (inst->sig.thrsw) {
   1344                         time += emit_thrsw(c, block, scoreboard, qinst, false);
   1345                 } else {
   1346                         insert_scheduled_instruction(c, block,
   1347                                                      scoreboard, qinst);
   1348 
   1349                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
   1350                                 block->branch_qpu_ip = c->qpu_inst_count - 1;
   1351                                 /* Fill the delay slots.
   1352                                  *
   1353                                  * We should fill these with actual instructions,
   1354                                  * instead, but that will probably need to be done
   1355                                  * after this, once we know what the leading
   1356                                  * instructions of the successors are (so we can
   1357                                  * handle A/B register file write latency)
   1358                                  */
   1359                                 for (int i = 0; i < 3; i++)
   1360                                         emit_nop(c, block, scoreboard);
   1361                         }
   1362                 }
   1363         }
   1364 
   1365         return time;
   1366 }
   1367 
   1368 static uint32_t
   1369 qpu_schedule_instructions_block(struct v3d_compile *c,
   1370                                 struct choose_scoreboard *scoreboard,
   1371                                 struct qblock *block,
   1372                                 enum quniform_contents *orig_uniform_contents,
   1373                                 uint32_t *orig_uniform_data,
   1374                                 uint32_t *next_uniform)
   1375 {
   1376         void *mem_ctx = ralloc_context(NULL);
   1377         struct list_head schedule_list;
   1378 
   1379         list_inithead(&schedule_list);
   1380 
   1381         /* Wrap each instruction in a scheduler structure. */
   1382         while (!list_empty(&block->instructions)) {
   1383                 struct qinst *qinst = (struct qinst *)block->instructions.next;
   1384                 struct schedule_node *n =
   1385                         rzalloc(mem_ctx, struct schedule_node);
   1386 
   1387                 n->inst = qinst;
   1388 
   1389                 list_del(&qinst->link);
   1390                 list_addtail(&n->link, &schedule_list);
   1391         }
   1392 
   1393         calculate_forward_deps(c, &schedule_list);
   1394         calculate_reverse_deps(c, &schedule_list);
   1395 
   1396         list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
   1397                 compute_delay(n);
   1398         }
   1399 
   1400         uint32_t cycles = schedule_instructions(c, scoreboard, block,
   1401                                                 &schedule_list,
   1402                                                 orig_uniform_contents,
   1403                                                 orig_uniform_data,
   1404                                                 next_uniform);
   1405 
   1406         ralloc_free(mem_ctx);
   1407 
   1408         return cycles;
   1409 }
   1410 
   1411 static void
   1412 qpu_set_branch_targets(struct v3d_compile *c)
   1413 {
   1414         vir_for_each_block(block, c) {
   1415                 /* The end block of the program has no branch. */
   1416                 if (!block->successors[0])
   1417                         continue;
   1418 
   1419                 /* If there was no branch instruction, then the successor
   1420                  * block must follow immediately after this one.
   1421                  */
   1422                 if (block->branch_qpu_ip == ~0) {
   1423                         assert(block->end_qpu_ip + 1 ==
   1424                                block->successors[0]->start_qpu_ip);
   1425                         continue;
   1426                 }
   1427 
   1428                 /* Walk back through the delay slots to find the branch
   1429                  * instr.
   1430                  */
   1431                 struct list_head *entry = block->instructions.prev;
   1432                 for (int i = 0; i < 3; i++)
   1433                         entry = entry->prev;
   1434                 struct qinst *branch = container_of(entry, branch, link);
   1435                 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
   1436 
   1437                 /* Make sure that the if-we-don't-jump
   1438                  * successor was scheduled just after the
   1439                  * delay slots.
   1440                  */
   1441                 assert(!block->successors[1] ||
   1442                        block->successors[1]->start_qpu_ip ==
   1443                        block->branch_qpu_ip + 4);
   1444 
   1445                 branch->qpu.branch.offset =
   1446                         ((block->successors[0]->start_qpu_ip -
   1447                           (block->branch_qpu_ip + 4)) *
   1448                          sizeof(uint64_t));
   1449 
   1450                 /* Set up the relative offset to jump in the
   1451                  * uniform stream.
   1452                  *
   1453                  * Use a temporary here, because
   1454                  * uniform_data[inst->uniform] may be shared
   1455                  * between multiple instructions.
   1456                  */
   1457                 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
   1458                 c->uniform_data[branch->uniform] =
   1459                         (block->successors[0]->start_uniform -
   1460                          (block->branch_uniform + 1)) * 4;
   1461         }
   1462 }
   1463 
   1464 uint32_t
   1465 v3d_qpu_schedule_instructions(struct v3d_compile *c)
   1466 {
   1467         const struct v3d_device_info *devinfo = c->devinfo;
   1468         struct qblock *end_block = list_last_entry(&c->blocks,
   1469                                                    struct qblock, link);
   1470 
   1471         /* We reorder the uniforms as we schedule instructions, so save the
   1472          * old data off and replace it.
   1473          */
   1474         uint32_t *uniform_data = c->uniform_data;
   1475         enum quniform_contents *uniform_contents = c->uniform_contents;
   1476         c->uniform_contents = ralloc_array(c, enum quniform_contents,
   1477                                            c->num_uniforms);
   1478         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
   1479         c->uniform_array_size = c->num_uniforms;
   1480         uint32_t next_uniform = 0;
   1481 
   1482         struct choose_scoreboard scoreboard;
   1483         memset(&scoreboard, 0, sizeof(scoreboard));
   1484         scoreboard.last_waddr_add = ~0;
   1485         scoreboard.last_waddr_mul = ~0;
   1486         scoreboard.last_ldvary_tick = -10;
   1487         scoreboard.last_sfu_write_tick = -10;
   1488         scoreboard.last_uniforms_reset_tick = -10;
   1489 
   1490         if (debug) {
   1491                 fprintf(stderr, "Pre-schedule instructions\n");
   1492                 vir_for_each_block(block, c) {
   1493                         fprintf(stderr, "BLOCK %d\n", block->index);
   1494                         list_for_each_entry(struct qinst, qinst,
   1495                                             &block->instructions, link) {
   1496                                 v3d_qpu_dump(devinfo, &qinst->qpu);
   1497                                 fprintf(stderr, "\n");
   1498                         }
   1499                 }
   1500                 fprintf(stderr, "\n");
   1501         }
   1502 
   1503         uint32_t cycles = 0;
   1504         vir_for_each_block(block, c) {
   1505                 block->start_qpu_ip = c->qpu_inst_count;
   1506                 block->branch_qpu_ip = ~0;
   1507                 block->start_uniform = next_uniform;
   1508 
   1509                 cycles += qpu_schedule_instructions_block(c,
   1510                                                           &scoreboard,
   1511                                                           block,
   1512                                                           uniform_contents,
   1513                                                           uniform_data,
   1514                                                           &next_uniform);
   1515 
   1516                 block->end_qpu_ip = c->qpu_inst_count - 1;
   1517         }
   1518 
   1519         /* Emit the program-end THRSW instruction. */;
   1520         struct qinst *thrsw = vir_nop();
   1521         thrsw->qpu.sig.thrsw = true;
   1522         emit_thrsw(c, end_block, &scoreboard, thrsw, true);
   1523 
   1524         qpu_set_branch_targets(c);
   1525 
   1526         assert(next_uniform == c->num_uniforms);
   1527 
   1528         return cycles;
   1529 }
   1530