Home | History | Annotate | Download | only in vc4
      1 /*
      2  * Copyright  2014 Broadcom
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include <inttypes.h>
     25 
     26 #include "vc4_context.h"
     27 #include "vc4_qir.h"
     28 #include "vc4_qpu.h"
     29 #include "util/ralloc.h"
     30 
     31 static void
     32 vc4_dump_program(struct vc4_compile *c)
     33 {
     34         fprintf(stderr, "%s prog %d/%d QPU:\n",
     35                 qir_get_stage_name(c->stage),
     36                 c->program_id, c->variant_id);
     37 
     38         for (int i = 0; i < c->qpu_inst_count; i++) {
     39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
     40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
     41                 fprintf(stderr, "\n");
     42         }
     43         fprintf(stderr, "\n");
     44 }
     45 
     46 static void
     47 queue(struct qblock *block, uint64_t inst)
     48 {
     49         struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
     50         q->inst = inst;
     51         list_addtail(&q->link, &block->qpu_inst_list);
     52 }
     53 
     54 static uint64_t *
     55 last_inst(struct qblock *block)
     56 {
     57         struct queued_qpu_inst *q =
     58                 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
     59         return &q->inst;
     60 }
     61 
     62 static void
     63 set_last_cond_add(struct qblock *block, uint32_t cond)
     64 {
     65         *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
     66 }
     67 
     68 static void
     69 set_last_cond_mul(struct qblock *block, uint32_t cond)
     70 {
     71         *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
     72 }
     73 
     74 /**
     75  * Some special registers can be read from either file, which lets us resolve
     76  * raddr conflicts without extra MOVs.
     77  */
     78 static bool
     79 swap_file(struct qpu_reg *src)
     80 {
     81         switch (src->addr) {
     82         case QPU_R_UNIF:
     83         case QPU_R_VARY:
     84                 if (src->mux == QPU_MUX_SMALL_IMM) {
     85                         return false;
     86                 } else {
     87                         if (src->mux == QPU_MUX_A)
     88                                 src->mux = QPU_MUX_B;
     89                         else
     90                                 src->mux = QPU_MUX_A;
     91                         return true;
     92                 }
     93 
     94         default:
     95                 return false;
     96         }
     97 }
     98 
     99 /**
    100  * Sets up the VPM read FIFO before we do any VPM read.
    101  *
    102  * VPM reads (vertex attribute input) and VPM writes (varyings output) from
    103  * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
    104  * VPM block.  In the VS/CS (unlike in the FS), the block starts out
    105  * uninitialized, and you need to emit setup to the block before any VPM
    106  * reads/writes.
    107  *
    108  * VRI has a FIFO in each direction, with each FIFO able to hold four
    109  * 32-bit-per-vertex values.  VPM reads come through the read FIFO and VPM
    110  * writes go through the write FIFO.  The read/write setup values from QPU go
    111  * through the write FIFO as well, with a sideband signal indicating that
    112  * they're setup values.  Once a read setup reaches the other side of the
    113  * FIFO, the VPM block will start asynchronously reading vertex attributes and
    114  * filling the read FIFO -- that way hopefully the QPU doesn't have to block
    115  * on reads later.
    116  *
    117  * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
    118  * time, which is 4 vec4s.  If more than that is being read (since we support
    119  * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
    120  *
    121  * The existence of the FIFO makes it seem like you should be able to emit
    122  * both setups for the 5-8 attribute cases and then do all the attribute
    123  * reads.  However, once the setup value makes it to the other end of the
    124  * write FIFO, it will immediately update the VPM block's setup register.
    125  * That updated setup register would be used for read FIFO fills from then on,
    126  * breaking whatever remaining VPM values were supposed to be read into the
    127  * read FIFO from the previous attribute set.
    128  *
    129  * As a result, we need to emit the read setup, pull every VPM read value from
    130  * that setup, and only then emit the second setup if applicable.
    131  */
    132 static void
    133 setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
    134 {
    135         if (c->num_inputs_in_fifo) {
    136                 c->num_inputs_in_fifo--;
    137                 return;
    138         }
    139 
    140         c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
    141 
    142         queue(block,
    143               qpu_load_imm_ui(qpu_vrsetup(),
    144                               c->vpm_read_offset |
    145                               0x00001a00 |
    146                               ((c->num_inputs_in_fifo & 0xf) << 20)));
    147         c->num_inputs_remaining -= c->num_inputs_in_fifo;
    148         c->vpm_read_offset += c->num_inputs_in_fifo;
    149 
    150         c->num_inputs_in_fifo--;
    151 }
    152 
    153 /**
    154  * This is used to resolve the fact that we might register-allocate two
    155  * different operands of an instruction to the same physical register file
    156  * even though instructions have only one field for the register file source
    157  * address.
    158  *
    159  * In that case, we need to move one to a temporary that can be used in the
    160  * instruction, instead.  We reserve ra14/rb14 for this purpose.
    161  */
    162 static void
    163 fixup_raddr_conflict(struct qblock *block,
    164                      struct qpu_reg dst,
    165                      struct qpu_reg *src0, struct qpu_reg *src1,
    166                      struct qinst *inst, uint64_t *unpack)
    167 {
    168         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
    169         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
    170 
    171         if (mux0 <= QPU_MUX_R5 ||
    172             mux0 != mux1 ||
    173             (src0->addr == src1->addr &&
    174              src0->mux == src1->mux)) {
    175                 return;
    176         }
    177 
    178         if (swap_file(src0) || swap_file(src1))
    179                 return;
    180 
    181         if (mux0 == QPU_MUX_A) {
    182                 /* Make sure we use the same type of MOV as the instruction,
    183                  * in case of unpacks.
    184                  */
    185                 if (qir_is_float_input(inst))
    186                         queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
    187                 else
    188                         queue(block, qpu_a_MOV(qpu_rb(14), *src0));
    189 
    190                 /* If we had an unpack on this A-file source, we need to put
    191                  * it into this MOV, not into the later move from regfile B.
    192                  */
    193                 if (inst->src[0].pack) {
    194                         *last_inst(block) |= *unpack;
    195                         *unpack = 0;
    196                 }
    197                 *src0 = qpu_rb(14);
    198         } else {
    199                 queue(block, qpu_a_MOV(qpu_ra(14), *src0));
    200                 *src0 = qpu_ra(14);
    201         }
    202 }
    203 
    204 static void
    205 set_last_dst_pack(struct qblock *block, struct qinst *inst)
    206 {
    207         bool had_pm = *last_inst(block) & QPU_PM;
    208         bool had_ws = *last_inst(block) & QPU_WS;
    209         uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
    210 
    211         if (!inst->dst.pack)
    212                 return;
    213 
    214         *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
    215 
    216         if (qir_is_mul(inst)) {
    217                 assert(!unpack || had_pm);
    218                 *last_inst(block) |= QPU_PM;
    219         } else {
    220                 assert(!unpack || !had_pm);
    221                 assert(!had_ws); /* dst must be a-file to pack. */
    222         }
    223 }
    224 
    225 static void
    226 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
    227                     struct qpu_reg dst)
    228 {
    229         if (dst.mux != QPU_MUX_R4)
    230                 queue(block, qpu_a_MOV(dst, qpu_r4()));
    231         else if (qinst->sf)
    232                 queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
    233 }
    234 
    235 static void
    236 vc4_generate_code_block(struct vc4_compile *c,
    237                         struct qblock *block,
    238                         struct qpu_reg *temp_registers)
    239 {
    240         int last_vpm_read_index = -1;
    241 
    242         qir_for_each_inst(qinst, block) {
    243 #if 0
    244                 fprintf(stderr, "translating qinst to qpu: ");
    245                 qir_dump_inst(qinst);
    246                 fprintf(stderr, "\n");
    247 #endif
    248 
    249                 static const struct {
    250                         uint32_t op;
    251                 } translate[] = {
    252 #define A(name) [QOP_##name] = {QPU_A_##name}
    253 #define M(name) [QOP_##name] = {QPU_M_##name}
    254                         A(FADD),
    255                         A(FSUB),
    256                         A(FMIN),
    257                         A(FMAX),
    258                         A(FMINABS),
    259                         A(FMAXABS),
    260                         A(FTOI),
    261                         A(ITOF),
    262                         A(ADD),
    263                         A(SUB),
    264                         A(SHL),
    265                         A(SHR),
    266                         A(ASR),
    267                         A(MIN),
    268                         A(MAX),
    269                         A(AND),
    270                         A(OR),
    271                         A(XOR),
    272                         A(NOT),
    273 
    274                         M(FMUL),
    275                         M(V8MULD),
    276                         M(V8MIN),
    277                         M(V8MAX),
    278                         M(V8ADDS),
    279                         M(V8SUBS),
    280                         M(MUL24),
    281 
    282                         /* If we replicate src[0] out to src[1], this works
    283                          * out the same as a MOV.
    284                          */
    285                         [QOP_MOV] = { QPU_A_OR },
    286                         [QOP_FMOV] = { QPU_A_FMAX },
    287                         [QOP_MMOV] = { QPU_M_V8MIN },
    288 
    289                         [QOP_MIN_NOIMM] = { QPU_A_MIN },
    290                 };
    291 
    292                 uint64_t unpack = 0;
    293                 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
    294                 for (int i = 0; i < qir_get_nsrc(qinst); i++) {
    295                         int index = qinst->src[i].index;
    296                         switch (qinst->src[i].file) {
    297                         case QFILE_NULL:
    298                         case QFILE_LOAD_IMM:
    299                                 src[i] = qpu_rn(0);
    300                                 break;
    301                         case QFILE_TEMP:
    302                                 src[i] = temp_registers[index];
    303                                 if (qinst->src[i].pack) {
    304                                         assert(!unpack ||
    305                                                unpack == qinst->src[i].pack);
    306                                         unpack = QPU_SET_FIELD(qinst->src[i].pack,
    307                                                                QPU_UNPACK);
    308                                         if (src[i].mux == QPU_MUX_R4)
    309                                                 unpack |= QPU_PM;
    310                                 }
    311                                 break;
    312                         case QFILE_UNIF:
    313                                 src[i] = qpu_unif();
    314                                 break;
    315                         case QFILE_VARY:
    316                                 src[i] = qpu_vary();
    317                                 break;
    318                         case QFILE_SMALL_IMM:
    319                                 src[i].mux = QPU_MUX_SMALL_IMM;
    320                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
    321                                 /* This should only have returned a valid
    322                                  * small immediate field, not ~0 for failure.
    323                                  */
    324                                 assert(src[i].addr <= 47);
    325                                 break;
    326                         case QFILE_VPM:
    327                                 setup_for_vpm_read(c, block);
    328                                 assert((int)qinst->src[i].index >=
    329                                        last_vpm_read_index);
    330                                 (void)last_vpm_read_index;
    331                                 last_vpm_read_index = qinst->src[i].index;
    332                                 src[i] = qpu_ra(QPU_R_VPM);
    333                                 break;
    334 
    335                         case QFILE_FRAG_X:
    336                                 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
    337                                 break;
    338                         case QFILE_FRAG_Y:
    339                                 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
    340                                 break;
    341                         case QFILE_FRAG_REV_FLAG:
    342                                 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
    343                                 break;
    344                         case QFILE_QPU_ELEMENT:
    345                                 src[i] = qpu_ra(QPU_R_ELEM_QPU);
    346                                 break;
    347 
    348                         case QFILE_TLB_COLOR_WRITE:
    349                         case QFILE_TLB_COLOR_WRITE_MS:
    350                         case QFILE_TLB_Z_WRITE:
    351                         case QFILE_TLB_STENCIL_SETUP:
    352                         case QFILE_TEX_S:
    353                         case QFILE_TEX_S_DIRECT:
    354                         case QFILE_TEX_T:
    355                         case QFILE_TEX_R:
    356                         case QFILE_TEX_B:
    357                                 unreachable("bad qir src file");
    358                         }
    359                 }
    360 
    361                 struct qpu_reg dst;
    362                 switch (qinst->dst.file) {
    363                 case QFILE_NULL:
    364                         dst = qpu_ra(QPU_W_NOP);
    365                         break;
    366                 case QFILE_TEMP:
    367                         dst = temp_registers[qinst->dst.index];
    368                         break;
    369                 case QFILE_VPM:
    370                         dst = qpu_ra(QPU_W_VPM);
    371                         break;
    372 
    373                 case QFILE_TLB_COLOR_WRITE:
    374                         dst = qpu_tlbc();
    375                         break;
    376 
    377                 case QFILE_TLB_COLOR_WRITE_MS:
    378                         dst = qpu_tlbc_ms();
    379                         break;
    380 
    381                 case QFILE_TLB_Z_WRITE:
    382                         dst = qpu_ra(QPU_W_TLB_Z);
    383                         break;
    384 
    385                 case QFILE_TLB_STENCIL_SETUP:
    386                         dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
    387                         break;
    388 
    389                 case QFILE_TEX_S:
    390                 case QFILE_TEX_S_DIRECT:
    391                         dst = qpu_rb(QPU_W_TMU0_S);
    392                         break;
    393 
    394                 case QFILE_TEX_T:
    395                         dst = qpu_rb(QPU_W_TMU0_T);
    396                         break;
    397 
    398                 case QFILE_TEX_R:
    399                         dst = qpu_rb(QPU_W_TMU0_R);
    400                         break;
    401 
    402                 case QFILE_TEX_B:
    403                         dst = qpu_rb(QPU_W_TMU0_B);
    404                         break;
    405 
    406                 case QFILE_VARY:
    407                 case QFILE_UNIF:
    408                 case QFILE_SMALL_IMM:
    409                 case QFILE_LOAD_IMM:
    410                 case QFILE_FRAG_X:
    411                 case QFILE_FRAG_Y:
    412                 case QFILE_FRAG_REV_FLAG:
    413                 case QFILE_QPU_ELEMENT:
    414                         assert(!"not reached");
    415                         break;
    416                 }
    417 
    418                 bool handled_qinst_cond = false;
    419 
    420                 switch (qinst->op) {
    421                 case QOP_RCP:
    422                 case QOP_RSQ:
    423                 case QOP_EXP2:
    424                 case QOP_LOG2:
    425                         switch (qinst->op) {
    426                         case QOP_RCP:
    427                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
    428                                                        src[0]) | unpack);
    429                                 break;
    430                         case QOP_RSQ:
    431                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
    432                                                        src[0]) | unpack);
    433                                 break;
    434                         case QOP_EXP2:
    435                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
    436                                                        src[0]) | unpack);
    437                                 break;
    438                         case QOP_LOG2:
    439                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
    440                                                        src[0]) | unpack);
    441                                 break;
    442                         default:
    443                                 abort();
    444                         }
    445 
    446                         handle_r4_qpu_write(block, qinst, dst);
    447 
    448                         break;
    449 
    450                 case QOP_LOAD_IMM:
    451                         assert(qinst->src[0].file == QFILE_LOAD_IMM);
    452                         queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
    453                         break;
    454 
    455                 case QOP_LOAD_IMM_U2:
    456                         queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
    457                         break;
    458 
    459                 case QOP_LOAD_IMM_I2:
    460                         queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
    461                         break;
    462 
    463                 case QOP_ROT_MUL:
    464                         /* Rotation at the hardware level occurs on the inputs
    465                          * to the MUL unit, and they must be accumulators in
    466                          * order to have the time necessary to move things.
    467                          */
    468                         assert(src[0].mux <= QPU_MUX_R3);
    469 
    470                         queue(block,
    471                               qpu_m_rot(dst, src[0], qinst->src[1].index -
    472                                         QPU_SMALL_IMM_MUL_ROT) | unpack);
    473                         set_last_cond_mul(block, qinst->cond);
    474                         handled_qinst_cond = true;
    475                         set_last_dst_pack(block, qinst);
    476                         break;
    477 
    478                 case QOP_MS_MASK:
    479                         src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
    480                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
    481                                              qinst, &unpack);
    482                         queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
    483                                                src[0], src[1]) | unpack);
    484                         break;
    485 
    486                 case QOP_FRAG_Z:
    487                 case QOP_FRAG_W:
    488                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
    489                          * the register to the Z/W payload.
    490                          */
    491                         break;
    492 
    493                 case QOP_TLB_COLOR_READ:
    494                         queue(block, qpu_NOP());
    495                         *last_inst(block) = qpu_set_sig(*last_inst(block),
    496                                                         QPU_SIG_COLOR_LOAD);
    497                         handle_r4_qpu_write(block, qinst, dst);
    498                         break;
    499 
    500                 case QOP_VARY_ADD_C:
    501                         queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
    502                         break;
    503 
    504 
    505                 case QOP_TEX_RESULT:
    506                         queue(block, qpu_NOP());
    507                         *last_inst(block) = qpu_set_sig(*last_inst(block),
    508                                                         QPU_SIG_LOAD_TMU0);
    509                         handle_r4_qpu_write(block, qinst, dst);
    510                         break;
    511 
    512                 case QOP_THRSW:
    513                         queue(block, qpu_NOP());
    514                         *last_inst(block) = qpu_set_sig(*last_inst(block),
    515                                                         QPU_SIG_THREAD_SWITCH);
    516                         c->last_thrsw = last_inst(block);
    517                         break;
    518 
    519                 case QOP_BRANCH:
    520                         /* The branch target will be updated at QPU scheduling
    521                          * time.
    522                          */
    523                         queue(block, (qpu_branch(qinst->cond, 0) |
    524                                       QPU_BRANCH_REL));
    525                         handled_qinst_cond = true;
    526                         break;
    527 
    528                 case QOP_UNIFORMS_RESET:
    529                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
    530                                              qinst, &unpack);
    531 
    532                         queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
    533                                                src[0], src[1]));
    534                         break;
    535 
    536                 default:
    537                         assert(qinst->op < ARRAY_SIZE(translate));
    538                         assert(translate[qinst->op].op != 0); /* NOPs */
    539 
    540                         /* Skip emitting the MOV if it's a no-op. */
    541                         if (qir_is_raw_mov(qinst) &&
    542                             dst.mux == src[0].mux && dst.addr == src[0].addr) {
    543                                 break;
    544                         }
    545 
    546                         /* If we have only one source, put it in the second
    547                          * argument slot as well so that we don't take up
    548                          * another raddr just to get unused data.
    549                          */
    550                         if (qir_get_non_sideband_nsrc(qinst) == 1)
    551                                 src[1] = src[0];
    552 
    553                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
    554                                              qinst, &unpack);
    555 
    556                         if (qir_is_mul(qinst)) {
    557                                 queue(block, qpu_m_alu2(translate[qinst->op].op,
    558                                                         dst,
    559                                                         src[0], src[1]) | unpack);
    560                                 set_last_cond_mul(block, qinst->cond);
    561                         } else {
    562                                 queue(block, qpu_a_alu2(translate[qinst->op].op,
    563                                                         dst,
    564                                                         src[0], src[1]) | unpack);
    565                                 set_last_cond_add(block, qinst->cond);
    566                         }
    567                         handled_qinst_cond = true;
    568                         set_last_dst_pack(block, qinst);
    569 
    570                         break;
    571                 }
    572 
    573                 assert(qinst->cond == QPU_COND_ALWAYS ||
    574                        handled_qinst_cond);
    575 
    576                 if (qinst->sf)
    577                         *last_inst(block) |= QPU_SF;
    578         }
    579 }
    580 
    581 void
    582 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
    583 {
    584         struct qblock *start_block = list_first_entry(&c->blocks,
    585                                                       struct qblock, link);
    586 
    587         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
    588         if (!temp_registers)
    589                 return;
    590 
    591         switch (c->stage) {
    592         case QSTAGE_VERT:
    593         case QSTAGE_COORD:
    594                 c->num_inputs_remaining = c->num_inputs;
    595                 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
    596                 break;
    597         case QSTAGE_FRAG:
    598                 break;
    599         }
    600 
    601         qir_for_each_block(block, c)
    602                 vc4_generate_code_block(c, block, temp_registers);
    603 
    604         /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
    605          *
    606          * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
    607          * that ensures that a later thread doesn't try to lock the scoreboard
    608          * and terminate before an earlier-spawned thread on the same QPU, by
    609          * delaying switching back to the later shader until earlier has
    610          * finished.  Otherwise, if the earlier thread was hitting the same
    611          * quad, the scoreboard would deadlock.
    612          */
    613         if (c->last_thrsw) {
    614                 assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
    615                        QPU_SIG_THREAD_SWITCH);
    616                 *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
    617                                   QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
    618                                                 QPU_SIG));
    619         }
    620 
    621         uint32_t cycles = qpu_schedule_instructions(c);
    622         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
    623 
    624         /* thread end can't have VPM write or read */
    625         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
    626                           QPU_WADDR_ADD) == QPU_W_VPM ||
    627             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
    628                           QPU_WADDR_MUL) == QPU_W_VPM ||
    629             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
    630                           QPU_RADDR_A) == QPU_R_VPM ||
    631             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
    632                           QPU_RADDR_B) == QPU_R_VPM) {
    633                 qpu_serialize_one_inst(c, qpu_NOP());
    634         }
    635 
    636         /* thread end can't have uniform read */
    637         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
    638                           QPU_RADDR_A) == QPU_R_UNIF ||
    639             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
    640                           QPU_RADDR_B) == QPU_R_UNIF) {
    641                 qpu_serialize_one_inst(c, qpu_NOP());
    642         }
    643 
    644         /* thread end can't have TLB operations */
    645         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
    646                 qpu_serialize_one_inst(c, qpu_NOP());
    647 
    648         /* Make sure there's no existing signal set (like for a small
    649          * immediate)
    650          */
    651         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
    652                           QPU_SIG) != QPU_SIG_NONE) {
    653                 qpu_serialize_one_inst(c, qpu_NOP());
    654         }
    655 
    656         c->qpu_insts[c->qpu_inst_count - 1] =
    657                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
    658                             QPU_SIG_PROG_END);
    659         qpu_serialize_one_inst(c, qpu_NOP());
    660         qpu_serialize_one_inst(c, qpu_NOP());
    661 
    662         switch (c->stage) {
    663         case QSTAGE_VERT:
    664         case QSTAGE_COORD:
    665                 break;
    666         case QSTAGE_FRAG:
    667                 c->qpu_insts[c->qpu_inst_count - 1] =
    668                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
    669                                     QPU_SIG_SCOREBOARD_UNLOCK);
    670                 break;
    671         }
    672 
    673         cycles += c->qpu_inst_count - inst_count_at_schedule_time;
    674 
    675         if (vc4_debug & VC4_DEBUG_SHADERDB) {
    676                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
    677                         qir_get_stage_name(c->stage),
    678                         c->program_id, c->variant_id,
    679                         cycles);
    680         }
    681 
    682         if (vc4_debug & VC4_DEBUG_QPU)
    683                 vc4_dump_program(c);
    684 
    685         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
    686 
    687         free(temp_registers);
    688 }
    689