Home | History | Annotate | Download | only in vc4
      1 
      2 /*
      3  * Copyright  2014 Broadcom
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9  * and/or sell copies of the Software, and to permit persons to whom the
     10  * Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     22  * IN THE SOFTWARE.
     23  */
     24 
     25 #include "vc4_qpu.h"
     26 
     27 static void
     28 fail_instr(uint64_t inst, const char *msg)
     29 {
     30         fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
     31         vc4_qpu_disasm(&inst, 1);
     32         fprintf(stderr, "\n");
     33         abort();
     34 }
     35 
     36 static bool
     37 writes_reg(uint64_t inst, uint32_t w)
     38 {
     39         return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
     40                 QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
     41 }
     42 
     43 static bool
     44 _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
     45 {
     46         struct {
     47                 uint32_t mux, addr;
     48         } src_regs[] = {
     49                 { QPU_GET_FIELD(inst, QPU_ADD_A) },
     50                 { QPU_GET_FIELD(inst, QPU_ADD_B) },
     51                 { QPU_GET_FIELD(inst, QPU_MUL_A) },
     52                 { QPU_GET_FIELD(inst, QPU_MUL_B) },
     53         };
     54 
     55         /* Branches only reference raddr_a (no mux), and we don't use that
     56          * feature of branching.
     57          */
     58         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
     59                 return false;
     60 
     61         /* Load immediates don't read any registers. */
     62         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
     63                 return false;
     64 
     65         for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
     66                 if (!ignore_a &&
     67                     src_regs[i].mux == QPU_MUX_A &&
     68                     (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
     69                         return true;
     70 
     71                 if (!ignore_b &&
     72                     QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
     73                     src_regs[i].mux == QPU_MUX_B &&
     74                     (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
     75                         return true;
     76         }
     77 
     78         return false;
     79 }
     80 
     81 static bool
     82 reads_reg(uint64_t inst, uint32_t r)
     83 {
     84         return _reads_reg(inst, r, false, false);
     85 }
     86 
     87 static bool
     88 reads_a_reg(uint64_t inst, uint32_t r)
     89 {
     90         return _reads_reg(inst, r, false, true);
     91 }
     92 
     93 static bool
     94 reads_b_reg(uint64_t inst, uint32_t r)
     95 {
     96         return _reads_reg(inst, r, true, false);
     97 }
     98 
     99 static bool
    100 writes_sfu(uint64_t inst)
    101 {
    102         return (writes_reg(inst, QPU_W_SFU_RECIP) ||
    103                 writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
    104                 writes_reg(inst, QPU_W_SFU_EXP) ||
    105                 writes_reg(inst, QPU_W_SFU_LOG));
    106 }
    107 
    108 /**
    109  * Checks for the instruction restrictions from page 37 ("Summary of
    110  * Instruction Restrictions").
    111  */
    112 void
    113 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
    114 {
    115         bool scoreboard_locked = false;
    116         bool threaded = false;
    117 
    118         /* We don't want to do validation in release builds, but we want to
    119          * keep compiling the validation code to make sure it doesn't get
    120          * broken.
    121          */
    122 #ifndef DEBUG
    123         return;
    124 #endif
    125 
    126         for (int i = 0; i < num_inst; i++) {
    127                 uint64_t inst = insts[i];
    128                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
    129 
    130                 if (sig != QPU_SIG_PROG_END) {
    131                         if (qpu_inst_is_tlb(inst))
    132                                 scoreboard_locked = true;
    133 
    134                         if (sig == QPU_SIG_THREAD_SWITCH ||
    135                             sig == QPU_SIG_LAST_THREAD_SWITCH) {
    136                                 threaded = true;
    137                         }
    138 
    139                         continue;
    140                 }
    141 
    142                 /* "The Thread End instruction must not write to either physical
    143                  *  regfile A or B."
    144                  */
    145                 if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
    146                     QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
    147                         fail_instr(inst, "write to phys reg in thread end");
    148                 }
    149 
    150                 /* Can't trigger an implicit wait on scoreboard in the program
    151                  * end instruction.
    152                  */
    153                 if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
    154                         fail_instr(inst, "implicit sb wait in program end");
    155 
    156                 /* Two delay slots will be executed. */
    157                 assert(i + 2 <= num_inst);
    158 
    159                  for (int j = i; j < i + 2; j++) {
    160                          /* "The last three instructions of any program
    161                           *  (Thread End plus the following two delay-slot
    162                           *  instructions) must not do varyings read, uniforms
    163                           *  read or any kind of VPM, VDR, or VDW read or
    164                           *  write."
    165                           */
    166                          if (writes_reg(insts[j], QPU_W_VPM) ||
    167                              reads_reg(insts[j], QPU_R_VARY) ||
    168                              reads_reg(insts[j], QPU_R_UNIF) ||
    169                              reads_reg(insts[j], QPU_R_VPM)) {
    170                                  fail_instr(insts[j], "last 3 instructions "
    171                                             "using fixed functions");
    172                          }
    173 
    174                          /* "The Thread End instruction and the following two
    175                           *  delay slot instructions must not write or read
    176                           *  address 14 in either regfile A or B."
    177                           */
    178                          if (writes_reg(insts[j], 14) ||
    179                              reads_reg(insts[j], 14)) {
    180                                  fail_instr(insts[j], "last 3 instructions "
    181                                             "must not use r14");
    182                          }
    183                  }
    184 
    185                  /* "The final program instruction (the second delay slot
    186                   *  instruction) must not do a TLB Z write."
    187                   */
    188                  if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
    189                          fail_instr(insts[i + 2], "final instruction doing "
    190                                     "Z write");
    191                  }
    192         }
    193 
    194         /* "A scoreboard wait must not occur in the first two instructions of
    195          *  a fragment shader. This is either the explicit Wait for Scoreboard
    196          *  signal or an implicit wait with the first tile-buffer read or
    197          *  write instruction."
    198          */
    199         for (int i = 0; i < 2; i++) {
    200                 uint64_t inst = insts[i];
    201 
    202                 if (qpu_inst_is_tlb(inst))
    203                         fail_instr(inst, "sb wait in first two insts");
    204         }
    205 
    206         /* "If TMU_NOSWAP is written, the write must be three instructions
    207          *  before the first TMU write instruction.  For example, if
    208          *  TMU_NOSWAP is written in the first shader instruction, the first
    209          *  TMU write cannot occur before the 4th shader instruction."
    210          */
    211         int last_tmu_noswap = -10;
    212         for (int i = 0; i < num_inst; i++) {
    213                 uint64_t inst = insts[i];
    214 
    215                 if ((i - last_tmu_noswap) <= 3 &&
    216                     (writes_reg(inst, QPU_W_TMU0_S) ||
    217                      writes_reg(inst, QPU_W_TMU1_S))) {
    218                         fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
    219                 }
    220 
    221                 if (writes_reg(inst, QPU_W_TMU_NOSWAP))
    222                     last_tmu_noswap = i;
    223         }
    224 
    225         /* "An instruction must not read from a location in physical regfile A
    226          *  or B that was written to by the previous instruction."
    227          */
    228         for (int i = 0; i < num_inst - 1; i++) {
    229                 uint64_t inst = insts[i];
    230                 uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
    231                 uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
    232                 uint32_t waddr_a, waddr_b;
    233 
    234                 if (inst & QPU_WS) {
    235                         waddr_b = add_waddr;
    236                         waddr_a = mul_waddr;
    237                 } else {
    238                         waddr_a = add_waddr;
    239                         waddr_b = mul_waddr;
    240                 }
    241 
    242                 if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
    243                     (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
    244                         fail_instr(insts[i + 1],
    245                                    "Reads physical reg too soon after write");
    246                 }
    247         }
    248 
    249         /* "After an SFU lookup instruction, accumulator r4 must not be read
    250          *  in the following two instructions. Any other instruction that
    251          *  results in r4 being written (that is, TMU read, TLB read, SFU
    252          *  lookup) cannot occur in the two instructions following an SFU
    253          *  lookup."
    254          */
    255         int last_sfu_inst = -10;
    256         for (int i = 0; i < num_inst - 1; i++) {
    257                 uint64_t inst = insts[i];
    258                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
    259 
    260                 if (i - last_sfu_inst <= 2 &&
    261                     (writes_sfu(inst) ||
    262                      sig == QPU_SIG_LOAD_TMU0 ||
    263                      sig == QPU_SIG_LOAD_TMU1 ||
    264                      sig == QPU_SIG_COLOR_LOAD)) {
    265                         fail_instr(inst, "R4 write too soon after SFU write");
    266                 }
    267 
    268                 if (writes_sfu(inst))
    269                         last_sfu_inst = i;
    270         }
    271 
    272         for (int i = 0; i < num_inst - 1; i++) {
    273                 uint64_t inst = insts[i];
    274 
    275                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
    276                     QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
    277                     QPU_SMALL_IMM_MUL_ROT) {
    278                         uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
    279                         uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
    280 
    281                         /* "The full horizontal vector rotate is only
    282                          *  available when both of the mul ALU input arguments
    283                          *  are taken from accumulators r0-r3."
    284                          */
    285                         if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
    286                                 fail_instr(inst,
    287                                            "MUL rotate using non-accumulator "
    288                                            "input");
    289                         }
    290 
    291                         if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
    292                             QPU_SMALL_IMM_MUL_ROT) {
    293                                 /* "An instruction that does a vector rotate
    294                                  *  by r5 must not immediately follow an
    295                                  *  instruction that writes to r5."
    296                                  */
    297                                 if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
    298                                         fail_instr(inst,
    299                                                    "vector rotate by r5 "
    300                                                    "immediately after r5 write");
    301                                 }
    302                         }
    303 
    304                         /* "An instruction that does a vector rotate must not
    305                          *  immediately follow an instruction that writes to the
    306                          *  accumulator that is being rotated."
    307                          */
    308                         if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
    309                             writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
    310                                 fail_instr(inst,
    311                                            "vector rotate of value "
    312                                            "written in previous instruction");
    313                         }
    314                 }
    315         }
    316 
    317         /* "An instruction that does a vector rotate must not immediately
    318          *  follow an instruction that writes to the accumulator that is being
    319          *  rotated.
    320          *
    321          * XXX: TODO.
    322          */
    323 
    324         /* "After an instruction that does a TLB Z write, the multisample mask
    325          *  must not be read as an instruction input argument in the following
    326          *  two instruction. The TLB Z write instruction can, however, be
    327          *  followed immediately by a TLB color write."
    328          */
    329         for (int i = 0; i < num_inst - 1; i++) {
    330                 uint64_t inst = insts[i];
    331                 if (writes_reg(inst, QPU_W_TLB_Z) &&
    332                     (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
    333                      reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
    334                         fail_instr(inst, "TLB Z write followed by MS mask read");
    335                 }
    336         }
    337 
    338         /*
    339          * "A single instruction can only perform a maximum of one of the
    340          *  following closely coupled peripheral accesses in a single
    341          *  instruction: TMU write, TMU read, TLB write, TLB read, TLB
    342          *  combined color read and write, SFU write, Mutex read or Semaphore
    343          *  access."
    344          */
    345         for (int i = 0; i < num_inst - 1; i++) {
    346                 uint64_t inst = insts[i];
    347 
    348                 if (qpu_num_sf_accesses(inst) > 1)
    349                         fail_instr(inst, "Single instruction writes SFU twice");
    350         }
    351 
    352         /* "The uniform base pointer can be written (from SIMD element 0) by
    353          *  the processor to reset the stream, there must be at least two
    354          *  nonuniform-accessing instructions following a pointer change
    355          *  before uniforms can be accessed once more."
    356          */
    357         int last_unif_pointer_update = -3;
    358         for (int i = 0; i < num_inst; i++) {
    359                 uint64_t inst = insts[i];
    360                 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
    361                 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
    362 
    363                 if (reads_reg(inst, QPU_R_UNIF) &&
    364                     i - last_unif_pointer_update <= 2) {
    365                         fail_instr(inst,
    366                                    "uniform read too soon after pointer update");
    367                 }
    368 
    369                 if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
    370                     waddr_mul == QPU_W_UNIFORMS_ADDRESS)
    371                         last_unif_pointer_update = i;
    372         }
    373 
    374         if (threaded) {
    375                 bool last_thrsw_found = false;
    376                 bool scoreboard_locked = false;
    377                 int tex_samples_outstanding = 0;
    378                 int last_tex_samples_outstanding = 0;
    379                 int thrsw_ip = -1;
    380 
    381                 for (int i = 0; i < num_inst; i++) {
    382                         uint64_t inst = insts[i];
    383                         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
    384 
    385                         if (i == thrsw_ip) {
    386                                 /* In order to get texture results back in the
    387                                  * correct order, before a new thrsw we have
    388                                  * to read all the texture results from before
    389                                  * the previous thrsw.
    390                                  *
    391                                  * FIXME: Is collecting the remaining results
    392                                  * during the delay slots OK, or should we do
    393                                  * this at THRSW signal time?
    394                                  */
    395                                 if (last_tex_samples_outstanding != 0) {
    396                                         fail_instr(inst, "THRSW with texture "
    397                                                    "results from the previous "
    398                                                    "THRSW still in the FIFO.");
    399                                 }
    400 
    401                                 last_tex_samples_outstanding =
    402                                         tex_samples_outstanding;
    403                                 tex_samples_outstanding = 0;
    404                         }
    405 
    406                         if (qpu_inst_is_tlb(inst))
    407                                 scoreboard_locked = true;
    408 
    409                         switch (sig) {
    410                         case QPU_SIG_THREAD_SWITCH:
    411                         case QPU_SIG_LAST_THREAD_SWITCH:
    412                                 /* No thread switching with the scoreboard
    413                                  * locked.  Doing so means we may deadlock
    414                                  * when the other thread tries to lock
    415                                  * scoreboard.
    416                                  */
    417                                 if (scoreboard_locked) {
    418                                         fail_instr(inst, "THRSW with the "
    419                                                    "scoreboard locked.");
    420                                 }
    421 
    422                                 /* No thread switching after lthrsw, since
    423                                  * lthrsw means that we get delayed until the
    424                                  * other shader is ready for us to terminate.
    425                                  */
    426                                 if (last_thrsw_found) {
    427                                         fail_instr(inst, "THRSW after a "
    428                                                    "previous LTHRSW");
    429                                 }
    430 
    431                                 if (sig == QPU_SIG_LAST_THREAD_SWITCH)
    432                                         last_thrsw_found = true;
    433 
    434                                 /* No THRSW while we already have a THRSW
    435                                  * queued.
    436                                  */
    437                                 if (i < thrsw_ip) {
    438                                         fail_instr(inst,
    439                                                    "THRSW with a THRSW queued.");
    440                                 }
    441 
    442                                 thrsw_ip = i + 3;
    443                                 break;
    444 
    445                         case QPU_SIG_LOAD_TMU0:
    446                         case QPU_SIG_LOAD_TMU1:
    447                                 if (last_tex_samples_outstanding == 0) {
    448                                         fail_instr(inst, "TMU load with nothing "
    449                                                    "in the results fifo from "
    450                                                    "the previous THRSW.");
    451                                 }
    452 
    453                                 last_tex_samples_outstanding--;
    454                                 break;
    455                         }
    456 
    457                         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
    458                         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
    459                         if (waddr_add == QPU_W_TMU0_S ||
    460                             waddr_add == QPU_W_TMU1_S ||
    461                             waddr_mul == QPU_W_TMU0_S ||
    462                             waddr_mul == QPU_W_TMU1_S) {
    463                                 tex_samples_outstanding++;
    464                         }
    465                 }
    466         }
    467 }
    468