Home | History | Annotate | Download | only in vc4
      1 /*
      2  * Copyright (c) 2014 Scott Mansell
      3  * Copyright  2014 Broadcom
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9  * and/or sell copies of the Software, and to permit persons to whom the
     10  * Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     22  * IN THE SOFTWARE.
     23  */
     24 
     25 #include <inttypes.h>
     26 #include "util/u_format.h"
     27 #include "util/crc32.h"
     28 #include "util/u_math.h"
     29 #include "util/u_memory.h"
     30 #include "util/ralloc.h"
     31 #include "util/hash_table.h"
     32 #include "tgsi/tgsi_dump.h"
     33 #include "tgsi/tgsi_parse.h"
     34 #include "compiler/nir/nir.h"
     35 #include "compiler/nir/nir_builder.h"
     36 #include "nir/tgsi_to_nir.h"
     37 #include "vc4_context.h"
     38 #include "vc4_qpu.h"
     39 #include "vc4_qir.h"
     40 #include "mesa/state_tracker/st_glsl_types.h"
     41 
     42 static struct qreg
     43 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
     44 static void
     45 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
     46 
     47 static void
     48 resize_qreg_array(struct vc4_compile *c,
     49                   struct qreg **regs,
     50                   uint32_t *size,
     51                   uint32_t decl_size)
     52 {
     53         if (*size >= decl_size)
     54                 return;
     55 
     56         uint32_t old_size = *size;
     57         *size = MAX2(*size * 2, decl_size);
     58         *regs = reralloc(c, *regs, struct qreg, *size);
     59         if (!*regs) {
     60                 fprintf(stderr, "Malloc failure\n");
     61                 abort();
     62         }
     63 
     64         for (uint32_t i = old_size; i < *size; i++)
     65                 (*regs)[i] = c->undef;
     66 }
     67 
     68 static void
     69 ntq_emit_thrsw(struct vc4_compile *c)
     70 {
     71         if (!c->fs_threaded)
     72                 return;
     73 
     74         /* Always thread switch after each texture operation for now.
     75          *
     76          * We could do better by batching a bunch of texture fetches up and
     77          * then doing one thread switch and collecting all their results
     78          * afterward.
     79          */
     80         qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
     81                                     c->undef, c->undef));
     82         c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
     83 }
     84 
     85 static struct qreg
     86 indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
     87 {
     88         struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
     89         uint32_t offset = nir_intrinsic_base(intr);
     90         struct vc4_compiler_ubo_range *range = NULL;
     91         unsigned i;
     92         for (i = 0; i < c->num_uniform_ranges; i++) {
     93                 range = &c->ubo_ranges[i];
     94                 if (offset >= range->src_offset &&
     95                     offset < range->src_offset + range->size) {
     96                         break;
     97                 }
     98         }
     99         /* The driver-location-based offset always has to be within a declared
    100          * uniform range.
    101          */
    102         assert(range);
    103         if (!range->used) {
    104                 range->used = true;
    105                 range->dst_offset = c->next_ubo_dst_offset;
    106                 c->next_ubo_dst_offset += range->size;
    107                 c->num_ubo_ranges++;
    108         }
    109 
    110         offset -= range->src_offset;
    111 
    112         /* Adjust for where we stored the TGSI register base. */
    113         indirect_offset = qir_ADD(c, indirect_offset,
    114                                   qir_uniform_ui(c, (range->dst_offset +
    115                                                      offset)));
    116 
    117         /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
    118         indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
    119         indirect_offset = qir_MIN_NOIMM(c, indirect_offset,
    120                                         qir_uniform_ui(c, (range->dst_offset +
    121                                                            range->size - 4)));
    122 
    123         qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
    124                      indirect_offset,
    125                      qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
    126 
    127         c->num_texture_samples++;
    128 
    129         ntq_emit_thrsw(c);
    130 
    131         return qir_TEX_RESULT(c);
    132 }
    133 
    134 nir_ssa_def *
    135 vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
    136 {
    137         switch (swiz) {
    138         default:
    139         case PIPE_SWIZZLE_NONE:
    140                 fprintf(stderr, "warning: unknown swizzle\n");
    141                 /* FALLTHROUGH */
    142         case PIPE_SWIZZLE_0:
    143                 return nir_imm_float(b, 0.0);
    144         case PIPE_SWIZZLE_1:
    145                 return nir_imm_float(b, 1.0);
    146         case PIPE_SWIZZLE_X:
    147         case PIPE_SWIZZLE_Y:
    148         case PIPE_SWIZZLE_Z:
    149         case PIPE_SWIZZLE_W:
    150                 return srcs[swiz];
    151         }
    152 }
    153 
    154 static struct qreg *
    155 ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
    156 {
    157         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
    158                                           def->num_components);
    159         _mesa_hash_table_insert(c->def_ht, def, qregs);
    160         return qregs;
    161 }
    162 
    163 /**
    164  * This function is responsible for getting QIR results into the associated
    165  * storage for a NIR instruction.
    166  *
    167  * If it's a NIR SSA def, then we just set the associated hash table entry to
    168  * the new result.
    169  *
    170  * If it's a NIR reg, then we need to update the existing qreg assigned to the
    171  * NIR destination with the incoming value.  To do that without introducing
    172  * new MOVs, we require that the incoming qreg either be a uniform, or be
    173  * SSA-defined by the previous QIR instruction in the block and rewritable by
    174  * this function.  That lets us sneak ahead and insert the SF flag beforehand
    175  * (knowing that the previous instruction doesn't depend on flags) and rewrite
    176  * its destination to be the NIR reg's destination
    177  */
    178 static void
    179 ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
    180                struct qreg result)
    181 {
    182         struct qinst *last_inst = NULL;
    183         if (!list_empty(&c->cur_block->instructions))
    184                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
    185 
    186         assert(result.file == QFILE_UNIF ||
    187                (result.file == QFILE_TEMP &&
    188                 last_inst && last_inst == c->defs[result.index]));
    189 
    190         if (dest->is_ssa) {
    191                 assert(chan < dest->ssa.num_components);
    192 
    193                 struct qreg *qregs;
    194                 struct hash_entry *entry =
    195                         _mesa_hash_table_search(c->def_ht, &dest->ssa);
    196 
    197                 if (entry)
    198                         qregs = entry->data;
    199                 else
    200                         qregs = ntq_init_ssa_def(c, &dest->ssa);
    201 
    202                 qregs[chan] = result;
    203         } else {
    204                 nir_register *reg = dest->reg.reg;
    205                 assert(dest->reg.base_offset == 0);
    206                 assert(reg->num_array_elems == 0);
    207                 struct hash_entry *entry =
    208                         _mesa_hash_table_search(c->def_ht, reg);
    209                 struct qreg *qregs = entry->data;
    210 
    211                 /* Insert a MOV if the source wasn't an SSA def in the
    212                  * previous instruction.
    213                  */
    214                 if (result.file == QFILE_UNIF) {
    215                         result = qir_MOV(c, result);
    216                         last_inst = c->defs[result.index];
    217                 }
    218 
    219                 /* We know they're both temps, so just rewrite index. */
    220                 c->defs[last_inst->dst.index] = NULL;
    221                 last_inst->dst.index = qregs[chan].index;
    222 
    223                 /* If we're in control flow, then make this update of the reg
    224                  * conditional on the execution mask.
    225                  */
    226                 if (c->execute.file != QFILE_NULL) {
    227                         last_inst->dst.index = qregs[chan].index;
    228 
    229                         /* Set the flags to the current exec mask.  To insert
    230                          * the SF, we temporarily remove our SSA instruction.
    231                          */
    232                         list_del(&last_inst->link);
    233                         qir_SF(c, c->execute);
    234                         list_addtail(&last_inst->link,
    235                                      &c->cur_block->instructions);
    236 
    237                         last_inst->cond = QPU_COND_ZS;
    238                         last_inst->cond_is_exec_mask = true;
    239                 }
    240         }
    241 }
    242 
    243 static struct qreg *
    244 ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
    245 {
    246         if (dest->is_ssa) {
    247                 struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);
    248                 for (int i = 0; i < dest->ssa.num_components; i++)
    249                         qregs[i] = c->undef;
    250                 return qregs;
    251         } else {
    252                 nir_register *reg = dest->reg.reg;
    253                 assert(dest->reg.base_offset == 0);
    254                 assert(reg->num_array_elems == 0);
    255                 struct hash_entry *entry =
    256                         _mesa_hash_table_search(c->def_ht, reg);
    257                 return entry->data;
    258         }
    259 }
    260 
    261 static struct qreg
    262 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
    263 {
    264         struct hash_entry *entry;
    265         if (src.is_ssa) {
    266                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
    267                 assert(i < src.ssa->num_components);
    268         } else {
    269                 nir_register *reg = src.reg.reg;
    270                 entry = _mesa_hash_table_search(c->def_ht, reg);
    271                 assert(reg->num_array_elems == 0);
    272                 assert(src.reg.base_offset == 0);
    273                 assert(i < reg->num_components);
    274         }
    275 
    276         struct qreg *qregs = entry->data;
    277         return qregs[i];
    278 }
    279 
    280 static struct qreg
    281 ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
    282                 unsigned src)
    283 {
    284         assert(util_is_power_of_two(instr->dest.write_mask));
    285         unsigned chan = ffs(instr->dest.write_mask) - 1;
    286         struct qreg r = ntq_get_src(c, instr->src[src].src,
    287                                     instr->src[src].swizzle[chan]);
    288 
    289         assert(!instr->src[src].abs);
    290         assert(!instr->src[src].negate);
    291 
    292         return r;
    293 };
    294 
    295 static inline struct qreg
    296 qir_SAT(struct vc4_compile *c, struct qreg val)
    297 {
    298         return qir_FMAX(c,
    299                         qir_FMIN(c, val, qir_uniform_f(c, 1.0)),
    300                         qir_uniform_f(c, 0.0));
    301 }
    302 
    303 static struct qreg
    304 ntq_rcp(struct vc4_compile *c, struct qreg x)
    305 {
    306         struct qreg r = qir_RCP(c, x);
    307 
    308         /* Apply a Newton-Raphson step to improve the accuracy. */
    309         r = qir_FMUL(c, r, qir_FSUB(c,
    310                                     qir_uniform_f(c, 2.0),
    311                                     qir_FMUL(c, x, r)));
    312 
    313         return r;
    314 }
    315 
    316 static struct qreg
    317 ntq_rsq(struct vc4_compile *c, struct qreg x)
    318 {
    319         struct qreg r = qir_RSQ(c, x);
    320 
    321         /* Apply a Newton-Raphson step to improve the accuracy. */
    322         r = qir_FMUL(c, r, qir_FSUB(c,
    323                                     qir_uniform_f(c, 1.5),
    324                                     qir_FMUL(c,
    325                                              qir_uniform_f(c, 0.5),
    326                                              qir_FMUL(c, x,
    327                                                       qir_FMUL(c, r, r)))));
    328 
    329         return r;
    330 }
    331 
    332 static struct qreg
    333 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
    334 {
    335         struct qreg src0_hi = qir_SHR(c, src0,
    336                                       qir_uniform_ui(c, 24));
    337         struct qreg src1_hi = qir_SHR(c, src1,
    338                                       qir_uniform_ui(c, 24));
    339 
    340         struct qreg hilo = qir_MUL24(c, src0_hi, src1);
    341         struct qreg lohi = qir_MUL24(c, src0, src1_hi);
    342         struct qreg lolo = qir_MUL24(c, src0, src1);
    343 
    344         return qir_ADD(c, lolo, qir_SHL(c,
    345                                         qir_ADD(c, hilo, lohi),
    346                                         qir_uniform_ui(c, 24)));
    347 }
    348 
    349 static struct qreg
    350 ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
    351 {
    352         struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
    353                                                  qir_uniform_ui(c, 8)));
    354         return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
    355 }
    356 
    357 /**
    358  * Emits a lowered TXF_MS from an MSAA texture.
    359  *
    360  * The addressing math has been lowered in NIR, and now we just need to read
    361  * it like a UBO.
    362  */
    363 static void
    364 ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
    365 {
    366         uint32_t tile_width = 32;
    367         uint32_t tile_height = 32;
    368         uint32_t tile_size = (tile_height * tile_width *
    369                               VC4_MAX_SAMPLES * sizeof(uint32_t));
    370 
    371         unsigned unit = instr->texture_index;
    372         uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
    373         uint32_t w_tiles = w / tile_width;
    374         uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
    375         uint32_t h_tiles = h / tile_height;
    376         uint32_t size = w_tiles * h_tiles * tile_size;
    377 
    378         struct qreg addr;
    379         assert(instr->num_srcs == 1);
    380         assert(instr->src[0].src_type == nir_tex_src_coord);
    381         addr = ntq_get_src(c, instr->src[0].src, 0);
    382 
    383         /* Perform the clamping required by kernel validation. */
    384         addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
    385         addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
    386 
    387         qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
    388                      addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
    389 
    390         ntq_emit_thrsw(c);
    391 
    392         struct qreg tex = qir_TEX_RESULT(c);
    393         c->num_texture_samples++;
    394 
    395         enum pipe_format format = c->key->tex[unit].format;
    396         if (util_format_is_depth_or_stencil(format)) {
    397                 struct qreg scaled = ntq_scale_depth_texture(c, tex);
    398                 for (int i = 0; i < 4; i++)
    399                         ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled));
    400         } else {
    401                 for (int i = 0; i < 4; i++)
    402                         ntq_store_dest(c, &instr->dest, i,
    403                                        qir_UNPACK_8_F(c, tex, i));
    404         }
    405 }
    406 
    407 static void
    408 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
    409 {
    410         struct qreg s, t, r, lod, compare;
    411         bool is_txb = false, is_txl = false;
    412         unsigned unit = instr->texture_index;
    413 
    414         if (instr->op == nir_texop_txf) {
    415                 ntq_emit_txf(c, instr);
    416                 return;
    417         }
    418 
    419         for (unsigned i = 0; i < instr->num_srcs; i++) {
    420                 switch (instr->src[i].src_type) {
    421                 case nir_tex_src_coord:
    422                         s = ntq_get_src(c, instr->src[i].src, 0);
    423                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
    424                                 t = qir_uniform_f(c, 0.5);
    425                         else
    426                                 t = ntq_get_src(c, instr->src[i].src, 1);
    427                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
    428                                 r = ntq_get_src(c, instr->src[i].src, 2);
    429                         break;
    430                 case nir_tex_src_bias:
    431                         lod = ntq_get_src(c, instr->src[i].src, 0);
    432                         is_txb = true;
    433                         break;
    434                 case nir_tex_src_lod:
    435                         lod = ntq_get_src(c, instr->src[i].src, 0);
    436                         is_txl = true;
    437                         break;
    438                 case nir_tex_src_comparator:
    439                         compare = ntq_get_src(c, instr->src[i].src, 0);
    440                         break;
    441                 default:
    442                         unreachable("unknown texture source");
    443                 }
    444         }
    445 
    446         if (c->stage != QSTAGE_FRAG && !is_txl) {
    447                 /* From the GLSL 1.20 spec:
    448                  *
    449                  *     "If it is mip-mapped and running on the vertex shader,
    450                  *      then the base texture is used."
    451                  */
    452                 is_txl = true;
    453                 lod = qir_uniform_ui(c, 0);
    454         }
    455 
    456         if (c->key->tex[unit].force_first_level) {
    457                 lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
    458                 is_txl = true;
    459                 is_txb = false;
    460         }
    461 
    462         struct qreg texture_u[] = {
    463                 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),
    464                 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
    465                 qir_uniform(c, QUNIFORM_CONSTANT, 0),
    466                 qir_uniform(c, QUNIFORM_CONSTANT, 0),
    467         };
    468         uint32_t next_texture_u = 0;
    469 
    470         /* There is no native support for GL texture rectangle coordinates, so
    471          * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
    472          * 1]).
    473          */
    474         if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
    475                 s = qir_FMUL(c, s,
    476                              qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, unit));
    477                 t = qir_FMUL(c, t,
    478                              qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, unit));
    479         }
    480 
    481         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {
    482                 texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,
    483                                            unit | (is_txl << 16));
    484         }
    485 
    486         struct qinst *tmu;
    487         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
    488                 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
    489                 tmu->src[qir_get_tex_uniform_src(tmu)] =
    490                         texture_u[next_texture_u++];
    491         } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
    492                    c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
    493                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
    494                    c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
    495                 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
    496                                    qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
    497                                                unit));
    498                 tmu->src[qir_get_tex_uniform_src(tmu)] =
    499                         texture_u[next_texture_u++];
    500         }
    501 
    502         if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
    503                 s = qir_SAT(c, s);
    504         }
    505 
    506         if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
    507                 t = qir_SAT(c, t);
    508         }
    509 
    510         tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
    511         tmu->src[qir_get_tex_uniform_src(tmu)] =
    512                 texture_u[next_texture_u++];
    513 
    514         if (is_txl || is_txb) {
    515                 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
    516                 tmu->src[qir_get_tex_uniform_src(tmu)] =
    517                         texture_u[next_texture_u++];
    518         }
    519 
    520         tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
    521         tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
    522 
    523         c->num_texture_samples++;
    524 
    525         ntq_emit_thrsw(c);
    526 
    527         struct qreg tex = qir_TEX_RESULT(c);
    528 
    529         enum pipe_format format = c->key->tex[unit].format;
    530 
    531         struct qreg *dest = ntq_get_dest(c, &instr->dest);
    532         if (util_format_is_depth_or_stencil(format)) {
    533                 struct qreg normalized = ntq_scale_depth_texture(c, tex);
    534                 struct qreg depth_output;
    535 
    536                 struct qreg u0 = qir_uniform_f(c, 0.0f);
    537                 struct qreg u1 = qir_uniform_f(c, 1.0f);
    538                 if (c->key->tex[unit].compare_mode) {
    539                         /* From the GL_ARB_shadow spec:
    540                          *
    541                          *     "Let Dt (D subscript t) be the depth texture
    542                          *      value, in the range [0, 1].  Let R be the
    543                          *      interpolated texture coordinate clamped to the
    544                          *      range [0, 1]."
    545                          */
    546                         compare = qir_SAT(c, compare);
    547 
    548                         switch (c->key->tex[unit].compare_func) {
    549                         case PIPE_FUNC_NEVER:
    550                                 depth_output = qir_uniform_f(c, 0.0f);
    551                                 break;
    552                         case PIPE_FUNC_ALWAYS:
    553                                 depth_output = u1;
    554                                 break;
    555                         case PIPE_FUNC_EQUAL:
    556                                 qir_SF(c, qir_FSUB(c, compare, normalized));
    557                                 depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
    558                                 break;
    559                         case PIPE_FUNC_NOTEQUAL:
    560                                 qir_SF(c, qir_FSUB(c, compare, normalized));
    561                                 depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
    562                                 break;
    563                         case PIPE_FUNC_GREATER:
    564                                 qir_SF(c, qir_FSUB(c, compare, normalized));
    565                                 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
    566                                 break;
    567                         case PIPE_FUNC_GEQUAL:
    568                                 qir_SF(c, qir_FSUB(c, normalized, compare));
    569                                 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
    570                                 break;
    571                         case PIPE_FUNC_LESS:
    572                                 qir_SF(c, qir_FSUB(c, compare, normalized));
    573                                 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
    574                                 break;
    575                         case PIPE_FUNC_LEQUAL:
    576                                 qir_SF(c, qir_FSUB(c, normalized, compare));
    577                                 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
    578                                 break;
    579                         }
    580                 } else {
    581                         depth_output = normalized;
    582                 }
    583 
    584                 for (int i = 0; i < 4; i++)
    585                         dest[i] = depth_output;
    586         } else {
    587                 for (int i = 0; i < 4; i++)
    588                         dest[i] = qir_UNPACK_8_F(c, tex, i);
    589         }
    590 }
    591 
    592 /**
    593  * Computes x - floor(x), which is tricky because our FTOI truncates (rounds
    594  * to zero).
    595  */
    596 static struct qreg
    597 ntq_ffract(struct vc4_compile *c, struct qreg src)
    598 {
    599         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
    600         struct qreg diff = qir_FSUB(c, src, trunc);
    601         qir_SF(c, diff);
    602         return qir_MOV(c, qir_SEL(c, QPU_COND_NS,
    603                                   qir_FADD(c, diff, qir_uniform_f(c, 1.0)),
    604                                   diff));
    605 }
    606 
    607 /**
    608  * Computes floor(x), which is tricky because our FTOI truncates (rounds to
    609  * zero).
    610  */
    611 static struct qreg
    612 ntq_ffloor(struct vc4_compile *c, struct qreg src)
    613 {
    614         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
    615 
    616         /* This will be < 0 if we truncated and the truncation was of a value
    617          * that was < 0 in the first place.
    618          */
    619         qir_SF(c, qir_FSUB(c, src, trunc));
    620 
    621         return qir_MOV(c, qir_SEL(c, QPU_COND_NS,
    622                                   qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)),
    623                                   trunc));
    624 }
    625 
    626 /**
    627  * Computes ceil(x), which is tricky because our FTOI truncates (rounds to
    628  * zero).
    629  */
    630 static struct qreg
    631 ntq_fceil(struct vc4_compile *c, struct qreg src)
    632 {
    633         struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
    634 
    635         /* This will be < 0 if we truncated and the truncation was of a value
    636          * that was > 0 in the first place.
    637          */
    638         qir_SF(c, qir_FSUB(c, trunc, src));
    639 
    640         return qir_MOV(c, qir_SEL(c, QPU_COND_NS,
    641                                   qir_FADD(c, trunc, qir_uniform_f(c, 1.0)),
    642                                   trunc));
    643 }
    644 
    645 static struct qreg
    646 ntq_fsin(struct vc4_compile *c, struct qreg src)
    647 {
    648         float coeff[] = {
    649                 -2.0 * M_PI,
    650                 pow(2.0 * M_PI, 3) / (3 * 2 * 1),
    651                 -pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
    652                 pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
    653                 -pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
    654         };
    655 
    656         struct qreg scaled_x =
    657                 qir_FMUL(c,
    658                          src,
    659                          qir_uniform_f(c, 1.0 / (M_PI * 2.0)));
    660 
    661         struct qreg x = qir_FADD(c,
    662                                  ntq_ffract(c, scaled_x),
    663                                  qir_uniform_f(c, -0.5));
    664         struct qreg x2 = qir_FMUL(c, x, x);
    665         struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
    666         for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
    667                 x = qir_FMUL(c, x, x2);
    668                 sum = qir_FADD(c,
    669                                sum,
    670                                qir_FMUL(c,
    671                                         x,
    672                                         qir_uniform_f(c, coeff[i])));
    673         }
    674         return sum;
    675 }
    676 
    677 static struct qreg
    678 ntq_fcos(struct vc4_compile *c, struct qreg src)
    679 {
    680         float coeff[] = {
    681                 -1.0f,
    682                 pow(2.0 * M_PI, 2) / (2 * 1),
    683                 -pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
    684                 pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
    685                 -pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
    686                 pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
    687         };
    688 
    689         struct qreg scaled_x =
    690                 qir_FMUL(c, src,
    691                          qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
    692         struct qreg x_frac = qir_FADD(c,
    693                                       ntq_ffract(c, scaled_x),
    694                                       qir_uniform_f(c, -0.5));
    695 
    696         struct qreg sum = qir_uniform_f(c, coeff[0]);
    697         struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
    698         struct qreg x = x2; /* Current x^2, x^4, or x^6 */
    699         for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
    700                 if (i != 1)
    701                         x = qir_FMUL(c, x, x2);
    702 
    703                 struct qreg mul = qir_FMUL(c,
    704                                            x,
    705                                            qir_uniform_f(c, coeff[i]));
    706                 if (i == 0)
    707                         sum = mul;
    708                 else
    709                         sum = qir_FADD(c, sum, mul);
    710         }
    711         return sum;
    712 }
    713 
    714 static struct qreg
    715 ntq_fsign(struct vc4_compile *c, struct qreg src)
    716 {
    717         struct qreg t = qir_get_temp(c);
    718 
    719         qir_SF(c, src);
    720         qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
    721         qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
    722         qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
    723         return qir_MOV(c, t);
    724 }
    725 
    726 static void
    727 emit_vertex_input(struct vc4_compile *c, int attr)
    728 {
    729         enum pipe_format format = c->vs_key->attr_formats[attr];
    730         uint32_t attr_size = util_format_get_blocksize(format);
    731 
    732         c->vattr_sizes[attr] = align(attr_size, 4);
    733         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
    734                 c->inputs[attr * 4 + i] =
    735                         qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
    736                 c->num_inputs++;
    737         }
    738 }
    739 
    740 static void
    741 emit_fragcoord_input(struct vc4_compile *c, int attr)
    742 {
    743         c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));
    744         c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));
    745         c->inputs[attr * 4 + 2] =
    746                 qir_FMUL(c,
    747                          qir_ITOF(c, qir_FRAG_Z(c)),
    748                          qir_uniform_f(c, 1.0 / 0xffffff));
    749         c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
    750 }
    751 
    752 static struct qreg
    753 emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot,
    754                       uint8_t swizzle)
    755 {
    756         uint32_t i = c->num_input_slots++;
    757         struct qreg vary = {
    758                 QFILE_VARY,
    759                 i
    760         };
    761 
    762         if (c->num_input_slots >= c->input_slots_array_size) {
    763                 c->input_slots_array_size =
    764                         MAX2(4, c->input_slots_array_size * 2);
    765 
    766                 c->input_slots = reralloc(c, c->input_slots,
    767                                           struct vc4_varying_slot,
    768                                           c->input_slots_array_size);
    769         }
    770 
    771         c->input_slots[i].slot = slot;
    772         c->input_slots[i].swizzle = swizzle;
    773 
    774         return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));
    775 }
    776 
    777 static void
    778 emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot)
    779 {
    780         for (int i = 0; i < 4; i++) {
    781                 c->inputs[attr * 4 + i] =
    782                         emit_fragment_varying(c, slot, i);
    783                 c->num_inputs++;
    784         }
    785 }
    786 
    787 static void
    788 add_output(struct vc4_compile *c,
    789            uint32_t decl_offset,
    790            uint8_t slot,
    791            uint8_t swizzle)
    792 {
    793         uint32_t old_array_size = c->outputs_array_size;
    794         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
    795                           decl_offset + 1);
    796 
    797         if (old_array_size != c->outputs_array_size) {
    798                 c->output_slots = reralloc(c,
    799                                            c->output_slots,
    800                                            struct vc4_varying_slot,
    801                                            c->outputs_array_size);
    802         }
    803 
    804         c->output_slots[decl_offset].slot = slot;
    805         c->output_slots[decl_offset].swizzle = swizzle;
    806 }
    807 
    808 static void
    809 declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size)
    810 {
    811         unsigned array_id = c->num_uniform_ranges++;
    812         if (array_id >= c->ubo_ranges_array_size) {
    813                 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
    814                                                 array_id + 1);
    815                 c->ubo_ranges = reralloc(c, c->ubo_ranges,
    816                                          struct vc4_compiler_ubo_range,
    817                                          c->ubo_ranges_array_size);
    818         }
    819 
    820         c->ubo_ranges[array_id].dst_offset = 0;
    821         c->ubo_ranges[array_id].src_offset = start;
    822         c->ubo_ranges[array_id].size = size;
    823         c->ubo_ranges[array_id].used = false;
    824 }
    825 
    826 static bool
    827 ntq_src_is_only_ssa_def_user(nir_src *src)
    828 {
    829         if (!src->is_ssa)
    830                 return false;
    831 
    832         if (!list_empty(&src->ssa->if_uses))
    833                 return false;
    834 
    835         return (src->ssa->uses.next == &src->use_link &&
    836                 src->ssa->uses.next->next == &src->ssa->uses);
    837 }
    838 
    839 /**
    840  * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
    841  * bit set.
    842  *
    843  * However, as an optimization, it tries to find the instructions generating
    844  * the sources to be packed and just emit the pack flag there, if possible.
    845  */
    846 static void
    847 ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
    848 {
    849         struct qreg result = qir_get_temp(c);
    850         struct nir_alu_instr *vec4 = NULL;
    851 
    852         /* If packing from a vec4 op (as expected), identify it so that we can
    853          * peek back at what generated its sources.
    854          */
    855         if (instr->src[0].src.is_ssa &&
    856             instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
    857             nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
    858             nir_op_vec4) {
    859                 vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
    860         }
    861 
    862         /* If the pack is replicating the same channel 4 times, use the 8888
    863          * pack flag.  This is common for blending using the alpha
    864          * channel.
    865          */
    866         if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
    867             instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
    868             instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
    869                 struct qreg rep = ntq_get_src(c,
    870                                               instr->src[0].src,
    871                                               instr->src[0].swizzle[0]);
    872                 ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep));
    873                 return;
    874         }
    875 
    876         for (int i = 0; i < 4; i++) {
    877                 int swiz = instr->src[0].swizzle[i];
    878                 struct qreg src;
    879                 if (vec4) {
    880                         src = ntq_get_src(c, vec4->src[swiz].src,
    881                                           vec4->src[swiz].swizzle[0]);
    882                 } else {
    883                         src = ntq_get_src(c, instr->src[0].src, swiz);
    884                 }
    885 
    886                 if (vec4 &&
    887                     ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
    888                     src.file == QFILE_TEMP &&
    889                     c->defs[src.index] &&
    890                     qir_is_mul(c->defs[src.index]) &&
    891                     !c->defs[src.index]->dst.pack) {
    892                         struct qinst *rewrite = c->defs[src.index];
    893                         c->defs[src.index] = NULL;
    894                         rewrite->dst = result;
    895                         rewrite->dst.pack = QPU_PACK_MUL_8A + i;
    896                         continue;
    897                 }
    898 
    899                 qir_PACK_8_F(c, result, src, i);
    900         }
    901 
    902         ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result));
    903 }
    904 
    905 /** Handles sign-extended bitfield extracts for 16 bits. */
    906 static struct qreg
    907 ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
    908               struct qreg bits)
    909 {
    910         assert(bits.file == QFILE_UNIF &&
    911                c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
    912                c->uniform_data[bits.index] == 16);
    913 
    914         assert(offset.file == QFILE_UNIF &&
    915                c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
    916         int offset_bit = c->uniform_data[offset.index];
    917         assert(offset_bit % 16 == 0);
    918 
    919         return qir_UNPACK_16_I(c, base, offset_bit / 16);
    920 }
    921 
    922 /** Handles unsigned bitfield extracts for 8 bits. */
    923 static struct qreg
    924 ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
    925               struct qreg bits)
    926 {
    927         assert(bits.file == QFILE_UNIF &&
    928                c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
    929                c->uniform_data[bits.index] == 8);
    930 
    931         assert(offset.file == QFILE_UNIF &&
    932                c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
    933         int offset_bit = c->uniform_data[offset.index];
    934         assert(offset_bit % 8 == 0);
    935 
    936         return qir_UNPACK_8_I(c, base, offset_bit / 8);
    937 }
    938 
    939 /**
    940  * If compare_instr is a valid comparison instruction, emits the
    941  * compare_instr's comparison and returns the sel_instr's return value based
    942  * on the compare_instr's result.
    943  */
    944 static bool
    945 ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
    946                     nir_alu_instr *compare_instr,
    947                     nir_alu_instr *sel_instr)
    948 {
    949         enum qpu_cond cond;
    950 
    951         switch (compare_instr->op) {
    952         case nir_op_feq:
    953         case nir_op_ieq:
    954         case nir_op_seq:
    955                 cond = QPU_COND_ZS;
    956                 break;
    957         case nir_op_fne:
    958         case nir_op_ine:
    959         case nir_op_sne:
    960                 cond = QPU_COND_ZC;
    961                 break;
    962         case nir_op_fge:
    963         case nir_op_ige:
    964         case nir_op_uge:
    965         case nir_op_sge:
    966                 cond = QPU_COND_NC;
    967                 break;
    968         case nir_op_flt:
    969         case nir_op_ilt:
    970         case nir_op_slt:
    971                 cond = QPU_COND_NS;
    972                 break;
    973         default:
    974                 return false;
    975         }
    976 
    977         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
    978         struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
    979 
    980         unsigned unsized_type =
    981                 nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
    982         if (unsized_type == nir_type_float)
    983                 qir_SF(c, qir_FSUB(c, src0, src1));
    984         else
    985                 qir_SF(c, qir_SUB(c, src0, src1));
    986 
    987         switch (sel_instr->op) {
    988         case nir_op_seq:
    989         case nir_op_sne:
    990         case nir_op_sge:
    991         case nir_op_slt:
    992                 *dest = qir_SEL(c, cond,
    993                                 qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
    994                 break;
    995 
    996         case nir_op_bcsel:
    997                 *dest = qir_SEL(c, cond,
    998                                 ntq_get_alu_src(c, sel_instr, 1),
    999                                 ntq_get_alu_src(c, sel_instr, 2));
   1000                 break;
   1001 
   1002         default:
   1003                 *dest = qir_SEL(c, cond,
   1004                                 qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
   1005                 break;
   1006         }
   1007 
   1008         /* Make the temporary for nir_store_dest(). */
   1009         *dest = qir_MOV(c, *dest);
   1010 
   1011         return true;
   1012 }
   1013 
   1014 /**
   1015  * Attempts to fold a comparison generating a boolean result into the
   1016  * condition code for selecting between two values, instead of comparing the
   1017  * boolean result against 0 to generate the condition code.
   1018  */
   1019 static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
   1020                                   struct qreg *src)
   1021 {
   1022         if (!instr->src[0].src.is_ssa)
   1023                 goto out;
   1024         if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
   1025                 goto out;
   1026         nir_alu_instr *compare =
   1027                 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
   1028         if (!compare)
   1029                 goto out;
   1030 
   1031         struct qreg dest;
   1032         if (ntq_emit_comparison(c, &dest, compare, instr))
   1033                 return dest;
   1034 
   1035 out:
   1036         qir_SF(c, src[0]);
   1037         return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
   1038 }
   1039 
   1040 static struct qreg
   1041 ntq_fddx(struct vc4_compile *c, struct qreg src)
   1042 {
   1043         /* Make sure that we have a bare temp to use for MUL rotation, so it
   1044          * can be allocated to an accumulator.
   1045          */
   1046         if (src.pack || src.file != QFILE_TEMP)
   1047                 src = qir_MOV(c, src);
   1048 
   1049         struct qreg from_left = qir_ROT_MUL(c, src, 1);
   1050         struct qreg from_right = qir_ROT_MUL(c, src, 15);
   1051 
   1052         /* Distinguish left/right pixels of the quad. */
   1053         qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
   1054                           qir_uniform_ui(c, 1)));
   1055 
   1056         return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
   1057                                   qir_FSUB(c, from_right, src),
   1058                                   qir_FSUB(c, src, from_left)));
   1059 }
   1060 
   1061 static struct qreg
   1062 ntq_fddy(struct vc4_compile *c, struct qreg src)
   1063 {
   1064         if (src.pack || src.file != QFILE_TEMP)
   1065                 src = qir_MOV(c, src);
   1066 
   1067         struct qreg from_bottom = qir_ROT_MUL(c, src, 2);
   1068         struct qreg from_top = qir_ROT_MUL(c, src, 14);
   1069 
   1070         /* Distinguish top/bottom pixels of the quad. */
   1071         qir_SF(c, qir_AND(c,
   1072                           qir_reg(QFILE_QPU_ELEMENT, 0),
   1073                           qir_uniform_ui(c, 2)));
   1074 
   1075         return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
   1076                                   qir_FSUB(c, from_top, src),
   1077                                   qir_FSUB(c, src, from_bottom)));
   1078 }
   1079 
   1080 static void
   1081 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
   1082 {
   1083         /* This should always be lowered to ALU operations for VC4. */
   1084         assert(!instr->dest.saturate);
   1085 
   1086         /* Vectors are special in that they have non-scalarized writemasks,
   1087          * and just take the first swizzle channel for each argument in order
   1088          * into each writemask channel.
   1089          */
   1090         if (instr->op == nir_op_vec2 ||
   1091             instr->op == nir_op_vec3 ||
   1092             instr->op == nir_op_vec4) {
   1093                 struct qreg srcs[4];
   1094                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
   1095                         srcs[i] = ntq_get_src(c, instr->src[i].src,
   1096                                               instr->src[i].swizzle[0]);
   1097                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
   1098                         ntq_store_dest(c, &instr->dest.dest, i,
   1099                                        qir_MOV(c, srcs[i]));
   1100                 return;
   1101         }
   1102 
   1103         if (instr->op == nir_op_pack_unorm_4x8) {
   1104                 ntq_emit_pack_unorm_4x8(c, instr);
   1105                 return;
   1106         }
   1107 
   1108         if (instr->op == nir_op_unpack_unorm_4x8) {
   1109                 struct qreg src = ntq_get_src(c, instr->src[0].src,
   1110                                               instr->src[0].swizzle[0]);
   1111                 for (int i = 0; i < 4; i++) {
   1112                         if (instr->dest.write_mask & (1 << i))
   1113                                 ntq_store_dest(c, &instr->dest.dest, i,
   1114                                                qir_UNPACK_8_F(c, src, i));
   1115                 }
   1116                 return;
   1117         }
   1118 
   1119         /* General case: We can just grab the one used channel per src. */
   1120         struct qreg src[nir_op_infos[instr->op].num_inputs];
   1121         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
   1122                 src[i] = ntq_get_alu_src(c, instr, i);
   1123         }
   1124 
   1125         struct qreg result;
   1126 
   1127         switch (instr->op) {
   1128         case nir_op_fmov:
   1129         case nir_op_imov:
   1130                 result = qir_MOV(c, src[0]);
   1131                 break;
   1132         case nir_op_fmul:
   1133                 result = qir_FMUL(c, src[0], src[1]);
   1134                 break;
   1135         case nir_op_fadd:
   1136                 result = qir_FADD(c, src[0], src[1]);
   1137                 break;
   1138         case nir_op_fsub:
   1139                 result = qir_FSUB(c, src[0], src[1]);
   1140                 break;
   1141         case nir_op_fmin:
   1142                 result = qir_FMIN(c, src[0], src[1]);
   1143                 break;
   1144         case nir_op_fmax:
   1145                 result = qir_FMAX(c, src[0], src[1]);
   1146                 break;
   1147 
   1148         case nir_op_f2i:
   1149         case nir_op_f2u:
   1150                 result = qir_FTOI(c, src[0]);
   1151                 break;
   1152         case nir_op_i2f:
   1153         case nir_op_u2f:
   1154                 result = qir_ITOF(c, src[0]);
   1155                 break;
   1156         case nir_op_b2f:
   1157                 result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
   1158                 break;
   1159         case nir_op_b2i:
   1160                 result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
   1161                 break;
   1162         case nir_op_i2b:
   1163         case nir_op_f2b:
   1164                 qir_SF(c, src[0]);
   1165                 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC,
   1166                                             qir_uniform_ui(c, ~0),
   1167                                             qir_uniform_ui(c, 0)));
   1168                 break;
   1169 
   1170         case nir_op_iadd:
   1171                 result = qir_ADD(c, src[0], src[1]);
   1172                 break;
   1173         case nir_op_ushr:
   1174                 result = qir_SHR(c, src[0], src[1]);
   1175                 break;
   1176         case nir_op_isub:
   1177                 result = qir_SUB(c, src[0], src[1]);
   1178                 break;
   1179         case nir_op_ishr:
   1180                 result = qir_ASR(c, src[0], src[1]);
   1181                 break;
   1182         case nir_op_ishl:
   1183                 result = qir_SHL(c, src[0], src[1]);
   1184                 break;
   1185         case nir_op_imin:
   1186                 result = qir_MIN(c, src[0], src[1]);
   1187                 break;
   1188         case nir_op_imax:
   1189                 result = qir_MAX(c, src[0], src[1]);
   1190                 break;
   1191         case nir_op_iand:
   1192                 result = qir_AND(c, src[0], src[1]);
   1193                 break;
   1194         case nir_op_ior:
   1195                 result = qir_OR(c, src[0], src[1]);
   1196                 break;
   1197         case nir_op_ixor:
   1198                 result = qir_XOR(c, src[0], src[1]);
   1199                 break;
   1200         case nir_op_inot:
   1201                 result = qir_NOT(c, src[0]);
   1202                 break;
   1203 
   1204         case nir_op_imul:
   1205                 result = ntq_umul(c, src[0], src[1]);
   1206                 break;
   1207 
   1208         case nir_op_seq:
   1209         case nir_op_sne:
   1210         case nir_op_sge:
   1211         case nir_op_slt:
   1212         case nir_op_feq:
   1213         case nir_op_fne:
   1214         case nir_op_fge:
   1215         case nir_op_flt:
   1216         case nir_op_ieq:
   1217         case nir_op_ine:
   1218         case nir_op_ige:
   1219         case nir_op_uge:
   1220         case nir_op_ilt:
   1221                 if (!ntq_emit_comparison(c, &result, instr, instr)) {
   1222                         fprintf(stderr, "Bad comparison instruction\n");
   1223                 }
   1224                 break;
   1225 
   1226         case nir_op_bcsel:
   1227                 result = ntq_emit_bcsel(c, instr, src);
   1228                 break;
   1229         case nir_op_fcsel:
   1230                 qir_SF(c, src[0]);
   1231                 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
   1232                 break;
   1233 
   1234         case nir_op_frcp:
   1235                 result = ntq_rcp(c, src[0]);
   1236                 break;
   1237         case nir_op_frsq:
   1238                 result = ntq_rsq(c, src[0]);
   1239                 break;
   1240         case nir_op_fexp2:
   1241                 result = qir_EXP2(c, src[0]);
   1242                 break;
   1243         case nir_op_flog2:
   1244                 result = qir_LOG2(c, src[0]);
   1245                 break;
   1246 
   1247         case nir_op_ftrunc:
   1248                 result = qir_ITOF(c, qir_FTOI(c, src[0]));
   1249                 break;
   1250         case nir_op_fceil:
   1251                 result = ntq_fceil(c, src[0]);
   1252                 break;
   1253         case nir_op_ffract:
   1254                 result = ntq_ffract(c, src[0]);
   1255                 break;
   1256         case nir_op_ffloor:
   1257                 result = ntq_ffloor(c, src[0]);
   1258                 break;
   1259 
   1260         case nir_op_fsin:
   1261                 result = ntq_fsin(c, src[0]);
   1262                 break;
   1263         case nir_op_fcos:
   1264                 result = ntq_fcos(c, src[0]);
   1265                 break;
   1266 
   1267         case nir_op_fsign:
   1268                 result = ntq_fsign(c, src[0]);
   1269                 break;
   1270 
   1271         case nir_op_fabs:
   1272                 result = qir_FMAXABS(c, src[0], src[0]);
   1273                 break;
   1274         case nir_op_iabs:
   1275                 result = qir_MAX(c, src[0],
   1276                                 qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
   1277                 break;
   1278 
   1279         case nir_op_ibitfield_extract:
   1280                 result = ntq_emit_ibfe(c, src[0], src[1], src[2]);
   1281                 break;
   1282 
   1283         case nir_op_ubitfield_extract:
   1284                 result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
   1285                 break;
   1286 
   1287         case nir_op_usadd_4x8:
   1288                 result = qir_V8ADDS(c, src[0], src[1]);
   1289                 break;
   1290 
   1291         case nir_op_ussub_4x8:
   1292                 result = qir_V8SUBS(c, src[0], src[1]);
   1293                 break;
   1294 
   1295         case nir_op_umin_4x8:
   1296                 result = qir_V8MIN(c, src[0], src[1]);
   1297                 break;
   1298 
   1299         case nir_op_umax_4x8:
   1300                 result = qir_V8MAX(c, src[0], src[1]);
   1301                 break;
   1302 
   1303         case nir_op_umul_unorm_4x8:
   1304                 result = qir_V8MULD(c, src[0], src[1]);
   1305                 break;
   1306 
   1307         case nir_op_fddx:
   1308         case nir_op_fddx_coarse:
   1309         case nir_op_fddx_fine:
   1310                 result = ntq_fddx(c, src[0]);
   1311                 break;
   1312 
   1313         case nir_op_fddy:
   1314         case nir_op_fddy_coarse:
   1315         case nir_op_fddy_fine:
   1316                 result = ntq_fddy(c, src[0]);
   1317                 break;
   1318 
   1319         default:
   1320                 fprintf(stderr, "unknown NIR ALU inst: ");
   1321                 nir_print_instr(&instr->instr, stderr);
   1322                 fprintf(stderr, "\n");
   1323                 abort();
   1324         }
   1325 
   1326         /* We have a scalar result, so the instruction should only have a
   1327          * single channel written to.
   1328          */
   1329         assert(util_is_power_of_two(instr->dest.write_mask));
   1330         ntq_store_dest(c, &instr->dest.dest,
   1331                        ffs(instr->dest.write_mask) - 1, result);
   1332 }
   1333 
   1334 static void
   1335 emit_frag_end(struct vc4_compile *c)
   1336 {
   1337         struct qreg color;
   1338         if (c->output_color_index != -1) {
   1339                 color = c->outputs[c->output_color_index];
   1340         } else {
   1341                 color = qir_uniform_ui(c, 0);
   1342         }
   1343 
   1344         uint32_t discard_cond = QPU_COND_ALWAYS;
   1345         if (c->s->info->fs.uses_discard) {
   1346                 qir_SF(c, c->discard);
   1347                 discard_cond = QPU_COND_ZS;
   1348         }
   1349 
   1350         if (c->fs_key->stencil_enabled) {
   1351                 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
   1352                              qir_uniform(c, QUNIFORM_STENCIL, 0));
   1353                 if (c->fs_key->stencil_twoside) {
   1354                         qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
   1355                                      qir_uniform(c, QUNIFORM_STENCIL, 1));
   1356                 }
   1357                 if (c->fs_key->stencil_full_writemasks) {
   1358                         qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
   1359                                      qir_uniform(c, QUNIFORM_STENCIL, 2));
   1360                 }
   1361         }
   1362 
   1363         if (c->output_sample_mask_index != -1) {
   1364                 qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
   1365         }
   1366 
   1367         if (c->fs_key->depth_enabled) {
   1368                 if (c->output_position_index != -1) {
   1369                         qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
   1370                                       qir_FMUL(c,
   1371                                                c->outputs[c->output_position_index],
   1372                                                qir_uniform_f(c, 0xffffff)))->cond = discard_cond;
   1373                 } else {
   1374                         qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
   1375                                      qir_FRAG_Z(c))->cond = discard_cond;
   1376                 }
   1377         }
   1378 
   1379         if (!c->msaa_per_sample_output) {
   1380                 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0),
   1381                              color)->cond = discard_cond;
   1382         } else {
   1383                 for (int i = 0; i < VC4_MAX_SAMPLES; i++) {
   1384                         qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0),
   1385                                      c->sample_colors[i])->cond = discard_cond;
   1386                 }
   1387         }
   1388 }
   1389 
   1390 static void
   1391 emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
   1392 {
   1393         struct qreg packed = qir_get_temp(c);
   1394 
   1395         for (int i = 0; i < 2; i++) {
   1396                 struct qreg scale =
   1397                         qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
   1398 
   1399                 struct qreg packed_chan = packed;
   1400                 packed_chan.pack = QPU_PACK_A_16A + i;
   1401 
   1402                 qir_FTOI_dest(c, packed_chan,
   1403                               qir_FMUL(c,
   1404                                        qir_FMUL(c,
   1405                                                 c->outputs[c->output_position_index + i],
   1406                                                 scale),
   1407                                        rcp_w));
   1408         }
   1409 
   1410         qir_VPM_WRITE(c, packed);
   1411 }
   1412 
   1413 static void
   1414 emit_zs_write(struct vc4_compile *c, struct qreg rcp_w)
   1415 {
   1416         struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
   1417         struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
   1418 
   1419         qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c,
   1420                                                           c->outputs[c->output_position_index + 2],
   1421                                                           zscale),
   1422                                               rcp_w),
   1423                                   zoffset));
   1424 }
   1425 
   1426 static void
   1427 emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w)
   1428 {
   1429         qir_VPM_WRITE(c, rcp_w);
   1430 }
   1431 
   1432 static void
   1433 emit_point_size_write(struct vc4_compile *c)
   1434 {
   1435         struct qreg point_size;
   1436 
   1437         if (c->output_point_size_index != -1)
   1438                 point_size = c->outputs[c->output_point_size_index];
   1439         else
   1440                 point_size = qir_uniform_f(c, 1.0);
   1441 
   1442         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
   1443          * BCM21553).
   1444          */
   1445         point_size = qir_FMAX(c, point_size, qir_uniform_f(c, .125));
   1446 
   1447         qir_VPM_WRITE(c, point_size);
   1448 }
   1449 
   1450 /**
   1451  * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c.
   1452  *
   1453  * The simulator insists that there be at least one vertex attribute, so
   1454  * vc4_draw.c will emit one if it wouldn't have otherwise.  The simulator also
   1455  * insists that all vertex attributes loaded get read by the VS/CS, so we have
   1456  * to consume it here.
   1457  */
   1458 static void
   1459 emit_stub_vpm_read(struct vc4_compile *c)
   1460 {
   1461         if (c->num_inputs)
   1462                 return;
   1463 
   1464         c->vattr_sizes[0] = 4;
   1465         (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
   1466         c->num_inputs++;
   1467 }
   1468 
   1469 static void
   1470 emit_vert_end(struct vc4_compile *c,
   1471               struct vc4_varying_slot *fs_inputs,
   1472               uint32_t num_fs_inputs)
   1473 {
   1474         struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
   1475 
   1476         emit_stub_vpm_read(c);
   1477 
   1478         emit_scaled_viewport_write(c, rcp_w);
   1479         emit_zs_write(c, rcp_w);
   1480         emit_rcp_wc_write(c, rcp_w);
   1481         if (c->vs_key->per_vertex_point_size)
   1482                 emit_point_size_write(c);
   1483 
   1484         for (int i = 0; i < num_fs_inputs; i++) {
   1485                 struct vc4_varying_slot *input = &fs_inputs[i];
   1486                 int j;
   1487 
   1488                 for (j = 0; j < c->num_outputs; j++) {
   1489                         struct vc4_varying_slot *output =
   1490                                 &c->output_slots[j];
   1491 
   1492                         if (input->slot == output->slot &&
   1493                             input->swizzle == output->swizzle) {
   1494                                 qir_VPM_WRITE(c, c->outputs[j]);
   1495                                 break;
   1496                         }
   1497                 }
   1498                 /* Emit padding if we didn't find a declared VS output for
   1499                  * this FS input.
   1500                  */
   1501                 if (j == c->num_outputs)
   1502                         qir_VPM_WRITE(c, qir_uniform_f(c, 0.0));
   1503         }
   1504 }
   1505 
   1506 static void
   1507 emit_coord_end(struct vc4_compile *c)
   1508 {
   1509         struct qreg rcp_w = qir_RCP(c, c->outputs[c->output_position_index + 3]);
   1510 
   1511         emit_stub_vpm_read(c);
   1512 
   1513         for (int i = 0; i < 4; i++)
   1514                 qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
   1515 
   1516         emit_scaled_viewport_write(c, rcp_w);
   1517         emit_zs_write(c, rcp_w);
   1518         emit_rcp_wc_write(c, rcp_w);
   1519         if (c->vs_key->per_vertex_point_size)
   1520                 emit_point_size_write(c);
   1521 }
   1522 
   1523 static void
   1524 vc4_optimize_nir(struct nir_shader *s)
   1525 {
   1526         bool progress;
   1527 
   1528         do {
   1529                 progress = false;
   1530 
   1531                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
   1532                 NIR_PASS(progress, s, nir_lower_alu_to_scalar);
   1533                 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
   1534                 NIR_PASS(progress, s, nir_copy_prop);
   1535                 NIR_PASS(progress, s, nir_opt_remove_phis);
   1536                 NIR_PASS(progress, s, nir_opt_dce);
   1537                 NIR_PASS(progress, s, nir_opt_dead_cf);
   1538                 NIR_PASS(progress, s, nir_opt_cse);
   1539                 NIR_PASS(progress, s, nir_opt_peephole_select, 8);
   1540                 NIR_PASS(progress, s, nir_opt_algebraic);
   1541                 NIR_PASS(progress, s, nir_opt_constant_folding);
   1542                 NIR_PASS(progress, s, nir_opt_undef);
   1543                 NIR_PASS(progress, s, nir_opt_loop_unroll,
   1544                          nir_var_shader_in |
   1545                          nir_var_shader_out |
   1546                          nir_var_local);
   1547         } while (progress);
   1548 }
   1549 
   1550 static int
   1551 driver_location_compare(const void *in_a, const void *in_b)
   1552 {
   1553         const nir_variable *const *a = in_a;
   1554         const nir_variable *const *b = in_b;
   1555 
   1556         return (*a)->data.driver_location - (*b)->data.driver_location;
   1557 }
   1558 
   1559 static void
   1560 ntq_setup_inputs(struct vc4_compile *c)
   1561 {
   1562         unsigned num_entries = 0;
   1563         nir_foreach_variable(var, &c->s->inputs)
   1564                 num_entries++;
   1565 
   1566         nir_variable *vars[num_entries];
   1567 
   1568         unsigned i = 0;
   1569         nir_foreach_variable(var, &c->s->inputs)
   1570                 vars[i++] = var;
   1571 
   1572         /* Sort the variables so that we emit the input setup in
   1573          * driver_location order.  This is required for VPM reads, whose data
   1574          * is fetched into the VPM in driver_location (TGSI register index)
   1575          * order.
   1576          */
   1577         qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
   1578 
   1579         for (unsigned i = 0; i < num_entries; i++) {
   1580                 nir_variable *var = vars[i];
   1581                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
   1582                 unsigned loc = var->data.driver_location;
   1583 
   1584                 assert(array_len == 1);
   1585                 (void)array_len;
   1586                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
   1587                                   (loc + 1) * 4);
   1588 
   1589                 if (c->stage == QSTAGE_FRAG) {
   1590                         if (var->data.location == VARYING_SLOT_POS) {
   1591                                 emit_fragcoord_input(c, loc);
   1592                         } else if (var->data.location == VARYING_SLOT_PNTC ||
   1593                                    (var->data.location >= VARYING_SLOT_VAR0 &&
   1594                                     (c->fs_key->point_sprite_mask &
   1595                                      (1 << (var->data.location -
   1596                                             VARYING_SLOT_VAR0))))) {
   1597                                 c->inputs[loc * 4 + 0] = c->point_x;
   1598                                 c->inputs[loc * 4 + 1] = c->point_y;
   1599                         } else {
   1600                                 emit_fragment_input(c, loc, var->data.location);
   1601                         }
   1602                 } else {
   1603                         emit_vertex_input(c, loc);
   1604                 }
   1605         }
   1606 }
   1607 
   1608 static void
   1609 ntq_setup_outputs(struct vc4_compile *c)
   1610 {
   1611         nir_foreach_variable(var, &c->s->outputs) {
   1612                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
   1613                 unsigned loc = var->data.driver_location * 4;
   1614 
   1615                 assert(array_len == 1);
   1616                 (void)array_len;
   1617 
   1618                 for (int i = 0; i < 4; i++)
   1619                         add_output(c, loc + i, var->data.location, i);
   1620 
   1621                 if (c->stage == QSTAGE_FRAG) {
   1622                         switch (var->data.location) {
   1623                         case FRAG_RESULT_COLOR:
   1624                         case FRAG_RESULT_DATA0:
   1625                                 c->output_color_index = loc;
   1626                                 break;
   1627                         case FRAG_RESULT_DEPTH:
   1628                                 c->output_position_index = loc;
   1629                                 break;
   1630                         case FRAG_RESULT_SAMPLE_MASK:
   1631                                 c->output_sample_mask_index = loc;
   1632                                 break;
   1633                         }
   1634                 } else {
   1635                         switch (var->data.location) {
   1636                         case VARYING_SLOT_POS:
   1637                                 c->output_position_index = loc;
   1638                                 break;
   1639                         case VARYING_SLOT_PSIZ:
   1640                                 c->output_point_size_index = loc;
   1641                                 break;
   1642                         }
   1643                 }
   1644         }
   1645 }
   1646 
   1647 static void
   1648 ntq_setup_uniforms(struct vc4_compile *c)
   1649 {
   1650         nir_foreach_variable(var, &c->s->uniforms) {
   1651                 uint32_t vec4_count = st_glsl_type_size(var->type);
   1652                 unsigned vec4_size = 4 * sizeof(float);
   1653 
   1654                 declare_uniform_range(c, var->data.driver_location * vec4_size,
   1655                                       vec4_count * vec4_size);
   1656 
   1657         }
   1658 }
   1659 
   1660 /**
   1661  * Sets up the mapping from nir_register to struct qreg *.
   1662  *
   1663  * Each nir_register gets a struct qreg per 32-bit component being stored.
   1664  */
   1665 static void
   1666 ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
   1667 {
   1668         foreach_list_typed(nir_register, nir_reg, node, list) {
   1669                 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
   1670                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
   1671                                                   array_len *
   1672                                                   nir_reg->num_components);
   1673 
   1674                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
   1675 
   1676                 for (int i = 0; i < array_len * nir_reg->num_components; i++)
   1677                         qregs[i] = qir_get_temp(c);
   1678         }
   1679 }
   1680 
   1681 static void
   1682 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
   1683 {
   1684         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
   1685         for (int i = 0; i < instr->def.num_components; i++)
   1686                 qregs[i] = qir_uniform_ui(c, instr->value.u32[i]);
   1687 
   1688         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
   1689 }
   1690 
   1691 static void
   1692 ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
   1693 {
   1694         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
   1695 
   1696         /* QIR needs there to be *some* value, so pick 0 (same as for
   1697          * ntq_setup_registers().
   1698          */
   1699         for (int i = 0; i < instr->def.num_components; i++)
   1700                 qregs[i] = qir_uniform_ui(c, 0);
   1701 }
   1702 
   1703 static void
   1704 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
   1705 {
   1706         nir_const_value *const_offset;
   1707         unsigned offset;
   1708 
   1709         switch (instr->intrinsic) {
   1710         case nir_intrinsic_load_uniform:
   1711                 assert(instr->num_components == 1);
   1712                 const_offset = nir_src_as_const_value(instr->src[0]);
   1713                 if (const_offset) {
   1714                         offset = nir_intrinsic_base(instr) + const_offset->u32[0];
   1715                         assert(offset % 4 == 0);
   1716                         /* We need dwords */
   1717                         offset = offset / 4;
   1718                         ntq_store_dest(c, &instr->dest, 0,
   1719                                        qir_uniform(c, QUNIFORM_UNIFORM,
   1720                                                    offset));
   1721                 } else {
   1722                         ntq_store_dest(c, &instr->dest, 0,
   1723                                        indirect_uniform_load(c, instr));
   1724                 }
   1725                 break;
   1726 
   1727         case nir_intrinsic_load_user_clip_plane:
   1728                 for (int i = 0; i < instr->num_components; i++) {
   1729                         ntq_store_dest(c, &instr->dest, i,
   1730                                        qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
   1731                                                    nir_intrinsic_ucp_id(instr) *
   1732                                                    4 + i));
   1733                 }
   1734                 break;
   1735 
   1736         case nir_intrinsic_load_blend_const_color_r_float:
   1737         case nir_intrinsic_load_blend_const_color_g_float:
   1738         case nir_intrinsic_load_blend_const_color_b_float:
   1739         case nir_intrinsic_load_blend_const_color_a_float:
   1740                 ntq_store_dest(c, &instr->dest, 0,
   1741                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X +
   1742                                            (instr->intrinsic -
   1743                                             nir_intrinsic_load_blend_const_color_r_float),
   1744                                            0));
   1745                 break;
   1746 
   1747         case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
   1748                 ntq_store_dest(c, &instr->dest, 0,
   1749                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA,
   1750                                            0));
   1751                 break;
   1752 
   1753         case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
   1754                 ntq_store_dest(c, &instr->dest, 0,
   1755                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA,
   1756                                            0));
   1757                 break;
   1758 
   1759         case nir_intrinsic_load_alpha_ref_float:
   1760                 ntq_store_dest(c, &instr->dest, 0,
   1761                                qir_uniform(c, QUNIFORM_ALPHA_REF, 0));
   1762                 break;
   1763 
   1764         case nir_intrinsic_load_sample_mask_in:
   1765                 ntq_store_dest(c, &instr->dest, 0,
   1766                                qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
   1767                 break;
   1768 
   1769         case nir_intrinsic_load_front_face:
   1770                 /* The register contains 0 (front) or 1 (back), and we need to
   1771                  * turn it into a NIR bool where true means front.
   1772                  */
   1773                 ntq_store_dest(c, &instr->dest, 0,
   1774                                qir_ADD(c,
   1775                                        qir_uniform_ui(c, -1),
   1776                                        qir_reg(QFILE_FRAG_REV_FLAG, 0)));
   1777                 break;
   1778 
   1779         case nir_intrinsic_load_input:
   1780                 assert(instr->num_components == 1);
   1781                 const_offset = nir_src_as_const_value(instr->src[0]);
   1782                 assert(const_offset && "vc4 doesn't support indirect inputs");
   1783                 if (c->stage == QSTAGE_FRAG &&
   1784                     nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {
   1785                         assert(const_offset->u32[0] == 0);
   1786                         /* Reads of the per-sample color need to be done in
   1787                          * order.
   1788                          */
   1789                         int sample_index = (nir_intrinsic_base(instr) -
   1790                                            VC4_NIR_TLB_COLOR_READ_INPUT);
   1791                         for (int i = 0; i <= sample_index; i++) {
   1792                                 if (c->color_reads[i].file == QFILE_NULL) {
   1793                                         c->color_reads[i] =
   1794                                                 qir_TLB_COLOR_READ(c);
   1795                                 }
   1796                         }
   1797                         ntq_store_dest(c, &instr->dest, 0,
   1798                                        qir_MOV(c, c->color_reads[sample_index]));
   1799                 } else {
   1800                         offset = nir_intrinsic_base(instr) + const_offset->u32[0];
   1801                         int comp = nir_intrinsic_component(instr);
   1802                         ntq_store_dest(c, &instr->dest, 0,
   1803                                        qir_MOV(c, c->inputs[offset * 4 + comp]));
   1804                 }
   1805                 break;
   1806 
   1807         case nir_intrinsic_store_output:
   1808                 const_offset = nir_src_as_const_value(instr->src[1]);
   1809                 assert(const_offset && "vc4 doesn't support indirect outputs");
   1810                 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
   1811 
   1812                 /* MSAA color outputs are the only case where we have an
   1813                  * output that's not lowered to being a store of a single 32
   1814                  * bit value.
   1815                  */
   1816                 if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {
   1817                         assert(offset == c->output_color_index);
   1818                         for (int i = 0; i < 4; i++) {
   1819                                 c->sample_colors[i] =
   1820                                         qir_MOV(c, ntq_get_src(c, instr->src[0],
   1821                                                                i));
   1822                         }
   1823                 } else {
   1824                         offset = offset * 4 + nir_intrinsic_component(instr);
   1825                         assert(instr->num_components == 1);
   1826                         c->outputs[offset] =
   1827                                 qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
   1828                         c->num_outputs = MAX2(c->num_outputs, offset + 1);
   1829                 }
   1830                 break;
   1831 
   1832         case nir_intrinsic_discard:
   1833                 if (c->execute.file != QFILE_NULL) {
   1834                         qir_SF(c, c->execute);
   1835                         qir_MOV_cond(c, QPU_COND_ZS, c->discard,
   1836                                      qir_uniform_ui(c, ~0));
   1837                 } else {
   1838                         qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0));
   1839                 }
   1840                 break;
   1841 
   1842         case nir_intrinsic_discard_if: {
   1843                 /* true (~0) if we're discarding */
   1844                 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
   1845 
   1846                 if (c->execute.file != QFILE_NULL) {
   1847                         /* execute == 0 means the channel is active.  Invert
   1848                          * the condition so that we can use zero as "executing
   1849                          * and discarding."
   1850                          */
   1851                         qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond)));
   1852                         qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond);
   1853                 } else {
   1854                         qir_OR_dest(c, c->discard, c->discard,
   1855                                     ntq_get_src(c, instr->src[0], 0));
   1856                 }
   1857 
   1858                 break;
   1859         }
   1860 
   1861         default:
   1862                 fprintf(stderr, "Unknown intrinsic: ");
   1863                 nir_print_instr(&instr->instr, stderr);
   1864                 fprintf(stderr, "\n");
   1865                 break;
   1866         }
   1867 }
   1868 
   1869 /* Clears (activates) the execute flags for any channels whose jump target
   1870  * matches this block.
   1871  */
   1872 static void
   1873 ntq_activate_execute_for_block(struct vc4_compile *c)
   1874 {
   1875         qir_SF(c, qir_SUB(c,
   1876                           c->execute,
   1877                           qir_uniform_ui(c, c->cur_block->index)));
   1878         qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0));
   1879 }
   1880 
   1881 static void
   1882 ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
   1883 {
   1884         if (!c->vc4->screen->has_control_flow) {
   1885                 fprintf(stderr,
   1886                         "IF statement support requires updated kernel.\n");
   1887                 return;
   1888         }
   1889 
   1890         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
   1891         bool empty_else_block =
   1892                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
   1893                  exec_list_is_empty(&nir_else_block->instr_list));
   1894 
   1895         struct qblock *then_block = qir_new_block(c);
   1896         struct qblock *after_block = qir_new_block(c);
   1897         struct qblock *else_block;
   1898         if (empty_else_block)
   1899                 else_block = after_block;
   1900         else
   1901                 else_block = qir_new_block(c);
   1902 
   1903         bool was_top_level = false;
   1904         if (c->execute.file == QFILE_NULL) {
   1905                 c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
   1906                 was_top_level = true;
   1907         }
   1908 
   1909         /* Set ZS for executing (execute == 0) and jumping (if->condition ==
   1910          * 0) channels, and then update execute flags for those to point to
   1911          * the ELSE block.
   1912          */
   1913         qir_SF(c, qir_OR(c,
   1914                          c->execute,
   1915                          ntq_get_src(c, if_stmt->condition, 0)));
   1916         qir_MOV_cond(c, QPU_COND_ZS, c->execute,
   1917                      qir_uniform_ui(c, else_block->index));
   1918 
   1919         /* Jump to ELSE if nothing is active for THEN, otherwise fall
   1920          * through.
   1921          */
   1922         qir_SF(c, c->execute);
   1923         qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC);
   1924         qir_link_blocks(c->cur_block, else_block);
   1925         qir_link_blocks(c->cur_block, then_block);
   1926 
   1927         /* Process the THEN block. */
   1928         qir_set_emit_block(c, then_block);
   1929         ntq_emit_cf_list(c, &if_stmt->then_list);
   1930 
   1931         if (!empty_else_block) {
   1932                 /* Handle the end of the THEN block.  First, all currently
   1933                  * active channels update their execute flags to point to
   1934                  * ENDIF
   1935                  */
   1936                 qir_SF(c, c->execute);
   1937                 qir_MOV_cond(c, QPU_COND_ZS, c->execute,
   1938                              qir_uniform_ui(c, after_block->index));
   1939 
   1940                 /* If everything points at ENDIF, then jump there immediately. */
   1941                 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index)));
   1942                 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
   1943                 qir_link_blocks(c->cur_block, after_block);
   1944                 qir_link_blocks(c->cur_block, else_block);
   1945 
   1946                 qir_set_emit_block(c, else_block);
   1947                 ntq_activate_execute_for_block(c);
   1948                 ntq_emit_cf_list(c, &if_stmt->else_list);
   1949         }
   1950 
   1951         qir_link_blocks(c->cur_block, after_block);
   1952 
   1953         qir_set_emit_block(c, after_block);
   1954         if (was_top_level)
   1955                 c->execute = c->undef;
   1956         else
   1957                 ntq_activate_execute_for_block(c);
   1958 
   1959 }
   1960 
   1961 static void
   1962 ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump)
   1963 {
   1964         struct qblock *jump_block;
   1965         switch (jump->type) {
   1966         case nir_jump_break:
   1967                 jump_block = c->loop_break_block;
   1968                 break;
   1969         case nir_jump_continue:
   1970                 jump_block = c->loop_cont_block;
   1971                 break;
   1972         default:
   1973                 unreachable("Unsupported jump type\n");
   1974         }
   1975 
   1976         qir_SF(c, c->execute);
   1977         qir_MOV_cond(c, QPU_COND_ZS, c->execute,
   1978                      qir_uniform_ui(c, jump_block->index));
   1979 
   1980         /* Jump to the destination block if everyone has taken the jump. */
   1981         qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index)));
   1982         qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
   1983         struct qblock *new_block = qir_new_block(c);
   1984         qir_link_blocks(c->cur_block, jump_block);
   1985         qir_link_blocks(c->cur_block, new_block);
   1986         qir_set_emit_block(c, new_block);
   1987 }
   1988 
   1989 static void
   1990 ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
   1991 {
   1992         switch (instr->type) {
   1993         case nir_instr_type_alu:
   1994                 ntq_emit_alu(c, nir_instr_as_alu(instr));
   1995                 break;
   1996 
   1997         case nir_instr_type_intrinsic:
   1998                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
   1999                 break;
   2000 
   2001         case nir_instr_type_load_const:
   2002                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
   2003                 break;
   2004 
   2005         case nir_instr_type_ssa_undef:
   2006                 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
   2007                 break;
   2008 
   2009         case nir_instr_type_tex:
   2010                 ntq_emit_tex(c, nir_instr_as_tex(instr));
   2011                 break;
   2012 
   2013         case nir_instr_type_jump:
   2014                 ntq_emit_jump(c, nir_instr_as_jump(instr));
   2015                 break;
   2016 
   2017         default:
   2018                 fprintf(stderr, "Unknown NIR instr type: ");
   2019                 nir_print_instr(instr, stderr);
   2020                 fprintf(stderr, "\n");
   2021                 abort();
   2022         }
   2023 }
   2024 
   2025 static void
   2026 ntq_emit_block(struct vc4_compile *c, nir_block *block)
   2027 {
   2028         nir_foreach_instr(instr, block) {
   2029                 ntq_emit_instr(c, instr);
   2030         }
   2031 }
   2032 
   2033 static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
   2034 
   2035 static void
   2036 ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)
   2037 {
   2038         if (!c->vc4->screen->has_control_flow) {
   2039                 fprintf(stderr,
   2040                         "loop support requires updated kernel.\n");
   2041                 ntq_emit_cf_list(c, &loop->body);
   2042                 return;
   2043         }
   2044 
   2045         bool was_top_level = false;
   2046         if (c->execute.file == QFILE_NULL) {
   2047                 c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
   2048                 was_top_level = true;
   2049         }
   2050 
   2051         struct qblock *save_loop_cont_block = c->loop_cont_block;
   2052         struct qblock *save_loop_break_block = c->loop_break_block;
   2053 
   2054         c->loop_cont_block = qir_new_block(c);
   2055         c->loop_break_block = qir_new_block(c);
   2056 
   2057         qir_link_blocks(c->cur_block, c->loop_cont_block);
   2058         qir_set_emit_block(c, c->loop_cont_block);
   2059         ntq_activate_execute_for_block(c);
   2060 
   2061         ntq_emit_cf_list(c, &loop->body);
   2062 
   2063         /* If anything had explicitly continued, or is here at the end of the
   2064          * loop, then we need to loop again.  SF updates are masked by the
   2065          * instruction's condition, so we can do the OR of the two conditions
   2066          * within SF.
   2067          */
   2068         qir_SF(c, c->execute);
   2069         struct qinst *cont_check =
   2070                 qir_SUB_dest(c,
   2071                              c->undef,
   2072                              c->execute,
   2073                              qir_uniform_ui(c, c->loop_cont_block->index));
   2074         cont_check->cond = QPU_COND_ZC;
   2075         cont_check->sf = true;
   2076 
   2077         qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS);
   2078         qir_link_blocks(c->cur_block, c->loop_cont_block);
   2079         qir_link_blocks(c->cur_block, c->loop_break_block);
   2080 
   2081         qir_set_emit_block(c, c->loop_break_block);
   2082         if (was_top_level)
   2083                 c->execute = c->undef;
   2084         else
   2085                 ntq_activate_execute_for_block(c);
   2086 
   2087         c->loop_break_block = save_loop_break_block;
   2088         c->loop_cont_block = save_loop_cont_block;
   2089 }
   2090 
   2091 static void
   2092 ntq_emit_function(struct vc4_compile *c, nir_function_impl *func)
   2093 {
   2094         fprintf(stderr, "FUNCTIONS not handled.\n");
   2095         abort();
   2096 }
   2097 
   2098 static void
   2099 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
   2100 {
   2101         foreach_list_typed(nir_cf_node, node, node, list) {
   2102                 switch (node->type) {
   2103                 case nir_cf_node_block:
   2104                         ntq_emit_block(c, nir_cf_node_as_block(node));
   2105                         break;
   2106 
   2107                 case nir_cf_node_if:
   2108                         ntq_emit_if(c, nir_cf_node_as_if(node));
   2109                         break;
   2110 
   2111                 case nir_cf_node_loop:
   2112                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
   2113                         break;
   2114 
   2115                 case nir_cf_node_function:
   2116                         ntq_emit_function(c, nir_cf_node_as_function(node));
   2117                         break;
   2118 
   2119                 default:
   2120                         fprintf(stderr, "Unknown NIR node type\n");
   2121                         abort();
   2122                 }
   2123         }
   2124 }
   2125 
   2126 static void
   2127 ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
   2128 {
   2129         ntq_setup_registers(c, &impl->registers);
   2130         ntq_emit_cf_list(c, &impl->body);
   2131 }
   2132 
   2133 static void
   2134 nir_to_qir(struct vc4_compile *c)
   2135 {
   2136         if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
   2137                 c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
   2138 
   2139         ntq_setup_inputs(c);
   2140         ntq_setup_outputs(c);
   2141         ntq_setup_uniforms(c);
   2142         ntq_setup_registers(c, &c->s->registers);
   2143 
   2144         /* Find the main function and emit the body. */
   2145         nir_foreach_function(function, c->s) {
   2146                 assert(strcmp(function->name, "main") == 0);
   2147                 assert(function->impl);
   2148                 ntq_emit_impl(c, function->impl);
   2149         }
   2150 }
   2151 
   2152 static const nir_shader_compiler_options nir_options = {
   2153         .lower_extract_byte = true,
   2154         .lower_extract_word = true,
   2155         .lower_ffma = true,
   2156         .lower_flrp32 = true,
   2157         .lower_fpow = true,
   2158         .lower_fsat = true,
   2159         .lower_fsqrt = true,
   2160         .lower_negate = true,
   2161         .native_integers = true,
   2162         .max_unroll_iterations = 32,
   2163 };
   2164 
   2165 const void *
   2166 vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
   2167                                 enum pipe_shader_ir ir, unsigned shader)
   2168 {
   2169         return &nir_options;
   2170 }
   2171 
   2172 static int
   2173 count_nir_instrs(nir_shader *nir)
   2174 {
   2175         int count = 0;
   2176         nir_foreach_function(function, nir) {
   2177                 if (!function->impl)
   2178                         continue;
   2179                 nir_foreach_block(block, function->impl) {
   2180                         nir_foreach_instr(instr, block)
   2181                                 count++;
   2182                 }
   2183         }
   2184         return count;
   2185 }
   2186 
   2187 static struct vc4_compile *
   2188 vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
   2189                struct vc4_key *key, bool fs_threaded)
   2190 {
   2191         struct vc4_compile *c = qir_compile_init();
   2192 
   2193         c->vc4 = vc4;
   2194         c->stage = stage;
   2195         c->shader_state = &key->shader_state->base;
   2196         c->program_id = key->shader_state->program_id;
   2197         c->variant_id =
   2198                 p_atomic_inc_return(&key->shader_state->compiled_variant_count);
   2199         c->fs_threaded = fs_threaded;
   2200 
   2201         c->key = key;
   2202         switch (stage) {
   2203         case QSTAGE_FRAG:
   2204                 c->fs_key = (struct vc4_fs_key *)key;
   2205                 if (c->fs_key->is_points) {
   2206                         c->point_x = emit_fragment_varying(c, ~0, 0);
   2207                         c->point_y = emit_fragment_varying(c, ~0, 0);
   2208                 } else if (c->fs_key->is_lines) {
   2209                         c->line_x = emit_fragment_varying(c, ~0, 0);
   2210                 }
   2211                 break;
   2212         case QSTAGE_VERT:
   2213                 c->vs_key = (struct vc4_vs_key *)key;
   2214                 break;
   2215         case QSTAGE_COORD:
   2216                 c->vs_key = (struct vc4_vs_key *)key;
   2217                 break;
   2218         }
   2219 
   2220         c->s = nir_shader_clone(c, key->shader_state->base.ir.nir);
   2221 
   2222         if (stage == QSTAGE_FRAG)
   2223                 NIR_PASS_V(c->s, vc4_nir_lower_blend, c);
   2224 
   2225         struct nir_lower_tex_options tex_options = {
   2226                 /* We would need to implement txs, but we don't want the
   2227                  * int/float conversions
   2228                  */
   2229                 .lower_rect = false,
   2230 
   2231                 .lower_txp = ~0,
   2232 
   2233                 /* Apply swizzles to all samplers. */
   2234                 .swizzle_result = ~0,
   2235         };
   2236 
   2237         /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
   2238          * The format swizzling applies before sRGB decode, and
   2239          * ARB_texture_swizzle is the last thing before returning the sample.
   2240          */
   2241         for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
   2242                 enum pipe_format format = c->key->tex[i].format;
   2243 
   2244                 if (!format)
   2245                         continue;
   2246 
   2247                 const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
   2248 
   2249                 for (int j = 0; j < 4; j++) {
   2250                         uint8_t arb_swiz = c->key->tex[i].swizzle[j];
   2251 
   2252                         if (arb_swiz <= 3) {
   2253                                 tex_options.swizzles[i][j] =
   2254                                         format_swizzle[arb_swiz];
   2255                         } else {
   2256                                 tex_options.swizzles[i][j] = arb_swiz;
   2257                         }
   2258                 }
   2259 
   2260                 if (util_format_is_srgb(format))
   2261                         tex_options.lower_srgb |= (1 << i);
   2262         }
   2263 
   2264         NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
   2265 
   2266         if (c->fs_key && c->fs_key->light_twoside)
   2267                 NIR_PASS_V(c->s, nir_lower_two_sided_color);
   2268 
   2269         if (c->vs_key && c->vs_key->clamp_color)
   2270                 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
   2271 
   2272         if (c->key->ucp_enables) {
   2273                 if (stage == QSTAGE_FRAG) {
   2274                         NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables);
   2275                 } else {
   2276                         NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables);
   2277                         NIR_PASS_V(c->s, nir_lower_io_to_scalar,
   2278                                    nir_var_shader_out);
   2279                 }
   2280         }
   2281 
   2282         /* FS input scalarizing must happen after nir_lower_two_sided_color,
   2283          * which only handles a vec4 at a time.  Similarly, VS output
   2284          * scalarizing must happen after nir_lower_clip_vs.
   2285          */
   2286         if (c->stage == QSTAGE_FRAG)
   2287                 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
   2288         else
   2289                 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
   2290 
   2291         NIR_PASS_V(c->s, vc4_nir_lower_io, c);
   2292         NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
   2293         NIR_PASS_V(c->s, nir_lower_idiv);
   2294 
   2295         vc4_optimize_nir(c->s);
   2296 
   2297         NIR_PASS_V(c->s, nir_convert_from_ssa, true);
   2298 
   2299         if (vc4_debug & VC4_DEBUG_SHADERDB) {
   2300                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
   2301                         qir_get_stage_name(c->stage),
   2302                         c->program_id, c->variant_id,
   2303                         count_nir_instrs(c->s));
   2304         }
   2305 
   2306         if (vc4_debug & VC4_DEBUG_NIR) {
   2307                 fprintf(stderr, "%s prog %d/%d NIR:\n",
   2308                         qir_get_stage_name(c->stage),
   2309                         c->program_id, c->variant_id);
   2310                 nir_print_shader(c->s, stderr);
   2311         }
   2312 
   2313         nir_to_qir(c);
   2314 
   2315         switch (stage) {
   2316         case QSTAGE_FRAG:
   2317                 /* FS threading requires that the thread execute
   2318                  * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
   2319                  * (with no other THRSW afterwards, obviously).  If we didn't
   2320                  * fetch a texture at a top level block, this wouldn't be
   2321                  * true.
   2322                  */
   2323                 if (c->fs_threaded && !c->last_thrsw_at_top_level) {
   2324                         c->failed = true;
   2325                         return c;
   2326                 }
   2327 
   2328                 emit_frag_end(c);
   2329                 break;
   2330         case QSTAGE_VERT:
   2331                 emit_vert_end(c,
   2332                               c->vs_key->fs_inputs->input_slots,
   2333                               c->vs_key->fs_inputs->num_inputs);
   2334                 break;
   2335         case QSTAGE_COORD:
   2336                 emit_coord_end(c);
   2337                 break;
   2338         }
   2339 
   2340         if (vc4_debug & VC4_DEBUG_QIR) {
   2341                 fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n",
   2342                         qir_get_stage_name(c->stage),
   2343                         c->program_id, c->variant_id);
   2344                 qir_dump(c);
   2345                 fprintf(stderr, "\n");
   2346         }
   2347 
   2348         qir_optimize(c);
   2349         qir_lower_uniforms(c);
   2350 
   2351         qir_schedule_instructions(c);
   2352         qir_emit_uniform_stream_resets(c);
   2353 
   2354         if (vc4_debug & VC4_DEBUG_QIR) {
   2355                 fprintf(stderr, "%s prog %d/%d QIR:\n",
   2356                         qir_get_stage_name(c->stage),
   2357                         c->program_id, c->variant_id);
   2358                 qir_dump(c);
   2359                 fprintf(stderr, "\n");
   2360         }
   2361 
   2362         qir_reorder_uniforms(c);
   2363         vc4_generate_code(vc4, c);
   2364 
   2365         if (vc4_debug & VC4_DEBUG_SHADERDB) {
   2366                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n",
   2367                         qir_get_stage_name(c->stage),
   2368                         c->program_id, c->variant_id,
   2369                         c->qpu_inst_count);
   2370                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d uniforms\n",
   2371                         qir_get_stage_name(c->stage),
   2372                         c->program_id, c->variant_id,
   2373                         c->num_uniforms);
   2374         }
   2375 
   2376         ralloc_free(c->s);
   2377 
   2378         return c;
   2379 }
   2380 
   2381 static void *
   2382 vc4_shader_state_create(struct pipe_context *pctx,
   2383                         const struct pipe_shader_state *cso)
   2384 {
   2385         struct vc4_context *vc4 = vc4_context(pctx);
   2386         struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader);
   2387         if (!so)
   2388                 return NULL;
   2389 
   2390         so->program_id = vc4->next_uncompiled_program_id++;
   2391 
   2392         nir_shader *s;
   2393 
   2394         if (cso->type == PIPE_SHADER_IR_NIR) {
   2395                 /* The backend takes ownership of the NIR shader on state
   2396                  * creation.
   2397                  */
   2398                 s = cso->ir.nir;
   2399         } else {
   2400                 assert(cso->type == PIPE_SHADER_IR_TGSI);
   2401 
   2402                 if (vc4_debug & VC4_DEBUG_TGSI) {
   2403                         fprintf(stderr, "prog %d TGSI:\n",
   2404                                 so->program_id);
   2405                         tgsi_dump(cso->tokens, 0);
   2406                         fprintf(stderr, "\n");
   2407                 }
   2408                 s = tgsi_to_nir(cso->tokens, &nir_options);
   2409         }
   2410 
   2411         NIR_PASS_V(s, nir_opt_global_to_local);
   2412         NIR_PASS_V(s, nir_lower_regs_to_ssa);
   2413         NIR_PASS_V(s, nir_normalize_cubemap_coords);
   2414 
   2415         NIR_PASS_V(s, nir_lower_load_const_to_scalar);
   2416 
   2417         vc4_optimize_nir(s);
   2418 
   2419         NIR_PASS_V(s, nir_remove_dead_variables, nir_var_local);
   2420 
   2421         /* Garbage collect dead instructions */
   2422         nir_sweep(s);
   2423 
   2424         so->base.type = PIPE_SHADER_IR_NIR;
   2425         so->base.ir.nir = s;
   2426 
   2427         if (vc4_debug & VC4_DEBUG_NIR) {
   2428                 fprintf(stderr, "%s prog %d NIR:\n",
   2429                         gl_shader_stage_name(s->stage),
   2430                         so->program_id);
   2431                 nir_print_shader(s, stderr);
   2432                 fprintf(stderr, "\n");
   2433         }
   2434 
   2435         return so;
   2436 }
   2437 
   2438 static void
   2439 copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
   2440                              struct vc4_compile *c)
   2441 {
   2442         int count = c->num_uniforms;
   2443         struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
   2444 
   2445         uinfo->count = count;
   2446         uinfo->data = ralloc_array(shader, uint32_t, count);
   2447         memcpy(uinfo->data, c->uniform_data,
   2448                count * sizeof(*uinfo->data));
   2449         uinfo->contents = ralloc_array(shader, enum quniform_contents, count);
   2450         memcpy(uinfo->contents, c->uniform_contents,
   2451                count * sizeof(*uinfo->contents));
   2452         uinfo->num_texture_samples = c->num_texture_samples;
   2453 
   2454         vc4_set_shader_uniform_dirty_flags(shader);
   2455 }
   2456 
   2457 static void
   2458 vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
   2459                              struct vc4_compiled_shader *shader)
   2460 {
   2461         struct vc4_fs_inputs inputs;
   2462 
   2463         memset(&inputs, 0, sizeof(inputs));
   2464         inputs.input_slots = ralloc_array(shader,
   2465                                           struct vc4_varying_slot,
   2466                                           c->num_input_slots);
   2467 
   2468         bool input_live[c->num_input_slots];
   2469 
   2470         memset(input_live, 0, sizeof(input_live));
   2471         qir_for_each_inst_inorder(inst, c) {
   2472                 for (int i = 0; i < qir_get_nsrc(inst); i++) {
   2473                         if (inst->src[i].file == QFILE_VARY)
   2474                                 input_live[inst->src[i].index] = true;
   2475                 }
   2476         }
   2477 
   2478         for (int i = 0; i < c->num_input_slots; i++) {
   2479                 struct vc4_varying_slot *slot = &c->input_slots[i];
   2480 
   2481                 if (!input_live[i])
   2482                         continue;
   2483 
   2484                 /* Skip non-VS-output inputs. */
   2485                 if (slot->slot == (uint8_t)~0)
   2486                         continue;
   2487 
   2488                 if (slot->slot == VARYING_SLOT_COL0 ||
   2489                     slot->slot == VARYING_SLOT_COL1 ||
   2490                     slot->slot == VARYING_SLOT_BFC0 ||
   2491                     slot->slot == VARYING_SLOT_BFC1) {
   2492                         shader->color_inputs |= (1 << inputs.num_inputs);
   2493                 }
   2494 
   2495                 inputs.input_slots[inputs.num_inputs] = *slot;
   2496                 inputs.num_inputs++;
   2497         }
   2498         shader->num_inputs = inputs.num_inputs;
   2499 
   2500         /* Add our set of inputs to the set of all inputs seen.  This way, we
   2501          * can have a single pointer that identifies an FS inputs set,
   2502          * allowing VS to avoid recompiling when the FS is recompiled (or a
   2503          * new one is bound using separate shader objects) but the inputs
   2504          * don't change.
   2505          */
   2506         struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs);
   2507         if (entry) {
   2508                 shader->fs_inputs = entry->key;
   2509                 ralloc_free(inputs.input_slots);
   2510         } else {
   2511                 struct vc4_fs_inputs *alloc_inputs;
   2512 
   2513                 alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs);
   2514                 memcpy(alloc_inputs, &inputs, sizeof(inputs));
   2515                 ralloc_steal(alloc_inputs, inputs.input_slots);
   2516                 _mesa_set_add(vc4->fs_inputs_set, alloc_inputs);
   2517 
   2518                 shader->fs_inputs = alloc_inputs;
   2519         }
   2520 }
   2521 
   2522 static struct vc4_compiled_shader *
   2523 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
   2524                         struct vc4_key *key)
   2525 {
   2526         struct hash_table *ht;
   2527         uint32_t key_size;
   2528         bool try_threading;
   2529 
   2530         if (stage == QSTAGE_FRAG) {
   2531                 ht = vc4->fs_cache;
   2532                 key_size = sizeof(struct vc4_fs_key);
   2533                 try_threading = vc4->screen->has_threaded_fs;
   2534         } else {
   2535                 ht = vc4->vs_cache;
   2536                 key_size = sizeof(struct vc4_vs_key);
   2537                 try_threading = false;
   2538         }
   2539 
   2540         struct vc4_compiled_shader *shader;
   2541         struct hash_entry *entry = _mesa_hash_table_search(ht, key);
   2542         if (entry)
   2543                 return entry->data;
   2544 
   2545         struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);
   2546         /* If the FS failed to compile threaded, fall back to single threaded. */
   2547         if (try_threading && c->failed) {
   2548                 qir_compile_destroy(c);
   2549                 c = vc4_shader_ntq(vc4, stage, key, false);
   2550         }
   2551 
   2552         shader = rzalloc(NULL, struct vc4_compiled_shader);
   2553 
   2554         shader->program_id = vc4->next_compiled_program_id++;
   2555         if (stage == QSTAGE_FRAG) {
   2556                 vc4_setup_compiled_fs_inputs(vc4, c, shader);
   2557 
   2558                 /* Note: the temporary clone in c->s has been freed. */
   2559                 nir_shader *orig_shader = key->shader_state->base.ir.nir;
   2560                 if (orig_shader->info->outputs_written & (1 << FRAG_RESULT_DEPTH))
   2561                         shader->disable_early_z = true;
   2562         } else {
   2563                 shader->num_inputs = c->num_inputs;
   2564 
   2565                 shader->vattr_offsets[0] = 0;
   2566                 for (int i = 0; i < 8; i++) {
   2567                         shader->vattr_offsets[i + 1] =
   2568                                 shader->vattr_offsets[i] + c->vattr_sizes[i];
   2569 
   2570                         if (c->vattr_sizes[i])
   2571                                 shader->vattrs_live |= (1 << i);
   2572                 }
   2573         }
   2574 
   2575         shader->failed = c->failed;
   2576         if (c->failed) {
   2577                 shader->failed = true;
   2578         } else {
   2579                 copy_uniform_state_to_shader(shader, c);
   2580                 shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
   2581                                                  c->qpu_inst_count *
   2582                                                  sizeof(uint64_t));
   2583         }
   2584 
   2585         shader->fs_threaded = c->fs_threaded;
   2586 
   2587         /* Copy the compiler UBO range state to the compiled shader, dropping
   2588          * out arrays that were never referenced by an indirect load.
   2589          *
   2590          * (Note that QIR dead code elimination of an array access still
   2591          * leaves that array alive, though)
   2592          */
   2593         if (c->num_ubo_ranges) {
   2594                 shader->num_ubo_ranges = c->num_ubo_ranges;
   2595                 shader->ubo_ranges = ralloc_array(shader, struct vc4_ubo_range,
   2596                                                   c->num_ubo_ranges);
   2597                 uint32_t j = 0;
   2598                 for (int i = 0; i < c->num_uniform_ranges; i++) {
   2599                         struct vc4_compiler_ubo_range *range =
   2600                                 &c->ubo_ranges[i];
   2601                         if (!range->used)
   2602                                 continue;
   2603 
   2604                         shader->ubo_ranges[j].dst_offset = range->dst_offset;
   2605                         shader->ubo_ranges[j].src_offset = range->src_offset;
   2606                         shader->ubo_ranges[j].size = range->size;
   2607                         shader->ubo_size += c->ubo_ranges[i].size;
   2608                         j++;
   2609                 }
   2610         }
   2611         if (shader->ubo_size) {
   2612                 if (vc4_debug & VC4_DEBUG_SHADERDB) {
   2613                         fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
   2614                                 qir_get_stage_name(c->stage),
   2615                                 c->program_id, c->variant_id,
   2616                                 shader->ubo_size / 4);
   2617                 }
   2618         }
   2619 
   2620         qir_compile_destroy(c);
   2621 
   2622         struct vc4_key *dup_key;
   2623         dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */
   2624         memcpy(dup_key, key, key_size);
   2625         _mesa_hash_table_insert(ht, dup_key, shader);
   2626 
   2627         return shader;
   2628 }
   2629 
   2630 static void
   2631 vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
   2632                      struct vc4_texture_stateobj *texstate)
   2633 {
   2634         for (int i = 0; i < texstate->num_textures; i++) {
   2635                 struct pipe_sampler_view *sampler = texstate->textures[i];
   2636                 struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler);
   2637                 struct pipe_sampler_state *sampler_state =
   2638                         texstate->samplers[i];
   2639 
   2640                 if (!sampler)
   2641                         continue;
   2642 
   2643                 key->tex[i].format = sampler->format;
   2644                 key->tex[i].swizzle[0] = sampler->swizzle_r;
   2645                 key->tex[i].swizzle[1] = sampler->swizzle_g;
   2646                 key->tex[i].swizzle[2] = sampler->swizzle_b;
   2647                 key->tex[i].swizzle[3] = sampler->swizzle_a;
   2648 
   2649                 if (sampler->texture->nr_samples > 1) {
   2650                         key->tex[i].msaa_width = sampler->texture->width0;
   2651                         key->tex[i].msaa_height = sampler->texture->height0;
   2652                 } else if (sampler){
   2653                         key->tex[i].compare_mode = sampler_state->compare_mode;
   2654                         key->tex[i].compare_func = sampler_state->compare_func;
   2655                         key->tex[i].wrap_s = sampler_state->wrap_s;
   2656                         key->tex[i].wrap_t = sampler_state->wrap_t;
   2657                         key->tex[i].force_first_level =
   2658                                 vc4_sampler->force_first_level;
   2659                 }
   2660         }
   2661 
   2662         key->ucp_enables = vc4->rasterizer->base.clip_plane_enable;
   2663 }
   2664 
   2665 static void
   2666 vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
   2667 {
   2668         struct vc4_job *job = vc4->job;
   2669         struct vc4_fs_key local_key;
   2670         struct vc4_fs_key *key = &local_key;
   2671 
   2672         if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
   2673                             VC4_DIRTY_BLEND |
   2674                             VC4_DIRTY_FRAMEBUFFER |
   2675                             VC4_DIRTY_ZSA |
   2676                             VC4_DIRTY_RASTERIZER |
   2677                             VC4_DIRTY_SAMPLE_MASK |
   2678                             VC4_DIRTY_FRAGTEX |
   2679                             VC4_DIRTY_UNCOMPILED_FS))) {
   2680                 return;
   2681         }
   2682 
   2683         memset(key, 0, sizeof(*key));
   2684         vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex);
   2685         key->base.shader_state = vc4->prog.bind_fs;
   2686         key->is_points = (prim_mode == PIPE_PRIM_POINTS);
   2687         key->is_lines = (prim_mode >= PIPE_PRIM_LINES &&
   2688                          prim_mode <= PIPE_PRIM_LINE_STRIP);
   2689         key->blend = vc4->blend->rt[0];
   2690         if (vc4->blend->logicop_enable) {
   2691                 key->logicop_func = vc4->blend->logicop_func;
   2692         } else {
   2693                 key->logicop_func = PIPE_LOGICOP_COPY;
   2694         }
   2695         if (job->msaa) {
   2696                 key->msaa = vc4->rasterizer->base.multisample;
   2697                 key->sample_coverage = (vc4->rasterizer->base.multisample &&
   2698                                         vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
   2699                 key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
   2700                 key->sample_alpha_to_one = vc4->blend->alpha_to_one;
   2701         }
   2702 
   2703         if (vc4->framebuffer.cbufs[0])
   2704                 key->color_format = vc4->framebuffer.cbufs[0]->format;
   2705 
   2706         key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0;
   2707         key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0;
   2708         key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0;
   2709         key->depth_enabled = (vc4->zsa->base.depth.enabled ||
   2710                               key->stencil_enabled);
   2711         if (vc4->zsa->base.alpha.enabled) {
   2712                 key->alpha_test = true;
   2713                 key->alpha_test_func = vc4->zsa->base.alpha.func;
   2714         }
   2715 
   2716         if (key->is_points) {
   2717                 key->point_sprite_mask =
   2718                         vc4->rasterizer->base.sprite_coord_enable;
   2719                 key->point_coord_upper_left =
   2720                         (vc4->rasterizer->base.sprite_coord_mode ==
   2721                          PIPE_SPRITE_COORD_UPPER_LEFT);
   2722         }
   2723 
   2724         key->light_twoside = vc4->rasterizer->base.light_twoside;
   2725 
   2726         struct vc4_compiled_shader *old_fs = vc4->prog.fs;
   2727         vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base);
   2728         if (vc4->prog.fs == old_fs)
   2729                 return;
   2730 
   2731         vc4->dirty |= VC4_DIRTY_COMPILED_FS;
   2732 
   2733         if (vc4->rasterizer->base.flatshade &&
   2734             old_fs && vc4->prog.fs->color_inputs != old_fs->color_inputs) {
   2735                 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
   2736         }
   2737 
   2738         if (old_fs && vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
   2739                 vc4->dirty |= VC4_DIRTY_FS_INPUTS;
   2740 }
   2741 
   2742 static void
   2743 vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
   2744 {
   2745         struct vc4_vs_key local_key;
   2746         struct vc4_vs_key *key = &local_key;
   2747 
   2748         if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
   2749                             VC4_DIRTY_RASTERIZER |
   2750                             VC4_DIRTY_VERTTEX |
   2751                             VC4_DIRTY_VTXSTATE |
   2752                             VC4_DIRTY_UNCOMPILED_VS |
   2753                             VC4_DIRTY_FS_INPUTS))) {
   2754                 return;
   2755         }
   2756 
   2757         memset(key, 0, sizeof(*key));
   2758         vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);
   2759         key->base.shader_state = vc4->prog.bind_vs;
   2760         key->fs_inputs = vc4->prog.fs->fs_inputs;
   2761         key->clamp_color = vc4->rasterizer->base.clamp_vertex_color;
   2762 
   2763         for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)
   2764                 key->attr_formats[i] = vc4->vtx->pipe[i].src_format;
   2765 
   2766         key->per_vertex_point_size =
   2767                 (prim_mode == PIPE_PRIM_POINTS &&
   2768                  vc4->rasterizer->base.point_size_per_vertex);
   2769 
   2770         struct vc4_compiled_shader *vs =
   2771                 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
   2772         if (vs != vc4->prog.vs) {
   2773                 vc4->prog.vs = vs;
   2774                 vc4->dirty |= VC4_DIRTY_COMPILED_VS;
   2775         }
   2776 
   2777         key->is_coord = true;
   2778         /* Coord shaders don't care what the FS inputs are. */
   2779         key->fs_inputs = NULL;
   2780         struct vc4_compiled_shader *cs =
   2781                 vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
   2782         if (cs != vc4->prog.cs) {
   2783                 vc4->prog.cs = cs;
   2784                 vc4->dirty |= VC4_DIRTY_COMPILED_CS;
   2785         }
   2786 }
   2787 
   2788 bool
   2789 vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode)
   2790 {
   2791         vc4_update_compiled_fs(vc4, prim_mode);
   2792         vc4_update_compiled_vs(vc4, prim_mode);
   2793 
   2794         return !(vc4->prog.cs->failed ||
   2795                  vc4->prog.vs->failed ||
   2796                  vc4->prog.fs->failed);
   2797 }
   2798 
   2799 static uint32_t
   2800 fs_cache_hash(const void *key)
   2801 {
   2802         return _mesa_hash_data(key, sizeof(struct vc4_fs_key));
   2803 }
   2804 
   2805 static uint32_t
   2806 vs_cache_hash(const void *key)
   2807 {
   2808         return _mesa_hash_data(key, sizeof(struct vc4_vs_key));
   2809 }
   2810 
   2811 static bool
   2812 fs_cache_compare(const void *key1, const void *key2)
   2813 {
   2814         return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0;
   2815 }
   2816 
   2817 static bool
   2818 vs_cache_compare(const void *key1, const void *key2)
   2819 {
   2820         return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;
   2821 }
   2822 
   2823 static uint32_t
   2824 fs_inputs_hash(const void *key)
   2825 {
   2826         const struct vc4_fs_inputs *inputs = key;
   2827 
   2828         return _mesa_hash_data(inputs->input_slots,
   2829                                sizeof(*inputs->input_slots) *
   2830                                inputs->num_inputs);
   2831 }
   2832 
   2833 static bool
   2834 fs_inputs_compare(const void *key1, const void *key2)
   2835 {
   2836         const struct vc4_fs_inputs *inputs1 = key1;
   2837         const struct vc4_fs_inputs *inputs2 = key2;
   2838 
   2839         return (inputs1->num_inputs == inputs2->num_inputs &&
   2840                 memcmp(inputs1->input_slots,
   2841                        inputs2->input_slots,
   2842                        sizeof(*inputs1->input_slots) *
   2843                        inputs1->num_inputs) == 0);
   2844 }
   2845 
   2846 static void
   2847 delete_from_cache_if_matches(struct hash_table *ht,
   2848                              struct hash_entry *entry,
   2849                              struct vc4_uncompiled_shader *so)
   2850 {
   2851         const struct vc4_key *key = entry->key;
   2852 
   2853         if (key->shader_state == so) {
   2854                 struct vc4_compiled_shader *shader = entry->data;
   2855                 _mesa_hash_table_remove(ht, entry);
   2856                 vc4_bo_unreference(&shader->bo);
   2857                 ralloc_free(shader);
   2858         }
   2859 }
   2860 
   2861 static void
   2862 vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
   2863 {
   2864         struct vc4_context *vc4 = vc4_context(pctx);
   2865         struct vc4_uncompiled_shader *so = hwcso;
   2866 
   2867         struct hash_entry *entry;
   2868         hash_table_foreach(vc4->fs_cache, entry)
   2869                 delete_from_cache_if_matches(vc4->fs_cache, entry, so);
   2870         hash_table_foreach(vc4->vs_cache, entry)
   2871                 delete_from_cache_if_matches(vc4->vs_cache, entry, so);
   2872 
   2873         ralloc_free(so->base.ir.nir);
   2874         free(so);
   2875 }
   2876 
   2877 static void
   2878 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
   2879 {
   2880         struct vc4_context *vc4 = vc4_context(pctx);
   2881         vc4->prog.bind_fs = hwcso;
   2882         vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;
   2883 }
   2884 
   2885 static void
   2886 vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso)
   2887 {
   2888         struct vc4_context *vc4 = vc4_context(pctx);
   2889         vc4->prog.bind_vs = hwcso;
   2890         vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;
   2891 }
   2892 
   2893 void
   2894 vc4_program_init(struct pipe_context *pctx)
   2895 {
   2896         struct vc4_context *vc4 = vc4_context(pctx);
   2897 
   2898         pctx->create_vs_state = vc4_shader_state_create;
   2899         pctx->delete_vs_state = vc4_shader_state_delete;
   2900 
   2901         pctx->create_fs_state = vc4_shader_state_create;
   2902         pctx->delete_fs_state = vc4_shader_state_delete;
   2903 
   2904         pctx->bind_fs_state = vc4_fp_state_bind;
   2905         pctx->bind_vs_state = vc4_vp_state_bind;
   2906 
   2907         vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
   2908                                                 fs_cache_compare);
   2909         vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
   2910                                                 vs_cache_compare);
   2911         vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash,
   2912                                               fs_inputs_compare);
   2913 }
   2914 
   2915 void
   2916 vc4_program_fini(struct pipe_context *pctx)
   2917 {
   2918         struct vc4_context *vc4 = vc4_context(pctx);
   2919 
   2920         struct hash_entry *entry;
   2921         hash_table_foreach(vc4->fs_cache, entry) {
   2922                 struct vc4_compiled_shader *shader = entry->data;
   2923                 vc4_bo_unreference(&shader->bo);
   2924                 ralloc_free(shader);
   2925                 _mesa_hash_table_remove(vc4->fs_cache, entry);
   2926         }
   2927 
   2928         hash_table_foreach(vc4->vs_cache, entry) {
   2929                 struct vc4_compiled_shader *shader = entry->data;
   2930                 vc4_bo_unreference(&shader->bo);
   2931                 ralloc_free(shader);
   2932                 _mesa_hash_table_remove(vc4->vs_cache, entry);
   2933         }
   2934 }
   2935